自用，一些工具代码。

pd读取excel

1
2
3
4
5
6
7


data = pd.DataFrame(pd.read_excel(excel_path))
for row in data.itertuples():
    id = getattr(row, 'uid')
    text = getattr(row, '文本')
    if pd.isna(text):
        continue
    info = re.findall(r'"重要"："([^"]+)",', str(text))

pd输出excel

1
2
3
4
5
6
7
8


def output_excel(outputdata, result_path):
    title = ['姓名', '性别', '年龄']
    writer = pd.ExcelWriter(result_path)
    
    df = pd.DataFrame(outputdata, columns=title)
    df.to_excel(writer, sheet_name='Sheet1', index=False)
    
    writer.save()

读取json

1
2
3


files = os.listdir(data_dir)
for jsonfile in files:
    json_data = json.load(open(data_dir + jsonfile, 'r', encoding='utf-8'))

输出json

1
2
3


def output_json(outputdata, result_path):
    with open(result_path, 'w+', encoding='utf-8') as file:
        json.dump(outputdata, file, indent=4, ensure_ascii=False)

读写excel

1
2
3
4
5
6
7


import xlrd
file = xlrd.open_workbook('test.xlsx')
sheet = flie.sheets()[0]
rows = sheet.nrows
cols = sheet.ncols
for i in range(rows):
    val = sheet.cell_value(i, 0)

字典

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


User = dict()
User[id] = []
User[id].append([val1, val2])
userdic = dict()
for line in data.readlines():
    linestr = line.strip().split('\t')
    tmp1 = int(linestr[0])
    tmp2 = int(linestr[1])
    if tmp1 not in userdic:
        userdic[tmp1] = set()
    userdic[tmp1].add(tmp2)

读写word

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16


from win32com import client as wc

word = wc.Dispatch("Word.Application")

bookList = os.listdir(r'./result_new_2/')
for file in bookList:
    #print(file)
    try:
        doc = word.Documents.Open("C:\\docdir\\" + file)
        doc.SaveAs("{}x".format("C:\\docxdir\\" + file), 12)#另存为后缀为".docx"的文件，其中参数12指docx文件
        doc.Close()
    except Exception as e:
        print(file)
        continue
word.Quit()
print("完成！")

re

1

text = re.findall(r'"type":"([^"]+)",', str(longtext))

以前的笔记

Pandas自用笔记