自用,一些工具代码。


pd读取excel

1
2
3
4
5
6
7
data = pd.DataFrame(pd.read_excel(excel_path))
for row in data.itertuples():
id = getattr(row, 'uid')
text = getattr(row, '文本')
if pd.isna(text):
continue
info = re.findall(r'"重要":"([^"]+)",', str(text))

pd输出excel

1
2
3
4
5
6
7
8
def output_excel(outputdata, result_path):
title = ['姓名', '性别', '年龄']
writer = pd.ExcelWriter(result_path)

df = pd.DataFrame(outputdata, columns=title)
df.to_excel(writer, sheet_name='Sheet1', index=False)

writer.save()

读取json

1
2
3
files = os.listdir(data_dir)
for jsonfile in files:
json_data = json.load(open(data_dir + jsonfile, 'r', encoding='utf-8'))

输出json

1
2
3
def output_json(outputdata, result_path):
with open(result_path, 'w+', encoding='utf-8') as file:
json.dump(outputdata, file, indent=4, ensure_ascii=False)

读写excel

1
2
3
4
5
6
7
import xlrd
file = xlrd.open_workbook('test.xlsx')
sheet = flie.sheets()[0]
rows = sheet.nrows
cols = sheet.ncols
for i in range(rows):
val = sheet.cell_value(i, 0)

字典

1
2
3
4
5
6
7
8
9
10
11
User = dict()
User[id] = []
User[id].append([val1, val2])
userdic = dict()
for line in data.readlines():
linestr = line.strip().split('\t')
tmp1 = int(linestr[0])
tmp2 = int(linestr[1])
if tmp1 not in userdic:
userdic[tmp1] = set()
userdic[tmp1].add(tmp2)

读写word

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from win32com import client as wc

word = wc.Dispatch("Word.Application")

bookList = os.listdir(r'./result_new_2/')
for file in bookList:
#print(file)
try:
doc = word.Documents.Open("C:\\docdir\\" + file)
doc.SaveAs("{}x".format("C:\\docxdir\\" + file), 12)#另存为后缀为".docx"的文件,其中参数12指docx文件
doc.Close()
except Exception as e:
print(file)
continue
word.Quit()
print("完成!")

re

1
text = re.findall(r'"type":"([^"]+)",', str(longtext))

以前的笔记

Pandas自用笔记