pandas基本操作及pdf转excel的方法
基本操作
import pandas as pd
dict_list = [{"name":"Jack","age":22,"score":100}, {"name":"xuan","age":21,"score":99}, {"name":"Rose","age":18,"score":60}]
df1 = pd.DataFrame(dict_list)
print(df1)
# name age score
# 0 Jack 22 100
# 1 xuan 21 99
# 2 Rose 18 60
dict = {"name":"Jack","age":22,"score":100}
df2 = pd.DataFrame(dict, index=[3,1,6])
print(df2)
# name age score
# 3 Jack 22 100
# 1 Jack 22 100
# 6 Jack 22 100
df2 = pd.DataFrame.from_dict(dict, orient="index",columns=['test'])
print(df2)
# test
# name Jack
# age 22
# score 100
list = ['xuan', 'Jack', 'Rose', 'Luxi']
df3 = pd.DataFrame(list,columns=['name'],index=[1,6,7,8])
print(df3)
# name
# 1 xuan
# 6 Jack
# 7 Rose
# 8 Luxi
list_list = [['name','age'],['xuanRui1',22],['xuanRui2',33],['xuanRui3',44]]
df4 = pd.DataFrame(list_list,columns=['姓名', '年龄'])
print(df4)
# 添加columns列名参数
# 姓名 年龄
# 0 name age
# 1 xuanRui1 22
# 2 xuanRui2 33
# 3 xuanRui3 44
df4["性别"] = ["无",'m','w','m']
print(df4)
# 姓名 年龄 性别
# 0 name age 无
# 1 xuanRui1 22 m
# 2 xuanRui2 33 w
# 3 xuanRui3 44 m
print(df4.iloc[:,[0,2]]) # iloc: 根据索引切片,不支持字段名
# 姓名 性别
# 0 name 无
# 1 xuanRui1 m
# 2 xuanRui2 w
# 3 xuanRui3 m
print(df4.iloc[[0,2],:])
# 姓名 年龄 性别
# 0 name age 无
# 2 xuanRui2 33 w
print(df4.loc[:,["姓名"]]) # iloc: 根据索引切片,不支持字段名
# 姓名
# 0 name
# 1 xuanRui1
# 2 xuanRui2
# 3 xuanRui3
print(df4.loc[[2,3],:])
# 姓名 年龄 性别
# 2 xuanRui2 33 w
# 3 xuanRui3 44 m
df4.index = ["一",'er',"san","si"]
print(df4)
# 姓名 年龄 性别
# 一 name age 无
# er xuanRui1 22 m
# san xuanRui2 33 w
# si xuanRui3 44 m
print(df4.loc[["er","si"],["姓名"]])
# 姓名
# er xuanRui1
# si xuanRui3
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
PDF格式转为Excel格式
import pandas as pd
import camelot.io as camelot
def pdfToExcel(pdf_path):
excel_path = pdf_path.split(".pdf")[0] + ".xlsx"
with pd.ExcelWriter(excel_path) as writer:
try:
for page_num in range(1,20):
print(page_num)
try:
tables = camelot.read_pdf(pdf_path,flavor='stream',pages=str(page_num))
for table_num in range(5):
try:
if tables[table_num].shape != (1,1):
table_df = tables[table_num].df
# 生成excel文件
# table_df.to_excel(writer,sheet_name=f"数据表{page_num}")
except IndexError:
break
except IndexError:
break
except:
print("PDF TO EXCEL ERROR ...")
return None
return table_df
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
推荐阅读