WHCSRL 技术网

pandas基本操作及pdf转excel的方法

基本操作

import pandas as pd

dict_list = [{"name":"Jack","age":22,"score":100}, {"name":"xuan","age":21,"score":99}, {"name":"Rose","age":18,"score":60}]
df1 = pd.DataFrame(dict_list)
print(df1)
#    name  age  score
# 0  Jack   22    100
# 1  xuan   21     99
# 2  Rose   18     60
dict = {"name":"Jack","age":22,"score":100}
df2 = pd.DataFrame(dict, index=[3,1,6])
print(df2)
#    name  age  score
# 3  Jack   22    100
# 1  Jack   22    100
# 6  Jack   22    100
df2 = pd.DataFrame.from_dict(dict, orient="index",columns=['test'])
print(df2)
#        test
# name   Jack
# age      22
# score   100

list = ['xuan', 'Jack', 'Rose', 'Luxi']
df3 = pd.DataFrame(list,columns=['name'],index=[1,6,7,8])
print(df3)
#    name
# 1  xuan
# 6  Jack
# 7  Rose
# 8  Luxi

list_list = [['name','age'],['xuanRui1',22],['xuanRui2',33],['xuanRui3',44]]
df4 = pd.DataFrame(list_list,columns=['姓名', '年龄'])
print(df4)
# 添加columns列名参数
#          姓名   年龄
# 0      name  age
# 1  xuanRui1   22
# 2  xuanRui2   33
# 3  xuanRui3   44

df4["性别"] = ["无",'m','w','m']
print(df4)
#          姓名   年龄 性别
# 0      name  age  无
# 1  xuanRui1   22  m
# 2  xuanRui2   33  w
# 3  xuanRui3   44  m
print(df4.iloc[:,[0,2]])  # iloc: 根据索引切片,不支持字段名
#          姓名 性别
# 0      name  无
# 1  xuanRui1  m
# 2  xuanRui2  w
# 3  xuanRui3  m
print(df4.iloc[[0,2],:])
#          姓名   年龄 性别
# 0      name  age  无
# 2  xuanRui2   33  w

print(df4.loc[:,["姓名"]]) # iloc: 根据索引切片,不支持字段名
# 姓名
# 0      name
# 1  xuanRui1
# 2  xuanRui2
# 3  xuanRui3
print(df4.loc[[2,3],:])
#          姓名  年龄 性别
# 2  xuanRui2  33  w
# 3  xuanRui3  44  m
df4.index = ["一",'er',"san","si"]
print(df4)
#            姓名   年龄 性别
# 一        name  age  无
# er   xuanRui1   22  m
# san  xuanRui2   33  w
# si   xuanRui3   44  m

print(df4.loc[["er","si"],["姓名"]])
#          姓名
# er  xuanRui1
# si  xuanRui3
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82

PDF格式转为Excel格式

import pandas as pd
import camelot.io as camelot

def pdfToExcel(pdf_path):
    excel_path = pdf_path.split(".pdf")[0] + ".xlsx"
    with pd.ExcelWriter(excel_path) as writer:
        try:
            for page_num in range(1,20):
                print(page_num)
                try:
                    tables = camelot.read_pdf(pdf_path,flavor='stream',pages=str(page_num))
                    for table_num in range(5):
                        try:
                            if tables[table_num].shape != (1,1):
                                table_df = tables[table_num].df
                                # 生成excel文件
                                # table_df.to_excel(writer,sheet_name=f"数据表{page_num}")
                        except IndexError:
                            break
                except IndexError:
                    break
        except:
            print("PDF TO EXCEL ERROR ...")
            return None
    return table_df
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
推荐阅读