任務(wù)是用python提取PDF里的表格文件到excel里面去。做為一個 學(xué)了一個周python的人來說當(dāng)然像嘗試一下看能不能做到,事實證明是可以的只是可能代碼有點爛。。。。。。
樣本大概是這樣的
好久之前寫了,一些細(xì)節(jié)忘記了,不過當(dāng)時注釋的很清楚直接上代碼,也是怕哪天電腦突然死機代碼沒了。。。換個地兒存。。。這個模塊提取的效率是真的慢。。。。。聽說可以調(diào)Java的程序提取效率快三倍但是沒學(xué)Java也就沒有去了解,以后還用得到應(yīng)該可以去了解一下。。。。。。
# -*- coding: utf-8 -*-
import os
import gc
from PyPDF2.pdf import PdfFileReader
from tabula import read_pdf
import pandas as pd
from openpyxl import load_workbook, Workbook
import datetime
def data_process2(dataframe2):
'''
三步:
刪除只有一個非空或者全空的列
從第一列開始往后合并直到遇到只有第一列不為空或者全不為空則處理下一步
遇到只有第一行不為空則檢查接下來的第三行如果一樣情況則接下來三行合并成一行
'''
#此循環(huán)處理只有一個非空或者全空列的情況,防止影響下面的處理
k = 0
while True:
if dataframe2.notnull().sum(axis=0)[k] <= 1:
print('%d空列\(zhòng)n', k, dataframe2.notnull()[k])
if k+1 == dataframe2.columns.size:
dataframe2 = dataframe2.iloc[0:, :k]
else:
dataframe_left = dataframe2.iloc[0:, :k]
dataframe_right = dataframe2.iloc[0:, k + 1:]
dataframe2 = pd.concat([dataframe_left, dataframe_right], axis=1, ignore_index=True)
k = k-1
if k >= dataframe2.columns.size-1:
break
k = k+1
i = 0
t = 0
print('去掉空列后\n', dataframe2)
#空字符代替NaN防止NaN和其他合并時全為空
dataframe2_copy = dataframe2.fillna('', inplace=False)
#此循環(huán)處理表頭
while True:
if i == 0:
if dataframe2.notnull().sum(axis=1)[0] == dataframe2.columns.size:
break
if dataframe2.notnull().sum(axis=1)[0] == 1 and dataframe2.notnull().iat[0, 0]:
break
i = i + 1
else:
if dataframe2.notnull().sum(axis=1)[i] == dataframe2.columns.size:
t = t+1
break
if dataframe2.notnull().sum(axis=1)[i] == 1 and dataframe2.notnull().iat[i, 0]:
t = t+1
break
dataframe2_copy.iloc[t] = dataframe2_copy.iloc[t] + dataframe2_copy.iloc[i]
i = i+1
if i >= len(dataframe2):
t = t + 1
break
print('處理表頭中\(zhòng)n', dataframe2_copy)
#去掉空行,并且重新索引
dataframe2_copy.dropna(axis=0, how='all', inplace=True)
dataframe2_copy = dataframe2_copy.reset_index(drop=True)
#次循環(huán)處理表里的數(shù)據(jù)
while i < len(dataframe2):
if i+2 >= len(dataframe2):
for p in range(len(dataframe2)-i):
dataframe2_copy.iloc[t] = dataframe2_copy.iloc[i+p]
t = t+1
break
elif dataframe2.notnull().sum(axis=1)[i] == 1 and dataframe2.notnull().iat[i, 0]:
if dataframe2.notnull().sum(axis=1)[i+2] == 1 and dataframe2.notnull().iat[i+2, 0]:
dataframe2_copy.iloc[t] = dataframe2_copy.iloc[i] + dataframe2_copy.iloc[i+1] + dataframe2_copy.iloc[i+2]
i = i+3
elif i+4 < len(dataframe2):
if dataframe2.notnull().sum(axis=1)[i + 1] == 1 and dataframe2.notnull().sum(axis=1)[i + 3] == 1 and dataframe2.notnull().sum(axis=1)[i + 4] == 1 and dataframe2.notnull().iat[i+1, 0] and dataframe2.notnull().iat[i+3, 0] and dataframe2.notnull().iat[i+4, 0]:
dataframe2_copy.iloc[t] = dataframe2_copy.iloc[i] + dataframe2_copy.iloc[i + 1] + dataframe2_copy.iloc[i + 2] + dataframe2_copy.iloc[i + 3] + dataframe2_copy.iloc[i + 4]
i = i + 5
else:
dataframe2_copy.iloc[t] = dataframe2_copy.iloc[i]
i = i + 1
else:
dataframe2_copy.iloc[t] = dataframe2_copy.iloc[i]
i = i + 1
else:
dataframe2_copy.iloc[t] = dataframe2_copy.iloc[i]
i = i+1
t = t+1
print('一個表的數(shù)據(jù)\n',dataframe2_copy)
return dataframe2_copy.iloc[:t]
def data_process1(dataframes):
'''
根據(jù)兩個空格拆分列數(shù)據(jù)合并
適用于數(shù)據(jù)均為str類型表格
如果非str型合并后為空數(shù)據(jù)丟失
'''
dataframes.fillna('', inplace=True)
print('處理前數(shù)據(jù):\n', dataframes)
n = 0
while True:
try:
dataframes[n].str.split(' ', expand=True)#一列全是非str pass
dataframes[n] = dataframes[n].astype('str')#處理有一部分為非str情況,防止數(shù)據(jù)丟失
over_data = dataframes[n].str.split(' ', expand=True)
over_data.fillna('', inplace=True)
except:
print('遇到非str型的列 pass')
n = n+1
if n >= dataframes.columns.size:
break
else:
continue
print('重疊的列:\n', over_data)
if n-1 < 0:
dataframe_right = dataframes.iloc[0:, n + 1:]
dataframes = pd.concat([over_data, dataframe_right], axis=1, ignore_index=True)
elif n+1 > dataframes.columns.size:
dataframe_left = dataframes.iloc[0:, :n]
dataframes = pd.concat([dataframe_left, over_data], axis=1, ignore_index=True)
else:
dataframe_left = dataframes.iloc[0:, :n]
dataframe_right = dataframes.iloc[0:, n+1:]
dataframes = pd.concat([dataframe_left, over_data, dataframe_right], axis=1, ignore_index=True)
n = n + over_data.columns.size
if n >= dataframes.columns.size:
break
print('處理后數(shù)據(jù)\n:', dataframes)
return dataframes
def getCashflowAggregation(dataframe1):
pass
def pdf_to_xlsx(folder):
'''
提取文件夾的PDF里表格數(shù)據(jù)
對數(shù)據(jù)做初步整理
對每個dataframe識別提取想要的數(shù)據(jù)保存到相應(yīng)的sheet里,
輸出同名xlsx格式文件
'''
files = os.listdir(folder)
#遍歷文件夾,找出PDF文件
pdfFile = [f for f in files if f.endswith('.pdf')]
for pdfFiles in pdfFile:
#建立一個和PDF同名的xlsx文件
pdfPath = os.path.join(folder, pdfFiles)
xlsPath = pdfPath[:-3] + 'xlsx'
#建立Workbook然后和所要保存的數(shù)據(jù)表格連接,之后每次保存都會保存到不同的Sheet中
Workbook(xlsPath)
book = Workbook()
book.save(filename=xlsPath)
#獲取PDF的頁數(shù)
pdf = PdfFileReader(open(pdfPath, 'rb'))
page_counts = pdf.getNumPages()
dataframe2 = pd.DataFrame()
#遍歷PDF每一頁,提取出表格數(shù)據(jù)
for page in range(1, page_counts+1):
try:
pf = read_pdf(pdfPath, encoding='gbk', multiple_tables=True,pages = page)
if len(pf) != 0:
for t in range(len(pf)):
dataframe1 = pf[t]
dataframe1 = data_process2(dataframe1)#處理表頭
dataframe1 = data_process1(dataframe1)#按空格拆分合并項
#CashflowAggregation = getCashflowAggregation(dataframe1)
#列數(shù)相同的表格合并,并且刪除重復(fù)項并保存
if dataframe2.empty:
dataframe2 = dataframe1
elif dataframe1.columns.size == dataframe2.columns.size:
dataframe2 = pd.concat([dataframe2,dataframe1],ignore_index=True)
#刪除重復(fù)項會影響池分布的匹配提取,但是可以很好的處理靜動態(tài)池和現(xiàn)金流歸集
#dataframe2.drop_duplicates(keep='first', inplace=True)#在原來的數(shù)據(jù)里刪除重復(fù)項
print(dataframe2)
else:
print('列數(shù):', dataframe1.columns.size)
print(dataframe2)
#保存在不同的工作簿
writer = pd.ExcelWriter(xlsPath, engin='openpyxl')
book = load_workbook(writer.path)
writer.book = book
dataframe2.to_excel(writer, sheet_name='shet')
writer.close()
dataframe2 = dataframe1
del(pf)
gc.collect()
except:
gc.collect()
print('Error Pass')
continue
#保存最后的數(shù)據(jù)表格到另一個工作表里
writer = pd.ExcelWriter(xlsPath, engin='openpyxl')
book = load_workbook(writer.path)
writer.book = book
dataframe2.to_excel(writer, sheet_name='shet')
writer.close()
star_time = datetime.datetime.now()
pdf_to_xlsx('D:\\2018暑假\\新建文件夾')
stop_time = datetime.datetime.now()
print('程序運行時間:', stop_time-star_time)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
- 176
- 177
- 178
- 179
- 180
- 181
- 182
- 183
- 184
- 185
- 186
- 187
- 188
- 189
- 190
- 191
- 192
- 193
- 194
- 195
- 196
- 197
- 198
- 199
- 200
- 201
- 202
- 203
- 204
- 205
- 206
- 207
- 208
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
- 176
- 177
- 178
- 179
- 180
- 181
- 182
- 183
- 184
- 185
- 186
- 187
- 188
- 189
- 190
- 191
- 192
- 193
- 194
- 195
- 196
- 197
- 198
- 199
- 200
- 201
- 202
- 203
- 204
- 205
- 206
- 207
- 208