Python 从PDF中提取数据并导出到excel
几个月后,我为自动化下一个过程编写了脚本Python 从PDF中提取数据并导出到excel,python,excel,Python,Excel,几个月后,我为自动化下一个过程编写了脚本 列出文件夹中的.pdf文件 从每个pdf文件中提取数据 将提取的数据保存在excel表格中 当处理多达15个pdf文件时,脚本可以完美地工作,但如果我尝试使用更多文件,则无法工作。我想第三个过程中会崩溃,但我不能确定 我写检查点(打印找到的文件数、打印提取的数据等),但为了能够保存非中断空间数据,我需要输入以下代码: import sys reload(sys) sys.setdefaultencoding('Cp1252') 当我把这行代码放到p
import sys
reload(sys)
sys.setdefaultencoding('Cp1252')
当我把这行代码放到pythonshell中时,我看不到任何东西,所以我不知道脚本何时崩溃
我想可能是关于记忆的,但我需要你的帮助
如果您能检查我的代码并给我建议,我将不胜感激
谢谢
我所有的剧本:
import sys
reload(sys)
sys.setdefaultencoding('Cp1252')
import os
from glob import glob
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re
import xlsxwriter
import time
def find_ext(dr, ext):
return glob(path.join(dr,"*.{}".format(ext)))
files = [f for f in os.listdir('.') if os.path.isfile(f)]
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
fstr = ''
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
str = retstr.getvalue()
fstr += str
fp.close()
device.close()
retstr.close()
return fstr
fecha_de_hoy =(time.strftime("%d/%m/%Y"))
fecha_de_hoy = re.sub("/", "-", fecha_de_hoy)
# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook('Expenses.xlsx')
worksheet = workbook.add_worksheet()
# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0
# Iterate over the data and write it out row by row.
worksheet.write(row, col, "FECHA")
worksheet.write(row, col + 1, "CLIENTE")
worksheet.write(row, col + 2, "PROVEEDOR" )
worksheet.write(row, col + 3, "REF. CLIENTE" )
worksheet.write(row, col + 4, "REMITENTE")
worksheet.write(row, col + 5, "DESTINATARIO")
worksheet.write(row, col + 6, "DIRECCION DEST.")
worksheet.write(row, col + 7, "CODIGO POSTAL DEST.")
worksheet.write(row, col + 8, "POBLACION DEST.")
worksheet.write(row, col + 9, "PROVINCIA DEST.")
worksheet.write(row, col + 10, "Nº BULTOS")
worksheet.write(row, col + 11, "PESO")
worksheet.write(row, col + 12, "COSTE")
worksheet.write(row, col + 13, "PVP")
worksheet.write(row, col + 14, "E-mail CONFIRMACIÓN")
row+=1
e = len(files)
lengthlist = e
w=0
print e
while w < lengthlist:
print w
print files[w]
factura = files[w]
string = convert_pdf_to_txt(factura)
txtList = convert_pdf_to_txt(factura).splitlines()
destinatarioIdx, direcionNumIdx, codigoNumIdx, poblacionIdx, provinciaIdx, pedidoIdx, bultosIdx = -1, -1, -1, -1, -1, -1, -1
for idx, line in enumerate(txtList):
if line == "Destino MercancÃa":
destinatarioIdx = idx +1
direcionNumIdx = idx +2
codigoNumIdx = idx +3
poblacionIdx = idx +3
provinciaIdx = idx +4
if line == "Nº de Pedido":
pedidoIdx = idx +1
if "Bultos" in line:
bultosIdx = idx + 2
nombre_destinatario = txtList[destinatarioIdx] if destinatarioIdx != -1 else ''
nombre_destinatario = re.sub("É", "É", nombre_destinatario)
direccion_destinatario = txtList[direcionNumIdx] if direcionNumIdx != -1 else ''
codigo_destinatario = txtList[codigoNumIdx] if codigoNumIdx != -1 else ''
codigo_destinatario = re.sub("\D", "", codigo_destinatario)
poblacion_destinatario = txtList[poblacionIdx] if poblacionIdx != -1 else ''
poblacion_destinatario = re.sub("[0-9]", "", poblacion_destinatario)
poblacion_destinatario = re.sub(r"\s+", "", poblacion_destinatario, flags=re.UNICODE)
provincia_destinatario = txtList[provinciaIdx] if provinciaIdx != -1 else ''
pedido_destinatario = txtList[pedidoIdx] if pedidoIdx != -1 else ''
bultos_destinatario = txtList[bultosIdx] if bultosIdx != -1 else ''
bultos_destinatario = re.sub(r"\s+", "", bultos_destinatario, flags=re.UNICODE)
#ARREGLAR EXCEPCIONES
'''for idx, line in enumerate(txtList):
if line == "Destino MercancÃa":
destinatarioIdx = idx +1
direcionNumIdx = idx +2
codigoNumIdx = idx +3
if codigoNumIdx < 1000:
direcion1 = idx +2
direccion2 = idx +3
direcionNumIdx = (direcion1, direccion2)
codigoNumIdx = idx +4
poblacionIdx = idx +4
provinciaIdx = idx +5'''
print "Nombre Destinatario"
print nombre_destinatario
print "Direccion destinatario"
print direccion_destinatario
print "codigo destinatario"
print codigo_destinatario
print "poblacion destinatario"
print poblacion_destinatario
print "Provincia destinatario"
print provincia_destinatario
print "Nº pedido destinatario"
print pedido_destinatario
print "Nº bultos envío"
print bultos_destinatario
# Iterate over the data and write it out row by row.
worksheet.write(row, col, fecha_de_hoy)
worksheet.write(row, col + 1, "SIDAC")
worksheet.write(row, col + 2, "PROVEEDOR" )
worksheet.write(row, col + 3, pedido_destinatario )
worksheet.write(row, col + 4, "SIDAC")
worksheet.write(row, col + 5, nombre_destinatario)
worksheet.write(row, col + 6, direccion_destinatario)
worksheet.write(row, col + 7, codigo_destinatario)
worksheet.write(row, col + 8, poblacion_destinatario)
worksheet.write(row, col + 9, provincia_destinatario)
worksheet.write(row, col + 10, bultos_destinatario)
worksheet.write(row, col + 11, "PESO")
worksheet.write(row, col + 12, "COSTE")
worksheet.write(row, col + 13, "PVP")
worksheet.write(row, col + 14, "trafico@buendialogistica.com")
w+=1
row+=1
workbook.close()
导入系统
重新加载(系统)
sys.setdefaultencoding('Cp1252')
导入操作系统
从全局导入全局
从pdfminer.pdfinterp导入PDFResourceManager、pdfpageexplorer
从pdfminer.converter导入文本转换器
从pdfminer.layout导入LAParams
从pdfminer.pdfpage导入pdfpage
从cStringIO导入StringIO
进口稀土
导入xlsxwriter
导入时间
def查找分机(dr,分机):
返回glob(path.join(dr,“*.{}”.format(ext)))
files=[f表示os.listdir('.')中的f,如果os.path.isfile(f)]
files=filter(lambda f:f.endswith(('.pdf','.pdf')),files)
def将pdf文件转换为txt文件(路径):
rsrcmgr=PDFResourceManager()
retstr=StringIO()
编解码器='utf-8'
laparams=laparams()
device=TextConverter(rsrcmgr、retstr、codec=codec、laparams=laparams)
fp=文件(路径“rb”)
解释器=PDFPAGE解释器(rsrcmgr,设备)
password=“”
maxpages=0
缓存=真
pagenos=set()
fstr=''
对于PDFPage.get_页面中的页面(fp,pagenos,maxpages=maxpages,password=password,caching=caching,check_extractable=True):
解释器。处理页面(第页)
str=retstr.getvalue()
fstr+=str
fp.close()
设备关闭()
retstr.close()
返回fstr
fecha_de_hoy=(time.strftime(“%d/%m/%Y”))
fecha_deu_hoy=re.sub(“/”,“-”,fecha_deu_hoy)
#创建工作簿并添加工作表。
工作簿=xlsxwriter.workbook('Expenses.xlsx')
工作表=工作簿。添加工作表()
#从第一个单元格开始。行和列的索引为零。
行=0
col=0
#迭代数据并逐行写出。
工作表。书写(行、列,“FECHA”)
工作表。书写(行、列+1,“客户”)
工作表。书写(行、列+2,“证明人”)
工作表。书写(行、列+3,“参考客户”)
工作表。书写(行、列+4,“汇款”)
工作表。书写(行、列+5,“目的”)
工作表。写入(行、列+6,“目录目的地”)
工作表。书写(行、列+7,“CODIGO邮政目的地”)
工作表。写(行,列+8,“POBLACION DEST.”)
工作表。书写(行、列+9,“省目的地”)
工作表。书写(行、列+10,“序号为BULTOS”)
工作表。填写(行、列+11,“比索”)
工作表。书写(行、列+12,“成本”)
工作表。书写(行、列+13,“PVP”)
工作表。书写(行、列+14,“电子邮件确认”)
行+=1
e=len(文件)
长度列表=e
w=0
打印e
而w