Python 从PDF中提取数据并导出到excel_Python_Excel

Python 从PDF中提取数据并导出到excel

python excel

Python 从PDF中提取数据并导出到excel,python,excel,Python,Excel,几个月后，我为自动化下一个过程编写了脚本列出文件夹中的.pdf文件从每个pdf文件中提取数据将提取的数据保存在excel表格中当处理多达15个pdf文件时，脚本可以完美地工作，但如果我尝试使用更多文件，则无法工作。我想第三个过程中会崩溃，但我不能确定我写检查点（打印找到的文件数、打印提取的数据等），但为了能够保存非中断空间数据，我需要输入以下代码： import sys reload(sys) sys.setdefaultencoding('Cp1252') 当我把这行代码放到p

几个月后，我为自动化下一个过程编写了脚本

列出文件夹中的.pdf文件

从每个pdf文件中提取数据

将提取的数据保存在excel表格中

当处理多达15个pdf文件时，脚本可以完美地工作，但如果我尝试使用更多文件，则无法工作。我想第三个过程中会崩溃，但我不能确定

我写检查点（打印找到的文件数、打印提取的数据等），但为了能够保存非中断空间数据，我需要输入以下代码：

import sys
reload(sys)  
sys.setdefaultencoding('Cp1252')

当我把这行代码放到pythonshell中时，我看不到任何东西，所以我不知道脚本何时崩溃

我想可能是关于记忆的，但我需要你的帮助

如果您能检查我的代码并给我建议，我将不胜感激

谢谢

我所有的剧本：

import sys
reload(sys)  
sys.setdefaultencoding('Cp1252')
import os
from glob import glob

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re
import xlsxwriter
import time




def find_ext(dr, ext):
    return glob(path.join(dr,"*.{}".format(ext)))  

files = [f for f in os.listdir('.') if os.path.isfile(f)]
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    fstr = ''
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,    password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

        str = retstr.getvalue()
        fstr += str

    fp.close()
    device.close()
    retstr.close()
    return fstr

fecha_de_hoy =(time.strftime("%d/%m/%Y"))
fecha_de_hoy = re.sub("/", "-", fecha_de_hoy)

# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook('Expenses.xlsx')
worksheet = workbook.add_worksheet()



# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0


# Iterate over the data and write it out row by row.

worksheet.write(row, col, "FECHA")
worksheet.write(row, col + 1, "CLIENTE")
worksheet.write(row, col + 2, "PROVEEDOR" )
worksheet.write(row, col + 3, "REF. CLIENTE" )
worksheet.write(row, col + 4, "REMITENTE")
worksheet.write(row, col + 5, "DESTINATARIO")
worksheet.write(row, col + 6, "DIRECCION DEST.")
worksheet.write(row, col + 7, "CODIGO POSTAL DEST.")
worksheet.write(row, col + 8, "POBLACION DEST.")
worksheet.write(row, col + 9, "PROVINCIA DEST.")
worksheet.write(row, col + 10, "Nº BULTOS")
worksheet.write(row, col + 11, "PESO")
worksheet.write(row, col + 12, "COSTE")
worksheet.write(row, col + 13, "PVP")
worksheet.write(row, col + 14, "E-mail CONFIRMACIÓN")

row+=1

e = len(files)
lengthlist = e
w=0
print e

while w < lengthlist:

    print w
    print files[w]

    factura = files[w]

    string = convert_pdf_to_txt(factura)


    txtList = convert_pdf_to_txt(factura).splitlines()
    destinatarioIdx, direcionNumIdx, codigoNumIdx, poblacionIdx, provinciaIdx, pedidoIdx, bultosIdx = -1, -1, -1, -1, -1, -1, -1

    for idx, line in enumerate(txtList):
        if line == "Destino MercancÃa":
            destinatarioIdx = idx +1
            direcionNumIdx = idx +2
            codigoNumIdx = idx +3  
            poblacionIdx = idx +3
            provinciaIdx = idx +4


        if line == "NÂº de Pedido":
            pedidoIdx = idx +1

        if "Bultos" in line:
            bultosIdx = idx + 2



    nombre_destinatario = txtList[destinatarioIdx] if destinatarioIdx != -1 else ''
    nombre_destinatario = re.sub("Ã‰", "É", nombre_destinatario)
    direccion_destinatario = txtList[direcionNumIdx] if direcionNumIdx != -1 else ''
    codigo_destinatario = txtList[codigoNumIdx] if codigoNumIdx != -1 else ''
    codigo_destinatario = re.sub("\D", "", codigo_destinatario)
    poblacion_destinatario = txtList[poblacionIdx] if poblacionIdx != -1 else ''
    poblacion_destinatario = re.sub("[0-9]", "", poblacion_destinatario)
    poblacion_destinatario = re.sub(r"\s+", "", poblacion_destinatario, flags=re.UNICODE)
    provincia_destinatario = txtList[provinciaIdx] if provinciaIdx != -1 else ''
    pedido_destinatario = txtList[pedidoIdx] if pedidoIdx != -1 else ''
    bultos_destinatario = txtList[bultosIdx] if bultosIdx != -1 else ''
    bultos_destinatario = re.sub(r"\s+", "", bultos_destinatario, flags=re.UNICODE)

    #ARREGLAR EXCEPCIONES

    '''for idx, line in enumerate(txtList):
        if line == "Destino MercancÃa":
            destinatarioIdx = idx +1
            direcionNumIdx = idx +2
            codigoNumIdx = idx +3

            if codigoNumIdx < 1000:
                direcion1 = idx +2
                direccion2 = idx +3
                direcionNumIdx = (direcion1, direccion2)
                codigoNumIdx = idx +4
                poblacionIdx = idx +4
                provinciaIdx = idx +5'''

    print "Nombre Destinatario"
    print nombre_destinatario
    print "Direccion destinatario"
    print direccion_destinatario
    print "codigo destinatario"
    print codigo_destinatario
    print "poblacion destinatario"
    print poblacion_destinatario
    print "Provincia destinatario"
    print provincia_destinatario
    print "Nº pedido destinatario"
    print pedido_destinatario
    print "Nº bultos envío"
    print bultos_destinatario

    # Iterate over the data and write it out row by row.

    worksheet.write(row, col, fecha_de_hoy)
    worksheet.write(row, col + 1, "SIDAC")
    worksheet.write(row, col + 2, "PROVEEDOR" )
    worksheet.write(row, col + 3, pedido_destinatario )
    worksheet.write(row, col + 4, "SIDAC")
    worksheet.write(row, col + 5, nombre_destinatario)
    worksheet.write(row, col + 6, direccion_destinatario)
    worksheet.write(row, col + 7, codigo_destinatario)
    worksheet.write(row, col + 8, poblacion_destinatario)
    worksheet.write(row, col + 9, provincia_destinatario)
    worksheet.write(row, col + 10, bultos_destinatario)
    worksheet.write(row, col + 11, "PESO")
    worksheet.write(row, col + 12, "COSTE")
    worksheet.write(row, col + 13, "PVP")
    worksheet.write(row, col + 14, "trafico@buendialogistica.com")




    w+=1
    row+=1


workbook.close()

导入系统重新加载（系统） sys.setdefaultencoding（'Cp1252'）导入操作系统从全局导入全局从pdfminer.pdfinterp导入PDFResourceManager、pdfpageexplorer 从pdfminer.converter导入文本转换器从pdfminer.layout导入LAParams 从pdfminer.pdfpage导入pdfpage 从cStringIO导入StringIO 进口稀土导入xlsxwriter 导入时间 def查找分机（dr，分机）：返回glob（path.join（dr，“*.{}”.format（ext））） files=[f表示os.listdir（'.'）中的f，如果os.path.isfile（f）] files=filter（lambda f:f.endswith（（'.pdf'，'.pdf'）），files） def将pdf文件转换为txt文件（路径）： rsrcmgr=PDFResourceManager（） retstr=StringIO（）编解码器='utf-8' laparams=laparams（） device=TextConverter（rsrcmgr、retstr、codec=codec、laparams=laparams） fp=文件（路径“rb”）解释器=PDFPAGE解释器（rsrcmgr，设备） password=“” maxpages=0 缓存=真 pagenos=set（） fstr='' 对于PDFPage.get_页面中的页面（fp，pagenos，maxpages=maxpages，password=password，caching=caching，check_extractable=True）：解释器。处理页面（第页） str=retstr.getvalue（） fstr+=str fp.close（）设备关闭（） retstr.close（）返回fstr fecha_de_hoy=（time.strftime（“%d/%m/%Y”）） fecha_deu_hoy=re.sub（“/”，“-”，fecha_deu_hoy） #创建工作簿并添加工作表。工作簿=xlsxwriter.workbook（'Expenses.xlsx'）工作表=工作簿。添加工作表（） #从第一个单元格开始。行和列的索引为零。行=0 col=0 #迭代数据并逐行写出。工作表。书写（行、列，“FECHA”）工作表。书写（行、列+1，“客户”）工作表。书写（行、列+2，“证明人”）工作表。书写（行、列+3，“参考客户”）工作表。书写（行、列+4，“汇款”）工作表。书写（行、列+5，“目的”）工作表。写入（行、列+6，“目录目的地”）工作表。书写（行、列+7，“CODIGO邮政目的地”）工作表。写（行，列+8，“POBLACION DEST.”）工作表。书写（行、列+9，“省目的地”）工作表。书写（行、列+10，“序号为BULTOS”）工作表。填写（行、列+11，“比索”）工作表。书写（行、列+12，“成本”）工作表。书写（行、列+13，“PVP”）工作表。书写（行、列+14，“电子邮件确认”）行+=1 e=len（文件）长度列表=e w=0 打印e 而w

[excel]相关文章推荐

随机文章推荐