Python 从PDF中提取数据并导出到excel

Python 从PDF中提取数据并导出到excel,python,excel,Python,Excel,几个月后,我为自动化下一个过程编写了脚本 列出文件夹中的.pdf文件 从每个pdf文件中提取数据 将提取的数据保存在excel表格中 当处理多达15个pdf文件时,脚本可以完美地工作,但如果我尝试使用更多文件,则无法工作。我想第三个过程中会崩溃,但我不能确定 我写检查点(打印找到的文件数、打印提取的数据等),但为了能够保存非中断空间数据,我需要输入以下代码: import sys reload(sys) sys.setdefaultencoding('Cp1252') 当我把这行代码放到p

几个月后,我为自动化下一个过程编写了脚本

  • 列出文件夹中的.pdf文件
  • 从每个pdf文件中提取数据
  • 将提取的数据保存在excel表格中
  • 当处理多达15个pdf文件时,脚本可以完美地工作,但如果我尝试使用更多文件,则无法工作。我想第三个过程中会崩溃,但我不能确定

    我写检查点(打印找到的文件数、打印提取的数据等),但为了能够保存非中断空间数据,我需要输入以下代码:

    import sys
    reload(sys)  
    sys.setdefaultencoding('Cp1252')
    
    当我把这行代码放到pythonshell中时,我看不到任何东西,所以我不知道脚本何时崩溃

    我想可能是关于记忆的,但我需要你的帮助

    如果您能检查我的代码并给我建议,我将不胜感激

    谢谢

    我所有的剧本:

    import sys
    reload(sys)  
    sys.setdefaultencoding('Cp1252')
    import os
    from glob import glob
    
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from cStringIO import StringIO
    import re
    import xlsxwriter
    import time
    
    
    
    
    def find_ext(dr, ext):
        return glob(path.join(dr,"*.{}".format(ext)))  
    
    files = [f for f in os.listdir('.') if os.path.isfile(f)]
    files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
    
    def convert_pdf_to_txt(path):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        fstr = ''
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,    password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
    
            str = retstr.getvalue()
            fstr += str
    
        fp.close()
        device.close()
        retstr.close()
        return fstr
    
    fecha_de_hoy =(time.strftime("%d/%m/%Y"))
    fecha_de_hoy = re.sub("/", "-", fecha_de_hoy)
    
    # Create a workbook and add a worksheet.
    workbook = xlsxwriter.Workbook('Expenses.xlsx')
    worksheet = workbook.add_worksheet()
    
    
    
    # Start from the first cell. Rows and columns are zero indexed.
    row = 0
    col = 0
    
    
    # Iterate over the data and write it out row by row.
    
    worksheet.write(row, col, "FECHA")
    worksheet.write(row, col + 1, "CLIENTE")
    worksheet.write(row, col + 2, "PROVEEDOR" )
    worksheet.write(row, col + 3, "REF. CLIENTE" )
    worksheet.write(row, col + 4, "REMITENTE")
    worksheet.write(row, col + 5, "DESTINATARIO")
    worksheet.write(row, col + 6, "DIRECCION DEST.")
    worksheet.write(row, col + 7, "CODIGO POSTAL DEST.")
    worksheet.write(row, col + 8, "POBLACION DEST.")
    worksheet.write(row, col + 9, "PROVINCIA DEST.")
    worksheet.write(row, col + 10, "Nº BULTOS")
    worksheet.write(row, col + 11, "PESO")
    worksheet.write(row, col + 12, "COSTE")
    worksheet.write(row, col + 13, "PVP")
    worksheet.write(row, col + 14, "E-mail CONFIRMACIÓN")
    
    row+=1
    
    e = len(files)
    lengthlist = e
    w=0
    print e
    
    while w < lengthlist:
    
        print w
        print files[w]
    
        factura = files[w]
    
        string = convert_pdf_to_txt(factura)
    
    
        txtList = convert_pdf_to_txt(factura).splitlines()
        destinatarioIdx, direcionNumIdx, codigoNumIdx, poblacionIdx, provinciaIdx, pedidoIdx, bultosIdx = -1, -1, -1, -1, -1, -1, -1
    
        for idx, line in enumerate(txtList):
            if line == "Destino Mercancía":
                destinatarioIdx = idx +1
                direcionNumIdx = idx +2
                codigoNumIdx = idx +3  
                poblacionIdx = idx +3
                provinciaIdx = idx +4
    
    
            if line == "Nº de Pedido":
                pedidoIdx = idx +1
    
            if "Bultos" in line:
                bultosIdx = idx + 2
    
    
    
        nombre_destinatario = txtList[destinatarioIdx] if destinatarioIdx != -1 else ''
        nombre_destinatario = re.sub("É", "É", nombre_destinatario)
        direccion_destinatario = txtList[direcionNumIdx] if direcionNumIdx != -1 else ''
        codigo_destinatario = txtList[codigoNumIdx] if codigoNumIdx != -1 else ''
        codigo_destinatario = re.sub("\D", "", codigo_destinatario)
        poblacion_destinatario = txtList[poblacionIdx] if poblacionIdx != -1 else ''
        poblacion_destinatario = re.sub("[0-9]", "", poblacion_destinatario)
        poblacion_destinatario = re.sub(r"\s+", "", poblacion_destinatario, flags=re.UNICODE)
        provincia_destinatario = txtList[provinciaIdx] if provinciaIdx != -1 else ''
        pedido_destinatario = txtList[pedidoIdx] if pedidoIdx != -1 else ''
        bultos_destinatario = txtList[bultosIdx] if bultosIdx != -1 else ''
        bultos_destinatario = re.sub(r"\s+", "", bultos_destinatario, flags=re.UNICODE)
    
        #ARREGLAR EXCEPCIONES
    
        '''for idx, line in enumerate(txtList):
            if line == "Destino Mercancía":
                destinatarioIdx = idx +1
                direcionNumIdx = idx +2
                codigoNumIdx = idx +3
    
                if codigoNumIdx < 1000:
                    direcion1 = idx +2
                    direccion2 = idx +3
                    direcionNumIdx = (direcion1, direccion2)
                    codigoNumIdx = idx +4
                    poblacionIdx = idx +4
                    provinciaIdx = idx +5'''
    
        print "Nombre Destinatario"
        print nombre_destinatario
        print "Direccion destinatario"
        print direccion_destinatario
        print "codigo destinatario"
        print codigo_destinatario
        print "poblacion destinatario"
        print poblacion_destinatario
        print "Provincia destinatario"
        print provincia_destinatario
        print "Nº pedido destinatario"
        print pedido_destinatario
        print "Nº bultos envío"
        print bultos_destinatario
    
        # Iterate over the data and write it out row by row.
    
        worksheet.write(row, col, fecha_de_hoy)
        worksheet.write(row, col + 1, "SIDAC")
        worksheet.write(row, col + 2, "PROVEEDOR" )
        worksheet.write(row, col + 3, pedido_destinatario )
        worksheet.write(row, col + 4, "SIDAC")
        worksheet.write(row, col + 5, nombre_destinatario)
        worksheet.write(row, col + 6, direccion_destinatario)
        worksheet.write(row, col + 7, codigo_destinatario)
        worksheet.write(row, col + 8, poblacion_destinatario)
        worksheet.write(row, col + 9, provincia_destinatario)
        worksheet.write(row, col + 10, bultos_destinatario)
        worksheet.write(row, col + 11, "PESO")
        worksheet.write(row, col + 12, "COSTE")
        worksheet.write(row, col + 13, "PVP")
        worksheet.write(row, col + 14, "trafico@buendialogistica.com")
    
    
    
    
        w+=1
        row+=1
    
    
    workbook.close()
    
    导入系统 重新加载(系统) sys.setdefaultencoding('Cp1252') 导入操作系统 从全局导入全局 从pdfminer.pdfinterp导入PDFResourceManager、pdfpageexplorer 从pdfminer.converter导入文本转换器 从pdfminer.layout导入LAParams 从pdfminer.pdfpage导入pdfpage 从cStringIO导入StringIO 进口稀土 导入xlsxwriter 导入时间 def查找分机(dr,分机): 返回glob(path.join(dr,“*.{}”.format(ext))) files=[f表示os.listdir('.')中的f,如果os.path.isfile(f)] files=filter(lambda f:f.endswith(('.pdf','.pdf')),files) def将pdf文件转换为txt文件(路径): rsrcmgr=PDFResourceManager() retstr=StringIO() 编解码器='utf-8' laparams=laparams() device=TextConverter(rsrcmgr、retstr、codec=codec、laparams=laparams) fp=文件(路径“rb”) 解释器=PDFPAGE解释器(rsrcmgr,设备) password=“” maxpages=0 缓存=真 pagenos=set() fstr='' 对于PDFPage.get_页面中的页面(fp,pagenos,maxpages=maxpages,password=password,caching=caching,check_extractable=True): 解释器。处理页面(第页) str=retstr.getvalue() fstr+=str fp.close() 设备关闭() retstr.close() 返回fstr fecha_de_hoy=(time.strftime(“%d/%m/%Y”)) fecha_deu_hoy=re.sub(“/”,“-”,fecha_deu_hoy) #创建工作簿并添加工作表。 工作簿=xlsxwriter.workbook('Expenses.xlsx') 工作表=工作簿。添加工作表() #从第一个单元格开始。行和列的索引为零。 行=0 col=0 #迭代数据并逐行写出。 工作表。书写(行、列,“FECHA”) 工作表。书写(行、列+1,“客户”) 工作表。书写(行、列+2,“证明人”) 工作表。书写(行、列+3,“参考客户”) 工作表。书写(行、列+4,“汇款”) 工作表。书写(行、列+5,“目的”) 工作表。写入(行、列+6,“目录目的地”) 工作表。书写(行、列+7,“CODIGO邮政目的地”) 工作表。写(行,列+8,“POBLACION DEST.”) 工作表。书写(行、列+9,“省目的地”) 工作表。书写(行、列+10,“序号为BULTOS”) 工作表。填写(行、列+11,“比索”) 工作表。书写(行、列+12,“成本”) 工作表。书写(行、列+13,“PVP”) 工作表。书写(行、列+14,“电子邮件确认”) 行+=1 e=len(文件) 长度列表=e w=0 打印e 而w