Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/325.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python pdfplumber无法提取相似单元格中多行的文本_Python - Fatal编程技术网

Python pdfplumber无法提取相似单元格中多行的文本

Python pdfplumber无法提取相似单元格中多行的文本,python,Python,我试图从pdf中提取信息,并将其加载到数据框中进行进一步处理。但是,我没有得到“地点”——“丢失地点(邮政编码)”——正确地附加了我想要提取的信息,即单个单元格中的多行。这是pdfplumber的限制还是某个地方的正则表达式出错了 import re import parse import pdfplumber import pandas as pd from collections import namedtuple import os police_report_re = re.compi

我试图从pdf中提取信息,并将其加载到数据框中进行进一步处理。但是,我没有得到“地点”——“丢失地点(邮政编码)”——正确地附加了我想要提取的信息,即单个单元格中的多行。这是pdfplumber的限制还是某个地方的正则表达式出错了

import re
import parse
import pdfplumber
import pandas as pd
from collections import namedtuple
import os

police_report_re = re.compile(r'(.*): (.*) Party At Fault: (.*)')
collision_re = re.compile(r'Collision With: (.*) Damage(.*):(.*)')
thirdparty_re = re.compile(r'Third Party Involved\? (.*) TP Injury Involved\? (.*)')
lost_re = re.compile(r'Description of Accident\/Loss: (.*)')
#
place_re = re.compile(r"(^[^>][\w\s]+)$",re.MULTILINE)
#
model_re = re.compile(r'Vehicle Model: (.*)')
engine_re = re.compile(r'Engine No: (.*) Chassis No: (.*)')
jpj_re = re.compile(r'JPJ Log Book No: (.*) JPJ Reg Date \(Man Yr\): (.*)')
assembly_re = re.compile(r'Assembly Type: (.*) Vehicle Class: (.*)')
vehtype_re = re.compile(r'Vehicle Type / Colour: (.*) Odometer Reading: (.*)')
condition_re = re.compile(r'General Condition: (.*) Condition of Damage: (.*)')
drive_re = re.compile(r'Vehicle Still Driveable\? (.*)')

file = 'claim.pdf'

lines = []
total_check = 0

with pdfplumber.open(file) as pdf:
  pages = pdf.pages
  for page in pdf.pages:
    text = page.extract_text()
    for line in text.split('\n'):
      
      comp8 = police_report_re.search(line)
      comp9 = collision_re.search(line)
      comp10 = thirdparty_re.search(line)
      comp11 = lost_re.search(line)      
      comp12 = place_re.search(line)  
      comp = model_re.search(line)
      comp2 = engine_re.search(line)
      comp3 = jpj_re.search(line)
      comp4 = assembly_re.search(line)
      comp5 = vehtype_re.search(line)
      comp6 = condition_re.search(line)
      comp7 = drive_re.search(line)
      
      if comp8:
        pol_rep, party_fault = comp8.group(2), comp8.group(3)
        print('Police Report No: ' + pol_rep + '  Party At Fault: ' + party_fault + '\n')

      elif comp9:
        collission, damage = comp9.group(1), comp9.group(3)
        print('Collision With: ' + collission + '  Damage/Loss Type: ' + damage + '\n')

      elif comp10:
        tp, injury = comp10.group(1), comp10.group(2)
        print('Third Party Involved? ' + tp + '  TP Injury Involved? ' + injury + '\n')

      elif comp11:
        accident = comp11.group(1)
        print('Description of Accident/Loss: ' + accident + '\n')

      elif comp12:
        tempat = comp12.group(1)
        print('Place of Loss (Postcode): ' + tempat + '\n')

      elif comp:
        model = comp.group(1)
        print('Vehicle Model: ' + model + '\n')

      elif comp2: 
        engine, carchasis = comp2.group(1), comp2.group(2)
        print('Engine No: '+ engine + '  Chasis No: ' + carchasis + '\n')

      elif comp3: 
        jpjlog, jpjreg = comp3.group(1), comp3.group(2)
        print('JPJ Log Book No: '+ jpjlog + '  JPJ Reg Date (Man Yr): ' + jpjreg + '\n')

      elif comp4: 
        assemblytype, vehclass = comp4.group(1), comp4.group(2)
        print('Assembly Type: ' + assemblytype + '  Vehicle Class: ' + vehclass + '\n')

      elif comp5: 
        vehtypecolor, odo = comp5.group(1), comp5.group(2)
        print('Vehicle Type / Colour: ' + vehtypecolor + '  Odometer Reading: ' + odo + '\n')

      elif comp6: 
        gencon, condmg = comp6.group(1), comp6.group(2)
        print('General Condition: ' + gencon + '  Condition of Damage: ' + condmg + '\n')

      elif comp7: 
        driveable = comp7.group(1)
        print('Vehicle Still Driveable? ' + driveable + '\n')
        lines.append(Line(pol_rep, party_fault, collission, damage, tp, injury, accident, tempat, model, engine, carchasis, jpjlog, jpjreg, assemblytype, vehclass, vehtypecolor, odo, gencon, condmg, driveable))

df = pd.DataFrame(lines)
df.head()