Python pdfplumber无法提取相似单元格中多行的文本
我试图从pdf中提取信息,并将其加载到数据框中进行进一步处理。但是,我没有得到“地点”——“丢失地点(邮政编码)”——正确地附加了我想要提取的信息,即单个单元格中的多行。这是pdfplumber的限制还是某个地方的正则表达式出错了Python pdfplumber无法提取相似单元格中多行的文本,python,Python,我试图从pdf中提取信息,并将其加载到数据框中进行进一步处理。但是,我没有得到“地点”——“丢失地点(邮政编码)”——正确地附加了我想要提取的信息,即单个单元格中的多行。这是pdfplumber的限制还是某个地方的正则表达式出错了 import re import parse import pdfplumber import pandas as pd from collections import namedtuple import os police_report_re = re.compi
import re
import parse
import pdfplumber
import pandas as pd
from collections import namedtuple
import os
police_report_re = re.compile(r'(.*): (.*) Party At Fault: (.*)')
collision_re = re.compile(r'Collision With: (.*) Damage(.*):(.*)')
thirdparty_re = re.compile(r'Third Party Involved\? (.*) TP Injury Involved\? (.*)')
lost_re = re.compile(r'Description of Accident\/Loss: (.*)')
#
place_re = re.compile(r"(^[^>][\w\s]+)$",re.MULTILINE)
#
model_re = re.compile(r'Vehicle Model: (.*)')
engine_re = re.compile(r'Engine No: (.*) Chassis No: (.*)')
jpj_re = re.compile(r'JPJ Log Book No: (.*) JPJ Reg Date \(Man Yr\): (.*)')
assembly_re = re.compile(r'Assembly Type: (.*) Vehicle Class: (.*)')
vehtype_re = re.compile(r'Vehicle Type / Colour: (.*) Odometer Reading: (.*)')
condition_re = re.compile(r'General Condition: (.*) Condition of Damage: (.*)')
drive_re = re.compile(r'Vehicle Still Driveable\? (.*)')
file = 'claim.pdf'
lines = []
total_check = 0
with pdfplumber.open(file) as pdf:
pages = pdf.pages
for page in pdf.pages:
text = page.extract_text()
for line in text.split('\n'):
comp8 = police_report_re.search(line)
comp9 = collision_re.search(line)
comp10 = thirdparty_re.search(line)
comp11 = lost_re.search(line)
comp12 = place_re.search(line)
comp = model_re.search(line)
comp2 = engine_re.search(line)
comp3 = jpj_re.search(line)
comp4 = assembly_re.search(line)
comp5 = vehtype_re.search(line)
comp6 = condition_re.search(line)
comp7 = drive_re.search(line)
if comp8:
pol_rep, party_fault = comp8.group(2), comp8.group(3)
print('Police Report No: ' + pol_rep + ' Party At Fault: ' + party_fault + '\n')
elif comp9:
collission, damage = comp9.group(1), comp9.group(3)
print('Collision With: ' + collission + ' Damage/Loss Type: ' + damage + '\n')
elif comp10:
tp, injury = comp10.group(1), comp10.group(2)
print('Third Party Involved? ' + tp + ' TP Injury Involved? ' + injury + '\n')
elif comp11:
accident = comp11.group(1)
print('Description of Accident/Loss: ' + accident + '\n')
elif comp12:
tempat = comp12.group(1)
print('Place of Loss (Postcode): ' + tempat + '\n')
elif comp:
model = comp.group(1)
print('Vehicle Model: ' + model + '\n')
elif comp2:
engine, carchasis = comp2.group(1), comp2.group(2)
print('Engine No: '+ engine + ' Chasis No: ' + carchasis + '\n')
elif comp3:
jpjlog, jpjreg = comp3.group(1), comp3.group(2)
print('JPJ Log Book No: '+ jpjlog + ' JPJ Reg Date (Man Yr): ' + jpjreg + '\n')
elif comp4:
assemblytype, vehclass = comp4.group(1), comp4.group(2)
print('Assembly Type: ' + assemblytype + ' Vehicle Class: ' + vehclass + '\n')
elif comp5:
vehtypecolor, odo = comp5.group(1), comp5.group(2)
print('Vehicle Type / Colour: ' + vehtypecolor + ' Odometer Reading: ' + odo + '\n')
elif comp6:
gencon, condmg = comp6.group(1), comp6.group(2)
print('General Condition: ' + gencon + ' Condition of Damage: ' + condmg + '\n')
elif comp7:
driveable = comp7.group(1)
print('Vehicle Still Driveable? ' + driveable + '\n')
lines.append(Line(pol_rep, party_fault, collission, damage, tp, injury, accident, tempat, model, engine, carchasis, jpjlog, jpjreg, assemblytype, vehclass, vehtypecolor, odo, gencon, condmg, driveable))
df = pd.DataFrame(lines)
df.head()