Python 试图分析Word文档并获取PdfReadError:EOF标记未找到_Python_Python 3.x

Python 试图分析Word文档并获取PdfReadError:EOF标记未找到

python python-3.x

Python 试图分析Word文档并获取PdfReadError:EOF标记未找到,python,python-3.x,Python,Python 3.x,我正在测试一些Python代码，以循环浏览简历，打开每个简历，解析每个简历，并根据每个简历的内容创建一个全面的报告。下面是我正在运行的代码 #importing all required libraries import PyPDF2 import os from os import listdir from os.path import isfile, join from io import StringIO import pandas as pd from collections impo

我正在测试一些Python代码，以循环浏览简历，打开每个简历，解析每个简历，并根据每个简历的内容创建一个全面的报告。下面是我正在运行的代码

#importing all required libraries

import PyPDF2
import os
from os import listdir
from os.path import isfile, join
from io import StringIO
import pandas as pd
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.matcher import PhraseMatcher

#Function to read resumes from the folder one by one
mypath='C:\\path_to_resumes\\' #enter your path here where you saved the resumes
onlyfiles = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]

def pdfextract(file):
    fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
    countpage = fileReader.getNumPages()
    count = 0
    text = []
    while count < countpage:    
        pageObj = fileReader.getPage(count)
        count +=1
        t = pageObj.extractText()
        print (t)
        text.append(t)
    return text

#function to read resume ends


#function that does phrase matching and builds a candidate profile
def create_profile(file):
    text = pdfextract(file) 
    text = str(text)
    text = text.replace("\\n", "")
    text = text.lower()
    #below is the csv where we have all the keywords, you can customize your own
    keyword_dict = pd.read_csv('D:/NLP_Resume/resume/template_new.csv')
    stats_words = [nlp(text) for text in keyword_dict['Statistics'].dropna(axis = 0)]
    NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis = 0)]
    ML_words = [nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis = 0)]
    DL_words = [nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis = 0)]
    R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis = 0)]
    python_words = [nlp(text) for text in keyword_dict['Python Language'].dropna(axis = 0)]
    Data_Engineering_words = [nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis = 0)]

    matcher = PhraseMatcher(nlp.vocab)
    matcher.add('Stats', None, *stats_words)
    matcher.add('NLP', None, *NLP_words)
    matcher.add('ML', None, *ML_words)
    matcher.add('DL', None, *DL_words)
    matcher.add('R', None, *R_words)
    matcher.add('Python', None, *python_words)
    matcher.add('DE', None, *Data_Engineering_words)
    doc = nlp(text)

    d = []  
    matches = matcher(doc)
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
        span = doc[start : end]  # get the matched slice of the doc
        d.append((rule_id, span.text))      
    keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items())

    ## convertimg string of keywords to dataframe
    df = pd.read_csv(StringIO(keywords),names = ['Keywords_List'])
    df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword'])
    df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count'])
    df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1) 
    df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))

    base = os.path.basename(file)
    filename = os.path.splitext(base)[0]

    name = filename.split('_')
    name2 = name[0]
    name2 = name2.lower()
    ## converting str to dataframe
    name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name'])

    dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1)
    dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True)

    return(dataf)

#function ends

#code to execute/call the above functions

final_database=pd.DataFrame()
i = 0 
while i < len(onlyfiles):
    file = onlyfiles[i]
    dat = create_profile(file)
    final_database = final_database.append(dat)
    i +=1
    print(final_database)


#code to count words under each category and visulaize it through Matplotlib

final_database2 = final_database['Keyword'].groupby([final_database['Candidate Name'], final_database['Subject']]).count().unstack()
final_database2.reset_index(inplace = True)
final_database2.fillna(0,inplace=True)
new_data = final_database2.iloc[:,1:]
new_data.index = final_database2['Candidate Name']
#execute the below line if you want to see the candidate profile in a csv format
#sample2=new_data.to_csv('sample.csv')
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
ax = new_data.plot.barh(title="Resume keywords by category", legend=False, figsize=(25,7), stacked=True)
labels = []
for j in new_data.columns:
    for i in new_data.index:
        label = str(j)+": " + str(new_data.loc[i][j])
        labels.append(label)
patches = ax.patches
for label, rect in zip(labels, patches):
    width = rect.get_width()
    if width > 0:
        x = rect.get_x()
        y = rect.get_y()
        height = rect.get_height()
        ax.text(x + width/2., y + height/2., label, ha='center', va='center')
plt.show()

#导入所有必需的库
导入PyPDF2
导入操作系统
从操作系统导入listdir
从os.path导入isfile，加入
从io导入StringIO
作为pd进口熊猫
从收款进口柜台
导入核心网站
nlp=en_core_web_sm.load（）
从spacy.matcher导入短语匹配器
#函数从文件夹中逐个读取简历
mypath='C:\\path\u to\u resumes\\\\在此处输入您保存简历的路径
onlyfiles=[os.path.join（mypath，f）表示os.listdir（mypath）中的f，如果os.path.isfile（os.path.join（mypath，f））]
def pdfextract（文件）：
fileReader=PyPDF2.PdfFileReader（打开（文件，'rb'））
countpage=fileReader.getNumPages（）
计数=0
text=[]
当计数<计数页：
pageObj=fileReader.getPage（计数）
计数+=1
t=pageObj.extractText（）
打印（t）
text.append（t）
返回文本
#用于读取简历结尾的函数
#用于进行短语匹配并构建候选配置文件的函数
def创建_配置文件（文件）：
text=pdfextract（文件）
text=str（text）
text=文本。替换（“\\n”和“”）
text=text.lower（）
#下面是csv，我们有所有的关键字，您可以自定义自己的关键字
关键字\u dict=pd.read\u csv（'D:/NLP\u Resume/Resume/template\u new.csv'））
stats_words=[nlp（文本）用于关键字_dict['Statistics']中的文本。dropna（axis=0）]
NLP_words=[NLP（文本）用于关键字_dict['NLP']中的文本。dropna（axis=0）]
ML_words=[nlp（文本）用于关键字_dict['Machine Learning']中的文本。dropna（axis=0）]
DL_words=[nlp（文本）用于关键字_dict['Deep Learning']中的文本。dropna（axis=0）]
R_words=[nlp（文本）用于关键字_dict['R Language']中的文本。dropna（axis=0）]
python_words=[nlp（文本），用于关键字_dict['python Language']中的文本。dropna（axis=0）]
Data_Engineering_words=[nlp（文本）用于关键字_dict['Data Engineering']中的文本。dropna（axis=0）]
matcher=短语匹配器（nlp.vocab）
添加（'Stats'，None，*Stats\u单词）
添加（'NLP'，无，*NLP_单词）
matcher.add（'ML'，None，*ML_单词）
matcher.add（'DL'，None，*DL_单词）
matcher.add（'R'，None，*R_单词）
add（'Python'，None，*Python\u单词）
匹配器。添加（'DE'，无，*数据\u工程\u单词）
doc=nlp（文本）
d=[]
匹配=匹配器（文档）
对于match_id，在matches中开始和结束：
rule_id=nlp.vocab.strings[match_id]#获取unicode id，即“COLOR”
span=doc[start:end]#获取文档的匹配切片
d、 追加（（规则id，span.text））
关键字=“\n”.join（f'{i[0]}{i[1]}（{j}）”表示计数器（d.items（）中的i，j）
##将关键字字符串转换为数据帧
df=pd.read\u csv（StringIO（关键字），name=['keywords\u List']）
df1=pd.DataFrame（df.Keywords_List.str.split（“”，1.tolist（），columns=['Subject'，'Keyword']））
df2=pd.DataFrame（df1.Keyword.str.split（“（”，1.tolist（），columns=['Keyword'，'Count']））
df3=pd.concat（[df1['Subject']，df2['Keyword']，df2['Count']]，axis=1）
df3['Count']=df3['Count'].apply（lambda x:x.rstrip（“））
base=os.path.basename（文件）
filename=os.path.splitext（基本）[0]
name=filename.split（“”）
名称2=名称[0]
name2=name2.lower（）
##将str转换为数据帧
name3=pd.read\u csv（StringIO（name2），names=['candidatename']）
dataf=pd.concat（[name3['candidatename']，df3['Subject']，df3['Keyword']，df3['Count']，axis=1）
dataf['Candidate Name'].fillna（dataf['Candidate Name'].iloc[0]，inplace=True）
返回（dataf）
#功能结束
#执行/调用上述函数的代码
final_database=pd.DataFrame（）
i=0
而i0：
x=rect.get_x（）
y=rect.get_y（）
高度=矩形。获取高度（）
ax.文本（x+宽度/2，y+高度/2，标签，ha='center'，va='center'）
plt.show（）

在文件夹中，我有“.doc”和“.docx”文件。一切似乎都很好，直到现在，就在下面。当我到达这里时，代码抛出一个错误。这是麻烦的代码。奇怪的是，它看起来像某种PDF错误，但我只遍历“.doc”和“.docx”文件

final_database=pd.DataFrame()
i = 0 
while i < len(onlyfiles):
    file = onlyfiles[i]
    dat = create_profile(file)
    final_database = final_database.append(dat)
    i +=1
    print(final_database)

final_database=pd.DataFrame（）
i=0
而i


以下是StackTrace：
Traceback (most recent call last):

  File "<ipython-input-2-c63fca79d39f>", line 5, in <module>
    dat = create_profile(file)

  File "<ipython-input-1-cdc3bf75cd26>", line 34, in create_profile
    text = pdfextract(file)

  File "<ipython-input-1-cdc3bf75cd26>", line 17, in pdfextract
    fileReader = PyPDF2.PdfFileReader(open(file,'rb'))

  File "C:\Users\ryans\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 1084, in __init__
    self.read(stream)

  File "C:\Users\ryans\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 1696, in read
    raise utils.PdfReadError("EOF marker not found")

PdfReadError: EOF marker not found

回溯（最近一次呼叫最后一次）：
文件“”，第5行，在
dat=创建_配置文件（文件）
文件“”，第34行，在创建配置文件中
text=pdfextract（文件）
pdfextract中第17行的文件“”
fileReader=PyPDF2.PdfFileReader（打开（文件，'rb'））
文件“C:\Users\ryans\Anaconda3\lib\site pac