Python 处理非结构化数据_Python_Pandas

Python 处理非结构化数据

python pandas

Python 处理非结构化数据,python,pandas,Python,Pandas,我用pdf、文本和word文件创建了以下数据表。我使用基于字典的方法将各种头分类到一个模块中。现在我需要从句子中提取值，并将数据结构化目前，数据的外观如下所示： header Module ADDITIONAL BONUS Payroll Pay Element EMPLOYEE ID: 4564576 Employee ID AMOUNT: 1200 USD Amount EMPLOYEE N

我用pdf、文本和word文件创建了以下数据表。我使用基于字典的方法将各种头分类到一个模块中。现在我需要从句子中提取值，并将数据结构化

目前，数据的外观如下所示：

header                      Module
ADDITIONAL BONUS            Payroll Pay Element
EMPLOYEE ID: 4564576        Employee ID
AMOUNT: 1200 USD            Amount
EMPLOYEE NAME: ANDY CHEN    Employee Name
SPOT AWARD                  Payroll Pay Element
EMPLOYEE ID: 7463453        Employee ID
AMOUNT: 200 USD             Amount
EMPLOYEE NAME: MICHAEL HISHAM      Employee Name
REALOCATION BONUS           Payroll Pay Element
EMPLOYEE ID: 7467673        Employee ID
AMOUNT: 1400 USD            Amount
EMPLOYEE NAME: AYMAN HISHAM Employee Name
REALOCATION BONUS           Payroll Pay Element
EMPLOYEE ID: 7467673        Employee ID
AMOUNT: 1400 USD            Amount
EMPLOYEE NAME: AYMAN HISHAM Employee Name

不确定如何以如下结构化格式实现所需的输出：

Employee ID Employee Name   Payroll Pay Element Amount
4564576 ANDY CHEN   ADDITIONAL BONUS    1200 USD
7463453 MICHAEL HISHAM  SPOT AWARD  200 USD
7467673 AYMAN HISHAM    REALOCATION BONUS   1400 USD
7467673 AYMAN HISHAM    REALOCATION BONUS   1400 USD

我使用以下代码从各种文档中获取第一个表数据

import re,os, subprocess,  pandas as pd, numpy as np
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import nltk


####################################PATH TO SAVE FILES###########################################
os.chdir(r"C:\Analytics\Microsoft\One_Time_Payment")

###################Converting CVs from Various format to txt Format ###########

path_PDF = r"\pdf"
path_DOC = r"\word"
path_TXT = r"\txt"
path_ERROR=r"\ERROR"

try:
    with subprocess.Popen([os.path.join(path_DOC, "docto.exe"), '-f', path_DOC, '-O', path_TXT,'-T','wdFormatTEXT'], stdout=subprocess.PIPE) as proc:
            print(proc.stdout.read()) 
except Exception:
        pass


all_pdf = os.listdir(path_PDF)

for each_file in all_pdf:                                                                                                                                      
    print("DEL : ",each_file)
    print("Test above")
    if each_file.endswith('.pdf'):
#        cnd_id =                                                                                            (re.findall('\d+', each_file ))
        txt_filename = each_file.partition('.')[0] + ".txt"
        subprocess.call([os.path.join(path_PDF, "pdftotext.exe"), '-raw', '-eol', 'dos','-layout',
        os.path.join(path_PDF, each_file),
        os.path.join(path_TXT, txt_filename)])



#####################Processing texts ###########################################

counter = 1

all_text = os.listdir(path_TXT)
#all_text = [os.path.splitext(filename)[0] for filename in os.listdir(path_TXT)]

# only in case extra file is gettign created

#error_file=pd.DataFrame(index=['sno'],columns=['msg'])


for each_file in all_text:

    try:
        print(os.path.join(path_TXT, each_file))
        inpdata = pd.read_table(os.path.join(path_TXT, each_file), header=None,encoding = "ISO-8859-1",sep='\n')
#        cnd_id = (re.findall('\d{6}', each_file ))
    except Exception as e1:
        try:
            inpdata = pd.read_table(os.path.join(path_TXT, each_file), header=None, engine='python',encoding = "ISO-8859-1",sep='\n',error_bad_lines=False)
        except Exception as ec:
            print("Python Engine E ::>> ",ec)
            str_err = "Could not process the file %s" % (each_file)
            print(str_err)
            continue         
#            try:
#                inpdata = pd.read_table(os.path.join(path_TXT, each_file), header=None,encoding = "ISO-8859-1",sep='\n', error_bad_lines=False)
#            except Exception as ec:


#                error_file["msg"]=str_err


    inpdata.columns = ["Sentences"]
    inpdata['Sentences'] = inpdata['Sentences'].str.upper()
#    inpdata['Module'] = ''
    inpdata_bck = inpdata
    inpdata_bck['sent_id'] = range(0,len(inpdata_bck))
#    inpdata_bck['cnd_id'] = str(cnd_id)
#    inpdata['num_words'] = 0
    inpdata['CID'] =   each_file.partition('.')[0]
    inpdata.dropna(subset=["Sentences"], inplace=True)
    inpdata = inpdata.reset_index()
    del inpdata['index']

    inpdata_bck.to_csv(each_file + '.csv')
    if (counter==1):
        all_info = inpdata
#        err_file=error_file
#        err_file1=error_file1
    else:
        all_info = pd.concat([all_info,inpdata])
#        err_file=pd.concat([err_file,error_file])
#        err_file1=pd.concat([err_file1,error_file1])
    counter = counter + 1   

#Removing Special Characteres to avoid import #Name issue
all_info['Sentences']= all_info['Sentences'].map(lambda x: x.lstrip('+-'))
all_info = all_info.reset_index() 

del all_info['index']  


mod = pd.read_excel(r"Payroll_dict.xlsx", sheetname='Payroll Dictionary') 

all_info['num_words'] = 0  
all_info['Module'] = ''
all_info = all_info[pd.notnull(all_info['Sentences'])]
all_info = all_info.reset_index()
del all_info['index']

categoryDict = {}
modules = list(set(mod['Module']))
for module in modules:
    categoryDict[module] = mod.loc[mod['Module'] == module, 'Header']





#Function for Module Assignment
def module_assign(inpdata):


######################### Module Creation ##############################  

    curr_sent_value=inpdata['Sentences']
    if inpdata['num_words'] <= 0:
        list1=["EMPLOYEE ID:", "REFERENCE ID"]
        if curr_sent_value in list1:
          return 'Employee ID'

        list2=["Amount", "AMOUNTS"]
        if curr_sent_value in list2:
          return 'Amount'     
    else:
           for key in categoryDict.keys():
                for word in categoryDict[key]:
                    match = re.findall(word,curr_sent_value,re.I|re.M)
                    if len(match)>0:
                        return key

for CID, cid_data in all_info.groupby('CID'):
    inpdata=all_info
    inpdata = inpdata.reset_index()
#    del inpdata['index']
    inpdata['header'] = " "
#    inpdata['clean_sent']=inpdata['Sentences'].map(reg_clean)
    inpdata['num_words']=inpdata['Sentences'].map(lambda x: len(x.split()))
    inpdata['header']=np.where(inpdata['num_words']<=5,inpdata['Sentences'],'')
    inpdata = inpdata[inpdata['num_words'] > 0]
    inpdata['Module']=inpdata.apply(module_assign,axis=1)

    def split_the_sentence_to_words_rem_stopwrds(p_sentence):
     return [token for token in word_tokenize(p_sentence)]


inpdata['split_wrd'] =inpdata['Sentences'].map(split_the_sentence_to_words_rem_stopwrds)

导入re、os、子流程、pandas作为pd、numpy作为np
从nltk.util导入ngrams
从nltk.tokenize导入单词\u tokenize
导入nltk
####################################保存文件的路径###########################################
chdir（r“C:\Analytics\Microsoft\One\u Time\u Payment”）
###################将CV从各种格式转换为txt格式###########
路径\u PDF=r“\PDF”
路径\u DOC=r“\word”
路径_TXT=r“\TXT”
路径\u错误=r“\ERROR”
尝试：
将subprocess.Popen（[os.path.join（path_DOC，“docto.exe”）、'-f'，path_DOC'-O'，path_TXT'-T'，wdFormatTEXT']，stdout=subprocess.PIPE）作为进程：
打印（proc.stdout.read（））
除例外情况外：
通过
all\u pdf=os.listdir（path\u pdf）
对于所有\u pdf中的每个\u文件：
打印（“删除：”，每个文件）
打印（“上述测试”）
如果每个_file.endswith（'.pdf'）：
#cnd_id=（re.findall（'\d+'，每个_文件））
txt_filename=每个_文件.partition（'.'）[0]+.txt
subprocess.call（[os.path.join（path_PDF，“pdftotext.exe”），'-raw'，'-eol'，dos'，'-layout'，
join（path\u PDF，每个文件），
join（path\u TXT，TXT\u文件名）]）
#####################处理文本###########################################
计数器=1
all_text=os.listdir（路径_TXT）
#all_text=[os.path.splitext（filename）[0]表示os.listdir（path_TXT）中的文件名]
#仅在创建了额外文件GetSign的情况下
#错误\u file=pd.DataFrame（索引=['sno']，列=['msg']）
对于所有\u文本中的每个\u文件：
尝试：
打印（os.path.join（path_TXT，每个_文件））
inpdata=pd.read_表（os.path.join（path_TXT，每个_文件），header=None，encoding=“ISO-8859-1”，sep='\n'）
#cnd_id=（re.findall（'\d{6}'，每个_文件））
例外情况除外，如e1：
尝试：
inpdata=pd.read\u表（os.path.join（path\u TXT，每个文件），header=None，engine='python'，encoding=“ISO-8859-1”，sep='\n'，error\u bad\u lines=False）
除欧共体的例外情况外：
打印（“Python引擎E:：>>”，ec）
str_err=“无法处理文件%s”%（每个文件）
打印（str_err）
持续
#尝试：
#inpdata=pd.read\u表（os.path.join（path\u TXT，每个文件），header=None，encoding=“ISO-8859-1”，sep='\n'，error\u bad\u lines=False）
#除欧共体的例外情况外：
#错误文件[“msg”]=str\u err
inpdata.columns=[“句子”]
inpdata['sequences']=inpdata['sequences'].str.upper（）
#inpdata['Module']=''
inpdata\u bck=inpdata
inpdata_bck['sent_id']=范围（0，len（inpdata_bck））
#inpdata_bck['cnd_id']=str（cnd_id）
#inpdata['num_words']=0
inpdata['CID']=每个_文件.partition（'.'）[0]
inpdata.dropna（子集=[“句子”]，inplace=True）
inpdata=inpdata.reset_index（）
del inpdata['index']
inpdata_bck.to_csv（每个_文件+'.csv'）
如果（计数器==1）：
所有信息=输入数据
#错误文件=错误文件
#err\u file1=error\u file1
其他：
all_info=pd.concat（[all_info，inpdata]）
#err\u file=pd.concat（[err\u file，error\u file]）
#err\u file1=pd.concat（[err\u file1，error\u file1]）
计数器=计数器+1
#删除特殊字符以避免导入#名称问题
all_info['SECTURES']=all_info['SECTURES'].map（lambda x:x.lstrip（+-'））
all_info=all_info.reset_index（）
删除所有信息['index']
mod=pd.read\u excel（r“Payroll\u dict.xlsx”，sheetname='Payroll Dictionary'）
所有信息['num\u words']=0
所有信息['Module']=''
all_info=all_info[pd.notnull（all_info['SECTURES']）]
all_info=all_info.reset_index（）
删除所有信息['index']
categoryDict={}
模块=列表（设置（mod['Module']））
对于模块中的模块：
类别信息[module]=模块位置[mod['module']==模块，'Header']
#模块分配函数
def模块分配（inpdata）：
#########################模块创建
curr_sent_value=inpdata['sents']
如果inpdata['num_words']0：
返回键
对于CID，所有信息组中的CID数据（'CID'）：
inpdata=所有信息
inpdata=inpdata.reset_index（）
#del inpdata['index']
inpdata['header']=“”
#inpdata['clean_sent']=inpdata['句].map（reg_clean）
inpdata['num_words']=inpdata['sequences'].map（lambda x:len（x.split（））
inpdata['header']=np.where（inpdata['num_words']0]
inpdata['Module']=inpdata.apply（模块分配，轴=1）
def将句子拆分为单词rem STOPWRD（句子）：
return[word\u tokenize（p\u句）中token的token]
inpdata['split_wrd']=inpdata['sequences'].map（将句子拆分为单词）

您可以透视表并剥离字符串：

new_df = df.pivot_table(index=df.Module.eq('Payroll Pay Element').cumsum(),
               columns='Module',
               values='header',
               aggfunc='first')

for col in ['Amount','Employee ID', 'Employee Name']:
    new_df[col] = new_df[col].str[len(col)+1:]

print(new_df)

输出：

Module     Amount Employee ID    Employee Name Payroll Pay Element
Module                                                            
1        1200 USD     4564576        ANDY CHEN    ADDITIONAL BONUS
2         200 USD     7463453   MICHAEL HISHAM          SPOT AWARD
3        1400 USD     7467673     AYMAN HISHAM   REALOCATION BONUS
4        1400 USD     7467673     AYMAN HISHAM   REALOCATION BONUS