在Python中快速将多模块HTML转换为CSV文件

在Python中快速将多模块HTML转换为CSV文件,python,pandas,csv,Python,Pandas,Csv,我需要从多模块html文件中提取数据,并将其转换为单个csv文件。由于html页面是完全非结构化的,因此任务变得单调乏味。完成任务后,1、2……的测试运行顺利。。。。10个文件,但之后,它开始需要很长时间。对于100多个文件,它几乎崩溃。我试了340个文件。它工作了,但至少花了3个小时,最后机器挂断了。粘贴下面的完整代码,并附上html文件示例(源代码)。有没有更好的处理方法? 注意:我已经检查过了,没有多大帮助。谢谢 import os from bs4 import BeautifulSou

我需要从多模块html文件中提取数据,并将其转换为单个csv文件。由于html页面是完全非结构化的,因此任务变得单调乏味。完成任务后,1、2……的测试运行顺利。。。。10个文件,但之后,它开始需要很长时间。对于100多个文件,它几乎崩溃。我试了340个文件。它工作了,但至少花了3个小时,最后机器挂断了。粘贴下面的完整代码,并附上html文件示例(源代码)。有没有更好的处理方法? 注意:我已经检查过了,没有多大帮助。谢谢

import os
from bs4 import BeautifulSoup as bs
import pandas as pd
import glob
import datetime

root_dir = r'/home/some path'
all_list = []
for newFile in glob.glob(os.path.join(root_dir, '**/*.html'), recursive=True):
    dictionary = {}
    # create soup.
    openFile = open(newFile)
    soup = bs(openFile, 'html.parser')
    # section 1: Case Details
    try:
        caseType = soup.find('span', {'class': 'case_details_table'})
        caseTypeChild = caseType.findChild()
        # ref for .next - https://stackoverflow.com/questions/5999407/extract-content-within-a-tag-with-beautifulsoup
        sessionsCase = caseTypeChild.next.next.next
        filing = sessionsCase.next.next
        filingNumberHeading = filing.find('label')
        filingNumber = filingNumberHeading.next.next
        filingDate = filingNumber.next.next.next.next
        registration = filingDate.next.next
        registrationNumberHeading = registration.find('label')
        registrationNumber = registrationNumberHeading.next.next.next
        cnrHeading = soup.find('b').find('label')
        cnrNumber = cnrHeading.next.next
        dictionary['Filing Number'] = filingNumber
        dictionary['Filing Date'] = filingDate
        dictionary['Registration Number'] = registrationNumber
        dictionary['CNR Number'] = cnrNumber
    except:
        pass

    # section 2: Case Status
    try:
        firstHearing = soup.find('strong')
        firstHearingDate = firstHearing.next_sibling.text
        dictionary['First Hearing'] = firstHearingDate
        nextHearing = soup.find('strong', text='Next Hearing Date')
        nextHearingDate = nextHearing.next_sibling.text
        dictionary['Next Hearing'] = nextHearingDate
        stageOfCase = soup.find('strong', text='Stage of Case')
        stageOfCaseText = stageOfCase.next_sibling.text
        dictionary['Stage of Case'] = stageOfCaseText
        courtNumber = soup.find('strong', text='Court Number and Judge')
        courtNumberText = courtNumber.next_sibling.next_sibling.text.strip()
        dictionary['Court Number and Judge'] = courtNumberText
    except:
        pass

    # section 6: FIR Details
    try:
        policeStationHeading = soup.find('span', attrs={'class': 'FIR_details_table'}).next.next
        policeStation = policeStationHeading.next.next.next.next
        firnumberHeading = policeStation.next.next.next
        firNumber = policeStation.find_next('label').next
        firYearHeading = firNumber.next.next.next
        firYear = firNumber.find_next('span').find_next('label').next
        # same as previous sections.
        dictionary[policeStationHeading] = policeStation
        dictionary[firnumberHeading] = firNumber
        dictionary[firYearHeading] = firYear
    except:
        pass

    # section 3: Petioner and Advocate
    try:
        petitioner = soup.find('span', attrs={'class': 'Petitioner_Advocate_table'})
        petitionerName = petitioner.next
        dictionary['Name of the Petitioner'] = petitionerName
        petitionerAdvocate = petitionerName.next.next
        dictionary['Name of the Advocate'] = petitionerAdvocate
    # section 4: Respondent and Advocate
        respondent = petitionerAdvocate.find_next('span')
        respondentName = respondent.next
        dictionary['Name of the Respondent'] = respondentName
    except:
        pass
    # section 5: Acts
    '''In this section 1. soup is prepared from act_table tab of web page
    2. Keys for main dictionary are created defining headings of acts. with 'not applied' values. 
    3. short form variables are created for names of the act. 
    4. list of acts is compared with list of variables and sections are replaced as values in the dictionary. '''

    acts = soup.select('#act_table td:nth-of-type(1)')
    sections = soup.select('#act_table td:nth-of-type(2)')
    dictionary['IPC'] = 'Not Applied'
    dictionary['PoA'] = 'Not Applied'
    dictionary['PCSO'] = 'Not Applied'
    dictionary['PCR'] = 'Not Applied'
    dictionary['Any Other Act'] = 'Not Applied'

    ipc = 'indian penal code'
    poa = 'prevention of atrocities'
    pcso = 'protection of children from sexual'
    pcr = 'protection of civil rights'


    try:
        act1 = tuple(acts[0].contents)
        sections1 = tuple(sections[0].contents)
        string = str(act1)
    except:
        pass
    try:
        act2 = tuple(acts[1].contents)
        sections2 = tuple(sections[1].contents)
    except:
        pass
    try:
        act3 = tuple(acts[2].contents)
        sections3 = tuple(sections[2].contents)
    except:
        pass
    try:
        act4 = tuple(acts[3].contents)
        sections4 = tuple(sections[3].contents)
    except:
        pass
    # using if and not for loop then actSession is not needed
    # for first act in list
    if len(acts) < 2:
        if ipc in string.lower():
            dictionary['IPC'] = sections1
        elif poa in string.lower():
            dictionary['PoA'] = sections1
        elif pcso in string.lower():
            dictionary['PCSO'] = sections1
        elif pcr in string.lower():
            dictionary['PCR'] = sections1
        else:
            pass
    # for 2nd act in list
    elif len(acts) == 2:
        if ipc in string.lower():
            dictionary['IPC'] = sections1
        elif poa in string.lower():
            dictionary['PoA'] = sections1
        elif pcso in string.lower():
            dictionary['PCSO'] = sections1
        else:
            pass
        if ipc in str(act2).lower():
            dictionary['IPC'] = sections2
        elif poa in str(act2).lower():
            dictionary['PoA'] = sections2
        elif pcso in str(act2).lower():
            dictionary['PCSO'] = sections2
        else:
            pass
    # for 3rd act in list
    elif len(acts) == 3:
        if ipc in string.lower():
            dictionary['IPC'] = sections1
        elif poa in string.lower():
            dictionary['PoA'] = sections1
        elif pcso in string.lower():
            dictionary['PCSO'] = sections1
        elif pcr in string.lower():
            dictionary['PCR'] = sections1
        else:
            pass
        if ipc in str(act2).lower():
            dictionary['IPC'] = sections2
        elif poa in str(act2).lower():
            dictionary['PoA'] = sections2
        elif pcso in str(act2).lower():
            dictionary['PCSO'] = sections2
        elif pcr in str(act2).lower():
            dictionary['PCR'] = sections2
        else:
            pass
    else:
        pass
    all_list.append(dictionary)

df = pd.DataFrame(all_list)
df = df[['CNR Number', 'Filing Number', 'Filing Date', 'First Hearing', 'Next Hearing', 'Stage of Case', 'Registration Number', 'Year', 'FIR Number', 'Police Station', 'Court Number and Judge',  'PoA', 'IPC', 'PCR', 'PCSO', 'Any Other Act', 'Name of the Petitioner', 'Name of the Advocate', 'Name of the Respondent']]
outputFile = open(os.path.join('/home/some path name/ file + str(
    datetime.datetime.now().day) + '_' + str(datetime.datetime.now().month) + '_' + str(
    datetime.datetime.now().year) + '.csv'), 'w')
df.to_csv(outputFile)
outputFile.close()
导入操作系统
从bs4导入BeautifulSoup作为bs
作为pd进口熊猫
导入glob
导入日期时间
root_dir=r'/home/some path'
所有_列表=[]
对于glob.glob(os.path.join(root_dir,***.html')中的新文件,recursive=True):
字典={}
#制作汤。
openFile=open(newFile)
soup=bs(openFile'html.parser')
#第一节:个案详情
尝试:
caseType=soup.find('span',{'class':'case\u details\u table'})
caseTypeChild=caseType.findChild()
#参考下一页-https://stackoverflow.com/questions/5999407/extract-content-within-a-tag-with-beautifulsoup
sessioncase=caseTypeChild.next.next.next
归档=会话case.next.next
FilingNumberReading=fileing.find('标签')
filingNumber=FilingNumberReading.next.next
filingDate=filingNumber.next.next.next.next.next
注册=提交日期。下一步。下一步
registrationNumberHeading=registration.find('label'))
registrationNumber=registrationNumberHeading.next.next.next
cnrHeading=soup.find('b')。find('label'))
cnrNumber=cnrHeading.next.next
字典['filingNumber']=文件编号
字典[‘提交日期]]=提交日期
字典['Registration Number']=注册号
字典['CNR编号']=CNR编号
除:
通过
#第2节:案件状况
尝试:
firstHearing=soup.find('strong')
firstHearingDate=firstHearing.next\u sibling.text
字典['First Hearing']=firstHearingDate
nextHearing=soup.find('strong',text='Next-harding Date')
nextHearingDate=nextHearing.next\u sibling.text
字典['Next Hearing']=nextHearingDate
stageOfCase=soup.find('strong',text='Stage of Case')
stageOfCaseText=stageOfCase.next\u sibling.text
字典['Stage of Case']=stageOfCaseText
courtNumber=soup.find('strong',text='Court Number and judget')
courtNumberText=courtNumber.next\u sibling.next\u sibling.text.strip()
字典[“法院号码和法官”]=法院号码文本
除:
通过
#第6节:详细信息
尝试:
policyStationHeading=soup.find('span',attrs={'class':'FIR\u details\u table'})
警察站=警察站heading.next.next.next.next
firnumberHeading=policeStation.next.next.next
firNumber=警察局。查找下一个(“标签”)。下一个
firYearHeading=firNumber.next.next.next
firYear=firNumber.find_next('span')。find_next('label')。next
#与前几节相同。
字典[警察站标题]=警察站
字典[firnumberHeading]=firNumber
字典[firYearHeading]=firYear
除:
通过
#第3节:小律师和辩护律师
尝试:
请愿者=soup.find('span',attrs={'class':'请愿者\倡导者\表格'})
请愿人姓名=请愿人
字典[‘申请人姓名]]=申请人姓名
请愿人地址=请愿人姓名.next.next
字典[‘倡导者的名字]]=请愿者建议
#第4节:被告和辩护人
答辩人=请愿人建议。查找下一个(“span”)
respondentName=响应者。下一步
字典[‘响应者的姓名]]=响应者姓名
除:
通过
#第5节:法案
“在第1节中。汤是从网页的act_table选项卡准备的
2.主字典的键被创建来定义ACT的标题。具有“未应用”值。
3.将为act的名称创建简短形式的变量。
4.acts列表与变量列表进行比较,并将部分替换为字典中的值。“”
acts=汤。选择(“#act_表td:n类型(1)”)
截面=汤。选择(“#act#U表td:n类型(2)”)
字典['IPC']=“未应用”
字典['PoA']=“未应用”
字典['PCSO']=“未应用”
字典['PCR']=“未应用”
字典[“任何其他行为”]=“不适用”
ipc=‘印度刑法’
poa=‘防止暴行’
pcso=‘保护儿童免受性侵犯’
pcr=‘公民权利保护’
尝试:
act1=元组(acts[0]。内容)
sections1=元组(节[0]。内容)
string=str(act1)
除:
通过
尝试:
act2=元组(acts[1]。内容)
sections2=元组(sections[1]。内容)
除:
通过
尝试:
act3=元组(acts[2]。内容)
sections3=元组(sections[2]。内容)
除:
通过
尝试:
act4=元组(acts[3]。内容)
sections4=元组(sections[3]。目录)
除:
通过
#使用if和not for循环则不需要actSession
#名单上的第一幕
如果len(acts)<2:
如果ipc在string.lower()中:
字典['IPC']=章节1
elif poa在string.lower()中:
字典['PoA']=章节1
elif pcso在string.lower()中:
字典['PCSO']=章节1
elif pcr在string.lower()中:
字典['PCR']=章节1
其他:
通过
#名单中的第二幕
elif len(acts)=2:
如果ipc在string.lower()中:
字典['IPC']=章节1
elif poa在string.lower()中:
措辞
(?=...) 
df = dictionary[['CNR Number', 'Filing Number', 'Filing Date', 'First Hearing', 'Next Hearing', 'Stage of Case', 'Registration Number', 'Year', 'FIR Number', 'Police Station', 'Court Number and Judge',  'PoA', 'IPC', 'PCR', 'PCSO', 'Any Other Act', 'Name of the Petitioner', 'Name of the Advocate', 'Name of the Respondent']]
one_case=''
for i in df:
    one_case=i+','
one_case=one_case+'\n'
outputFile = open('output.csv', 'a')
outputFile.write(one_case)
outputFile.close()
import os
from bs4 import BeautifulSoup as bs
import pandas as pd
import glob
import datetime

csv_header = ['CNR Number', 'Filing Number', 'Filing Date', 'First Hearing', 'Next Hearing', 'Stage of Case', 'Registration Number', 'Year', 'FIR Number', 'Police Station', 'Court Number and Judge',  'PoA', 'IPC', 'PCR', 'PCSO', 'Any Other Act', 'Name of the Petitioner', 'Name of the Advocate', 'Name of the Respondent']

root_dir = r'/home/some path'

def convert_html_case_files_to_csv():

    output_filename = os.path.join('/home/some path name/ file' + str(
        datetime.datetime.now().day) + '_' + str(datetime.datetime.now().month) + '_' + str(
        datetime.datetime.now().year) + '.csv')

    with open(output_filename, 'w', newline='') as f:
        writer = csv.DictWriter(f, header)
        for newFile in glob.glob(os.path.join(root_dir, '**/*.html'), recursive=True):
            writer.writerow(process_case_file(newFile)

def process_case_file(filename):
    """Read and parse html file, return csv row as dict"""
    dictionary = {}
    # create soup.
    openFile = open(newFile)
    soup = bs(openFile, 'html.parser')
    # section 1: Case Details
    try:
        caseType = soup.find('span', {'class': 'case_details_table'})
        caseTypeChild = caseType.findChild()
        # ref for .next - https://stackoverflow.com/questions/5999407/extract-content-within-a-tag-with-beautifulsoup
        sessionsCase = caseTypeChild.next.next.next
        filing = sessionsCase.next.next
        filingNumberHeading = filing.find('label')
        filingNumber = filingNumberHeading.next.next
        filingDate = filingNumber.next.next.next.next
        registration = filingDate.next.next
        registrationNumberHeading = registration.find('label')
        registrationNumber = registrationNumberHeading.next.next.next
        cnrHeading = soup.find('b').find('label')
        cnrNumber = cnrHeading.next.next
        dictionary['Filing Number'] = filingNumber
        dictionary['Filing Date'] = filingDate
        dictionary['Registration Number'] = registrationNumber
        dictionary['CNR Number'] = cnrNumber
    except:
        pass

    # section 2: Case Status
    try:
        firstHearing = soup.find('strong')
        firstHearingDate = firstHearing.next_sibling.text
        dictionary['First Hearing'] = firstHearingDate
        nextHearing = soup.find('strong', text='Next Hearing Date')
        nextHearingDate = nextHearing.next_sibling.text
        dictionary['Next Hearing'] = nextHearingDate
        stageOfCase = soup.find('strong', text='Stage of Case')
        stageOfCaseText = stageOfCase.next_sibling.text
        dictionary['Stage of Case'] = stageOfCaseText
        courtNumber = soup.find('strong', text='Court Number and Judge')
        courtNumberText = courtNumber.next_sibling.next_sibling.text.strip()
        dictionary['Court Number and Judge'] = courtNumberText
    except:
        pass

    # section 6: FIR Details
    try:
        policeStationHeading = soup.find('span', attrs={'class': 'FIR_details_table'}).next.next
        policeStation = policeStationHeading.next.next.next.next
        firnumberHeading = policeStation.next.next.next
        firNumber = policeStation.find_next('label').next
        firYearHeading = firNumber.next.next.next
        firYear = firNumber.find_next('span').find_next('label').next
        # same as previous sections.
        dictionary[policeStationHeading] = policeStation
        dictionary[firnumberHeading] = firNumber
        dictionary[firYearHeading] = firYear
    except:
        pass

    # section 3: Petioner and Advocate
    try:
        petitioner = soup.find('span', attrs={'class': 'Petitioner_Advocate_table'})
        petitionerName = petitioner.next
        dictionary['Name of the Petitioner'] = petitionerName
        petitionerAdvocate = petitionerName.next.next
        dictionary['Name of the Advocate'] = petitionerAdvocate
    # section 4: Respondent and Advocate
        respondent = petitionerAdvocate.find_next('span')
        respondentName = respondent.next
        dictionary['Name of the Respondent'] = respondentName
    except:
        pass
    # section 5: Acts
    '''In this section 1. soup is prepared from act_table tab of web page
    2. Keys for main dictionary are created defining headings of acts. with 'not applied' values. 
    3. short form variables are created for names of the act. 
    4. list of acts is compared with list of variables and sections are replaced as values in the dictionary. '''

    acts = soup.select('#act_table td:nth-of-type(1)')
    sections = soup.select('#act_table td:nth-of-type(2)')
    dictionary['IPC'] = 'Not Applied'
    dictionary['PoA'] = 'Not Applied'
    dictionary['PCSO'] = 'Not Applied'
    dictionary['PCR'] = 'Not Applied'
    dictionary['Any Other Act'] = 'Not Applied'

    ipc = 'indian penal code'
    poa = 'prevention of atrocities'
    pcso = 'protection of children from sexual'
    pcr = 'protection of civil rights'


    try:
        act1 = tuple(acts[0].contents)
        sections1 = tuple(sections[0].contents)
        string = str(act1)
    except:
        pass
    try:
        act2 = tuple(acts[1].contents)
        sections2 = tuple(sections[1].contents)
    except:
        pass
    try:
        act3 = tuple(acts[2].contents)
        sections3 = tuple(sections[2].contents)
    except:
        pass
    try:
        act4 = tuple(acts[3].contents)
        sections4 = tuple(sections[3].contents)
    except:
        pass
    # using if and not for loop then actSession is not needed
    # for first act in list
    if len(acts) < 2:
        if ipc in string.lower():
            dictionary['IPC'] = sections1
        elif poa in string.lower():
            dictionary['PoA'] = sections1
        elif pcso in string.lower():
            dictionary['PCSO'] = sections1
        elif pcr in string.lower():
            dictionary['PCR'] = sections1
        else:
            pass
    # for 2nd act in list
    elif len(acts) == 2:
        if ipc in string.lower():
            dictionary['IPC'] = sections1
        elif poa in string.lower():
            dictionary['PoA'] = sections1
        elif pcso in string.lower():
            dictionary['PCSO'] = sections1
        else:
            pass
        if ipc in str(act2).lower():
            dictionary['IPC'] = sections2
        elif poa in str(act2).lower():
            dictionary['PoA'] = sections2
        elif pcso in str(act2).lower():
            dictionary['PCSO'] = sections2
        else:
            pass
    # for 3rd act in list
    elif len(acts) == 3:
        if ipc in string.lower():
            dictionary['IPC'] = sections1
        elif poa in string.lower():
            dictionary['PoA'] = sections1
        elif pcso in string.lower():
            dictionary['PCSO'] = sections1
        elif pcr in string.lower():
            dictionary['PCR'] = sections1
        else:
            pass
        if ipc in str(act2).lower():
            dictionary['IPC'] = sections2
        elif poa in str(act2).lower():
            dictionary['PoA'] = sections2
        elif pcso in str(act2).lower():
            dictionary['PCSO'] = sections2
        elif pcr in str(act2).lower():
            dictionary['PCR'] = sections2
        else:
            pass
    else:
        pass

    return dictionary

if __name__ == "__main__":
    def convert_html_case_files_to_csv()
import multiprocessing as mp

def convert_html_case_files_to_csv():

    output_filename = os.path.join('/home/some path name/ file' + str(
        datetime.datetime.now().day) + '_' + str(datetime.datetime.now().month) + '_' + str(
        datetime.datetime.now().year) + '.csv')
    # wilding guessing 4 processors sounds good...
    with mp.Pool(4) as pool:
        with open(output_filename, 'w', newline='') as f:
            writer = csv.DictWriter(f, header)
            writer.writerows(pool.imap_unordered(process_case_file, 
                glob.glob(os.path.join(root_dir, '**/*.html'), recursive=True)))