Python 从表格到CSV的Web数据

Python 从表格到CSV的Web数据,python,csv,web-scraping,beautifulsoup,Python,Csv,Web Scraping,Beautifulsoup,代码如下: import requests from bs4 import BeautifulSoup import re url = 'https://olps.cgtransport.org/OLTP/Tax/VehicleStatus.aspx' reg_number = ['CG04DS7961'] for i in reg_number: reg1 = i[:-4] reg2 = i[-4:] payload = { '__VIEWSTATEFIELDC

代码如下:

import requests
from bs4 import BeautifulSoup
import re

url = 'https://olps.cgtransport.org/OLTP/Tax/VehicleStatus.aspx'

reg_number = ['CG04DS7961']

for i in reg_number:
    reg1 = i[:-4]
    reg2 = i[-4:]

    payload = { '__VIEWSTATEFIELDCOUNT' : '3',
'__VIEWSTATE' : '5Rx7Jezv02wRDXtT58JN6uHfoZf2BCTkLyrML9D/7VLW1gz5HhU8sjA2R/7tOPruA/C5yDKTBJBtetPEAxUAPV6iDKZ9TrCt+JTtG9yZisuK5rgWRPQQ9iCqmEFBIGT9K/pVMPJVr2BE+S/S/wtmyTiZRL5zAnbBXZ+Z6xTQcmMj1VSq8vlwmx+0jsZpOHSu46nUZhurNclrV469rApFvORQTcnI2iyS4moLgwH6muz/umtBfTw31jzVsP/3R0u',
'__VIEWSTATE1' : 'pFQlf7Tpik2lCjknuojNbZw9FEYHiUYYGzxOYiwOGcSqt8nHzrZpJW8fGseyQWsG2+r12CzsbOEsxEyBh73/YHGDyK52IHBN1JLYgV45SkLp2jJqaDSbeSE6/3Xfibfd8PXX0SzoyztUTYb30K0Y9X1zTBKl6yP08Ui4I9Wuks7+4qRBDhOLedsrjBCrlWZLgUTIUgiye9UeIfQ/Q8sTR9NOM1N91b38x4+C7kaXhqn/ayrrVxJJm1uXE1ua48z',
'__VIEWSTATE2' : 'SYo3Su3gkp4339oFMeN+Q+/7XFFqlTTs4RAHi08VV252mno3weI5t9jg6ns4mhcrRQLa0bOM2Q/y/qEgkGPXoRxh1QBC/DyfGlLyVc/umb8WOdA1DDypkEt+oRRmI48fX1L6/scDrVZKUQWtF2Pm87WPQcYLP19h5vHXqGIvTHOIdoLzjC',
'__VIEWSTATEGENERATOR' : '34956357',
'__EVENTVALIDATION' : 'ygss/i7NxWFitcgCI9h84GSJJl8UM4sb1apUvzZIv1T1PL/JHswnbZ01G31EtP5I3zrr3rZRL0Hb6aAnrgkmqg7B70FsbNrF9hZ9eFjIGJKw7YBq+G+6hHXE1hYZu3i23uu0Lhdkm+S2An6ptxA+dW5P7+o=', 'ctl00$ContentPlaceHolder1$txtregPart1' : reg1,
'ctl00$ContentPlaceHolder1$txtregPart2' : reg2,
'ctl00$ContentPlaceHolder1$btnshow' : 'Search' }

    r = requests.post(url, data=payload)

    soup = BeautifulSoup(r.content, 'html5lib')

    table = soup.find('table', attrs = {'id':'ctl00_ContentPlaceHolder1_tbPermit'})
    data = table.text
    headers = re.findall('.+:', data)
    print(data)
代码输出: 所需输出仅为CSV格式的值。我不需要标题

我已经在StackOverflow上搜索了一个解决方案

但是它对我不起作用,因为我不能使用这个模块。我想在不支持Pandas的Pythonista上运行这段代码。对于这个特定的网站,其他帖子的表格格式与我的不同

我只希望值为:


CG04DS7961,20/09/2010,赖布尔,生活时间,PRATIK DEWANGAN,…

根据您的代码,您可以找到特定元素并使用BeautifulSoup获取文本,然后将文本附加到字符串列表,然后将列表写入csv文件,下面是修改后的代码:

import requests
from bs4 import BeautifulSoup
import re
import csv #import csv

url = 'https://olps.cgtransport.org/OLTP/Tax/VehicleStatus.aspx'

reg_number = ['CG04DS7961']

for i in reg_number:
    reg1 = i[:-4]
    reg2 = i[-4:]

    payload = { '__VIEWSTATEFIELDCOUNT' : '3', 
'__VIEWSTATE' : '5Rx7Jezv02wRDXtT58JN6uHfoZf2BCTkLyrML9D/7VLW1gz5HhU8sjA2R/7tOPruA/C5yDKTBJBtetPEAxUAPV6iDKZ9TrCt+JTtG9yZisuK5rgWRPQQ9iCqmEFBIGT9K/pVMPJVr2BE+S/S/wtmyTiZRL5zAnbBXZ+Z6xTQcmMj1VSq8vlwmx+0jsZpOHSu46nUZhurNclrV469rApFvORQTcnI2iyS4moLgwH6muz/umtBfTw31jzVsP/3R0u', 
'__VIEWSTATE1' : 'pFQlf7Tpik2lCjknuojNbZw9FEYHiUYYGzxOYiwOGcSqt8nHzrZpJW8fGseyQWsG2+r12CzsbOEsxEyBh73/YHGDyK52IHBN1JLYgV45SkLp2jJqaDSbeSE6/3Xfibfd8PXX0SzoyztUTYb30K0Y9X1zTBKl6yP08Ui4I9Wuks7+4qRBDhOLedsrjBCrlWZLgUTIUgiye9UeIfQ/Q8sTR9NOM1N91b38x4+C7kaXhqn/ayrrVxJJm1uXE1ua48z', 
'__VIEWSTATE2' : 'SYo3Su3gkp4339oFMeN+Q+/7XFFqlTTs4RAHi08VV252mno3weI5t9jg6ns4mhcrRQLa0bOM2Q/y/qEgkGPXoRxh1QBC/DyfGlLyVc/umb8WOdA1DDypkEt+oRRmI48fX1L6/scDrVZKUQWtF2Pm87WPQcYLP19h5vHXqGIvTHOIdoLzjC', 
'__VIEWSTATEGENERATOR' : '34956357', 
'__EVENTVALIDATION' : 'ygss/i7NxWFitcgCI9h84GSJJl8UM4sb1apUvzZIv1T1PL/JHswnbZ01G31EtP5I3zrr3rZRL0Hb6aAnrgkmqg7B70FsbNrF9hZ9eFjIGJKw7YBq+G+6hHXE1hYZu3i23uu0Lhdkm+S2An6ptxA+dW5P7+o=', 'ctl00$ContentPlaceHolder1$txtregPart1' : reg1, 
'ctl00$ContentPlaceHolder1$txtregPart2' : reg2, 
'ctl00$ContentPlaceHolder1$btnshow' : 'Search' }

    r = requests.post(url, data=payload)

    soup = BeautifulSoup(r.content, 'lxml')

    table = soup.find('table', attrs = {'id':'ctl00_ContentPlaceHolder1_tbPermit'})
    resultList=list() # resultList to store the find result
    #find the element text and append the result with string format to resultList 
    for i in table.find_all(attrs={"align":"left"}):
        bElement = i.font.text
        if str(bElement) != "":
            resultList.append(str(bElement))
    #write the result to output.csv
    with open("output.csv",'wb') as resultFile:
        wr = csv.writer(resultFile, dialect='excel')
        wr.writerows([resultList])
    print resultList #print the resultList
resultList的输出为:

['CG04DS7961', '20/09/2010', 'RAIPUR', 'LIFE TIME', 'PRATIK DEWANGAN', '.', 'NON-TRANSPORT VEHICLE', 'MCYCLE MOTOR CYCLE', 'TVS MOTORS LTD', 'SCOOTYPEP+', '9/2010', '2', '95', '0', 'OG3FA2172150', 'MD626BG39A2F97895', '19/09/2025']

非常感谢。这正是我想要的。事实上,我试过在桌子上找我。findallattrs={align:left}但不确定如何获取这些元素。谢谢你花时间给我指引了正确的方向。
['CG04DS7961', '20/09/2010', 'RAIPUR', 'LIFE TIME', 'PRATIK DEWANGAN', '.', 'NON-TRANSPORT VEHICLE', 'MCYCLE MOTOR CYCLE', 'TVS MOTORS LTD', 'SCOOTYPEP+', '9/2010', '2', '95', '0', 'OG3FA2172150', 'MD626BG39A2F97895', '19/09/2025']