Beautifulsoup抓取javascript表问题csv
我目前有一个脚本,它将从一个基本URL生成一个URL和名称列表,还有一个脚本,它将使用URL列表中的链接为我提供所需的数据,但我正在努力创建一个从第一行获取URL的循环,运行第二个脚本,然后将该文件保存为包含URL的第一个文件中的第2+3列 这是我的两个脚本和它们输出的数据Beautifulsoup抓取javascript表问题csv,javascript,python,csv,web,scrape,Javascript,Python,Csv,Web,Scrape,我目前有一个脚本,它将从一个基本URL生成一个URL和名称列表,还有一个脚本,它将使用URL列表中的链接为我提供所需的数据,但我正在努力创建一个从第一行获取URL的循环,运行第二个脚本,然后将该文件保存为包含URL的第一个文件中的第2+3列 这是我的两个脚本和它们输出的数据 import io import requests import operator import csv from bs4 import BeautifulSoup from requests import get url
import io
import requests
import operator
import csv
from bs4 import BeautifulSoup
from requests import get
url = 'http://arizonascaleracers.liverc.com/results'
response = get(url)
print(response.text[:500])
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
race_tables = html_soup.find_all('table', {'class':'table table-hover entry_list_data'})[2]
output_rows = []
for row in race_tables.find('tbody').find_all("tr")[1:]:
col = row.find_all("td")
output_row = []
for td in col:
if td.find(racename=''):
racename = ''.join('blank')
if td.find(suburl=''):
suburl = ''.join('blank')
if td.find(time=''):
time = ''.join('blank')
else:
suburl = row.find("a")['href']
racename = col[0].text
time = col[1].text
output_row.append(td)
output_rows.append(output_row)
with open('output.csv', 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerows(output_rows)
此脚本的输出为
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227665""><i class=""fa fa-trophy""></i> Race 6: Modified Touring Car (Modified Touring Car A-Main)</a></td>","<td>Nov 23, 2019 at 2:39pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227664""><i class=""fa fa-trophy""></i> Race 5: 17.5 Super Stock Touring Car (17.5 Super Stock Touring Car A-Main)</a></td>","<td>Nov 23, 2019 at 2:31pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227666""><i class=""fa fa-trophy""></i> Race 4: Pro GT (Pro GT A-Main)</a></td>","<td>Nov 23, 2019 at 2:17pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227662""><i class=""fa fa-trophy""></i> Race 3: USGT 21.5 (USGT 21.5 A-Main)</a></td>","<td>Nov 23, 2019 at 2:10pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227663""><i class=""fa fa-trophy""></i> Race 2: 1-12 Open Modified (1-12 Open Modified A-Main)</a></td>","<td>Nov 23, 2019 at 2:03pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227661""><i class=""fa fa-trophy""></i> Race 1: VTA (VTA A-Main)</a></td>","<td>Nov 23, 2019 at 1:52pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227120""><i class=""fa fa-trophy""></i> Race 6: Modified Touring Car (Modified Touring Car (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 1:22pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227119""><i class=""fa fa-trophy""></i> Race 5: 17.5 Super Stock Touring Car (17.5 Super Stock Touring Car (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 1:14pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227118""><i class=""fa fa-trophy""></i> Race 4: Pro GT (Pro GT (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 1:06pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227117""><i class=""fa fa-trophy""></i> Race 3: USGT 21.5 (USGT 21.5 (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:58pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227116""><i class=""fa fa-trophy""></i> Race 2: 1-12 Open Modified (1-12 Open Modified (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:51pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227115""><i class=""fa fa-trophy""></i> Race 1: VTA (VTA (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:40pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226732""><i class=""fa fa-trophy""></i> Race 6: Modified Touring Car (Modified Touring Car (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:18pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226731""><i class=""fa fa-trophy""></i> Race 5: 17.5 Super Stock Touring Car (17.5 Super Stock Touring Car (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:11pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226730""><i class=""fa fa-trophy""></i> Race 4: Pro GT (Pro GT (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:03pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226729""><i class=""fa fa-trophy""></i> Race 3: USGT 21.5 (USGT 21.5 (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:55am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226728""><i class=""fa fa-trophy""></i> Race 2: 1-12 Open Modified (1-12 Open Modified (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:47am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226727""><i class=""fa fa-trophy""></i> Race 1: VTA (VTA (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:37am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226250""><i class=""fa fa-trophy""></i> Race 6: Modified Touring Car (Modified Touring Car (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:16am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226249""><i class=""fa fa-trophy""></i> Race 5: 17.5 Super Stock Touring Car (17.5 Super Stock Touring Car (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:08am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226251""><i class=""fa fa-trophy""></i> Race 4: Pro GT (Pro GT (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:00am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226247""><i class=""fa fa-trophy""></i> Race 3: USGT 21.5 (USGT 21.5 (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 10:52am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226248""><i class=""fa fa-trophy""></i> Race 2: 1-12 Open Modified (1-12 Open Modified (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 10:42am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2226246""><i class=""fa fa-trophy""></i> Race 1: VTA (VTA (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 10:31am</td>"
这里的输出是
1 1 GABE HARVELL 24/5:04.408 --- 12.481 12.83912.839 12.59312.593
2 3 JOHNATHAN LEE 24/5:09.287 4.879 12.583 13.01113.011 12.65512.655
3 4 BRAD TOFFELMIRE 24/5:12.110 2.823 12.520 13.11813.118 12.71012.710
4 2 JACK KLOEBER 23/5:09.212 1 Lap 13.028 13.61013.610 13.09713.097
5 5 BILL CLINE 22/5:02.867 1 Lap 13.177 13.89813.898 13.28713.287
6 6 TIMOTHY SCHMUCK 22/5:03.815 0.948 12.919 13.92713.927 13.13913.139
7 7 CRAIG NELSON 21/5:08.304 1 Lap 13.713 14.82414.824 13.95713.957
现在我想我需要创建一个循环,并通过调用输入和输出的列来循环这些,比如
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
contents = []
with open('output.csv','r') as csvf:
urls = csv.reader(csvf)[:0]
for url in urls:
html = urllib2.urlopen('http://arizonascaleracers.liverc.com'+(url))
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', {"class":"table table-striped race_result"})
for row in table.find('tbody').find_all("tr", recursive=False):
col = row.find_all("td")
FinishPos = col[0].text.strip()
DriverInfo = col[1].text.strip()
DI = row.find('span', class_ = 'driver_name')
Qual = col[2].text.strip()
test = col[3].text
LapsTime = col[4].text
Behind = col[5].text
Fastest = col[6].text
Avg_Lap = col[7].text
Avg_Top_5 = col[8].text
Avg_Top_10 = col[9].text
Avg_Top_15 = col[10].text
Top_3_Consecutive = col[11].text
DI = DI.text
output_rows.append(output_row)
with open('col[1]+[2].csv', 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerows(output_rows)
显然,这最后一块不工作,我似乎找不到任何答案,如何做到这一点
我想要得到的是这些数据
1 1 GABE HARVELL 24/5:04.408 --- 12.481 12.83912.839 12.59312.593
2 3 JOHNATHAN LEE 24/5:09.287 4.879 12.583 13.01113.011 12.65512.655
3 4 BRAD TOFFELMIRE 24/5:12.110 2.823 12.520 13.11813.118 12.71012.710
4 2 JACK KLOEBER 23/5:09.212 1 Lap 13.028 13.61013.610 13.09713.097
5 5 BILL CLINE 22/5:02.867 1 Lap 13.177 13.89813.898 13.28713.287
6 6 TIMOTHY SCHMUCK 22/5:03.815 0.948 12.919 13.92713.927 13.13913.139
7 7 CRAIG NELSON 21/5:08.304 1 Lap 13.713 14.82414.824 13.95713.957
但是对于每个url。这些数据只针对一个url
"<td><a class=""block"" href=""/results/?p=view_race_result&id=2227665""><i class=""fa fa-trophy""></i> Race 6: Modified Touring Car (Modified Touring Car A-Main)</a></td>","<td>Nov 23, 2019 at 2:39pm</td>"
谢谢你的帮助 这里有一点重构,可以帮助您将刮取的数据转换成CSV表单。它消除了临时CSV文件,只需在每个竞赛条目URL之间循环,提取文本,然后将结果数据放入namedtuple数据结构中,以便于修改
#!/usr/bin/env python
# functionality of first script
import sys
import requests
import re
from bs4 import BeautifulSoup
from collections import namedtuple
base_url = 'http://arizonascaleracers.liverc.com'
response = requests.get('{}/results/?p=view_event&id={}'.format(base_url, sys.argv[1]))
#response = requests.get('{}/results'.format(base_url))
html_soup = BeautifulSoup(response.text, 'html.parser')
race_tables = html_soup.find_all('table', {'class':'table table-hover entry_list_data'})[2]
REGEX_RACENAME = re.compile(' ?Race \d: [\w \.\-]+(\([\w \.\-\(\/\)]+\))')
Car = namedtuple('Car', ['FinishPos', 'DriverInfo', 'Qual', 'LapsTime', 'Fastest', 'AvgLap', 'AvgTop5', 'AvgTop10', 'AvgTop15', 'Top3Consecutive'])
races = []
class Race:
def __init__(self, name, time, race_id):
self.name = newname
self.time = time
self.id = race_id
self.positions = []
def __str__(self):
retval = ''
for p in self.positions:
line = '{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|\n'.format(p.FinishPos, p.Qual, p.DriverInfo, p.LapsTime, p.Fastest, p.AvgLap, p.AvgTop10, p.AvgTop15, p.Top3Consecutive, self.name, self.time)
retval += line
return retval
for row in race_tables.find('tbody').find_all("tr")[1:]:
col = row.find("a", href=True)
try:
racename = col.text
newname = REGEX_RACENAME.match(racename).group(1)[1:][:-1]
except AttributeError:
continue
else:
href = col.get('href')
timestamp = row.find_all("td")[-1].text
r = Race(newname, timestamp, href)
response = requests.get('{}{}'.format(base_url, href))
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {"class":"table table-striped race_result"})
for entry in table.find('tbody').find_all("tr", recursive=False):
col = entry.find_all("td")
FinishPos = col[0].text.strip()
DriverInfo = entry.find('span', class_ = 'driver_name').text
Qual = col[2].text.strip()
LapsTime = col[4].text
Fastest = col[6].text
Avg_Lap = col[7].text
Avg_Top_5 = col[8].text
Avg_Top_10 = col[9].text
Avg_Top_15 = col[10].text
Top_3_Consecutive = col[11].text
c = Car(FinishPos, DriverInfo, Qual, LapsTime, Fastest, Avg_Lap, Avg_Top_5, Avg_Top_10, Avg_Top_15, Top_3_Consecutive)
r.positions.append(c)
races.append(r)
# second script
with open('output', 'a') as outputfile:
print(len(races))
for r in races:
#print(r.name)
#print(str(r))
outputfile.write(str(r)) # add in different fields to be outputted to CSV as desired
祝你的项目好运 你能更清楚地描述一下你需要的数据吗?预期的结果是什么?谢谢。我继续添加了一些额外的细节,让我知道这是否更有意义。谢谢看起来脚本2生成了这个输出,对吗?脚本2生成了,但是我想自动滚动URL并生成数据,我需要从脚本1中提取数据以填充脚本2作为输入和输出。
Race 6: Modified Touring Car (Modified Touring Car A-Main)Nov 23, 2019.csv
#!/usr/bin/env python
# functionality of first script
import sys
import requests
import re
from bs4 import BeautifulSoup
from collections import namedtuple
base_url = 'http://arizonascaleracers.liverc.com'
response = requests.get('{}/results/?p=view_event&id={}'.format(base_url, sys.argv[1]))
#response = requests.get('{}/results'.format(base_url))
html_soup = BeautifulSoup(response.text, 'html.parser')
race_tables = html_soup.find_all('table', {'class':'table table-hover entry_list_data'})[2]
REGEX_RACENAME = re.compile(' ?Race \d: [\w \.\-]+(\([\w \.\-\(\/\)]+\))')
Car = namedtuple('Car', ['FinishPos', 'DriverInfo', 'Qual', 'LapsTime', 'Fastest', 'AvgLap', 'AvgTop5', 'AvgTop10', 'AvgTop15', 'Top3Consecutive'])
races = []
class Race:
def __init__(self, name, time, race_id):
self.name = newname
self.time = time
self.id = race_id
self.positions = []
def __str__(self):
retval = ''
for p in self.positions:
line = '{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|\n'.format(p.FinishPos, p.Qual, p.DriverInfo, p.LapsTime, p.Fastest, p.AvgLap, p.AvgTop10, p.AvgTop15, p.Top3Consecutive, self.name, self.time)
retval += line
return retval
for row in race_tables.find('tbody').find_all("tr")[1:]:
col = row.find("a", href=True)
try:
racename = col.text
newname = REGEX_RACENAME.match(racename).group(1)[1:][:-1]
except AttributeError:
continue
else:
href = col.get('href')
timestamp = row.find_all("td")[-1].text
r = Race(newname, timestamp, href)
response = requests.get('{}{}'.format(base_url, href))
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {"class":"table table-striped race_result"})
for entry in table.find('tbody').find_all("tr", recursive=False):
col = entry.find_all("td")
FinishPos = col[0].text.strip()
DriverInfo = entry.find('span', class_ = 'driver_name').text
Qual = col[2].text.strip()
LapsTime = col[4].text
Fastest = col[6].text
Avg_Lap = col[7].text
Avg_Top_5 = col[8].text
Avg_Top_10 = col[9].text
Avg_Top_15 = col[10].text
Top_3_Consecutive = col[11].text
c = Car(FinishPos, DriverInfo, Qual, LapsTime, Fastest, Avg_Lap, Avg_Top_5, Avg_Top_10, Avg_Top_15, Top_3_Consecutive)
r.positions.append(c)
races.append(r)
# second script
with open('output', 'a') as outputfile:
print(len(races))
for r in races:
#print(r.name)
#print(str(r))
outputfile.write(str(r)) # add in different fields to be outputted to CSV as desired