Python-Beautifulsoup-仅来自最终刮取链接的数据被输出到文本文件
我试图从一个网站上的多个链接中获取体育日程安排。URL被正确地找到和打印,但只有来自上一个被刮取的URL的数据被输出到控制台和文本文件 我的代码如下:Python-Beautifulsoup-仅来自最终刮取链接的数据被输出到文本文件,python,python-3.x,web-scraping,beautifulsoup,Python,Python 3.x,Web Scraping,Beautifulsoup,我试图从一个网站上的多个链接中获取体育日程安排。URL被正确地找到和打印,但只有来自上一个被刮取的URL的数据被输出到控制台和文本文件 我的代码如下: import requests import time from bs4 import BeautifulSoup def makesoup(url): cookies = {'mycountries' : '101,28,3,102,42,10,18,4,2,22', 'user_time_zone': 'Europe/London'
import requests
import time
from bs4 import BeautifulSoup
def makesoup(url):
cookies = {'mycountries' : '101,28,3,102,42,10,18,4,2,22', 'user_time_zone': 'Europe/London', 'user_time_zone_id': '1'}
r = requests.post(url, cookies=cookies)
return BeautifulSoup(r.text,"lxml")
def linkscrape(links, savefile):
baseurl = "https://sport-tv-guide.live"
urllist = []
for link in links:
finalurl = (baseurl+ link['href'])
urllist.append(finalurl)
print(finalurl)
for singleurl in urllist:
soup2=makesoup(url=singleurl)
g_data=soup2.find_all('div', {'id': 'channelInfo'})
c_data=soup2.find_all('div', {'class': 'liveOtherStations clearfix'})
with open(savefile ,"w") as text_file:
for match in g_data:
try:
event = match.find('div', class_='title full').text.strip()
except:
event = ""
try:
extrainfo= match.find_previous('div', class_='pt-5 text-center').text.strip()
except:
extrainfo = ""
try:
startime = match.find('div', class_='time full').text.strip()
print("Time; ", startime)
except:
startime = "Time element not found"
try:
dateandtime = match.find('div', class_='date full').text.strip()
print("Date:", dateandtime)
except:
dateandtime = "Date not found"
try:
sport = match.find_previous('div', class_='sportTitle').text.strip()
print("Sport:", sport)
except:
sport = "Sport element not found"
try:
singlechannel = match.find('div', class_='station full').text.strip()
print("Main Channel:", singlechannel)
print("-----")
except:
singlechannel = "Single Channel element not found"
extra_channels = []
for channel in c_data:
try:
channels = match.find('div', class_='stationLive active col-wrap')
print("Extra Channels:", channel.text)
extra_channels.append(channel.text)
except:
channels = "No channels found"
print(channels)
extra_channels.append(channel.text)
print("-------")
if extra_channels:
for channel in extra_channels:
text_file.writelines("__**Sport:**__" +':' + ' '+ sport +" \n"+"__**Competitors:**__" +':' + ' '+ event+" \n"+ "__**Extra Info:**__" +':' + ' '+ extrainfo+" \n"+"__**Match Date:**__" +':' + ' ' +dateandtime +" \n"+"__**Match Time:**__"+':' + ' ' +startime +" \n"+ "__**Main Channel**__"+':' + ' '+singlechannel+" \n" + "__**Channels**__"+':' + ' '+channel+" \n"+'-' *20 + " \n")
else:
text_file.writelines("__**Sport:**__" +':' + ' '+ sport +" \n"+"__**Competitors:**__" +':' + ' '+ event+" \n"+ "__**Extra Info:**__" +':' + ' '+ extrainfo+" \n"+"__**Match Date:**__" +':' + ' ' +dateandtime +" \n"+"__**Match Time:**__"+':' + ' ' +startime +" \n"+ "__**Main Channel**__"+':' + ' '+singlechannel+" \n" + "__**Channels**__"+':' + " \n"+'-' *20 + " \n")
def matches():
dict = {"https://sport-tv-guide.live/live/darts":"/var/scripts/output/darts.txt",
"https://sport-tv-guide.live/live/mma":"/var/scripts/output/mma.txt",
"https://sport-tv-guide.live/live/wwe":"/var/scripts/output/wrestling.txt",
"https://sport-tv-guide.live/live/motorsport":"/var/scripts/output/motorsport.txt",
"https://sport-tv-guide.live/live/rugby-union":"/var/scripts/output/rugbyunion.txt",
"https://sport-tv-guide.live/live/rugby-league":"/var/scripts/output/rugbyleague.txt",
"https://sport-tv-guide.live/live/cricket":"/var/scripts/output/cricket.txt",
"https://sport-tv-guide.live/live/tennis":"/var/scripts/output/tennis.txt",
"https://sport-tv-guide.live/live/snooker":"/var/scripts/output/snooker.txt",
"https://sport-tv-guide.live/live/golf":"/var/scripts/output/golf.txt",
"https://sport-tv-guide.live/live/netball":"/var/scripts/output/netball.txt",
"https://sport-tv-guide.live/live/basketball":"/var/scripts/output/nba.txt",
"https://sport-tv-guide.live/live/baseball":"/var/scripts/output/mlb.txt",
"https://sport-tv-guide.live/live/ice-hockey":"/var/scripts/output/nhl.txt",
"https://sport-tv-guide.live/live/nfl":"/var/scripts/output/nfl.txt",
"https://sport-tv-guide.live/live/boxing":"/var/scripts/output/boxing.txt"}
for key, value in dict.items():
soup=makesoup(url=key)
game_check = soup.find('div', class_='alert alert-info')
if game_check is not None:
with open(value ,"w") as text_file:
text_file.writelines("No games found for event")
else:
linkscrape(links=soup.find_all('a', {'class': 'article flag', 'href': True}), savefile = value)
matches()
这为我提供了以下输出
我有下面的旧代码,它工作正常,并从主页上的每个抓取的URL输出所有数据
import requests
from bs4 import BeautifulSoup
def makesoup(url):
cookies = {'mycountries' : '101,28,3,102,42,10,18,4,2,22', 'user_time_zone': 'Europe/London', 'user_time_zone_id': '1'}
r = requests.post(url, cookies=cookies)
return BeautifulSoup(r.text,"lxml")
def linkscrape(links):
baseurl = "https://sport-tv-guide.live"
urllist = []
for link in links:
finalurl = (baseurl+ link['href'])
urllist.append(finalurl)
# print(finalurl)
for singleurl in urllist:
soup2=makesoup(url=singleurl)
g_data=soup2.find_all('div', {'id': 'channelInfo'})
c_data=soup2.find_all('div', {'class': 'liveOtherStations clearfix'})
for match in g_data:
try:
hometeam = match.find_previous('div', class_='cell40 text-center teamName1').text.strip()
awayteam = match.find_previous('div', class_='cell40 text-center teamName2').text.strip()
print("Competitors; ", hometeam +" "+ "vs" +" "+ awayteam)
except:
hometeam = "Home Team element not found"
awayteam = "Away Team element not found"
try:
startime = match.find('div', class_='time full').text.strip()
print("Time; ", startime)
except:
startime = "Time element not found"
try:
event= match.find('div', class_='title full').text.strip()
print("Event:", event)
except:
event = "Event element not found"
try:
dateandtime = match.find('div', class_='date full').text.strip()
print("Date:", dateandtime)
except:
dateandtime = "Date not found"
try:
sport = match.find('div', class_='text full').text.strip()
print("Sport:", sport)
except:
sport = "Sport element not found"
try:
singlechannel = match.find('div', class_='station full').text.strip()
print("Main Channel:", singlechannel)
print("-----")
except:
singlechannel = "Single Channel element not found"
for channel in c_data:
try:
channels = match.find('div', class_='stationLive active col-wrap')
print("Extra Channels:", channel.text)
except:
channels = "No channels found"
print(channels)
print("-------")
def matches():
soup=makesoup(url = "https://sport-tv-guide.live/live/mma")
linkscrape(links= soup.find_all('a', {'class': 'article flag', 'href' : True}))
matches()
我认为问题可能在于以下方面:
with open(savefile ,"w") as text_file:
因此,我尝试将其移动到for links in links:
for循环中的链接中,但输出是相同的(仅从正在输出的主页上刮取最后一个URL的数据)
感谢所有能够为解决此问题提供帮助的人。您是对的,问题确实存在于以下代码行中:
打开(保存文件,“w”)作为文本文件:
然后,“w”
在这行代码中意味着打开文件进行“写入”,这看起来像是您想要的,但实际上您想要的是“append”的“a”
。“写入”将擦除文件并从开头开始写入,但“附加”将将其添加到文件末尾。试试这个:
打开(保存文件,“a”)作为文本文件:
谢谢。这起作用了。我只需要将while代码移动到urlist:``循环中的`` for singleurl中,以获得正确的输出。