Python beautifulsoup代码未正确循环元素
我不确定是什么问题。但我有一个使用Selenium和Beautifulsoup4访问和解析 下面的代码不循环Python beautifulsoup代码未正确循环元素,python,selenium,for-loop,web-scraping,beautifulsoup,Python,Selenium,For Loop,Web Scraping,Beautifulsoup,我不确定是什么问题。但我有一个使用Selenium和Beautifulsoup4访问和解析 下面的代码不循环联盟 对于game\u data.league.append(count[1].text),行号为[1],但该值对该网页重复,而不是对每一行重复 我的代码: import pandas as pd from selenium import webdriver from datetime import datetime from bs4 import BeautifulSoup as bs
联盟
对于game\u data.league.append(count[1].text)
,行号为[1]
,但该值对该网页重复,而不是对每一行重复
我的代码:
import pandas as pd
from selenium import webdriver
from datetime import datetime
from bs4 import BeautifulSoup as bs
from math import nan
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.score = []
self.date = []
self.time = []
self.country = []
self.league = []
self.game = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
def append(self, score):
pass
def get_urls(browser, landing_page):
browser.get(landing_page)
urls = [i.get_attribute('href') for i in
browser.find_elements_by_css_selector(
'.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
return urls
def parse_data(html):
df = pd.read_html(html, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[0].text
game_data = GameData()
game_date = datetime.strptime(soup.select_one('.bold')['href'].split('/')[-2], '%Y%m%d').date()
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
country = row[1].split('»')[0]
continue
game_time = row[1]
score = row[3] if row[3] else nan
game_data.date.append(game_date)
game_data.time.append(game_time)
game_data.country.append(country)
game_data.league.append(count[1].text)
game_data.game.append(row[2])
game_data.score.append(score)
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
return game_data
if __name__ == '__main__':
start_url = "https://www.oddsportal.com/matches/soccer/"
urls = []
browser = webdriver.Chrome()
results = None
urls = get_urls(browser, start_url)
urls.insert(0, start_url)
for number, url in enumerate(urls):
if number > 0:
browser.get(url)
html = browser.page_source
game_data = parse_data(html)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
结果:
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
| | score | date | time | country | league | game | home_odds | draw_odds | away_odds |
+=====+=========================+============+========+===========+===============+=========================+=============+=============+=============+
| 496 | Inter Turku - Mariehamn | 2021-06-10 | 15:00 | Finland | Veikkausliiga | Inter Turku - Mariehamn | 1.4 | 4.6 | 7.49 |
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
| 497 | KTP - HIFK | 2021-06-10 | 15:30 | Finland | Veikkausliiga | KTP - HIFK | 3.42 | 3.17 | 2.18 |
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
| 498 | Haka - HJK | 2021-06-10 | 15:30 | Finland | Veikkausliiga | Haka - HJK | 6.56 | 4.25 | 1.47 |
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
| 499 | SJK - KuPS | 2021-06-10 | 15:30 | Finland | Veikkausliiga | SJK - KuPS | 3.34 | 3.25 | 2.18 |
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
| 500 | Lahti - Ilves | 2021-06-10 | 15:30 | Finland | Veikkausliiga | Lahti - Ilves | 2.5 | 3.08 | 2.93 |
+-----+-------------------------+------------+--------+-----------+---------------+-------------------------+-------------+-------------+-------------+
如何为每一行循环正确的值,而不是为整个页面循环相同的值?要回答您的特定问题,而不是解决我看到的其他问题,您需要更改逻辑以确定何时添加
if n == 0 or '»' in row[1]:
league = leagues[n]
n+=1
我还将检索联赛作为其自己的列表:
leagues = [i.text for i in soup.select('.first2 > a:last-child')]
嗨,你能带我去哪里更好地学习网页抓取吗?我似乎遇到了同样的问题,我不得不四处提问。是的,我知道这段代码有一些问题,例如,
分数
正确循环到可用的位置,然后获得“游戏”值。嗨,我认为这可能有助于简化问题。如果你对联盟价值有疑问,那么只需关注决定联盟价值的逻辑。将代码复制到另一个文件中,并删除不属于当前问题的代码。我每次在循环中打印行
,因为逻辑应用于此,检查了测试,还查看了计数
的内容。我认为这是一个逻辑缺陷,而不是缺乏网络抓取知识。在学习更好的网络抓取方面,我会通过类似的问题,挑战自己回答这些问题,而不看现有答案,然后与现有答案进行比较。是的。这是我要做的事。谢谢。你可以随时来找我,和我讨论一些偶然的问题。我现在不总是在那里,但如果我在那里,而且也不是经常在那里,我很乐意帮助讨论一些事情。
import pandas as pd
from selenium import webdriver
from datetime import datetime
from bs4 import BeautifulSoup as bs
from math import nan
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.score = []
self.date = []
self.time = []
self.country = []
self.league = []
self.game = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
def append(self, score):
pass
def get_urls(browser, landing_page):
browser.get(landing_page)
urls = [i.get_attribute('href') for i in
browser.find_elements_by_css_selector(
'.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
return urls
def parse_data(html):
df = pd.read_html(html, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[0].text
game_data = GameData()
game_date = datetime.strptime(soup.select_one('.bold')['href'].split('/')[-2], '%Y%m%d').date()
leagues = [i.text for i in soup.select('.first2 > a:last-child')]
n = 0
for row in df.itertuples():
if n == 0 or '»' in row[1]:
league = leagues[n]
n+=1
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
country = row[1].split('»')[0]
continue
game_time = row[1]
score = row[3] if row[3] else nan
game_data.date.append(game_date)
game_data.time.append(game_time)
game_data.country.append(country)
game_data.league.append(league)
game_data.game.append(row[2])
game_data.score.append(score)
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
return game_data
if __name__ == '__main__':
start_url = "https://www.oddsportal.com/matches/soccer/"
urls = []
browser = webdriver.Chrome()
results = None
urls = get_urls(browser, start_url)
urls.insert(0, start_url)
for number, url in enumerate(urls):
if number > 0:
browser.get(url)
html = browser.page_source
game_data = parse_data(html)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)