Python 如何使用Beautifulsoup在Xpath之间进行迭代?

Python 如何使用Beautifulsoup在Xpath之间进行迭代?,python,selenium-webdriver,xpath,beautifulsoup,Python,Selenium Webdriver,Xpath,Beautifulsoup,我试图在上的xpath之间进行迭代 我测试了代码,它适用于元素。单击(),如下所示: from selenium import webdriver browser = webdriver.Chrome() browser.get("https://www.oddsportal.com/matches/soccer/") element = browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[

我试图在上的
xpath
之间进行迭代

我测试了代码,它适用于
元素。单击()
,如下所示:

from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.oddsportal.com/matches/soccer/")
element = browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[4]/div/div/span/a[3]")
element.click()
import pandas as pd
from selenium import webdriver
from datetime import datetime
from bs4 import BeautifulSoup as bs

browser = webdriver.Chrome()

urls = {
    "https://www.oddsportal.com/matches/soccer/"
}


class GameData:
    def __init__(self):
        self.country = []


def parse_data(url):
    browser.get(url)
    df = pd.read_html(browser.page_source, header=0)[0]
    html = browser.page_source
    soup = bs(html, "lxml")
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
    main = content.find('th', {'class': 'first2 tl'})
    if main is None:
        return None
    count = main.findAll('a')
    country = count[0].text
    game_data = GameData()
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            country = row[1].split('»')[0]
            continue
        game_data.country.append(country)

    return game_data


if __name__ == '__main__':

    results = None

    for url in urls:
        game_data = parse_data(url)
        if game_data is None:
            continue
        result = pd.DataFrame(game_data.__dict__)
        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)
我希望
URL
在其中迭代的Xpath是:

/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[3]/div/div/span/a[2]
/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[3]/div/div/span/a[3]
/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[3]/div/div/span/a[4]
/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[3]/div/div/span/a[5]
/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[3]/div/div/span/a[6]
/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[3]/div/div/span/a[7]
/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[3]/div/div/span/a[8]
我有一个代码,可以刮取任何给定的
URL集,如下所示:

from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.oddsportal.com/matches/soccer/")
element = browser.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[6]/div[1]/div/div[1]/div[2]/div[1]/div[4]/div/div/span/a[3]")
element.click()
import pandas as pd
from selenium import webdriver
from datetime import datetime
from bs4 import BeautifulSoup as bs

browser = webdriver.Chrome()

urls = {
    "https://www.oddsportal.com/matches/soccer/"
}


class GameData:
    def __init__(self):
        self.country = []


def parse_data(url):
    browser.get(url)
    df = pd.read_html(browser.page_source, header=0)[0]
    html = browser.page_source
    soup = bs(html, "lxml")
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'})
    main = content.find('th', {'class': 'first2 tl'})
    if main is None:
        return None
    count = main.findAll('a')
    country = count[0].text
    game_data = GameData()
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            country = row[1].split('»')[0]
            continue
        game_data.country.append(country)

    return game_data


if __name__ == '__main__':

    results = None

    for url in urls:
        game_data = parse_data(url)
        if game_data is None:
            continue
        result = pd.DataFrame(game_data.__dict__)
        if results is None:
            results = result
        else:
            results = results.append(result, ignore_index=True)
如何将
xpath
集成到此代码中


我尝试了讨论过的解决方案,但是我一事无成,或者可能我进入学习曲线有点早。

你可以根据今天的日期进行减法或加法来构建它们。但是,您也可以使用第n个子节点提取相关节点,指定第一个(昨天)锚定标记,然后指定从明天开始获取的第n个子范围;将它们与或语法相结合。您不需要指定今天,因为这是登录页。然后,您可以
浏览器。在返回的列表上循环获取
到每个提取的链接:

from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://www.oddsportal.com/matches/soccer/')
other_days = [i.get_attribute('href') 
              for i in browser.find_elements_by_css_selector('.next-games-date > a:nth-child(1), .next-games-date > a:nth-child(n+3)')]
print(other_days)
for a_day in other_days:
    browser.get(a_day)
    #do something

与注释中共享的代码集成(意味着重写一些现有类):


对我来说,xpath与任何东西都不匹配。你到底想点击然后做什么?是导航到indiv matches页面吗?xpath对我来说有点棘手。我必须为
元素使用完整的xpath。单击()
才能工作。您可以尝试xpath,即
“//*[@id=“col content”]/div[3]/div/div/span/a[3]”
我正在尝试在“明天”和其他路径之间进行迭代。举个例子:这太棒了!我可以在不同的日子之间重复。谢谢现在,正如您看到的代码一样,browser.get在
def parse_data(url)
下使用,然后数据框被附加在
中,用于url中的url
,我如何在我的代码中使用您的方法?返回的列表是否应该输入到
中,用于url中的url
?是的,符合SO的原则,我没有包括dataframe,dataframe定义了许多其他属性,这些属性随后被附加到dataframe中,dataframe为url中的url提供
,并附加到dataframe中。因此
url
每天都会出现,这怎么样?