Python 3.x 使用BeautifulSoup和Requests和Pandas从中的中刮取数据

Python 3.x 使用BeautifulSoup和Requests和Pandas从中的中刮取数据,python-3.x,pandas,beautifulsoup,python-requests-html,Python 3.x,Pandas,Beautifulsoup,Python Requests Html,我试图从这段HTML代码中的中间提取T和0-0以及2 OT。我开始写下面的代码,但太多的新手,无法理解它。谢谢你的帮助 <div class ="sidearm-schedule-game-details flex item-1 columns"> == $0 <div class="sidearm-schedule-game-result text-italic"> == $0 <span></span

我试图从这段HTML代码中的中间提取T和0-0以及2 OT。我开始写下面的代码,但太多的新手,无法理解它。谢谢你的帮助

    <div class ="sidearm-schedule-game-details flex item-1 columns"> == $0
        <div class="sidearm-schedule-game-result text-italic"> == $0
            <span>(2 OT)</span>


import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup

url = ''
school = requests.get(url).text
soup = BeautifulSoup(school,'lxml')

rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")

sheet = pd.DataFrame()
for row in rows:
    result = row.find('div',class_="sidearm-schedule-game-result").text.strip().replace('\n', ', ')
    df = pd.DataFrame([[result]], columns=['result'])
    sheet = sheet.append(df).reset_index(drop=True)

0          L, 1-2
1     L, 1-2 (OT)
2          W, 1-0
3          W, 1-0
4          L, 1-2
5   W, 1-0 (2 OT)
6   T, 0-0 (2 OT)
7          W, 3-0
8     L, 2-3 (OT)
9     W, 2-1 (OT)
10         W, 1-0
11         W, 1-0
12         L, 0-1
13  T, 0-0 (2 OT)
14         L, 0-1
15         W, 1-0
16         L, 0-1
17         W, 3-1
18         L, 1-2


import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup

url = ''
school = requests.get(url).text
soup = BeautifulSoup(school,'lxml')

rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")

sheet = pd.DataFrame()
for row in rows:
    result = row.find('div',class_="sidearm-schedule-game-result").text.strip().replace('\n', ', ')
    df = pd.DataFrame([[result]], columns=['result'])
    sheet = sheet.append(df).reset_index(drop=True)

0          L, 1-2
1     L, 1-2 (OT)
2          W, 1-0
3          W, 1-0
4          L, 1-2
5   W, 1-0 (2 OT)
6   T, 0-0 (2 OT)
7          W, 3-0
8     L, 2-3 (OT)
9     W, 2-1 (OT)
10         W, 1-0
11         W, 1-0
12         L, 0-1
13  T, 0-0 (2 OT)
14         L, 0-1
15         W, 1-0
16         L, 0-1
17         W, 3-1
18         L, 1-2


    a = html.xpath('//div[@class, "sidearm-schedule-game-result"]')
    #select all nodes that start with a <div> and have "sidearm-schedule-game-result" in the class.
    for each in a:
         b = each.xpath('.//span/text()')
         #the './/' will only look at subelements of what you selected earlier and text() will extract the text from that field.


    a = html.xpath('//div[@class, "sidearm-schedule-game-result"]')
    #select all nodes that start with a <div> and have "sidearm-schedule-game-result" in the class.
    for each in a:
         b = each.xpath('.//span/text()')
         #the './/' will only look at subelements of what you selected earlier and text() will extract the text from that field.


import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = ''
school = requests.get(url).text
soup = BeautifulSoup(school,'lxml')

rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")

data = []
for row in rows:
    opponent = row.select_one('.sidearm-schedule-game-opponent-logo img')['alt'].rsplit(maxsplit=1)[0]
    name_date = row.select_one('.sidearm-schedule-game-opponent-name a')['aria-label']

    result = re.findall(r'([A-Z]),\s+([\d-]+)\s*(.*)', row.select_one('.sidearm-schedule-game-result').get_text(strip=True, separator=' '))[0]

    data.append([opponent, *result, name_date])

df = pd.DataFrame(data, columns=['Name', 'Result', 'Score', 'OT', 'Info'])

                            Name Result Score      OT                                             Info
0      University of Connecticut      L   1-2                                UConn on August 24 7 p.m.
1              Drexel University      L   1-2    (OT)                       Drexel on August 27 7 p.m.
2   George Washington University      W   1-0                  George Washington on September 1 4 p.m.
3          St. John's University      W   1-0                      St. John's on September 4 7:30 p.m.
4          Binghamton University      L   1-2                         Binghamton on September 7 8 p.m.
5               Rider University      W   1-0  (2 OT)                     Rider on September 11 7 p.m.
6     University of Pennsylvania      T   0-0  (2 OT)                      Penn on September 15 6 p.m.
7                           Army      W   3-0                              Army on September 22 7 p.m.
8             Cornell University      L   2-3    (OT)                   Cornell on September 25 7 p.m.
9              Boston University      W   2-1    (OT)                  Boston U on September 29 4 p.m.
10            Colgate University      W   1-0                              Colgate on October 3 7 p.m.
11   United States Naval Academy      W   1-0                                 Navy on October 6 6 p.m.
12             Lafayette College      L   0-1                          Lafayette on October 13 12 p.m.
13             Dartmouth College      T   0-0  (2 OT)                   Dartmouth on October 16 6 p.m.
14           American University      L   0-1                            American on October 20 6 p.m.
15           Bucknell University      W   1-0                            Bucknell on October 24 7 p.m.
16       Loyola University (Md.)      L   0-1                        Loyola (Md.) on October 27 3 p.m.
17                    Holy Cross      W   3-1                          Holy Cross on November 3 6 p.m.
18            Colgate University      L   1-2          No. 3 Colgate (Semifinals) on November 9 7 p.m.


import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = ''
school = requests.get(url).text
soup = BeautifulSoup(school,'lxml')

rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")

data = []
for row in rows:
    opponent = row.select_one('.sidearm-schedule-game-opponent-logo img')['alt'].rsplit(maxsplit=1)[0]
    name_date = row.select_one('.sidearm-schedule-game-opponent-name a')['aria-label']

    result = re.findall(r'([A-Z]),\s+([\d-]+)\s*(.*)', row.select_one('.sidearm-schedule-game-result').get_text(strip=True, separator=' '))[0]

    data.append([opponent, *result, name_date])

df = pd.DataFrame(data, columns=['Name', 'Result', 'Score', 'OT', 'Info'])

                            Name Result Score      OT                                             Info
0      University of Connecticut      L   1-2                                UConn on August 24 7 p.m.
1              Drexel University      L   1-2    (OT)                       Drexel on August 27 7 p.m.
2   George Washington University      W   1-0                  George Washington on September 1 4 p.m.
3          St. John's University      W   1-0                      St. John's on September 4 7:30 p.m.
4          Binghamton University      L   1-2                         Binghamton on September 7 8 p.m.
5               Rider University      W   1-0  (2 OT)                     Rider on September 11 7 p.m.
6     University of Pennsylvania      T   0-0  (2 OT)                      Penn on September 15 6 p.m.
7                           Army      W   3-0                              Army on September 22 7 p.m.
8             Cornell University      L   2-3    (OT)                   Cornell on September 25 7 p.m.
9              Boston University      W   2-1    (OT)                  Boston U on September 29 4 p.m.
10            Colgate University      W   1-0                              Colgate on October 3 7 p.m.
11   United States Naval Academy      W   1-0                                 Navy on October 6 6 p.m.
12             Lafayette College      L   0-1                          Lafayette on October 13 12 p.m.
13             Dartmouth College      T   0-0  (2 OT)                   Dartmouth on October 16 6 p.m.
14           American University      L   0-1                            American on October 20 6 p.m.
15           Bucknell University      W   1-0                            Bucknell on October 24 7 p.m.
16       Loyola University (Md.)      L   0-1                        Loyola (Md.) on October 27 3 p.m.
17                    Holy Cross      W   3-1                          Holy Cross on November 3 6 p.m.
18            Colgate University      L   1-2          No. 3 Colgate (Semifinals) on November 9 7 p.m.