使用Python在Beautifulsoup4中进行刮片-初学者_Python_Python 3.x_Pandas_Beautifulsoup

使用Python在Beautifulsoup4中进行刮片-初学者

python python-3.x pandas

使用Python在Beautifulsoup4中进行刮片-初学者,python,python-3.x,pandas,beautifulsoup,Python,Python 3.x,Pandas,Beautifulsoup,感谢您对最后一个问题的帮助（）。然而，我目前正忙于准备我的最终数据帧。我已经能够从原始表格中提取所有数据，并以我想要的方式显示出来，但是我现在想将主队和客队添加到df中，但似乎无法理解。这是我目前所拥有的，我想从中吸取教训 from urllib.request import urlopen # import the library from bs4 import BeautifulSoup # Import BS from bs4 import SoupStrainer # Imp

感谢您对最后一个问题的帮助（）。然而，我目前正忙于准备我的最终数据帧。我已经能够从原始表格中提取所有数据，并以我想要的方式显示出来，但是我现在想将主队和客队添加到df中，但似乎无法理解。这是我目前所拥有的，我想从中吸取教训

from urllib.request import urlopen  # import the library
from bs4 import BeautifulSoup   # Import BS
from bs4 import SoupStrainer    # Import Soup Strainer
import pandas as pd    # import pandas as a package

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'

scrapeweb1 = basescrape+matchid

page = urlopen(scrapeweb1) # access the website
only_tables = SoupStrainer('table', attrs={"width" : "583"}) # parse only table elements when parsing
soup = BeautifulSoup(page, 'html.parser', parse_only=only_tables)   # parse the html

only_teams = SoupStrainer('table', attrs={"width" : "376"}) # parse only team qtr score elements when parsing
soup2 = BeautifulSoup(page, 'html.parser', parse_only=only_teams)   # parse the html


# only valid rows with player data in

table = soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"})


# create variables to keep the data in

hometeam = []
awayteam = []

player = []
kicks = []
handballs = []
disposals = []
marks = []
goals = []
behinds = []
tackles = []
hitouts = []
inside50s = []
freesfor = []
freesagainst = []
fantasy = []
supercoach = []

# Find all the <tr> tag pairs, skip the first one, then for each.
for row in soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}):
    # Create a variable of all the <td> tag pairs in each <tr> tag pair,
    col = row.find_all('td')

    # Create a variable of the string inside 1st <td> tag pair,
    column_1 = col[0].string.strip()
    # and append it to player variable
    player.append(column_1)

    # Create a variable of the string inside 2nd <td> tag pair,
    column_2 = col[1].string.strip()
    # and append it to kicks variable
    kicks.append(column_2)

    # Create a variable of the string inside 3rd <td> tag pair,
    column_3 = col[2].string.strip()
    # and append it to handballs variable
    handballs.append(column_3)

    # Create a variable of the string inside 4th <td> tag pair,
    column_4 = col[3].string.strip()
    # and append it to disposals variable
    disposals.append(column_4)

    # Create a variable of the string inside 5th <td> tag pair,
    column_5 = col[4].string.strip()
    # and append it to marks variable
    marks.append(column_5)

    # Create a variable of the string inside 5th <td> tag pair,
    column_6 = col[5].string.strip()
    # and append it to goals variable
    goals.append(column_6)

    # Create a variable of the string inside 5th <td> tag pair,
    column_7 = col[6].string.strip()
    # and append it to behinds variable
    behinds.append(column_7)

    # Create a variable of the string inside 5th <td> tag pair,
    column_8 = col[7].string.strip()
    # and append it to tackles variable
    tackles.append(column_8)

    # Create a variable of the string inside 5th <td> tag pair,
    column_9 = col[8].string.strip()
    # and append it to hitouts variable
    hitouts.append(column_9)

    # Create a variable of the string inside 5th <td> tag pair,
    column_10 = col[9].string.strip()
    # and append it to inside50s variable
    inside50s.append(column_10)

    # Create a variable of the string inside 5th <td> tag pair,
    column_11 = col[10].string.strip()
    # and append it to freesfo variable
    freesfor.append(column_11)

    # Create a variable of the string inside 5th <td> tag pair,
    column_12 = col[11].string.strip()
    # and append it to freesagainst variable
    freesagainst.append(column_12)

    # Create a variable of the string inside 5th <td> tag pair,
    column_13 = col[12].string.strip()
    # and append it to fantasy variable
    fantasy.append(column_13)

    # Create a variable of the string inside 5th <td> tag pair,
    column_14 = col[13].string.strip()
    # and append it to supercoach variable
    supercoach.append(column_14)

# Find all the <tr> tag pairs, then for each.
for row in soup2.find_all("tr", class_= "leftbold"):
    # Create a variable of all the <td> tag pairs in each <tr> tag pair,
    col2 = row.find_all('td')

    # Create a variable of the string inside 1st <td> tag pair,
    hometeam = col2[0].string.strip()
    # and append it to player variable
    # hometeam.append(column2_1)

    # Create a variable of the string inside 2nd <td> tag pair,
    awayteam = col2[1].string.strip()
    # and append it to kicks variable
    # awayteam.append(column2_2)


# Create a variable of the value of the columns
columns = {'match_id': matchid, 'home_team': hometeam, 'away_team': awayteam, 'player': player, 'kicks': kicks, 'handballs': handballs, 'disposals': disposals, 'marks': marks, 'goals': goals, 'behinds': behinds, 'tackles': tackles, 'hitouts': hitouts, 'inside_50s': inside50s, 'frees_for': freesfor, 'frees_against': freesagainst, 'fantasy': fantasy, 'supercoach': supercoach}

# Create a dataframe from the columns variable - n
df = pd.DataFrame(columns, columns = ['match_id', 'home_team', 'away_team', 'player', 'kicks', 'handballs', 'disposals', 'marks', 'goals', 'behinds', 'tackles', 'hitouts', 'inside_50s', 'frees_for', 'frees_against', 'fantasy', 'supercoach'])

print(df)

# print(soup.prettify())

# print(table)

编辑：所以我检查了源代码，发现表中有一些不正确的HTML

<table border="0" cellspacing="0" cellpadding="0" width="376" id="matchscoretable">
<tr>
<th class="leftbold" height="23" width="100">Team</td>


团队

它使用“/td”而不是“/th”，当通过beautiful soup解析时，会导致table标记关闭

[<table border="0" cellpadding="0" cellspacing="0" id="matchscoretable" width="376">
<tr>
<th class="leftbold" height="23" width="100">Team</th></tr></table>]

[
团队]

我可能需要寻找另一种方法来获得主客场球队的名字

我设法解决了这个问题，下面是完成的代码

from urllib.request import urlopen  # import the library
from bs4 import BeautifulSoup   # Import BS
from bs4 import SoupStrainer    # Import Soup Strainer
import pandas as pd    # import pandas as a package

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'

scrapeweb1 = basescrape+matchid

page = urlopen(scrapeweb1) # access the website
page2 = urlopen(scrapeweb1) # access the website

only_tables = SoupStrainer('table', attrs={"width" : "583"}) # parse only table elements when parsing

soup = BeautifulSoup(page, 'html.parser', parse_only=only_tables)   # parse the html
soup2 = BeautifulSoup(page2, 'html.parser') # parse the html


# only valid rows with player data in

table = soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"})


# create variables to keep the data in

Table1 = soup2.find_all('table', attrs={'width':"375"})[1]

hometeam = Table1.find_all('td', attrs={'width':"124"})[0].string.strip()
awayteam = Table1.find_all('td', attrs={'width':"124"})[1].string.strip()


player = []
kicks = []
handballs = []
disposals = []
marks = []
goals = []
behinds = []
tackles = []
hitouts = []
inside50s = []
freesfor = []
freesagainst = []
fantasy = []
supercoach = []

# Find all the <tr> tag pairs, skip the first one, then for each.
for row in soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}):
    # Create a variable of all the <td> tag pairs in each <tr> tag pair,
    col = row.find_all('td')

    # Create a variable of the string inside 1st <td> tag pair,
    column_1 = col[0].string.strip()
    # and append it to player variable
    player.append(column_1)

    # Create a variable of the string inside 2nd <td> tag pair,
    column_2 = col[1].string.strip()
    # and append it to kicks variable
    kicks.append(column_2)

    # Create a variable of the string inside 3rd <td> tag pair,
    column_3 = col[2].string.strip()
    # and append it to handballs variable
    handballs.append(column_3)

    # Create a variable of the string inside 4th <td> tag pair,
    column_4 = col[3].string.strip()
    # and append it to disposals variable
    disposals.append(column_4)

    # Create a variable of the string inside 5th <td> tag pair,
    column_5 = col[4].string.strip()
    # and append it to marks variable
    marks.append(column_5)

    # Create a variable of the string inside 5th <td> tag pair,
    column_6 = col[5].string.strip()
    # and append it to goals variable
    goals.append(column_6)

    # Create a variable of the string inside 5th <td> tag pair,
    column_7 = col[6].string.strip()
    # and append it to behinds variable
    behinds.append(column_7)

    # Create a variable of the string inside 5th <td> tag pair,
    column_8 = col[7].string.strip()
    # and append it to tackles variable
    tackles.append(column_8)

    # Create a variable of the string inside 5th <td> tag pair,
    column_9 = col[8].string.strip()
    # and append it to hitouts variable
    hitouts.append(column_9)

    # Create a variable of the string inside 5th <td> tag pair,
    column_10 = col[9].string.strip()
    # and append it to inside50s variable
    inside50s.append(column_10)

    # Create a variable of the string inside 5th <td> tag pair,
    column_11 = col[10].string.strip()
    # and append it to freesfo variable
    freesfor.append(column_11)

    # Create a variable of the string inside 5th <td> tag pair,
    column_12 = col[11].string.strip()
    # and append it to freesagainst variable
    freesagainst.append(column_12)

    # Create a variable of the string inside 5th <td> tag pair,
    column_13 = col[12].string.strip()
    # and append it to fantasy variable
    fantasy.append(column_13)

    # Create a variable of the string inside 5th <td> tag pair,
    column_14 = col[13].string.strip()
    # and append it to supercoach variable
    supercoach.append(column_14)




# Create a variable of the value of the columns
columns = {'match_id': matchid, 'home_team': hometeam, 'away_team': awayteam, 'player': player, 'kicks': kicks, 'handballs': handballs, 'disposals': disposals, 'marks': marks, 'goals': goals, 'behinds': behinds, 'tackles': tackles, 'hitouts': hitouts, 'inside_50s': inside50s, 'frees_for': freesfor, 'frees_against': freesagainst, 'fantasy': fantasy, 'supercoach': supercoach}

# Create a dataframe from the columns variable - n
df = pd.DataFrame(columns, columns = ['match_id', 'home_team', 'away_team', 'player', 'kicks', 'handballs', 'disposals', 'marks', 'goals', 'behinds', 'tackles', 'hitouts', 'inside_50s', 'frees_for', 'frees_against', 'fantasy', 'supercoach'])

print(df)

从urllib.request导入urlopen#导入库
从bs4导入BeautifulSoup#导入BS
从bs4进口滤汤器#进口滤汤器
作为pd导入熊猫#作为包装导入熊猫
棒球场http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid='6172'
scrapeweb1=basescrape+matchid
page=urlopen（scrapeweb1）#访问网站
page2=urlopen（scrapeweb1）#访问网站
only_tables=SoupStrainer（'table'，attrs={“width”：“583”}）#解析时只解析表元素
soup=BeautifulSoup（页面'html.parser'，parse_only=only_tables）#解析html
soup2=BeautifulSoup（第2页，“html.parser”）#解析html
#仅在中包含播放机数据的有效行
table=soup.find_all（“tr”，attrs={“onmouseover”：“this.bgColor='#cbcdd0'；”）
#创建变量以保存数据
Table1=soup2.find_all（'table'，attrs={'width'：“375”}）[1]
hometeam=Table1.find_all（'td'，attrs={'width'：“124”}）[0].string.strip（）
awayteam=Table1.find_all（'td'，attrs={'width'：“124”}）[1].string.strip（）
玩家=[]
踢=[]
手球=[]
处置=[]
分数=[]
目标=[]
落后=[]
铲球=[]
命中率=[]
inside50s=[]
freesfor=[]
免责声明=[]
幻想=[]
超级通道=[]
#找到所有标记对，跳过第一个标记对，然后为每个标记对跳过第一个标记对。
对于soup.find_all（“tr”，attrs={“onmouseover”：“this.bgColor='#cbcdd0'；”}）中的行：
#创建每个标记对中所有标记对的变量，
col=行。查找所有（'td'））
#在第一个标记对内创建字符串变量，
列_1=col[0]。string.strip（）
#并将其附加到player变量
player.append（第1列）
#在第二个标记对内创建字符串变量，
列_2=col[1]。string.strip（）
#并将其附加到kicks变量
kicks.append（第2列）
#在第三个标记对内创建字符串变量，
列_3=col[2]。string.strip（）
#并将其附加到handballs变量
手球.附加（第3列）
#在第4个标记对内创建字符串变量，
列_4=col[3]。string.strip（）
#并将其附加到disposals变量
处置。追加（第4列）
#在第5个标记对内创建字符串变量，
列_5=col[4]。string.strip（）
#并将其附加到marks变量
标记。追加（第5列）
#在第5个标记对内创建字符串变量，
列_6=col[5]。string.strip（）
#并将其附加到目标变量
目标。追加（第6列）
#在第5个标记对内创建字符串变量，
列_7=col[6]。string.strip（）
#并将其附加到behinds变量
后面追加（第7列）
#在第5个标记对内创建字符串变量，
列_8=col[7]。string.strip（）
#并将其附加到tackles变量
铲球。附加（第8列）
#在第5个标记对内创建字符串变量，
列_9=col[8]。string.strip（）
#并将其附加到hitouts变量
暂停。追加（第9列）
#在第5个标记对内创建字符串变量，
列_10=col[9]。string.strip（）
#并将其附加到inside50s变量
inside50s.追加（第10列）
#在第5个标记对内创建字符串变量，
列_11=col[10]。string.strip（）
#并将其附加到freesfo变量
freesfor.append（第11列）
#在第5个标记对内创建字符串变量，
列_12=col[11]。string.strip（）
#并将其附加到变量
附加（第12列）
#在第5个标记对内创建字符串变量，
列_13=col[12]。string.strip（）
#并将其附加到fantasy变量
fantasy.append（第13列）
#在第5个标记对内创建字符串变量，
列_14=col[13]。string.strip（）
#并将其附加到Supercach变量
supercach.append（第14列）
#创建列值的变量
列={'match\u id'：matchid，'home\u team'：home\u team，'客场\u team'：awayteam，'player'：player，'kicks'：kicks，'handball'：手球，'disposals'：处置，'marks'：marks，'goals'：goals，'behinds'：behinds，'stackles'：铲球，'hittouts'：hittouts，'inside50，'frees'：freesfor'：freesfor'：freesfor，'frees，'freesfuncy'ach'：超ach}
#从columns变量-n创建数据帧
df=pd.DataFrame（列，列=[‘比赛id’、‘主队’、‘客队’、‘球员’、‘踢’、‘手球’、‘处置’、‘得分’、‘进球’、‘落后’、‘铲球’、‘出局’、‘内线50’、‘出局’、‘出局’、‘出局’、‘幻想’、‘超级教练’]）
打印（df）

以下是一种方法：

from urllib.request import urlopen  # import the library
from bs4 import BeautifulSoup   # Import BS
from bs4 import SoupStrainer    # Import Soup Strainer
import pandas as pd    # import pandas as a package

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'
url = ''.join([basescrape,matchid])

# changed the table width to 585 to get first row with team name
only_tables = SoupStrainer('table', attrs={"width" : "585"}) # parse only table elements when parsing
soup = BeautifulSoup(urlopen(url), 'html.parser', parse_only=only_tables)   # parse the html
# use the table titles as anchor points
teams = soup.find_all('td', attrs={'class':'innertbtitle', 'align':'left'})
# create an empty list for the players
player_list = []
# iterate through anchor points
for team in teams:
    # extract team name from the table title
    team_name = team.text.strip().split(' ', maxsplit=1)[0]
    # get the rows from the next table relative to anchor point
    trs = team.find_next('table', attrs={'width':583}).find_all('tr')
    # create list of labels using first row in table
    labels = [td.text for td in trs.pop(0).find_all('td')]
    # iterate through the remaining rows
    for row in trs:
        # build dictionary using label as key and text of each cell as value
        player_dict = {label:value.text for label,value in 
                       zip(labels, row.find_all('td'))}
        # add team name to dictionary
        player_dict['team'] = team_name
        # append dictionary to the list
        player_list.append(player_dict)

# create the dataframe
df = pd.DataFrame(player_list)
print(df)

非常感谢你花时间来做这件事，据我所知，这会将所有的数据刮到一个很好的数据框中？编辑：我刚刚设法抽出一些时间来运行它，哇，这太整洁了！非常感谢你的帮助。很抱歉让你这么痛苦，但你能帮我理解每一行吗？我正在拼命地努力提高我的理解力y python编码。我到目前为止的理解（使用“players_list=[]”作为第1行）1.创建一个名为“players_list”的空变量2.查看“teams”BS元素3中的每个“team”。team_名称由标题分配

from urllib.request import urlopen  # import the library
from bs4 import BeautifulSoup   # Import BS
from bs4 import SoupStrainer    # Import Soup Strainer
import pandas as pd    # import pandas as a package

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'

scrapeweb1 = basescrape+matchid

page = urlopen(scrapeweb1) # access the website
page2 = urlopen(scrapeweb1) # access the website

only_tables = SoupStrainer('table', attrs={"width" : "583"}) # parse only table elements when parsing

soup = BeautifulSoup(page, 'html.parser', parse_only=only_tables)   # parse the html
soup2 = BeautifulSoup(page2, 'html.parser') # parse the html


# only valid rows with player data in

table = soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"})


# create variables to keep the data in

Table1 = soup2.find_all('table', attrs={'width':"375"})[1]

hometeam = Table1.find_all('td', attrs={'width':"124"})[0].string.strip()
awayteam = Table1.find_all('td', attrs={'width':"124"})[1].string.strip()


player = []
kicks = []
handballs = []
disposals = []
marks = []
goals = []
behinds = []
tackles = []
hitouts = []
inside50s = []
freesfor = []
freesagainst = []
fantasy = []
supercoach = []

# Find all the <tr> tag pairs, skip the first one, then for each.
for row in soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}):
    # Create a variable of all the <td> tag pairs in each <tr> tag pair,
    col = row.find_all('td')

    # Create a variable of the string inside 1st <td> tag pair,
    column_1 = col[0].string.strip()
    # and append it to player variable
    player.append(column_1)

    # Create a variable of the string inside 2nd <td> tag pair,
    column_2 = col[1].string.strip()
    # and append it to kicks variable
    kicks.append(column_2)

    # Create a variable of the string inside 3rd <td> tag pair,
    column_3 = col[2].string.strip()
    # and append it to handballs variable
    handballs.append(column_3)

    # Create a variable of the string inside 4th <td> tag pair,
    column_4 = col[3].string.strip()
    # and append it to disposals variable
    disposals.append(column_4)

    # Create a variable of the string inside 5th <td> tag pair,
    column_5 = col[4].string.strip()
    # and append it to marks variable
    marks.append(column_5)

    # Create a variable of the string inside 5th <td> tag pair,
    column_6 = col[5].string.strip()
    # and append it to goals variable
    goals.append(column_6)

    # Create a variable of the string inside 5th <td> tag pair,
    column_7 = col[6].string.strip()
    # and append it to behinds variable
    behinds.append(column_7)

    # Create a variable of the string inside 5th <td> tag pair,
    column_8 = col[7].string.strip()
    # and append it to tackles variable
    tackles.append(column_8)

    # Create a variable of the string inside 5th <td> tag pair,
    column_9 = col[8].string.strip()
    # and append it to hitouts variable
    hitouts.append(column_9)

    # Create a variable of the string inside 5th <td> tag pair,
    column_10 = col[9].string.strip()
    # and append it to inside50s variable
    inside50s.append(column_10)

    # Create a variable of the string inside 5th <td> tag pair,
    column_11 = col[10].string.strip()
    # and append it to freesfo variable
    freesfor.append(column_11)

    # Create a variable of the string inside 5th <td> tag pair,
    column_12 = col[11].string.strip()
    # and append it to freesagainst variable
    freesagainst.append(column_12)

    # Create a variable of the string inside 5th <td> tag pair,
    column_13 = col[12].string.strip()
    # and append it to fantasy variable
    fantasy.append(column_13)

    # Create a variable of the string inside 5th <td> tag pair,
    column_14 = col[13].string.strip()
    # and append it to supercoach variable
    supercoach.append(column_14)




# Create a variable of the value of the columns
columns = {'match_id': matchid, 'home_team': hometeam, 'away_team': awayteam, 'player': player, 'kicks': kicks, 'handballs': handballs, 'disposals': disposals, 'marks': marks, 'goals': goals, 'behinds': behinds, 'tackles': tackles, 'hitouts': hitouts, 'inside_50s': inside50s, 'frees_for': freesfor, 'frees_against': freesagainst, 'fantasy': fantasy, 'supercoach': supercoach}

# Create a dataframe from the columns variable - n
df = pd.DataFrame(columns, columns = ['match_id', 'home_team', 'away_team', 'player', 'kicks', 'handballs', 'disposals', 'marks', 'goals', 'behinds', 'tackles', 'hitouts', 'inside_50s', 'frees_for', 'frees_against', 'fantasy', 'supercoach'])

print(df)

from urllib.request import urlopen  # import the library
from bs4 import BeautifulSoup   # Import BS
from bs4 import SoupStrainer    # Import Soup Strainer
import pandas as pd    # import pandas as a package

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'
url = ''.join([basescrape,matchid])

# changed the table width to 585 to get first row with team name
only_tables = SoupStrainer('table', attrs={"width" : "585"}) # parse only table elements when parsing
soup = BeautifulSoup(urlopen(url), 'html.parser', parse_only=only_tables)   # parse the html
# use the table titles as anchor points
teams = soup.find_all('td', attrs={'class':'innertbtitle', 'align':'left'})
# create an empty list for the players
player_list = []
# iterate through anchor points
for team in teams:
    # extract team name from the table title
    team_name = team.text.strip().split(' ', maxsplit=1)[0]
    # get the rows from the next table relative to anchor point
    trs = team.find_next('table', attrs={'width':583}).find_all('tr')
    # create list of labels using first row in table
    labels = [td.text for td in trs.pop(0).find_all('td')]
    # iterate through the remaining rows
    for row in trs:
        # build dictionary using label as key and text of each cell as value
        player_dict = {label:value.text for label,value in 
                       zip(labels, row.find_all('td'))}
        # add team name to dictionary
        player_dict['team'] = team_name
        # append dictionary to the list
        player_list.append(player_dict)

# create the dataframe
df = pd.DataFrame(player_list)
print(df)