Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/308.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/python-3.x/17.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
使用Python在Beautifulsoup4中进行刮片-初学者_Python_Python 3.x_Pandas_Beautifulsoup - Fatal编程技术网

使用Python在Beautifulsoup4中进行刮片-初学者

使用Python在Beautifulsoup4中进行刮片-初学者,python,python-3.x,pandas,beautifulsoup,Python,Python 3.x,Pandas,Beautifulsoup,感谢您对最后一个问题的帮助()。然而,我目前正忙于准备我的最终数据帧。我已经能够从原始表格中提取所有数据,并以我想要的方式显示出来,但是我现在想将主队和客队添加到df中,但似乎无法理解。这是我目前所拥有的,我想从中吸取教训 from urllib.request import urlopen # import the library from bs4 import BeautifulSoup # Import BS from bs4 import SoupStrainer # Imp

感谢您对最后一个问题的帮助()。然而,我目前正忙于准备我的最终数据帧。我已经能够从原始表格中提取所有数据,并以我想要的方式显示出来,但是我现在想将主队和客队添加到df中,但似乎无法理解。这是我目前所拥有的,我想从中吸取教训

from urllib.request import urlopen  # import the library
from bs4 import BeautifulSoup   # Import BS
from bs4 import SoupStrainer    # Import Soup Strainer
import pandas as pd    # import pandas as a package

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'

scrapeweb1 = basescrape+matchid

page = urlopen(scrapeweb1) # access the website
only_tables = SoupStrainer('table', attrs={"width" : "583"}) # parse only table elements when parsing
soup = BeautifulSoup(page, 'html.parser', parse_only=only_tables)   # parse the html

only_teams = SoupStrainer('table', attrs={"width" : "376"}) # parse only team qtr score elements when parsing
soup2 = BeautifulSoup(page, 'html.parser', parse_only=only_teams)   # parse the html


# only valid rows with player data in

table = soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"})


# create variables to keep the data in

hometeam = []
awayteam = []

player = []
kicks = []
handballs = []
disposals = []
marks = []
goals = []
behinds = []
tackles = []
hitouts = []
inside50s = []
freesfor = []
freesagainst = []
fantasy = []
supercoach = []

# Find all the <tr> tag pairs, skip the first one, then for each.
for row in soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}):
    # Create a variable of all the <td> tag pairs in each <tr> tag pair,
    col = row.find_all('td')

    # Create a variable of the string inside 1st <td> tag pair,
    column_1 = col[0].string.strip()
    # and append it to player variable
    player.append(column_1)

    # Create a variable of the string inside 2nd <td> tag pair,
    column_2 = col[1].string.strip()
    # and append it to kicks variable
    kicks.append(column_2)

    # Create a variable of the string inside 3rd <td> tag pair,
    column_3 = col[2].string.strip()
    # and append it to handballs variable
    handballs.append(column_3)

    # Create a variable of the string inside 4th <td> tag pair,
    column_4 = col[3].string.strip()
    # and append it to disposals variable
    disposals.append(column_4)

    # Create a variable of the string inside 5th <td> tag pair,
    column_5 = col[4].string.strip()
    # and append it to marks variable
    marks.append(column_5)

    # Create a variable of the string inside 5th <td> tag pair,
    column_6 = col[5].string.strip()
    # and append it to goals variable
    goals.append(column_6)

    # Create a variable of the string inside 5th <td> tag pair,
    column_7 = col[6].string.strip()
    # and append it to behinds variable
    behinds.append(column_7)

    # Create a variable of the string inside 5th <td> tag pair,
    column_8 = col[7].string.strip()
    # and append it to tackles variable
    tackles.append(column_8)

    # Create a variable of the string inside 5th <td> tag pair,
    column_9 = col[8].string.strip()
    # and append it to hitouts variable
    hitouts.append(column_9)

    # Create a variable of the string inside 5th <td> tag pair,
    column_10 = col[9].string.strip()
    # and append it to inside50s variable
    inside50s.append(column_10)

    # Create a variable of the string inside 5th <td> tag pair,
    column_11 = col[10].string.strip()
    # and append it to freesfo variable
    freesfor.append(column_11)

    # Create a variable of the string inside 5th <td> tag pair,
    column_12 = col[11].string.strip()
    # and append it to freesagainst variable
    freesagainst.append(column_12)

    # Create a variable of the string inside 5th <td> tag pair,
    column_13 = col[12].string.strip()
    # and append it to fantasy variable
    fantasy.append(column_13)

    # Create a variable of the string inside 5th <td> tag pair,
    column_14 = col[13].string.strip()
    # and append it to supercoach variable
    supercoach.append(column_14)

# Find all the <tr> tag pairs, then for each.
for row in soup2.find_all("tr", class_= "leftbold"):
    # Create a variable of all the <td> tag pairs in each <tr> tag pair,
    col2 = row.find_all('td')

    # Create a variable of the string inside 1st <td> tag pair,
    hometeam = col2[0].string.strip()
    # and append it to player variable
    # hometeam.append(column2_1)

    # Create a variable of the string inside 2nd <td> tag pair,
    awayteam = col2[1].string.strip()
    # and append it to kicks variable
    # awayteam.append(column2_2)


# Create a variable of the value of the columns
columns = {'match_id': matchid, 'home_team': hometeam, 'away_team': awayteam, 'player': player, 'kicks': kicks, 'handballs': handballs, 'disposals': disposals, 'marks': marks, 'goals': goals, 'behinds': behinds, 'tackles': tackles, 'hitouts': hitouts, 'inside_50s': inside50s, 'frees_for': freesfor, 'frees_against': freesagainst, 'fantasy': fantasy, 'supercoach': supercoach}

# Create a dataframe from the columns variable - n
df = pd.DataFrame(columns, columns = ['match_id', 'home_team', 'away_team', 'player', 'kicks', 'handballs', 'disposals', 'marks', 'goals', 'behinds', 'tackles', 'hitouts', 'inside_50s', 'frees_for', 'frees_against', 'fantasy', 'supercoach'])

print(df)

# print(soup.prettify())

# print(table)
编辑:所以我检查了源代码,发现表中有一些不正确的HTML

<table border="0" cellspacing="0" cellpadding="0" width="376" id="matchscoretable">
<tr>
<th class="leftbold" height="23" width="100">Team</td>

团队
它使用“/td”而不是“/th”,当通过beautiful soup解析时,会导致table标记关闭

[<table border="0" cellpadding="0" cellspacing="0" id="matchscoretable" width="376">
<tr>
<th class="leftbold" height="23" width="100">Team</th></tr></table>]
[
团队]

我可能需要寻找另一种方法来获得主客场球队的名字

我设法解决了这个问题,下面是完成的代码

from urllib.request import urlopen  # import the library
from bs4 import BeautifulSoup   # Import BS
from bs4 import SoupStrainer    # Import Soup Strainer
import pandas as pd    # import pandas as a package

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'

scrapeweb1 = basescrape+matchid

page = urlopen(scrapeweb1) # access the website
page2 = urlopen(scrapeweb1) # access the website

only_tables = SoupStrainer('table', attrs={"width" : "583"}) # parse only table elements when parsing

soup = BeautifulSoup(page, 'html.parser', parse_only=only_tables)   # parse the html
soup2 = BeautifulSoup(page2, 'html.parser') # parse the html


# only valid rows with player data in

table = soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"})


# create variables to keep the data in

Table1 = soup2.find_all('table', attrs={'width':"375"})[1]

hometeam = Table1.find_all('td', attrs={'width':"124"})[0].string.strip()
awayteam = Table1.find_all('td', attrs={'width':"124"})[1].string.strip()


player = []
kicks = []
handballs = []
disposals = []
marks = []
goals = []
behinds = []
tackles = []
hitouts = []
inside50s = []
freesfor = []
freesagainst = []
fantasy = []
supercoach = []

# Find all the <tr> tag pairs, skip the first one, then for each.
for row in soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}):
    # Create a variable of all the <td> tag pairs in each <tr> tag pair,
    col = row.find_all('td')

    # Create a variable of the string inside 1st <td> tag pair,
    column_1 = col[0].string.strip()
    # and append it to player variable
    player.append(column_1)

    # Create a variable of the string inside 2nd <td> tag pair,
    column_2 = col[1].string.strip()
    # and append it to kicks variable
    kicks.append(column_2)

    # Create a variable of the string inside 3rd <td> tag pair,
    column_3 = col[2].string.strip()
    # and append it to handballs variable
    handballs.append(column_3)

    # Create a variable of the string inside 4th <td> tag pair,
    column_4 = col[3].string.strip()
    # and append it to disposals variable
    disposals.append(column_4)

    # Create a variable of the string inside 5th <td> tag pair,
    column_5 = col[4].string.strip()
    # and append it to marks variable
    marks.append(column_5)

    # Create a variable of the string inside 5th <td> tag pair,
    column_6 = col[5].string.strip()
    # and append it to goals variable
    goals.append(column_6)

    # Create a variable of the string inside 5th <td> tag pair,
    column_7 = col[6].string.strip()
    # and append it to behinds variable
    behinds.append(column_7)

    # Create a variable of the string inside 5th <td> tag pair,
    column_8 = col[7].string.strip()
    # and append it to tackles variable
    tackles.append(column_8)

    # Create a variable of the string inside 5th <td> tag pair,
    column_9 = col[8].string.strip()
    # and append it to hitouts variable
    hitouts.append(column_9)

    # Create a variable of the string inside 5th <td> tag pair,
    column_10 = col[9].string.strip()
    # and append it to inside50s variable
    inside50s.append(column_10)

    # Create a variable of the string inside 5th <td> tag pair,
    column_11 = col[10].string.strip()
    # and append it to freesfo variable
    freesfor.append(column_11)

    # Create a variable of the string inside 5th <td> tag pair,
    column_12 = col[11].string.strip()
    # and append it to freesagainst variable
    freesagainst.append(column_12)

    # Create a variable of the string inside 5th <td> tag pair,
    column_13 = col[12].string.strip()
    # and append it to fantasy variable
    fantasy.append(column_13)

    # Create a variable of the string inside 5th <td> tag pair,
    column_14 = col[13].string.strip()
    # and append it to supercoach variable
    supercoach.append(column_14)




# Create a variable of the value of the columns
columns = {'match_id': matchid, 'home_team': hometeam, 'away_team': awayteam, 'player': player, 'kicks': kicks, 'handballs': handballs, 'disposals': disposals, 'marks': marks, 'goals': goals, 'behinds': behinds, 'tackles': tackles, 'hitouts': hitouts, 'inside_50s': inside50s, 'frees_for': freesfor, 'frees_against': freesagainst, 'fantasy': fantasy, 'supercoach': supercoach}

# Create a dataframe from the columns variable - n
df = pd.DataFrame(columns, columns = ['match_id', 'home_team', 'away_team', 'player', 'kicks', 'handballs', 'disposals', 'marks', 'goals', 'behinds', 'tackles', 'hitouts', 'inside_50s', 'frees_for', 'frees_against', 'fantasy', 'supercoach'])

print(df)
从urllib.request导入urlopen#导入库
从bs4导入BeautifulSoup#导入BS
从bs4进口滤汤器#进口滤汤器
作为pd导入熊猫#作为包装导入熊猫
棒球场http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid='6172'
scrapeweb1=basescrape+matchid
page=urlopen(scrapeweb1)#访问网站
page2=urlopen(scrapeweb1)#访问网站
only_tables=SoupStrainer('table',attrs={“width”:“583”})#解析时只解析表元素
soup=BeautifulSoup(页面'html.parser',parse_only=only_tables)#解析html
soup2=BeautifulSoup(第2页,“html.parser”)#解析html
#仅在中包含播放机数据的有效行
table=soup.find_all(“tr”,attrs={“onmouseover”:“this.bgColor='#cbcdd0';”)
#创建变量以保存数据
Table1=soup2.find_all('table',attrs={'width':“375”})[1]
hometeam=Table1.find_all('td',attrs={'width':“124”})[0].string.strip()
awayteam=Table1.find_all('td',attrs={'width':“124”})[1].string.strip()
玩家=[]
踢=[]
手球=[]
处置=[]
分数=[]
目标=[]
落后=[]
铲球=[]
命中率=[]
inside50s=[]
freesfor=[]
免责声明=[]
幻想=[]
超级通道=[]
#找到所有标记对,跳过第一个标记对,然后为每个标记对跳过第一个标记对。
对于soup.find_all(“tr”,attrs={“onmouseover”:“this.bgColor='#cbcdd0';”})中的行:
#创建每个标记对中所有标记对的变量,
col=行。查找所有('td'))
#在第一个标记对内创建字符串变量,
列_1=col[0]。string.strip()
#并将其附加到player变量
player.append(第1列)
#在第二个标记对内创建字符串变量,
列_2=col[1]。string.strip()
#并将其附加到kicks变量
kicks.append(第2列)
#在第三个标记对内创建字符串变量,
列_3=col[2]。string.strip()
#并将其附加到handballs变量
手球.附加(第3列)
#在第4个标记对内创建字符串变量,
列_4=col[3]。string.strip()
#并将其附加到disposals变量
处置。追加(第4列)
#在第5个标记对内创建字符串变量,
列_5=col[4]。string.strip()
#并将其附加到marks变量
标记。追加(第5列)
#在第5个标记对内创建字符串变量,
列_6=col[5]。string.strip()
#并将其附加到目标变量
目标。追加(第6列)
#在第5个标记对内创建字符串变量,
列_7=col[6]。string.strip()
#并将其附加到behinds变量
后面追加(第7列)
#在第5个标记对内创建字符串变量,
列_8=col[7]。string.strip()
#并将其附加到tackles变量
铲球。附加(第8列)
#在第5个标记对内创建字符串变量,
列_9=col[8]。string.strip()
#并将其附加到hitouts变量
暂停。追加(第9列)
#在第5个标记对内创建字符串变量,
列_10=col[9]。string.strip()
#并将其附加到inside50s变量
inside50s.追加(第10列)
#在第5个标记对内创建字符串变量,
列_11=col[10]。string.strip()
#并将其附加到freesfo变量
freesfor.append(第11列)
#在第5个标记对内创建字符串变量,
列_12=col[11]。string.strip()
#并将其附加到变量
附加(第12列)
#在第5个标记对内创建字符串变量,
列_13=col[12]。string.strip()
#并将其附加到fantasy变量
fantasy.append(第13列)
#在第5个标记对内创建字符串变量,
列_14=col[13]。string.strip()
#并将其附加到Supercach变量
supercach.append(第14列)
#创建列值的变量
列={'match\u id':matchid,'home\u team':home\u team,'客场\u team':awayteam,'player':player,'kicks':kicks,'handball':手球,'disposals':处置,'marks':marks,'goals':goals,'behinds':behinds,'stackles':铲球,'hittouts':hittouts,'inside50,'frees':freesfor':freesfor':freesfor,'frees,'freesfuncy'ach':超ach}
#从columns变量-n创建数据帧
df=pd.DataFrame(列,列=[‘比赛id’、‘主队’、‘客队’、‘球员’、‘踢’、‘手球’、‘处置’、‘得分’、‘进球’、‘落后’、‘铲球’、‘出局’、‘内线50’、‘出局’、‘出局’、‘出局’、‘幻想’、‘超级教练’])
打印(df)

以下是一种方法:

from urllib.request import urlopen  # import the library
from bs4 import BeautifulSoup   # Import BS
from bs4 import SoupStrainer    # Import Soup Strainer
import pandas as pd    # import pandas as a package

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'
url = ''.join([basescrape,matchid])

# changed the table width to 585 to get first row with team name
only_tables = SoupStrainer('table', attrs={"width" : "585"}) # parse only table elements when parsing
soup = BeautifulSoup(urlopen(url), 'html.parser', parse_only=only_tables)   # parse the html
# use the table titles as anchor points
teams = soup.find_all('td', attrs={'class':'innertbtitle', 'align':'left'})
# create an empty list for the players
player_list = []
# iterate through anchor points
for team in teams:
    # extract team name from the table title
    team_name = team.text.strip().split(' ', maxsplit=1)[0]
    # get the rows from the next table relative to anchor point
    trs = team.find_next('table', attrs={'width':583}).find_all('tr')
    # create list of labels using first row in table
    labels = [td.text for td in trs.pop(0).find_all('td')]
    # iterate through the remaining rows
    for row in trs:
        # build dictionary using label as key and text of each cell as value
        player_dict = {label:value.text for label,value in 
                       zip(labels, row.find_all('td'))}
        # add team name to dictionary
        player_dict['team'] = team_name
        # append dictionary to the list
        player_list.append(player_dict)

# create the dataframe
df = pd.DataFrame(player_list)
print(df)

非常感谢你花时间来做这件事,据我所知,这会将所有的数据刮到一个很好的数据框中?编辑:我刚刚设法抽出一些时间来运行它,哇,这太整洁了!非常感谢你的帮助。很抱歉让你这么痛苦,但你能帮我理解每一行吗?我正在拼命地努力提高我的理解力y python编码。我到目前为止的理解(使用“players_list=[]”作为第1行)1.创建一个名为“players_list”的空变量2.查看“teams”BS元素3中的每个“team”。team_名称由标题分配
from urllib.request import urlopen  # import the library
from bs4 import BeautifulSoup   # Import BS
from bs4 import SoupStrainer    # Import Soup Strainer
import pandas as pd    # import pandas as a package

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'

scrapeweb1 = basescrape+matchid

page = urlopen(scrapeweb1) # access the website
page2 = urlopen(scrapeweb1) # access the website

only_tables = SoupStrainer('table', attrs={"width" : "583"}) # parse only table elements when parsing

soup = BeautifulSoup(page, 'html.parser', parse_only=only_tables)   # parse the html
soup2 = BeautifulSoup(page2, 'html.parser') # parse the html


# only valid rows with player data in

table = soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"})


# create variables to keep the data in

Table1 = soup2.find_all('table', attrs={'width':"375"})[1]

hometeam = Table1.find_all('td', attrs={'width':"124"})[0].string.strip()
awayteam = Table1.find_all('td', attrs={'width':"124"})[1].string.strip()


player = []
kicks = []
handballs = []
disposals = []
marks = []
goals = []
behinds = []
tackles = []
hitouts = []
inside50s = []
freesfor = []
freesagainst = []
fantasy = []
supercoach = []

# Find all the <tr> tag pairs, skip the first one, then for each.
for row in soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}):
    # Create a variable of all the <td> tag pairs in each <tr> tag pair,
    col = row.find_all('td')

    # Create a variable of the string inside 1st <td> tag pair,
    column_1 = col[0].string.strip()
    # and append it to player variable
    player.append(column_1)

    # Create a variable of the string inside 2nd <td> tag pair,
    column_2 = col[1].string.strip()
    # and append it to kicks variable
    kicks.append(column_2)

    # Create a variable of the string inside 3rd <td> tag pair,
    column_3 = col[2].string.strip()
    # and append it to handballs variable
    handballs.append(column_3)

    # Create a variable of the string inside 4th <td> tag pair,
    column_4 = col[3].string.strip()
    # and append it to disposals variable
    disposals.append(column_4)

    # Create a variable of the string inside 5th <td> tag pair,
    column_5 = col[4].string.strip()
    # and append it to marks variable
    marks.append(column_5)

    # Create a variable of the string inside 5th <td> tag pair,
    column_6 = col[5].string.strip()
    # and append it to goals variable
    goals.append(column_6)

    # Create a variable of the string inside 5th <td> tag pair,
    column_7 = col[6].string.strip()
    # and append it to behinds variable
    behinds.append(column_7)

    # Create a variable of the string inside 5th <td> tag pair,
    column_8 = col[7].string.strip()
    # and append it to tackles variable
    tackles.append(column_8)

    # Create a variable of the string inside 5th <td> tag pair,
    column_9 = col[8].string.strip()
    # and append it to hitouts variable
    hitouts.append(column_9)

    # Create a variable of the string inside 5th <td> tag pair,
    column_10 = col[9].string.strip()
    # and append it to inside50s variable
    inside50s.append(column_10)

    # Create a variable of the string inside 5th <td> tag pair,
    column_11 = col[10].string.strip()
    # and append it to freesfo variable
    freesfor.append(column_11)

    # Create a variable of the string inside 5th <td> tag pair,
    column_12 = col[11].string.strip()
    # and append it to freesagainst variable
    freesagainst.append(column_12)

    # Create a variable of the string inside 5th <td> tag pair,
    column_13 = col[12].string.strip()
    # and append it to fantasy variable
    fantasy.append(column_13)

    # Create a variable of the string inside 5th <td> tag pair,
    column_14 = col[13].string.strip()
    # and append it to supercoach variable
    supercoach.append(column_14)




# Create a variable of the value of the columns
columns = {'match_id': matchid, 'home_team': hometeam, 'away_team': awayteam, 'player': player, 'kicks': kicks, 'handballs': handballs, 'disposals': disposals, 'marks': marks, 'goals': goals, 'behinds': behinds, 'tackles': tackles, 'hitouts': hitouts, 'inside_50s': inside50s, 'frees_for': freesfor, 'frees_against': freesagainst, 'fantasy': fantasy, 'supercoach': supercoach}

# Create a dataframe from the columns variable - n
df = pd.DataFrame(columns, columns = ['match_id', 'home_team', 'away_team', 'player', 'kicks', 'handballs', 'disposals', 'marks', 'goals', 'behinds', 'tackles', 'hitouts', 'inside_50s', 'frees_for', 'frees_against', 'fantasy', 'supercoach'])

print(df)
from urllib.request import urlopen  # import the library
from bs4 import BeautifulSoup   # Import BS
from bs4 import SoupStrainer    # Import Soup Strainer
import pandas as pd    # import pandas as a package

basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'
url = ''.join([basescrape,matchid])

# changed the table width to 585 to get first row with team name
only_tables = SoupStrainer('table', attrs={"width" : "585"}) # parse only table elements when parsing
soup = BeautifulSoup(urlopen(url), 'html.parser', parse_only=only_tables)   # parse the html
# use the table titles as anchor points
teams = soup.find_all('td', attrs={'class':'innertbtitle', 'align':'left'})
# create an empty list for the players
player_list = []
# iterate through anchor points
for team in teams:
    # extract team name from the table title
    team_name = team.text.strip().split(' ', maxsplit=1)[0]
    # get the rows from the next table relative to anchor point
    trs = team.find_next('table', attrs={'width':583}).find_all('tr')
    # create list of labels using first row in table
    labels = [td.text for td in trs.pop(0).find_all('td')]
    # iterate through the remaining rows
    for row in trs:
        # build dictionary using label as key and text of each cell as value
        player_dict = {label:value.text for label,value in 
                       zip(labels, row.find_all('td'))}
        # add team name to dictionary
        player_dict['team'] = team_name
        # append dictionary to the list
        player_list.append(player_dict)

# create the dataframe
df = pd.DataFrame(player_list)
print(df)