Python 刮和循环元标签与美丽的汤_Python_Arrays_Web Scraping_Beautifulsoup_Meta Tags

Python 刮和循环元标签与美丽的汤

python arrays web-scraping

Python 刮和循环元标签与美丽的汤,python,arrays,web-scraping,beautifulsoup,meta-tags,Python,Arrays,Web Scraping,Beautifulsoup,Meta Tags,下面是一个网络刮刀，它使用漂亮的汤从这个网站上刮取一个球队的花名册。每一列数据被放入一个数组中，然后循环到一个CSV文件中。我想删除团队名称（“代码中的团队”），但我很难将meta标记（html代码见下文）合并到我的CSV writerow循环中 <meta property="og:site_name" content="Tampa Bay Rays" /> 但我需要团队数组（最后一个数组）匹配前两个数组的长度，如下所示： [Player A, Player B, Player

下面是一个网络刮刀，它使用漂亮的汤从这个网站上刮取一个球队的花名册。每一列数据被放入一个数组中，然后循环到一个CSV文件中。我想删除团队名称（“代码中的团队”），但我很难将meta标记（html代码见下文）合并到我的CSV writerow循环中

<meta property="og:site_name" content="Tampa Bay Rays" />

但我需要团队数组（最后一个数组）匹配前两个数组的长度，如下所示：

[Player A, Player B, Player C]
[46,36,33]
[Tampa Bay Rays, Tampa Bay Rays, Tampa Bay Rays]

有人知道如何在我的writerow csv循环中进行此元标记调整吗？提前谢谢

import requests
import csv
from bs4 import BeautifulSoup

page=requests.get('http://m.rays.mlb.com/roster/')
soup=BeautifulSoup(page.text, 'html.parser')

#Remove Unwanted Links
last_links=soup.find(class_='nav-tabset-container')
last_links.decompose()
side_links=soup.find(class_='column secondary span-5 right')
side_links.decompose()

#Generate CSV
f=csv.writer(open('MLB_Active_Roster.csv','w',newline=''))
f.writerow(['Name','Number','Hand','Height','Weight','DOB','Team'])

#Find Player Name Links
player_list=soup.find(class_='layout layout-roster')
player_list_items=player_list.find_all('a')

#Extract Player Name Text
names=[player_name.contents[0] for player_name in player_list_items]

#Find Player Number
number_list=soup.find(class_='layout layout-roster')
number_list_items=number_list.find_all('td',index='0')


#Extract Player Number Text
number=[player_number.contents[0] for player_number in number_list_items]

#Find B/T
hand_list=soup.find(class_='layout layout-roster')
hand_list_items=hand_list.find_all('td',index='3')

#Extract B/T
handedness=[player_hand.contents[0] for player_hand in hand_list_items]

#Find Height
height_list=soup.find(class_='layout layout-roster')
height_list_items=hand_list.find_all('td',index='4')

#Extract Height
height=[player_height.contents[0] for player_height in height_list_items]

#Find Weight
weight_list=soup.find(class_='layout layout-roster')
weight_list_items=weight_list.find_all('td',index='5')

#Extract Weight
weight=[player_weight.contents[0] for player_weight in weight_list_items]

#Find DOB
DOB_list=soup.find(class_='layout layout-roster')
DOB_list_items=DOB_list.find_all('td',index='6')

#Extract DOB
DOB=[player_DOB.contents[0] for player_DOB in DOB_list_items]

#Find Team Name
team_list=soup.find('meta',property='og:site_name')
Team=[team_name.contents[0] for team_name in team_list]
print(Team)

#Loop Excel Rows
for i in range(len(names)):
    f.writerow([names[i],number[i],handedness[i],height[i],weight[i],DOB[i],Team[i]])

问题在于您使用find函数的方式

而不是使用这个：

player_list=soup.find(class_='layout layout-roster')

您应该使用以下选项：

player_list=soup.find({"class":"layout layout-roster"})

（应将此更改应用于所有查找函数）

您的结束脚本应该如下所示：

side_links=soup.find({"class":'column secondary span-5 right'})
side_links.decompose()

#Generate CSV
f=csv.writer(open('MLB_Active_Roster.csv','w',newline=''))
f.writerow(['Name','Number','Hand','Height','Weight','DOB','Team'])

#Find Player Name Links
player_list=soup.find({"class":'layout layout-roster'})
player_list_items=player_list.find_all('a')

#Extract Player Name Text
names=[player_name.contents[0] for player_name in player_list_items]

#Find Player Number
number_list=soup.find({"class":'layout layout-roster'})
number_list_items=number_list.find_all('td',{"index":'0'})


#Extract Player Number Text
number=[player_number.contents[0] for player_number in number_list_items]

#Find B/T
hand_list=soup.find({"class":'layout layout-roster'})
hand_list_items=hand_list.find_all('td',{"index":'3'})

#Extract B/T
handedness=[player_hand.contents[0] for player_hand in hand_list_items]

#Find Height
height_list=soup.find({"class":'layout layout-roster'})
height_list_items=hand_list.find_all('td',{"index":'4'})

#Extract Height
height=[player_height.contents[0] for player_height in height_list_items]

#Find Weight
weight_list=soup.find({"class":'layout layout-roster'})
weight_list_items=weight_list.find_all('td',{"index":'5'})

#Extract Weight
weight=[player_weight.contents[0] for player_weight in weight_list_items]

#Find DOB
DOB_list=soup.find({"class":'layout layout-roster'})
DOB_list_items=DOB_list.find_all('td',{"index":'6'})

#Extract DOB
DOB=[player_DOB.contents[0] for player_DOB in DOB_list_items]

#Find Team Name
team_list=soup.find('meta',{"property":'og:site_name'})
Team=[team_name.contents[0] for team_name in team_list]
print(Team)

#Loop Excel Rows
for i in range(len(names)):
    f.writerow([names[i],number[i],handedness[i],height[i],weight[i],DOB[i],Team[i]])

更改很简单，请将部件

#查找团队名称

更改为：

#Find Team Name
team_list=soup.find('meta',property='og:site_name')
Team = [team_list['content'] for _ in names]

完整程序：

import requests
import csv
from bs4 import BeautifulSoup

page=requests.get('http://m.rays.mlb.com/roster/')
soup=BeautifulSoup(page.text, 'html.parser')

#Remove Unwanted Links
last_links=soup.find(class_='nav-tabset-container')
last_links.decompose()
side_links=soup.find(class_='column secondary span-5 right')
side_links.decompose()

#Generate CSV
f=csv.writer(open('MLB_Active_Roster.csv','w',newline=''))
f.writerow(['Name','Number','Hand','Height','Weight','DOB','Team'])

#Find Player Name Links
player_list=soup.find(class_='layout layout-roster')
player_list_items=player_list.find_all('a')

#Extract Player Name Text
names=[player_name.contents[0] for player_name in player_list_items]

#Find Player Number
number_list=soup.find(class_='layout layout-roster')
number_list_items=number_list.find_all('td',index='0')


#Extract Player Number Text
number=[player_number.contents[0] for player_number in number_list_items]

#Find B/T
hand_list=soup.find(class_='layout layout-roster')
hand_list_items=hand_list.find_all('td',index='3')

#Extract B/T
handedness=[player_hand.contents[0] for player_hand in hand_list_items]

#Find Height
height_list=soup.find(class_='layout layout-roster')
height_list_items=hand_list.find_all('td',index='4')

#Extract Height
height=[player_height.contents[0] for player_height in height_list_items]

#Find Weight
weight_list=soup.find(class_='layout layout-roster')
weight_list_items=weight_list.find_all('td',index='5')

#Extract Weight
weight=[player_weight.contents[0] for player_weight in weight_list_items]

#Find DOB
DOB_list=soup.find(class_='layout layout-roster')
DOB_list_items=DOB_list.find_all('td',index='6')

#Extract DOB
DOB=[player_DOB.contents[0] for player_DOB in DOB_list_items]

#Find Team Name
team_list=soup.find('meta',property='og:site_name')
Team = [team_list['content'] for _ in names]

for i in range(len(names)):
    f.writerow([names[i],number[i],handedness[i],height[i],weight[i],DOB[i],Team[i]])

结果显示在CSV文件中：

Name,Number,Hand,Height,Weight,DOB,Team
Jose Alvarado,46,L/L,"6'2""",245lbs,5/21/95,Tampa Bay Rays
Matt Andriese,35,R/R,"6'2""",225lbs,8/28/89,Tampa Bay Rays
Chris Archer,22,R/R,"6'2""",195lbs,9/26/88,Tampa Bay Rays
Diego Castillo,63,R/R,"6'3""",240lbs,1/18/94,Tampa Bay Rays
Nathan Eovaldi,24,R/R,"6'2""",225lbs,2/13/90,Tampa Bay Rays
Chih-Wei Hu,58,R/R,"6'0""",220lbs,11/4/93,Tampa Bay Rays
Andrew Kittredge,36,R/R,"6'1""",200lbs,3/17/90,Tampa Bay Rays
Adam Kolarek,56,L/L,"6'3""",205lbs,1/14/89,Tampa Bay Rays
Sergio Romo,54,R/R,"5'11""",185lbs,3/4/83,Tampa Bay Rays
Jaime Schultz,57,R/R,"5'10""",200lbs,6/20/91,Tampa Bay Rays
Blake Snell,4,L/L,"6'4""",200lbs,12/4/92,Tampa Bay Rays
Ryne Stanek,55,R/R,"6'4""",215lbs,7/26/91,Tampa Bay Rays
Hunter Wood,61,R/R,"6'1""",165lbs,8/12/93,Tampa Bay Rays
Ryan Yarbrough,48,R/L,"6'5""",205lbs,12/31/91,Tampa Bay Rays
Wilson Ramos,40,R/R,"6'1""",245lbs,8/10/87,Tampa Bay Rays
Jesus Sucre,45,R/R,"6'0""",200lbs,4/30/88,Tampa Bay Rays
Jake Bauers,9,L/L,"6'1""",195lbs,10/6/95,Tampa Bay Rays
Ji-Man Choi,26,L/R,"6'1""",230lbs,5/19/91,Tampa Bay Rays
C.J. Cron,44,R/R,"6'4""",235lbs,1/5/90,Tampa Bay Rays
Matt Duffy,5,R/R,"6'2""",170lbs,1/15/91,Tampa Bay Rays
Adeiny Hechavarria,11,R/R,"6'0""",195lbs,4/15/89,Tampa Bay Rays
Daniel Robertson,28,R/R,"5'11""",200lbs,3/22/94,Tampa Bay Rays
Joey Wendle,18,L/R,"6'1""",190lbs,4/26/90,Tampa Bay Rays
Carlos Gomez,27,R/R,"6'3""",220lbs,12/4/85,Tampa Bay Rays
Kevin Kiermaier,39,L/R,"6'1""",215lbs,4/22/90,Tampa Bay Rays
Mallex Smith,0,L/R,"5'10""",180lbs,5/6/93,Tampa Bay Rays

你的代码中有太多的重复。尽量避免复制和粘贴编程

也就是说，你可以从相同的项目中列出一个列表：

['foo']*3

给出

['foo'，'foo'，'foo']

。这对于团队名称很方便，对于所有团队成员都是一样的

您可以使用

zip（）

和

writerows（）

在一行代码中将所有列表写入CSV

import requests
import csv
from bs4 import BeautifulSoup

page = requests.get('http://m.rays.mlb.com/roster/')
soup = BeautifulSoup(page.text, 'html.parser')

soup.find(class_='nav-tabset-container').decompose()
soup.find(class_='column secondary span-5 right').decompose()

roster = soup.find(class_='layout layout-roster')
names = [n.contents[0] for n in roster.find_all('a')]
number = [n.contents[0] for n in roster.find_all('td', index='0')]
handedness = [n.contents[0] for n in roster.find_all('td', index='3')]
height = [n.contents[0] for n in roster.find_all('td', index='4')]
weight = [n.contents[0] for n in roster.find_all('td', index='5')]
DOB = [n.contents[0] for n in roster.find_all('td', index='6')]
team = [soup.find('meta',property='og:site_name')['content']] * len(names)

with open('MLB_Active_Roster.csv', 'w', newline='') as fp:
    f = csv.writer(fp)
    f.writerow(['Name','Number','Hand','Height','Weight','DOB','Team'])
    f.writerows(zip(names, number, handedness, height, weight, DOB, team))

谢谢你的建议，托马拉克！我是python新手，所以这非常有用！谢谢你的反馈，我很高兴你觉得这很有用。您可以尝试实现的另一种方法是在表行上循环，并为每个

写入一行数据-有许多方法可以解决此任务。好主意。我一定会尝试去做更多的练习。再次感谢！谢谢你的帮助，福佐罗！我一直在“查找玩家名称链接”下的“查找所有（'a'）行中遇到“非类型”错误。我最终解决了这个问题，再次感谢您的指导。

import requests
import csv
from bs4 import BeautifulSoup

page = requests.get('http://m.rays.mlb.com/roster/')
soup = BeautifulSoup(page.text, 'html.parser')

soup.find(class_='nav-tabset-container').decompose()
soup.find(class_='column secondary span-5 right').decompose()

roster = soup.find(class_='layout layout-roster')
names = [n.contents[0] for n in roster.find_all('a')]
number = [n.contents[0] for n in roster.find_all('td', index='0')]
handedness = [n.contents[0] for n in roster.find_all('td', index='3')]
height = [n.contents[0] for n in roster.find_all('td', index='4')]
weight = [n.contents[0] for n in roster.find_all('td', index='5')]
DOB = [n.contents[0] for n in roster.find_all('td', index='6')]
team = [soup.find('meta',property='og:site_name')['content']] * len(names)

with open('MLB_Active_Roster.csv', 'w', newline='') as fp:
    f = csv.writer(fp)
    f.writerow(['Name','Number','Hand','Height','Weight','DOB','Team'])
    f.writerows(zip(names, number, handedness, height, weight, DOB, team))