Python-在抓取之后进行组合会更出色_Python_Excel_Csv_Web Scraping

Python-在抓取之后进行组合会更出色

python excel csv web-scraping

Python-在抓取之后进行组合会更出色,python,excel,csv,web-scraping,Python,Excel,Csv,Web Scraping,我按照另一个人的代码为一名球员刮取了一年的篮球运动员数据，我已经自动化了它，这样我可以为X球员刮取Y年的数据，但它每年输出一个excel。到底有没有办法把所有内容都删掉，并最终都放在一个excel/csv中 from urllib.request import urlopen from bs4 import BeautifulSoup import pandas as pd from time import sleep from random import randint from dateti

我按照另一个人的代码为一名球员刮取了一年的篮球运动员数据，我已经自动化了它，这样我可以为X球员刮取Y年的数据，但它每年输出一个excel。到底有没有办法把所有内容都删掉，并最终都放在一个excel/csv中

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from datetime import date
import numpy as np

poop = []
# NBA season and player we will be analyzing. Basketball-reference's html code is weird so inputs are in place to extract
year = input("Year End Season: ")
initial = input("Initial of Last Name: ")
last_name = input("First 5 letters of last name: ")
first_name = input("First 2 letters of first name (and 01 at the end unless multiple players with that name): ")
Number_pages = int(input("How many years to scrape? "))

years = range(Number_pages)

for adds in years:
    # URL page we will scraping with inputs from above
    url = "https://www.basketball-reference.com/players/" + str(initial) + "/" + str(last_name) + str(first_name) + "/gamelog/" + str(int(year) + int(adds))

    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html, features="lxml")

    sleep(randint(2,10))

    print(soup.div.h1.text.strip())

    # use findALL() to get the column headers
    container = soup.find('div', class_='table_outer_container')
    container.findAll('tr', limit=2)
    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in container.findAll('tr', limit=2)[0].findAll('th')]

    # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
    headers = headers[1:]
    headers

    # avoid the first header row
    rows = container.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]

    stats = pd.DataFrame(player_stats, columns = headers)
    stats.head(10)
    poop.append(stats)

    stats.to_csv(soup.div.h1.text.strip() + '.csv')

是的，您已经在

poop

列表中找到了您需要的东西

# ... your imports ...

poop = []

# ... your logic ...

for adds in years:

     # ... your logic ...

    stats = pd.DataFrame(player_stats, columns = headers)
    stats.head(10)

    # here, you add each data-frames to the 'poop' list.
    poop.append(stats)

# now, concatenate all the data-frames into one
all_stats = pandas.concat(poop)

是的，您已经在

poop

列表中找到了您需要的东西

# ... your imports ...

poop = []

# ... your logic ...

for adds in years:

     # ... your logic ...

    stats = pd.DataFrame(player_stats, columns = headers)
    stats.head(10)

    # here, you add each data-frames to the 'poop' list.
    poop.append(stats)

# now, concatenate all the data-frames into one
all_stats = pandas.concat(poop)

使用返回对象将您的流程封装在已定义的方法中，然后通过列表或dict创建数据帧列表：

方法

def bball_scrape(adds):
    # URL page we will scraping with inputs from above
    url = "https://www.basketball-reference.com/players/{1}/{2}{3}/gamelog/{4}" 
    url = url.format(str(initial), str(last_name), str(first_name), 
                     str(int(year) + int(adds)))

    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html, features="lxml")

    sleep(randint(2,10))

    print(soup.div.h1.text.strip())

    # use findALL() to get the column headers
    container = soup.find('div', class_='table_outer_container')
    container.findAll('tr', limit=2)
    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in container.findAll('tr', limit=2)[0].findAll('th')]

    # exclude the first column...
    headers = headers[1:]

    # avoid the first header row
    rows = container.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                     for i in range(len(rows))]

    stats = pd.DataFrame(player_stats, columns = headers)
    stats.to_csv(soup.div.h1.text.strip() + '.csv')

    return stats

数据帧

df_list = [bball_scrape(adds) for adds in years]
# df_dict = {add:bball_scrape(adds) for adds in years}

final_df = pd.concat(df_list, ignore_index=True)

使用返回对象将您的流程封装在已定义的方法中，然后通过列表或dict创建数据帧列表：

方法

def bball_scrape(adds):
    # URL page we will scraping with inputs from above
    url = "https://www.basketball-reference.com/players/{1}/{2}{3}/gamelog/{4}" 
    url = url.format(str(initial), str(last_name), str(first_name), 
                     str(int(year) + int(adds)))

    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html, features="lxml")

    sleep(randint(2,10))

    print(soup.div.h1.text.strip())

    # use findALL() to get the column headers
    container = soup.find('div', class_='table_outer_container')
    container.findAll('tr', limit=2)
    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in container.findAll('tr', limit=2)[0].findAll('th')]

    # exclude the first column...
    headers = headers[1:]

    # avoid the first header row
    rows = container.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                     for i in range(len(rows))]

    stats = pd.DataFrame(player_stats, columns = headers)
    stats.to_csv(soup.div.h1.text.strip() + '.csv')

    return stats

数据帧

df_list = [bball_scrape(adds) for adds in years]
# df_dict = {add:bball_scrape(adds) for adds in years}

final_df = pd.concat(df_list, ignore_index=True)

为什么不在这里使用pandas来解析表呢？也可以简化一些输入：

import pandas as pd
from time import sleep
from random import randint


poop = []
# NBA season and player we will be analyzing. Basketball-reference's html code is weird so inputs are in place to extract
year = input("Year End Season: ")
last_name = input("Enter last name: ")

initial = last_name[0]
last_name = last_name[0:5]

first_name = input("Enter first name: ")
first_name = first_name[0:2] + '01'

Number_pages = int(input("How many years to scrape? "))

years = range(Number_pages)

for adds in years:
    # URL page we will scraping with inputs from above
    url = "https://www.basketball-reference.com/players/" + str(initial) + "/" + str(last_name) + str(first_name) + "/gamelog/" + str(int(year) + int(adds))
    stats=pd.read_html(url)[-1]
    poop.append(stats)
    sleep(randint(2,10))
    
all_stats = pd.concat(poop, sort=False).reset_index(drop=True)

为什么不在这里使用pandas来解析表呢？也可以简化一些输入：

import pandas as pd
from time import sleep
from random import randint


poop = []
# NBA season and player we will be analyzing. Basketball-reference's html code is weird so inputs are in place to extract
year = input("Year End Season: ")
last_name = input("Enter last name: ")

initial = last_name[0]
last_name = last_name[0:5]

first_name = input("Enter first name: ")
first_name = first_name[0:2] + '01'

Number_pages = int(input("How many years to scrape? "))

years = range(Number_pages)

for adds in years:
    # URL page we will scraping with inputs from above
    url = "https://www.basketball-reference.com/players/" + str(initial) + "/" + str(last_name) + str(first_name) + "/gamelog/" + str(int(year) + int(adds))
    stats=pd.read_html(url)[-1]
    poop.append(stats)
    sleep(randint(2,10))
    
all_stats = pd.concat(poop, sort=False).reset_index(drop=True)

传说这正是我想要的。对不起，我只是在胡乱打字，这正是我想要的。对不起，我只是在胡乱打字