Python 巨蟒与美女刮痧足球_Python_Beautifulsoup

Python 巨蟒与美女刮痧足球

python

Python 巨蟒与美女刮痧足球,python,beautifulsoup,Python,Beautifulsoup,在查看一个足球抓取python程序时，在stackoverflow上发现了这个适合我需要的程序，我试图做的是修改它，只返回特定日期或一组日期的分数，只是不知道如何做到这一点。但这只是返回每个日期的每个结果。谢谢，马尔您需要从标题标签中检索日期，并添加一些逻辑来对照所需日期列表进行检查使用正则表达式从标题标记字符串中检索日期，并将其转换为datetime对象： date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') #

在查看一个足球抓取python程序时，在stackoverflow上发现了这个适合我需要的程序，

我试图做的是修改它，只返回特定日期或一组日期的分数，只是不知道如何做到这一点。但这只是返回每个日期的每个结果。谢谢，马尔

您需要从标题标签中检索日期，并添加一些逻辑来对照所需日期列表进行检查

使用正则表达式从标题标记字符串中检索日期，并将其转换为datetime对象：

date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # pattern

played_date = re.findall(date_pattern, played_tag.get_text()) # find pattern in caption tag string played_tag
played_date = ' '.join(played_date[0])
played_date = datetime.strptime(played_date, '%d %B %Y') # convert it into datetime object

并定义一个要刮取的日期列表：

list_of_dates = ['2017-01-07', '2017-01-06'] # put all dates to be print here

def dates_to_datetime(dates):
    """ Converts a list of date strings to a list of datetime objects """
    datetime_objs = []
    for d in dates:
        datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
    return datetime_objs

list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects

最后，您需要添加一条if语句，以检查从标题标记字符串检索到的日期是否在日期列表中：

# check if retrieved date is in list_of_dates
if played_date in list_of_dates:

以下是完整的示例：

import re
from datetime import datetime
from bs4 import BeautifulSoup
import urllib2
import csv


list_of_dates = ['2017-01-07', '2017-01-06'] # put all dates to be print here


def dates_to_datetime(dates):
    """ Converts a list of date strings to a list of datetime objects """
    datetime_objs = []
    for d in dates:
        datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
    return datetime_objs


list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects

date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # date pattern in <caption> tag i.e.: 9th January 2017

url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)

league =  'FA'

for games in soup.find_all('table', class_='table-stats'):
    played_tag = games.find('caption')

    # retrieve date from caption tag
    played_date = re.findall(date_pattern, played_tag.get_text())
    played_date = ' '.join(played_date[0])
    played_date = datetime.strptime(played_date, '%d %B %Y')

    # check if retrieved date is in list_of_dates
    if played_date in list_of_dates:

        played = played_tag and ''.join(played_tag.stripped_strings)
        print played

        for match in soup.find_all('td', class_='match-details'):
            home_tag = match.find('span', class_='team-home')
            home = home_tag and ''.join(home_tag.stripped_strings)
            score_tag = match.find('span', class_='score')
            score = score_tag and ''.join(score_tag.stripped_strings)
            away_tag = match.find('span', class_='team-away')
            away = away_tag and ''.join(away_tag.stripped_strings)

            if ( score.split('-')[0] > score.rsplit('-')[1] ):
                home_win = 1
            else:
                home_win = 0

            if (score.rsplit('-')[1] > score.split('-')[0] ):
                away_win = 1
            else:
                away_win = 0


            if home and score and away:
                print league,',',home,',',home_win,',',score
                print league,',',away,',',away_win,',',score
    else:
        pass

重新导入
从日期时间导入日期时间
从bs4导入BeautifulSoup
导入urllib2
导入csv
列出日期=['2017-01-07'，'2017-01-06']#将所有要打印的日期放在这里
定义日期到日期时间（日期）：
“”“将日期字符串列表转换为日期时间对象列表”“”
datetime_objs=[]
对于d in日期：
datetime_objs.append（datetime.strtime（d），%Y-%m-%d'））
返回日期时间
日期列表=日期到日期时间（日期列表）#将日期列表转换为日期时间对象列表
日期模式=重新编译（r'（\d{1,2}）\w{2}\s（[\w]+\s\d{4}）#标记中的日期模式，即：2017年1月9日
url='1〕http://www.bbc.co.uk/sport/football/fa-cup/results'
page=urllib2.urlopen（url.read（））
汤=美汤（第页）
联赛=‘足总’
对于汤中的游戏。查找所有（'table'，class='table-stats'）：
played_tag=games.find（'标题'）
#从标题标签中检索日期
played\u date=re.findall（日期模式，played\u标记。获取\u文本（））
播放日期=''。加入（播放日期[0]）
播放日期=datetime.strTime（播放日期，'%d%B%Y'）
#检查检索的日期是否在日期列表中
如果在日期列表中播放日期：
played=播放的\u标记和“”。加入（播放的\u标记。剥离的\u字符串）
打印播放
查找所有（'td'，class='match-details'）：
home\u tag=match.find（'span'，class='team-home'）
home=home\u标记和“”。连接（home\u标记.stripped\u字符串）
score\u tag=match.find（'span'，class='score'）
score=score\u标记和“”。连接（score\u标记.stripped\u字符串）
客场\u tag=match.find（'span'，class='team-away'）
away=away\u标记和“”。连接（away\u标记.剥离的\u字符串）
如果（score.split（'-'）[0]>score.rsplit（'-'）[1]）：
home_win=1
其他：
本垒打赢=0
如果（score.rsplit（'-'）[1]>score.split（'-'）[0]）：
客场胜利=1
其他：
客场胜利=0
如果在主场和客场得分：
打印联赛，主场，主场，主场，得分
打印联赛、客场、客场、胜利、得分
其他：
通过

感谢@Benjamin让我走上了正确的轨道，对他的答案进行了一点修改，以检查第二个循环中的日期，我知道这是低效的，因为它将循环遍历每个选定日期的所有数据，但它确实实现了我的最终目标

import re
from datetime import datetime
from bs4 import BeautifulSoup
import urllib2
import csv


list_of_dates = ['2016-11-06', '2016-11-05'] # put all dates to be print here


def dates_to_datetime(dates):
    """ Converts a list of date strings to a list of datetime objects """
    datetime_objs = []
    for d in dates:
        datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
    return datetime_objs


list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects

date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # date pattern in <caption> tag i.e.: 9th January 2017

url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)

league =  'FA'

for games in soup.find_all('table', class_='table-stats'):
    played_tag = games.find('caption')

    # retrieve date from caption tag
    played_date = re.findall(date_pattern, played_tag.get_text())
    played_date = ' '.join(played_date[0])
    played_date = datetime.strptime(played_date, '%d %B %Y')

    # check if retrieved date is in list_of_dates
    if played_date in list_of_dates:

        for match in soup.find_all('td', class_='match-details'):

        # get parent match date            
            match_date = match.parent.parent.parent.caption
            if match_date == played_tag:
                home_tag = match.find('span', class_='team-home')
                home = home_tag and ''.join(home_tag.stripped_strings)
                score_tag = match.find('span', class_='score')
                score = score_tag and ''.join(score_tag.stripped_strings)
                away_tag = match.find('span', class_='team-away')
                away = away_tag and ''.join(away_tag.stripped_strings)

                if ( score.split('-')[0] > score.rsplit('-')[1] ):
                    home_win = 1
                else:
                    home_win = 0

                if (score.rsplit('-')[1] > score.split('-')[0] ):
                    away_win = 1
                else:
                    away_win = 0


                if home and score and away:
                    print played_date, ',', league,',',home,',',home_win
                    print played_date, ',', league,',',away,',',away_win
        else:
            pass

重新导入
从日期时间导入日期时间
从bs4导入BeautifulSoup
导入urllib2
导入csv
列出日期=['2016-11-06'，'2016-11-05']#将所有要打印的日期放在这里
定义日期到日期时间（日期）：
“”“将日期字符串列表转换为日期时间对象列表”“”
datetime_objs=[]
对于d in日期：
datetime_objs.append（datetime.strtime（d），%Y-%m-%d'））
返回日期时间
日期列表=日期到日期时间（日期列表）#将日期列表转换为日期时间对象列表
日期模式=重新编译（r'（\d{1,2}）\w{2}\s（[\w]+\s\d{4}）#标记中的日期模式，即：2017年1月9日
url='1〕http://www.bbc.co.uk/sport/football/fa-cup/results'
page=urllib2.urlopen（url.read（））
汤=美汤（第页）
联赛=‘足总’
对于汤中的游戏。查找所有（'table'，class='table-stats'）：
played_tag=games.find（'标题'）
#从标题标签中检索日期
played\u date=re.findall（日期模式，played\u标记。获取\u文本（））
播放日期=''。加入（播放日期[0]）
播放日期=datetime.strTime（播放日期，'%d%B%Y'）
#检查检索的日期是否在日期列表中
如果在日期列表中播放日期：
查找所有（'td'，class='match-details'）：
#获取父匹配日期
match_date=match.parent.parent.parent.caption
如果匹配日期==播放日期标记：
home\u tag=match.find（'span'，class='team-home'）
home=home\u标记和“”。连接（home\u标记.stripped\u字符串）
score\u tag=match.find（'span'，class='score'）
score=score\u标记和“”。连接（score\u标记.stripped\u字符串）
客场\u tag=match.find（'span'，class='team-away'）
away=away\u标记和“”。连接（away\u标记.剥离的\u字符串）
如果（score.split（'-'）[0]>score.rsplit（'-'）[1]）：
home_win=1
其他：
本垒打赢=0
如果（score.rsplit（'-'）[1]>score.split（'-'）[0]）：
客场胜利=1
其他：
客场胜利=0
如果在主场和客场得分：
打印日期、联赛、主场、主场、胜利
打印日期，联赛，客场，客场，客场
其他：
通过

这样更好，因为它不会在所有游戏中循环，只循环那些需要的游戏

import re
from datetime import datetime
from bs4 import BeautifulSoup
import urllib2
import csv

f = open('/FACup.csv', 'wt')
writer = csv.writer(f)

list_of_dates = ['2017-01-09', '2017-01-08', '2017-01-07', '2017-01-06'] # put all dates to be print here


def dates_to_datetime(dates):
    """ Converts a list of date strings to a list of datetime objects """
    datetime_objs = []
    for d in dates:
        datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
    return datetime_objs


list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects

date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # date pattern in <caption> tag i.e.: 9th January 2017

url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)

league =  'FA'

for games in soup.find_all('table', class_='table-stats'):
    played_tag = games.find('caption')

    # retrieve date from caption tag
    played_date = re.findall(date_pattern, played_tag.get_text())
    played_date = ' '.join(played_date[0])
    played_date = datetime.strptime(played_date, '%d %B %Y')

    # check if retrieved date is in list_of_dates
    if played_date in list_of_dates:

        for match in games.find_all('td', class_='match-details'):
            home_tag = match.find('span', class_='team-home')
            home = home_tag and ''.join(home_tag.stripped_strings)
            score_tag = match.find('span', class_='score')
            score = score_tag and ''.join(score_tag.stripped_strings)
            away_tag = match.find('span', class_='team-away')
            away = away_tag and ''.join(away_tag.stripped_strings)

            if ( score.split('-')[0] > score.rsplit('-')[1] ):
                home_win = 1
            else:
                home_win = 0

            if (score.rsplit('-')[1] > score.split('-')[0] ):
                away_win = 1
            else:
                away_win = 0


            if home and score and away:
                writer.writerow( (league, played_date.strftime('%Y-%m-%d'), home, home_win, 'H') )
                writer.writerow( (league, played_date.strftime('%Y-%m-%d'), away, away_win, 'A') )
    else:
        pass

f.close()

重新导入
从日期时间导入日期时间
从bs4导入BeautifulSoup
导入urllib2
导入csv
f=打开（'/FACup.csv'，'wt'）
writer=csv.writer（f）
列出日期=['2017-01-09'、'2017-01-08'、'2017-01-07'、'2017-01-06']#将所有要打印的日期放在这里
定义日期到日期时间（日期）：
“”“将日期字符串列表转换为日期时间对象列表”“”
import re
from datetime import datetime
from bs4 import BeautifulSoup
import urllib2
import csv

f = open('/FACup.csv', 'wt')
writer = csv.writer(f)

list_of_dates = ['2017-01-09', '2017-01-08', '2017-01-07', '2017-01-06'] # put all dates to be print here


def dates_to_datetime(dates):
    """ Converts a list of date strings to a list of datetime objects """
    datetime_objs = []
    for d in dates:
        datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
    return datetime_objs


list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects

date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # date pattern in <caption> tag i.e.: 9th January 2017

url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)

league =  'FA'

for games in soup.find_all('table', class_='table-stats'):
    played_tag = games.find('caption')

    # retrieve date from caption tag
    played_date = re.findall(date_pattern, played_tag.get_text())
    played_date = ' '.join(played_date[0])
    played_date = datetime.strptime(played_date, '%d %B %Y')

    # check if retrieved date is in list_of_dates
    if played_date in list_of_dates:

        for match in games.find_all('td', class_='match-details'):
            home_tag = match.find('span', class_='team-home')
            home = home_tag and ''.join(home_tag.stripped_strings)
            score_tag = match.find('span', class_='score')
            score = score_tag and ''.join(score_tag.stripped_strings)
            away_tag = match.find('span', class_='team-away')
            away = away_tag and ''.join(away_tag.stripped_strings)

            if ( score.split('-')[0] > score.rsplit('-')[1] ):
                home_win = 1
            else:
                home_win = 0

            if (score.rsplit('-')[1] > score.split('-')[0] ):
                away_win = 1
            else:
                away_win = 0


            if home and score and away:
                writer.writerow( (league, played_date.strftime('%Y-%m-%d'), home, home_win, 'H') )
                writer.writerow( (league, played_date.strftime('%Y-%m-%d'), away, away_win, 'A') )
    else:
        pass

f.close()