Web scraping 寻找Alpha scraping电话会议记录问题
我试图收集一个研究项目(我是一名博士生)的Alpha会议记录。现在,我在网上找到了一段代码,可以提取成绩单并将其存储在.json文件中。我已经调整了代码以轮换用户代理。但是,由于以下原因,代码仅提取电话会议记录的第一页:Web scraping 寻找Alpha scraping电话会议记录问题,web-scraping,scrapy,Web Scraping,Scrapy,我试图收集一个研究项目(我是一名博士生)的Alpha会议记录。现在,我在网上找到了一段代码,可以提取成绩单并将其存储在.json文件中。我已经调整了代码以轮换用户代理。但是,由于以下原因,代码仅提取电话会议记录的第一页: body = response.css('div#a-body p.p1') chunks = body.css('p.p1') 页面由一系列元素表示,类.p1.p2.p3等表示页码。我已经尝试了很多方法,例如用以下代码替换上述代码: response.xpath('//di
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')
页面由一系列
元素表示,类.p1
.p2
.p3
等表示页码。我已经尝试了很多方法,例如用以下代码替换上述代码:
response.xpath('//div[@id="a-body"]/p')
但我无法提取完整的电话会议记录(只有第一页)。以下是完整代码:
import scrapy
# This enum lists the stages of each transcript.
from enum import Enum
import random
# SRC: https://developers.whatismybrowser.com/useragents/explore/
user_agent_list = [
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
#Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
Stage = Enum('Stage', 'preamble execs analysts body')
# Some transcript preambles are concatenated on a single line. This list is used
# To separate the title and date sections of the string.
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
transcripts = {}
class TranscriptSpider(scrapy.Spider):
name = 'transcripts'
custom_settings = {
'DOWNLOAD_DELAY': 2 # 0.25 == 250 ms of delay, 1 == 1000ms of delay, etc.
}
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
def parse(self, response):
# Follows each transcript page's link from the given index page.
for href in response.css('.dashboard-article-link::attr(href)').extract():
user_agent = random.choice(user_agent_list)
yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript,headers={'User-Agent': user_agent})
# Follows the pagination links at the bottom of given index page.
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
def parse_transcript(self, response):
i = 0
transcript = {}
details = {}
execs = []
analysts = []
script = []
mode = 1
# As the pages are represented by a series of `<p>` elements we have to do this the
# old-fashioned way - breaking it into chunks and iterating over them.
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')
while i < len(chunks):
# If the current line is a heading and we're not currently going
# through the transcript body (where headings represent speakers),
# change the current section flag to the next section.
if (len(chunks[i].css('strong::text').extract()) == 0) or (mode == 4):
currStage = Stage(mode)
# If we're on the preamble stage, each bit of data is extracted
# separately as they all have their own key in the JSON.
if currStage == Stage['preamble']:
# If we're on the first line of the preamble, that's the
# company name, stock exchange and ticker acroynm (or should
# be - see below)
if i == 0:
# Checks to see if the second line is a heading. If not,
# everything is fine.
if len(chunks[1].css('strong::text').extract()) == 0:
details['company'] = chunks[i].css('p::text').extract_first()
if " (" in details['company']:
details['company'] = details['company'].split(' (')[0]
# If a specific stock exchange is not listed, it
# defaults to NYSE
details['exchange'] = "NYSE"
details['ticker'] = chunks.css('a::text').extract_first()
if ":" in details['ticker']:
ticker = details['ticker'].split(':')
details['exchange'] = ticker[0]
details['ticker'] = ticker[1]
# However, if it is, that means this line contains the
# full, concatenated preamble, so everything must be
# extracted here
else:
details['company'] = chunks[i].css('p::text').extract_first()
if " (" in details['company']:
details['company'] = details['company'].split(' (')[0]
# if a specific stock exchange is not listed, default to NYSE
details['exchange'] = "NYSE"
details['ticker'] = chunks.css('a::text').extract_first()
if ":" in details['ticker']:
ticker = details['ticker'].split(':')
details['exchange'] = ticker[0]
details['ticker'] = ticker[1]
titleAndDate = chunks[i].css('p::text').extract[1]
for date in months:
if date in titleAndDate:
splits = titleAndDate.split(date)
details['title'] = splits[0]
details['date'] = date + splits[1]
# Otherwise, we're onto the title line.
elif i == 1:
title = chunks[i].css('p::text').extract_first()
# This should never be the case, but just to be careful
# I'm leaving it in.
if len(title) <= 0:
title = "NO TITLE"
details['title'] = title
# Or the date line.
elif i == 2:
details['date'] = chunks[i].css('p::text').extract_first()
# If we're onto the 'Executives' section, we create a list of
# all of their names, positions and company name (from the
# preamble).
elif currStage == Stage['execs']:
anExec = chunks[i].css('p::text').extract_first().split(" - ")
# This covers if the execs are separated with an em- rather
# than an en-dash (see above).
if len(anExec) <= 1:
anExec = chunks[i].css('p::text').extract_first().split(" – ")
name = anExec[0]
if len(anExec) > 1:
position = anExec[1]
# Again, this should never be the case, as an Exec-less
# company would find it hard to get much done.
else:
position = ""
execs.append((name,position,details['company']))
# This does the same, but with the analysts (which never seem
# to be separated by em-dashes for some reason).
elif currStage == Stage['analysts']:
name = chunks[i].css('p::text').extract_first().split(" - ")[0]
company = chunks[i].css('p::text').extract_first().split(" - ")[1]
analysts.append((name,company))
# This strips the transcript body of everything except simple
# HTML, and stores that.
elif currStage == Stage['body']:
line = chunks[i].css('p::text').extract_first()
html = "p>"
if line is None:
line = chunks[i].css('strong::text').extract_first()
html = "h1>"
script.append("<"+html+line+"</"+html)
else:
mode += 1
i += 1
# Adds the various arrays to the dictionary for the transcript
details['exec'] = execs
details['analysts'] = analysts
details['transcript'] = ''.join(script)
# Adds this transcript to the dictionary of all scraped
# transcripts, and yield that for the output
transcript["entry"] = details
yield transcript
import scrapy
#此枚举列出了每个成绩单的阶段。
从枚举导入枚举
随机输入
#SRC:https://developers.whatismybrowser.com/useragents/explore/
用户\代理\列表=[
“Mozilla/5.0(X11;Linux x86_64)AppleWebKit/537.36(KHTML,类似Gecko)Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36”,
“Mozilla/5.0(Windows NT 6.1;WOW64)AppleWebKit/537.36(KHTML,类似Gecko)Chrome/45.0.2454.85 Safari/537.36”,
“Mozilla/5.0(Windows NT 6.1;WOW64;Trident/7.0;rv:11.0)就像Gecko一样”,
“Mozilla/5.0(Windows NT 6.1;WOW64;rv:40.0)Gecko/20100101 Firefox/40.0”,
“Mozilla/5.0(X11;Linux x86_64)AppleWebKit/537.36(KHTML,类似Gecko)Chrome/44.0.2403.157 Safari/537.36”,
“Mozilla/5.0(Windows NT 6.3;Win64;x64)AppleWebKit/537.36(KHTML,类似Gecko)Chrome/60.0.3112.113 Safari/537.36”,
“Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML,类似Gecko)Chrome/57.0.2987.133 Safari/537.36”,
“Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,类似Gecko)Chrome/57.0.2987.133 Safari/537.36”,
“Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML,类似Gecko)Chrome/55.0.2883.87 Safari/537.36”,
“Mozilla/5.0(Windows NT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,类似Gecko)Chrome/55.0.2883.87 Safari/537.36”,
#火狐
“Mozilla/4.0(兼容;MSIE 9.0;Windows NT 6.1)”,
“Mozilla/5.0(Windows NT 6.1;WOW64;Trident/7.0;rv:11.0)就像Gecko一样”,
“Mozilla/5.0(兼容;MSIE 9.0;Windows NT 6.1;WOW64;Trident/5.0)”,
“Mozilla/5.0(Windows NT 6.1;Trident/7.0;rv:11.0)就像Gecko一样”,
“Mozilla/5.0(Windows NT 6.2;WOW64;Trident/7.0;rv:11.0)就像Gecko一样”,
“Mozilla/5.0(Windows NT 10.0;WOW64;Trident/7.0;rv:11.0)就像Gecko一样”,
“Mozilla/5.0(兼容;MSIE 9.0;Windows NT 6.0;Trident/5.0)”,
“Mozilla/5.0(Windows NT 6.3;WOW64;Trident/7.0;rv:11.0)就像Gecko一样”,
“Mozilla/5.0(兼容;MSIE 9.0;Windows NT 6.1;Trident/5.0)”,
“Mozilla/5.0(Windows NT 6.1;Win64;x64;Trident/7.0;rv:11.0)就像Gecko一样”,
“Mozilla/5.0(兼容;MSIE 10.0;Windows NT 6.1;WOW64;Trident/6.0)”,
“Mozilla/5.0(兼容;MSIE 10.0;Windows NT 6.1;Trident/6.0)”,
“Mozilla/4.0(兼容;MSIE 8.0;Windows NT 5.1;Trident/4.0;.NET CLR 2.0.50727;.NET CLR 3.0.4506.2152;.NET CLR 3.5.30729)”
]
Stage=Enum('Stage','preamble execs-body')
#一些转录本的前言连在一行上。使用此列表
#分隔字符串的标题和日期部分。
月份=[“一月”、“二月”、“三月”、“四月”、“五月”、“六月”、“七月”、“八月”、“九月”、“十月”、“十一月”、“十二月”]
转录本={}
类转录本蜘蛛(scrapy.Spider):
姓名=‘成绩单’
自定义设置={
“下载延迟”:2#0.25==250毫秒的延迟,1==1000毫秒的延迟,等等。
}
起始URL=['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
def解析(自我,响应):
#跟踪给定索引页中每个成绩单页的链接。
for href in response.css('.dashboard article link::attr(href)')。extract():
用户代理=随机。选择(用户代理列表)
产生scrapy.Request(response.urljoin(href),callback=self.parse_transcript,headers={'User-Agent':User_-Agent})
#遵循给定索引页底部的分页链接。
next_page=response.css('li.next a::attr(href')).extract_first()
如果下一页不是“无”:
下一页=response.urljoin(下一页)
生成scrapy.Request(下一页,callback=self.parse)
def parse_转录本(自我,回复):
i=0
转录本={}
详细信息={}
execs=[]
分析师=[]
脚本=[]
模式=1
#由于页面由一系列``元素表示,因此我们必须按照
#老式的方法-将其分解成块并迭代。
body=response.css('div#a-body p.p1')
chunks=body.css('p.p1')
而我# Follows the pagination links at the bottom of transcript page.
next_page = response.css(YOUR CSS SELECTOR GOES HERE).extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse_transcript)
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')