Python Scrapy Webcrawler和数据提取器
我试图用scrapy创建一个webcrawler,我使用的模板我以前使用过,但我似乎无法让它解析URL。我可以看到它进入youtube,然后进入watchpage,但它不会从那里提取标题、描述或任何内容,因为它总是无法解析Python Scrapy Webcrawler和数据提取器,python,web,scrapy,web-crawler,Python,Web,Scrapy,Web Crawler,我试图用scrapy创建一个webcrawler,我使用的模板我以前使用过,但我似乎无法让它解析URL。我可以看到它进入youtube,然后进入watchpage,但它不会从那里提取标题、描述或任何内容,因为它总是无法解析 from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.selector import HtmlXPathSelector from scrapy.http import Request from sc
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import log
from krakenkrawler.items import KrakenItem
class AttractionSpider(CrawlSpider):
name = "thekraken"
allowed_domains = ["youtube.com"]
start_urls = [
"http://www.youtube.com/?gl=GB&hl=en-GB"
]
rules = ()
def __init__(self, name=None, **kwargs):
super(AttractionSpider, self).__init__(name, **kwargs)
self.items_buffer = {}
self.base_url = "http://www.youtube.com"
from scrapy.conf import settings
settings.overrides['DOWNLOAD_TIMEOUT'] = 360
def parse(self, response):
print "Start scrapping Attractions...."
try:
hxs = HtmlXPathSelector(response)
links = hxs.select("//h3[@class='yt-lockup-title']//a/@href")
if not links:
return
log.msg("No Data to scrap")
for link in links:
v_url = ''.join( link.extract() )
if not v_url:
continue
else:
_url = self.base_url + v_url
yield Request( url= _url, callback=self.parse_details )
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
def parse_details(self, response):
print "Start scrapping Detailed Info...."
try:
hxs = HtmlXPathSelector(response)
l_venue = KrakenItem()
v_name = hxs.select("//*[@id='eow-title'].text").extract()
if not v_name:
v_name = hxs.select("//*[@id='eow-title'].text").extract()
l_venue["name"] = v_name[0].strip()
base = hxs.select("//*[@id='content']/div[7]")
if base.extract()[0].strip() == "<div style=\"clear:both\"></div>":
base = hxs.select("//*[@id='content']/div[8]")
elif base.extract()[0].strip() == "<div style=\"padding-top:10px;margin-top:10px;border-top:1px dotted #DDD;\">\n You must be logged in to add a tip\n </div>":
base = hxs.select("//*[@id='content']/div[6]")
x_datas = base.select("div[1]/b").extract()
v_datas = base.select("div[1]/text()").extract()
i_d = 0;
if x_datas:
for x_data in x_datas:
print "data is:" + x_data.strip()
if x_data.strip() == "<b>Address:</b>":
l_venue["address"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Contact:</b>":
l_venue["contact"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Operating Hours:</b>":
l_venue["hours"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Website:</b>":
l_venue["website"] = (base.select("//*[@id='watch-actions-share-panel']/div/div[2]/div[2]/span[1]/input/text()").extract())[0].strip()
i_d += 1
v_photo = base.select("img/@src").extract()
if v_photo:
l_venue["photo"] = v_photo[0].strip()
v_desc = base.select("div[3]/text()").extract()
if v_desc:
desc = ""
for dsc in v_desc:
desc += dsc
l_venue["desc"] = desc.strip()
v_video = hxs.select("//*[@id='content']/iframe/@src").extract()
if v_video:
l_venue["video"] = v_video[0].strip()
yield l_venue
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
提前非常感谢。问题是,您正在查找的结构//h3[@class='yt-lockup-title']///a/@href并不存在于所有页面中 我修改了您的代码以验证打开了哪些页面以及提取了哪些数据:
class AttractionSpider(CrawlSpider):
name = "thekraken"
bot_name = 'kraken'
allowed_domains = ["youtube.com"]
start_urls = ["http://www.youtube.com/?gl=GB&hl=en-GB"]
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
print "Start scrapping Attractions...."
print response.url
try :
hxs = HtmlXPathSelector(response)
links = hxs.select("//h3[@class='yt-lockup-title']//a/@href")
for link in links:
v_url = ''.join( link.extract() )
print v_url
if not links:
log.msg("No Data to scrap")
except :
pass
结果是这样的:
开始取消景点
开始取消景点
开始取消景点
开始取消景点。。。。
/观察?v=TdICODRvAhc&list=UUrqsNpKuDQZreGaxBL_a5Jg
/观察?v=CDGzm5edrlw&list=UUrqsNpKuDQZreGaxBL_a5Jg
/手表?v=F2或5KS54JM&list=UUrqsNpKuDQZreGaxBL_a5Jg
/观察?v=LHRzOIvqmQI&list=UUrqsNpKuDQZreGaxBL_a5Jg
/手表?v=F4iqiM6h-2U&list=UUrqsNpKuDQZreGaxBL_a5Jg
/手表?v=ug3UPIvWlvU&list=UUrqsNpKuDQZreGaxBL_a5Jg
/监视?v=msiZs6lIZ9w&list=UUrqsNpKuDQZreGaxBL_a5Jg
/观察?v=Jh6A3DoOLBg&list=UUrqsNpKuDQZreGaxBL_a5Jg
在没有搜索结果的内部页面中,没有yt锁定标题类。
简言之,你必须改进你的蜘蛛