Python 我的scrapy crawler不会从amazon.com返回结果
伙计们,我已经为基于刮擦的网络爬虫编写代码好几个星期了。他们似乎在按预期工作。我成了一个狂热的粉丝。但在过去的几天里,我最新的“刮痧爬虫”拒绝在亚马逊网站上爬行。我没有得到任何结果。我也没有得到任何错误代码。我甚至试过刮壳。它只是不返回任何结果。我怀疑问题出在xpath或css表达式中,但我无法找到答案。任何帮助都将不胜感激 这是我的蜘蛛的样子 我的代码打印xxxxx,之后什么也不打印Python 我的scrapy crawler不会从amazon.com返回结果,python,scrapy,web-crawler,amazon,Python,Scrapy,Web Crawler,Amazon,伙计们,我已经为基于刮擦的网络爬虫编写代码好几个星期了。他们似乎在按预期工作。我成了一个狂热的粉丝。但在过去的几天里,我最新的“刮痧爬虫”拒绝在亚马逊网站上爬行。我没有得到任何结果。我也没有得到任何错误代码。我甚至试过刮壳。它只是不返回任何结果。我怀疑问题出在xpath或css表达式中,但我无法找到答案。任何帮助都将不胜感激 这是我的蜘蛛的样子 我的代码打印xxxxx,之后什么也不打印 import scrapy from scrapy.spiders import CrawlSpider,
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from amazon.items import LowesItem
from amazon.items import SwatchcolorItem
class SattySpider(scrapy.Spider):
name = "faucets"
allowed_domains = ["amazon.com"]
start_urls = [
"https://www.amazon.com/s?ie=UTF8&page=1&rh=n%3A228013%2Ck%3Abathroom%20faucets"
]
rules = (
Rule(LinkExtractor(allow='amazon\.com/[A-Z][a-zA-Z_/]+$'),
'parse_category', follow=True,
),
)
def parse(self, response):
print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
#####
# I even tried xpath
#for sel in response.xpath('.//li[@class="s-result-item celwidget s-hidden-sponsored-item"]'):
# prodDesc= sel.xpath('.//div[@class="s-item-container"]//div[@class="a-row a-spacing-none"]//a[@title]').extract()
#####
for sel in response.css("li.s-result-item.celwidget.s-hidden-sponsored-item > div.s-item-container > div > div > a::attr('href')"):
#for sel in response.xpath('.//li[@class="s-result-item celwidget s-hidden-sponsored-item"]'):
prodDesc= sel.xpath('.//div[@class="s-item-container"]//div[@class="a-row a-spacing-none"]//a[@title]').extract()
print prodDesc
produrls = sel.xpath('.//@data-producturl').extract()
urls = sel.xpath('.//@data-productimg').extract()
#prod_url_det = response.urljoin(produrl.extract())
lowi= LowesItem()
lowi['swatcharray'] = {}
for idx,swatch in enumerate(sel.xpath('.//div[@class="product-container js-product-container"]//a//div[@class="pvs pvs-options-height v-spacing-small"]//ul/li')):
swatchcolor = swatch.xpath('.//img//@alt').extract()
lowi['swatcharray'][idx] =swatchcolor
#yield lowi
#url_prod_det = response.urljoin(produrl)
for idx1,url in enumerate(urls):
url_prod_det = response.urljoin(produrls[idx1])
yield scrapy.Request(url_prod_det,
meta={'lowes': LowesItem(prod=prod[idx1], swatcharray=lowi['swatcharray'], file_urls=['http:' + url])},
callback=self.parse_productdetail)
for next in response.css("div.grid-parent.v-spacing-extra-large > nav > ul > li.page-next > a::attr('href')"):
url_next = response.urljoin(next.extract())
print " url_next : " + url_next
yield scrapy.Request(url_next, callback=self.parse)
def parse_productdetail(self, response):
print 'Testing....'
# for model in response.xpath('//div[@class="pd-numbers grid-50 tablet-grid-100"]//p[@class="secondary-text small-type"]').re('<strong> Model # </strong>'):
for model in response.xpath('//div[@class="pd-numbers grid-50 tablet-grid-100"]//p[@class="secondary-text small-type"]'):
#print model.extract()
modelname = model.xpath('./text()').extract()
#print modelname
#yield lowesItem
lowesItem = response.meta['lowes']
lowesItem['model']=modelname[1]
lowesItem['category']='default'
lowesItem['subcategory']='default'
lowesItem['vendor']='Lowes'
for namevals in response.xpath('//div[@id="collapseSpecs"]//div[@class="panel-body"]//div[@class="grid-100 grid-parent"]//div[@class="grid-50"]//table[@class="table full-width no-borders"]//tbody//tr'):
#print namevals
name = namevals.xpath('.//th/text()').extract()
val = namevals.xpath('.//td//span/text()').extract()
if 'Faucet Type' in name:
lowesItem['faucettype']=val[0]
elif 'Number of Faucet Handles' in name:
lowesItem['numofhandles']=val[0]
elif 'ADA Compliant' in name:
lowesItem['ada']=val[0]
elif 'Built-In Water Filter' in name:
lowesItem['builtinwaterfilter']=val[0]
elif 'Mounting Location' in name:
lowesItem['mountingloc']=val[0]
elif 'Color/Finish Family' in name:
lowesItem['color']=val[0]
elif 'Manufacturer Color/Finish' in name:
lowesItem['manufacturercolor']=val[0]
elif 'Collection Name' in name:
lowesItem['collection']=val[0]
elif 'Soap or Lotion Dispenser' in name:
lowesItem['soapdispenser']=val[0]
elif 'Spout Height (Inches)' in name:
lowesItem['spoutheight']=val[0]
elif 'Max Flow Rate' in name:
lowesItem['maxflowrate']=val[0]
yield lowesItem
import scrapy
从scrapy.spider导入爬行蜘蛛,规则
从scrapy.LinkExtractor导入LinkExtractor
从amazon.items导入Lowsitem
从amazon.items导入样例ColorItem
等级SattySpider(刮毛蜘蛛):
name=“水龙头”
允许的_域=[“amazon.com”]
起始URL=[
"https://www.amazon.com/s?ie=UTF8&page=1&rh=n%3A228013%2Ck%3Abathroom%20faucets"
]
规则=(
规则(LinkExtractor(allow='amazon\.com/[A-Z][A-zA-Z_/]+$),
“parse_category”,follow=True,
),
)
def解析(自我,响应):
打印“XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX”
#####
#我甚至试过xpath
#对于response.xpath('.//li[@class=“s-result-item-s-hidden-sponsorted-item”]')中的sel:
#prodDesc=sel.xpath('.//div[@class=“s-item-container”]//div[@class=“a-row a-spating-none”]///a[@title]”)。extract()
#####
对于response.css中的sel(“li.s-result-item.celwidget.s-hidden-sponsored-item>div.s-item-container>div>div>a::attr('href')”):
#对于response.xpath('.//li[@class=“s-result-item-s-hidden-sponsorted-item”]')中的sel:
prodDesc=sel.xpath('.//div[@class=“s-item-container”]//div[@class=“a-row a-spating-none”]///a[@title]”)。extract()
打印产品说明
produrls=sel.xpath('./@data-producturl').extract()
url=sel.xpath('./@data-productimg').extract()
#prod\u url\u det=response.urljoin(produrl.extract())
lowi=Lowsitem()
lowi['swatcharray']={}
对于idx,枚举中的样例(sel.xpath('.//div[@class=“product container js product container”]//a//div[@class=“pvs pvs options height v-spating-small”]//ul li'):
swatchcolor=swatch.xpath('.//img//@alt').extract()
lowi['swatcharray'][idx]=样本颜色
#产量低
#url\u prod\u det=response.urljoin(produrl)
对于idx1,枚举中的url(url):
url\u prod\u det=response.urljoin(produrls[idx1])
产生刮擦请求(url\u prod\u det,
meta={'lowes':LowesItem(prod=prod[idx1],swatcharray=lowi['swatcharray'],文件url=['http:'+url]),
callback=self.parse\u productdetail)
对于next in response.css(“div.grid-parent.v-spating-extra-large>nav>ul>li.page-next>a::attr('href')”):
url\u next=response.urljoin(next.extract())
打印“url\u next:+url\u next”
生成scrapy.Request(url\u next,callback=self.parse)
def parse_productdetail(自我,响应):
打印“测试…”
#对于响应中的模型.xpath('//div[@class=“pd numbers grid-50 tablet-grid-100”]//p[@class=“secondary text small type”]”)。re('model#):
对于response.xpath中的模型('//div[@class=“pd numbers grid-50 tablet-grid-100”]//p[@class=“secondary text small type”]):
#打印model.extract()
modelname=model.xpath('./text()').extract()
#打印模型名
#产量下限
lowesItem=response.meta['lowes']
lowesItem['model']=modelname[1]
lowesItem['category']='default'
lowesItem['subcategory']='default'
lowesItem['vendor']='Lowes'
xpath('//div[@id=“collapsspecs”]//div[@class=“panel body”]//div[@class=“grid-100 grid parent”]//div[@class=“grid-50”]//table[@class=“table full width no borders”]//tbody//tr):
#打印姓名
name=namevals.xpath('.//th/text()').extract()
val=namevals.xpath('.//td//span/text()')。extract()
如果名称中有“水龙头类型”:
lowesItem['FauceType']=val[0]
elif名称中的“水龙头把手数量”:
lowesItem['numofhandles']=val[0]
elif名称为“符合ADA要求”:
lowesItem['ada']=val[0]
elif“内置滤水器”名称:
lowesItem['builtinwaterfilter']=val[0]
elif名称中的“安装位置”:
lowesItem['mountingloc']=val[0]
elif名称中的“颜色/饰面系列”:
lowesItem['color']=val[0]
elif名称中的“制造商颜色/饰面”:
lowesItem['manufacturercolor']=val[0]
elif名称中的“集合名称”:
lowesItem['collection']=val[0]
elif名称中的“肥皂或乳液分配器”:
lowesItem['soapdispenser']=val[0]
elif名称中的“喷口高度(英寸)”:
lowesItem['spoutheight']=val[0]
elif名称中的“最大流速”:
lowesItem['maxflowrate']=val[0]
产量下限
不要刮亚马逊,使用API我愿意打赌你没有正确设置用户代理
标题,亚马逊会拒绝你。他们不喜欢被刮伤。你不应该刮去它们,而应该使用API。不管听众是谁,他们都很容易聪明地检测并阻止刮去者。用你的设置编辑你的帖子。py
什么是API?你能分享链接吗。@阿米尔你说得对,我没有设置用户代理。如何设置用户代理。不要刮亚马逊,使用API我愿意打赌你没有正确设置用户代理
标题,亚马逊会拒绝你。他们不喜欢被刮伤。你不应该刮它们然后使用它们