Scrapy 从具有不同值的同一url获取重复请求
我试图抓取一些数据作为我的辅助项目,但在收集数据时遇到了问题。我已经试了两天,运气不好 第一个问题: 当我从主页抓取帖子时,我得到了一个错误的标记 第二个问题: 我已经阅读并尝试实现获取电话号码,但徒劳无功, 还是这个答案 第三个问题: 如何实现下一页(注释掉gumtree.py中的代码) 第四个问题: 我现在可以获得电话号码,但我收到了对同一url的重复请求,请求的值不同,[参见结果] 如果有人能给我指点方向,我将不胜感激。 我的主要目标是抓取有电话号码的帖子 我试图搜索stackoverflow,但找不到合适的帖子。 非常感谢 setting.pyScrapy 从具有不同值的同一url获取重复请求,scrapy,token,Scrapy,Token,我试图抓取一些数据作为我的辅助项目,但在收集数据时遇到了问题。我已经试了两天,运气不好 第一个问题: 当我从主页抓取帖子时,我得到了一个错误的标记 第二个问题: 我已经阅读并尝试实现获取电话号码,但徒劳无功, 还是这个答案 第三个问题: 如何实现下一页(注释掉gumtree.py中的代码) 第四个问题: 我现在可以获得电话号码,但我收到了对同一url的重复请求,请求的值不同,[参见结果] 如果有人能给我指点方向,我将不胜感激。 我的主要目标是抓取有电话号码的帖子 我试图搜索stackoverf
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py[更新]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
结果:[scrapy crawl gumtree-o..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
您是否检查了meta['item']是否实际被传递到parse_token() 我会做以下几件事:
meta={'item':item}
request=request(response.urljoin(href),meta=meta,callback=self.parse\u令牌)
让步请求
我找到了解决方案
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item
#-*-编码:utf-8-*-
导入re、json、scrapy
从crawler.items导入CrawlerItem
从碎片导入请求、项目、字段、选择器
树胶树https://www.gumtree.com'
getphonehttps://www.gumtree.com/ajax/account/seller/reveal/number/'
GumtreeSpider类(刮毛蜘蛛):
name=“gumtree”
允许的_域=[“gumtree.com”]
起始URL=[
'https://www.gumtree.com/search?search_category=cars',
]
def解析(自我,响应):
item=CrawlerItem()
pid=[]
arr_url=[]
for href in response.css('a.listing-link::attr(href')。extract():
如果len(href)>0:
post_id=u“”。加入(href).encode('utf-8').strip()
post_id=re.match('.*([0-9]+)$',post_id)
如果是post_id:
pid.append(后标识组(1))
domain=gumtree+href
arr_url.append(域)
i=0
而我
Hello and Jelx,感谢您的重播,但我能够修复令牌部分,这就是为什么它有一个“线通过”(第一个问题)的原因。它有什么?请给我一个基本上我需要的示例:“我现在可以获得电话号码,但我收到对相同url的重复请求,具有不同的值。“从主线程上的结果可以看出,我收到了三个对同一url的请求,但电话号码不同。我不明白为什么。非常感谢