Python 在scrapy中使用代理和假用户代理时,有没有办法绕过PHPSESSID和_cfduid cookies?
到目前为止,我已经尝试了代码的上述注释部分 我可以用默认设置抓取几个页面,并将抓取增加到100以上,延迟设置为30秒Python 在scrapy中使用代理和假用户代理时,有没有办法绕过PHPSESSID和_cfduid cookies?,python,cookies,scrapy,web-crawler,Python,Cookies,Scrapy,Web Crawler,到目前为止,我已经尝试了代码的上述注释部分 我可以用默认设置抓取几个页面,并将抓取增加到100以上,延迟设置为30秒 我认为问题在于,对于代理和用户代理的每个组合,PHPSESSID在一开始就被设置为“仅一次”,而对于该组合,cfduid被设置为爬网的生存期。我使用 下面是为每个新请求设置新cookie的代码 import scrapy from scrapy import Request from NPM.items import NPMItem from scrapy.loader im
我认为问题在于,对于代理和用户代理的每个组合,PHPSESSID在一开始就被设置为“仅一次”,而对于该组合,cfduid被设置为爬网的生存期。我使用 下面是为每个新请求设置新cookie的代码
import scrapy
from scrapy import Request
from NPM.items import NPMItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Identity, MapCompose, Join, Compose
import re
import cfscrape
from scrapy_splash import SplashRequest
# import requests
# session=requests.Session()
# print(session.cookies.getdict())
# response = session.get('http://google.com')
# print(session.cookies.get_dict())
script = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go{
splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
})
assert(splash:wait(0.5))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
cookies = splash:get_cookies(),
html = splash:html(),
}
end
"""
class ExampleCrawler(scrapy.Spider):
name = 'Example'
custom_settings = {
'RETRY_TIMES': 5,
'DOWNLOAD_DELAY': 3,
'CONCURRENT_REQUESTS': 20,
'CONCURRENT_REQUESTS_PER_DOMAIN': 20,
'CONCURRENT_REQUESTS_PER_IP': 20,
'AUTOTHROTTLE_ENABLED': True,
'COOKIES_ENABLED': True,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36 ',
'PROXY_LIST': 'EXAMPLE/proxy.txt'
}
allowed_domains = ['example.com']
start_urls = ['https://example/real-estate/']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'X-My-Header': 'value'},
)
def parse(self, response):
properties = response.xpath('//*[@id="mbuzz"]/following-sibling::table')[0:-1]
for property in properties:
links = property.xpath('.//@href').extract_first()
urlo = response.urljoin(links)
link = urlo.replace('/real-estate', '')
# head=response.headers
#
# token,u_a=cfscrape.get_tokens(link)
# cfduid=token['__cfduid']
#
# cook=response.headers.getlist('Set-Cookie')
# # HEAD=Request.meta
# cook=str(cook)
# if re.search('PHPSESSID=(.*);',cook):
# cookie=re.search('PHPSESSID=(.*);', cook).group(1)
# if cookie:
# cookie=cookie
# yield SplashRequest(link, cookies={'__cfduid':cfduid,'PHPSESSID':cookie},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
# else :
# pass
# else:
# yield Request(link, cookies={'__cfduid':cfduid},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
# print(u_a)
yield Request(link, callback=self.parse_property, meta={'URL': link})
# yield Request(link, cookies={'__cfduid':cfduid},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
# yield SplashRequest(link, self.parse_property,
# endpoint='execute',
# cache_args=['lua_source'],
# args={'lua_source': script},
# headers={'X-My-Header': 'value'},
# )
rel_next_page = response.xpath('//u[contains (text(), "Next")]/text()/ancestor::a/@href').extract_first()
next_page = response.urljoin(rel_next_page)
yield Request(next_page, callback=self.parse)
请看
def parse(self, response):
properties = response.xpath('//*[@id="buzz"]/following-sibling::table')[0:-1]
for i, property in enumerate(properties):
links = property.xpath('.//@href').extract_first()
urls = response.urljoin(links)
yield Request(urls, callback=self.parse_property, meta={'URL': link, 'cookiejar': i})
rel_next_page = response.xpath('//u[contains (text(), "Next")]/text()/ancestor::a/@href').extract_first()
next_page = response.urljoin(rel_next_page)
for i, url in enumerate(next_page):
yield Request(next_page, callback=self.parse, meta={'cookiejar': i})