Python 在scrapy中使用代理和假用户代理时,有没有办法绕过PHPSESSID和_cfduid cookies?

Python 在scrapy中使用代理和假用户代理时,有没有办法绕过PHPSESSID和_cfduid cookies?,python,cookies,scrapy,web-crawler,Python,Cookies,Scrapy,Web Crawler,到目前为止,我已经尝试了代码的上述注释部分 我可以用默认设置抓取几个页面,并将抓取增加到100以上,延迟设置为30秒 我认为问题在于,对于代理和用户代理的每个组合,PHPSESSID在一开始就被设置为“仅一次”,而对于该组合,cfduid被设置为爬网的生存期。我使用 下面是为每个新请求设置新cookie的代码 import scrapy from scrapy import Request from NPM.items import NPMItem from scrapy.loader im

到目前为止,我已经尝试了代码的上述注释部分

我可以用默认设置抓取几个页面,并将抓取增加到100以上,延迟设置为30秒


我认为问题在于,对于代理和用户代理的每个组合,PHPSESSID在一开始就被设置为“仅一次”,而对于该组合,cfduid被设置为爬网的生存期。

我使用

下面是为每个新请求设置新cookie的代码


import scrapy
from scrapy import Request
from NPM.items import NPMItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Identity, MapCompose, Join, Compose
import re
import cfscrape
from scrapy_splash import SplashRequest

# import requests
# session=requests.Session()
# print(session.cookies.getdict())
# response = session.get('http://google.com')
# print(session.cookies.get_dict())
script = """
function main(splash)
  splash:init_cookies(splash.args.cookies)
  assert(splash:go{
    splash.args.url,
    headers=splash.args.headers,
    http_method=splash.args.http_method,
    body=splash.args.body,
    })
  assert(splash:wait(0.5))

  local entries = splash:history()
  local last_response = entries[#entries].response
  return {
    url = splash:url(),
    headers = last_response.headers,
    http_status = last_response.status,
    cookies = splash:get_cookies(),
    html = splash:html(),
  }
end
"""

class ExampleCrawler(scrapy.Spider):
    name = 'Example'
    custom_settings = {
      'RETRY_TIMES': 5,
      'DOWNLOAD_DELAY': 3,
      'CONCURRENT_REQUESTS': 20,
      'CONCURRENT_REQUESTS_PER_DOMAIN': 20,
      'CONCURRENT_REQUESTS_PER_IP': 20,
      'AUTOTHROTTLE_ENABLED': True,
      'COOKIES_ENABLED': True,
      'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36 ',
      'PROXY_LIST': 'EXAMPLE/proxy.txt'
      }
    allowed_domains = ['example.com']
    start_urls = ['https://example/real-estate/']

    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url, self.parse,
                                endpoint='execute',
                                cache_args=['lua_source'],
                                args={'lua_source': script},
                                headers={'X-My-Header': 'value'},
                                )

    def parse(self, response):
        properties = response.xpath('//*[@id="mbuzz"]/following-sibling::table')[0:-1]
        for property in properties:
            links = property.xpath('.//@href').extract_first()
            urlo = response.urljoin(links)
            link = urlo.replace('/real-estate', '')
            # head=response.headers
            #
            # token,u_a=cfscrape.get_tokens(link)
            # cfduid=token['__cfduid']
            #
            # cook=response.headers.getlist('Set-Cookie')
            # # HEAD=Request.meta
            # cook=str(cook)
            # if re.search('PHPSESSID=(.*);',cook):
            #     cookie=re.search('PHPSESSID=(.*);', cook).group(1)
            #     if cookie:
            #         cookie=cookie
            #         yield SplashRequest(link, cookies={'__cfduid':cfduid,'PHPSESSID':cookie},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
            #     else :
            #         pass
            # else:
            #     yield Request(link, cookies={'__cfduid':cfduid},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
            # print(u_a)
            yield Request(link, callback=self.parse_property, meta={'URL': link})
            # yield Request(link, cookies={'__cfduid':cfduid},headers={'USER_AGENT':u_a},callback=self.parse_property, meta={'URL':link})
            # yield SplashRequest(link, self.parse_property,
            #                     endpoint='execute',
            #                     cache_args=['lua_source'],
            #                     args={'lua_source': script},
            #                     headers={'X-My-Header': 'value'},
            #                     )
        rel_next_page = response.xpath('//u[contains (text(), "Next")]/text()/ancestor::a/@href').extract_first()
        next_page = response.urljoin(rel_next_page)
        yield Request(next_page, callback=self.parse)

请看
 def parse(self, response):
        properties = response.xpath('//*[@id="buzz"]/following-sibling::table')[0:-1]
        for i, property in enumerate(properties):
            links = property.xpath('.//@href').extract_first()
            urls = response.urljoin(links)
            yield Request(urls, callback=self.parse_property, meta={'URL': link, 'cookiejar':  i})

        rel_next_page = response.xpath('//u[contains (text(), "Next")]/text()/ancestor::a/@href').extract_first()
        next_page = response.urljoin(rel_next_page)
        for i, url in enumerate(next_page):
            yield Request(next_page, callback=self.parse, meta={'cookiejar': i})