Python 痒痒:爬行蜘蛛不';t解析响应

Python 痒痒:爬行蜘蛛不';t解析响应,python,scrapy-spider,Python,Scrapy Spider,我以前成功地使用过爬行蜘蛛。但是当我为了与Redis集成而修改代码,并添加我自己的中间件来设置UserAgent和Cookie时,爬行器不再解析响应,因此爬行器不会生成新的请求,爬行器在开始后很快关闭 即使我编写了以下代码: def parse_start_url(自我,响应): 返回self.parse_项(响应) 它只解析来自第一个url的响应 这是我的密码: 蜘蛛网: 我认为重要的是: SCHEDULER = "scrapy_redis.scheduler.Scheduler" DUPE

我以前成功地使用过爬行蜘蛛。但是当我为了与Redis集成而修改代码,并添加我自己的中间件来设置UserAgent和Cookie时,爬行器不再解析响应,因此爬行器不会生成新的请求,爬行器在开始后很快关闭

即使我编写了以下代码: def parse_start_url(自我,响应): 返回self.parse_项(响应) 它只解析来自第一个url的响应

这是我的密码: 蜘蛛网:

我认为重要的是:

SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
DOWNLOADER_MIDDLEWARES = {
'yydzh.middlewares.UserAgentmiddleware': 500,
'yydzh.middlewares.CookieMiddleware': 600
}
COOKIES_ENABLED = True
中间件: UserAgentmiddleware随机更改用户代理以避免被服务器注意到

CookieMiddleware将cookies添加到请求登录扫描的页面的请求中

logger = logging.getLogger(__name__)


class UserAgentmiddleware(UserAgentMiddleware):

def process_request(self, request, spider):
    agent = random.choice(agents)
    request.headers["User-Agent"] = agent

class CookieMiddleware(RetryMiddleware):

def __init__(self, settings, crawler):
    RetryMiddleware.__init__(self, settings)
    self.rconn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT,
                             password=REDIS_PASS, db=1, decode_responses=True)  
    init_cookie(self.rconn, crawler.spider.name)


@classmethod
def from_crawler(cls, crawler):
    return cls(crawler.settings, crawler)

def process_request(self, request, spider):
    redisKeys = self.rconn.keys()
    while len(redisKeys) > 0:
        elem = random.choice(redisKeys)
        if spider.name + ':Cookies' in elem:
            cookie = json.loads(self.rconn.get(elem))
            request.cookies = cookie
            request.meta["accountText"] = elem.split("Cookies:")[-1]
            break
        else:
            redisKeys.remove(elem)

def process_response(self, request, response, spider):
    if('您没有登录或者您没有权限访问此页面' in str(response.body)):
        accountText = request.meta["accountText"]
        remove_cookie(self.rconn, spider.name, accountText)
        update_cookie(self.rconn, spider.name, accountText)
        logger.warning("更新Cookie成功!(账号为:%s)" % accountText)
        return request

    return response

发现问题:所有的URL在以前的请求之前都已经被Redis服务器过滤过,重新启动它可以解决问题

logger = logging.getLogger(__name__)


class UserAgentmiddleware(UserAgentMiddleware):

def process_request(self, request, spider):
    agent = random.choice(agents)
    request.headers["User-Agent"] = agent

class CookieMiddleware(RetryMiddleware):

def __init__(self, settings, crawler):
    RetryMiddleware.__init__(self, settings)
    self.rconn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT,
                             password=REDIS_PASS, db=1, decode_responses=True)  
    init_cookie(self.rconn, crawler.spider.name)


@classmethod
def from_crawler(cls, crawler):
    return cls(crawler.settings, crawler)

def process_request(self, request, spider):
    redisKeys = self.rconn.keys()
    while len(redisKeys) > 0:
        elem = random.choice(redisKeys)
        if spider.name + ':Cookies' in elem:
            cookie = json.loads(self.rconn.get(elem))
            request.cookies = cookie
            request.meta["accountText"] = elem.split("Cookies:")[-1]
            break
        else:
            redisKeys.remove(elem)

def process_response(self, request, response, spider):
    if('您没有登录或者您没有权限访问此页面' in str(response.body)):
        accountText = request.meta["accountText"]
        remove_cookie(self.rconn, spider.name, accountText)
        update_cookie(self.rconn, spider.name, accountText)
        logger.warning("更新Cookie成功!(账号为:%s)" % accountText)
        return request

    return response