Python 刮痧荣誉rel=nofollow_Python_Web Crawler_Scrapy

Python 刮痧荣誉rel=nofollow
python web-crawler scrapy
Python 刮痧荣誉rel=nofollow,python,web-crawler,scrapy,Python,Web Crawler,Scrapy,scrapy可以忽略链接吗？在scrapy 0.22中查看，它看起来是这样的：如何启用它？保罗的现场，我就是这样做的： rules = ( # Extract all pages, follow links, call method 'parse_page' for response callback, before processing links call method links_processor Rule(LinkExtractor(allow=('','/')),follow=T
scrapy可以忽略链接吗？在
scrapy 0.22中查看，它看起来是这样的：
如何启用它？
保罗的现场，我就是这样做的：
rules = (
# Extract all pages, follow links, call method 'parse_page' for response callback, before processing links call method links_processor
Rule(LinkExtractor(allow=('','/')),follow=True,callback='parse_page',process_links='links_processor'),

这就是实际的函数（我是python新手，我相信有一种更好的方法可以在不创建新列表的情况下将项目从for循环中移除）
def links_processor(self,links): 
 # A hook into the links processing from an existing page, done in order to not follow "nofollow" links 
 ret_links = list()
 if links:
 for link in links:
 if not link.nofollow: ret_links.append(link)
 return ret_links

轻松一点。
保罗的现场，我就是这样做的：
rules = (
# Extract all pages, follow links, call method 'parse_page' for response callback, before processing links call method links_processor
Rule(LinkExtractor(allow=('','/')),follow=True,callback='parse_page',process_links='links_processor'),

这就是实际的函数（我是python新手，我相信有一种更好的方法可以在不创建新列表的情况下将项目从for循环中移除）
def links_processor(self,links): 
 # A hook into the links processing from an existing page, done in order to not follow "nofollow" links 
 ret_links = list()
 if links:
 for link in links:
 if not link.nofollow: ret_links.append(link)
 return ret_links

很简单。Itamar Gero的答案是正确的。
对于我自己的博客，我实现了一个爬行蜘蛛，它使用基于LinkExtractor的规则从我的博客页面中提取所有相关链接：
# -*- coding: utf-8 -*-

'''
*   This program is free software: you can redistribute it and/or modify
*   it under the terms of the GNU General Public License as published by
*   the Free Software Foundation, either version 3 of the License, or
*   (at your option) any later version.
*
*   This program is distributed in the hope that it will be useful,
*   but WITHOUT ANY WARRANTY; without even the implied warranty of
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*   GNU General Public License for more details.
*
*   You should have received a copy of the GNU General Public License
*   along with this program.  If not, see <http://www.gnu.org/licenses/>.
*
*   @author Marcel Lange <info@ask-sheldon.com>
*   @package ScrapyCrawler 
 '''


from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

import Crawler.settings
from Crawler.items import PageCrawlerItem


class SheldonSpider(CrawlSpider):
    name = Crawler.settings.CRAWLER_NAME
    allowed_domains = Crawler.settings.CRAWLER_DOMAINS
    start_urls = Crawler.settings.CRAWLER_START_URLS
    rules = (
        Rule(
            LinkExtractor(
                allow_domains=Crawler.settings.CRAWLER_DOMAINS,
                allow=Crawler.settings.CRAWLER_ALLOW_REGEX,
                deny=Crawler.settings.CRAWLER_DENY_REGEX,
                restrict_css=Crawler.settings.CSS_SELECTORS,
                canonicalize=True,
                unique=True
            ),
            follow=True,
            callback='parse_item',
            process_links='filter_links'
        ),
    )

    # Filter links with the nofollow attribute
    def filter_links(self, links):
        return_links = list()
        if links:
            for link in links:
                if not link.nofollow:
                    return_links.append(link)
                else:
                    self.logger.debug('Dropped link %s because nofollow attribute was set.' % link.url)
        return return_links

    def parse_item(self, response):
        # self.logger.info('Parsed URL: %s with STATUS %s', response.url, response.status)
        item = PageCrawlerItem()
        item['status'] = response.status
        item['title'] = response.xpath('//title/text()')[0].extract()
        item['url'] = response.url
        item['headers'] = response.headers
        return item

#-*-编码：utf-8-*-
'''
*此程序是免费软件：您可以重新发布和/或修改它
*它是根据GNU通用公共许可证的条款发布的
*自由软件基金会，或者许可证的第3版，或者
*（由您选择）任何更高版本。
*
*这个节目的发布是希望它会有用，
*但没有任何保证；甚至没有对
*适销性或适用于特定用途。请参阅
*有关更多详细信息，请参阅GNU通用公共许可证。
*
*您应该已经收到GNU通用公共许可证的副本
*与此程序一起使用。如果没有，请参阅。
*
*@作者马塞尔·兰格
*@package ScrapyCrawler
'''
从scrapy.spider导入爬行蜘蛛，规则
从scrapy.LinkExtractor导入LinkExtractor
导入爬虫程序设置
从Crawler.items导入页面crawleritem
谢尔顿蜘蛛类（爬行蜘蛛）：
名称=Crawler.settings.Crawler\u名称
允许的\u域=Crawler.settings.Crawler\u域
开始\u URL=Crawler.settings.Crawler\u开始\u URL
规则=(
统治(
链接抽取器(
允许\u域=Crawler.settings.Crawler\u域，
allow=Crawler.settings.Crawler\u allow\u REGEX，
deny=Crawler.settings.Crawler\u deny\u REGEX，
restrict\u css=Crawler.settings.css\u选择器，
规范化=真，
唯一=真
),
follow=True，
callback='parse_item'，
处理链接=“过滤链接”
),
)
#使用nofollow属性筛选链接
def过滤器链接（自身、链接）：
return_links=list（）
如果链接：
对于链接中的链接：
如果不是link.nofollow：
返回链接。追加（链接）
其他：
self.logger.debug（'由于设置了nofollow属性而丢弃了链接%s。'%link.url）
返回链接
def解析_项（自身、响应）：
#self.logger.info（'解析的URL:%s，状态为%s'，response.URL，response.STATUS）
item=PageCrawleItem（）
项目['status']=响应状态
项['title']=response.xpath（'//title/text（）'）[0].extract（）
项['url']=response.url
item['headers']=response.headers
退货项目

在上，我详细描述了如何实现一个网站爬虫来预热我的Wordpress fullpage缓存。Itamar Gero的回答是正确的。
对于我自己的博客，我实现了一个爬行蜘蛛，它使用基于LinkExtractor的规则从我的博客页面中提取所有相关链接：
# -*- coding: utf-8 -*-

'''
*   This program is free software: you can redistribute it and/or modify
*   it under the terms of the GNU General Public License as published by
*   the Free Software Foundation, either version 3 of the License, or
*   (at your option) any later version.
*
*   This program is distributed in the hope that it will be useful,
*   but WITHOUT ANY WARRANTY; without even the implied warranty of
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*   GNU General Public License for more details.
*
*   You should have received a copy of the GNU General Public License
*   along with this program.  If not, see <http://www.gnu.org/licenses/>.
*
*   @author Marcel Lange <info@ask-sheldon.com>
*   @package ScrapyCrawler 
 '''


from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

import Crawler.settings
from Crawler.items import PageCrawlerItem


class SheldonSpider(CrawlSpider):
    name = Crawler.settings.CRAWLER_NAME
    allowed_domains = Crawler.settings.CRAWLER_DOMAINS
    start_urls = Crawler.settings.CRAWLER_START_URLS
    rules = (
        Rule(
            LinkExtractor(
                allow_domains=Crawler.settings.CRAWLER_DOMAINS,
                allow=Crawler.settings.CRAWLER_ALLOW_REGEX,
                deny=Crawler.settings.CRAWLER_DENY_REGEX,
                restrict_css=Crawler.settings.CSS_SELECTORS,
                canonicalize=True,
                unique=True
            ),
            follow=True,
            callback='parse_item',
            process_links='filter_links'
        ),
    )

    # Filter links with the nofollow attribute
    def filter_links(self, links):
        return_links = list()
        if links:
            for link in links:
                if not link.nofollow:
                    return_links.append(link)
                else:
                    self.logger.debug('Dropped link %s because nofollow attribute was set.' % link.url)
        return return_links

    def parse_item(self, response):
        # self.logger.info('Parsed URL: %s with STATUS %s', response.url, response.status)
        item = PageCrawlerItem()
        item['status'] = response.status
        item['title'] = response.xpath('//title/text()')[0].extract()
        item['url'] = response.url
        item['headers'] = response.headers
        return item

#-*-编码：utf-8-*-
'''
*此程序是免费软件：您可以重新发布和/或修改它
*它是根据GNU通用公共许可证的条款发布的
*自由软件基金会，或者许可证的第3版，或者
*（由您选择）任何更高版本。
*
*这个节目的发布是希望它会有用，
*但没有任何保证；甚至没有对
*适销性或适用于特定用途。请参阅
*有关更多详细信息，请参阅GNU通用公共许可证。
*
*您应该已经收到GNU通用公共许可证的副本
*与此程序一起使用。如果没有，请参阅。
*
*@作者马塞尔·兰格
*@package ScrapyCrawler
'''
从scrapy.spider导入爬行蜘蛛，规则
从scrapy.LinkExtractor导入LinkExtractor
导入爬虫程序设置
从Crawler.items导入页面crawleritem
谢尔顿蜘蛛类（爬行蜘蛛）：
名称=Crawler.settings.Crawler\u名称
允许的\u域=Crawler.settings.Crawler\u域
开始\u URL=Crawler.settings.Crawler\u开始\u URL
规则=(
统治(
链接抽取器(
允许\u域=Crawler.settings.Crawler\u域，
allow=Crawler.settings.Crawler\u allow\u REGEX，
deny=Crawler.settings.Crawler\u deny\u REGEX，
restrict\u css=Crawler.settings.css\u选择器，
规范化=真，
唯一=真
),
follow=True，
callback='parse_item'，
处理链接=“过滤链接”
),
)
#使用nofollow属性筛选链接
def过滤器链接（自身、链接）：
return_links=list（）
如果链接：
对于链接中的链接：
如果不是link.nofollow：
返回链接。追加（链接）
其他：
self.logger.debug（'由于设置了nofollow属性而丢弃了链接%s。'%link.url）
返回链接
def解析_项（自身、响应）：
#self.logger.info（'解析的URL:%s，状态为%s'，response.URL，response.STATUS）
item=PageCrawleItem（）
项目['status']=响应状态
项['title']=response.xpath（'//title/text（）'）[0].extract（）
项['url']=response.url
item['headers']=response.headers
退货项目

在上，我已经详细描述了如何实现一个网站爬虫来预热我的Wordpress fullpage缓存。
sgmlLinkedExtractor
将提取Link
属性设置为True
或<