Python 刮皮机没有按预期工作

Python 刮皮机没有按预期工作,python,scrapy,Python,Scrapy,这是我的Scrapy蜘蛛。我正试图从网上搜集一些数据。但我不知道如何强制Scrapy递归地跟踪链接。我的错在哪里 import re from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selecto

这是我的
Scrapy
蜘蛛。我正试图从网上搜集一些数据。但我不知道如何强制
Scrapy
递归地跟踪链接。我的错在哪里

import re
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from YellowPagesOfMoldova.items import YellowpagesofmoldovaItem
from scrapy.item import Item

class YellowSpider(CrawlSpider):
    name = 'yellow'
    allowed_domains = ['yellowpages.md']
    start_urls = [ 'http://www.yellowpages.md/eng/companies/info/8939-arc-publishing-house']
    rules = (
        Rule(SgmlLinkExtractor(allow=('eng.+')), follow=True),
        )

    def parse(self, response):

        sel = Selector(response)
        i = YellowpagesofmoldovaItem()
        i['url']            = response.url
        i['locality']       = sel.xpath("//tr[3]/td/p[1]/span[1]/text()").extract()
        i['title']          = sel.xpath('//title/text()').extract()
        i['title2']         = sel.xpath("//td/h1/text()").extract()
        i['website']        = sel.xpath("//p[2]/a/text()").extract()
        i['activity']       = sel.xpath("//tbody/tr[4]/td/p/text()").extract()
        i['street']         = sel.xpath("//tr/td/p[1]/span[2]/text()").extract()
        return i
谢谢

我解决了这个问题。现在它工作得很好。看起来是这样的:

import re
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from YellowPagesOfMoldova.items import YellowpagesofmoldovaItem
from scrapy.item import Item

class YellowSpider(CrawlSpider):
    name = 'yellow'
    allowed_domains = ['yellowpages.md']
    start_urls = [ 'http://www.yellowpages.md/eng/companies/info/8939-arc-publishing-house']
    rules = (
        Rule(SgmlLinkExtractor(allow=('eng.+')),callback='parse_items', follow=True),

        )

    def parse_items(self, response):

        sel = Selector(response)
        i = YellowpagesofmoldovaItem()
        i['url']            = response.url
        i['locality']       = sel.xpath("//tr[3]/td/p[1]/span[1]/text()").extract()
        i['title']          = sel.xpath('//title/text()').extract()
        i['title2']         = sel.xpath("//td/h1/text()").extract()
        i['website']        = sel.xpath("//p[2]/a/text()").extract()
        i['activity']       = sel.xpath("//tbody/tr[4]/td/p/text()").extract()
        i['street']         = sel.xpath("//tr/td/p[1]/span[2]/text()").extract()
        return i

CrawlSpider的
parse
方法不应该被重写,因为这就是所有“魔法”发生的地方。(见附件)


def parse
更改为
def parse_page
并在您的规则中引用此回调:
规则(SgmlLinkExtractor(allow=('eng.+')),回调='parse_page',follow=True),

哦,在编辑时发布了我的答案。很酷,你弄明白了