如何用scrapy刮多页?

如何用scrapy刮多页?,scrapy,web-crawler,Scrapy,Web Crawler,我正试图把一张有多页的桌子拼凑起来。我使用以下代码打印第一页数据: import scrapy from scrapy.http.request import Request from indicators.items import EducationIndicators class mySpider(scrapy.Spider): name = "education2" allowed_domains = ["data.un.org"] start_urls = (

我正试图把一张有多页的桌子拼凑起来。我使用以下代码打印第一页数据:

import scrapy
from scrapy.http.request import Request
from indicators.items import EducationIndicators

class mySpider(scrapy.Spider):
    name = "education2"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )

    def parse(self, response):
        return Request(
            url='http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
            callback=self.parse_table
        )

    def parse_table(self,response):
        sel = response.selector
        for tr in sel.xpath('//*[@id="divData"]/div/table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('td[1]/text()').extract_first()
            item['years'] = tr.xpath('td[position()>1]/text()').extract() 
            print(item)
            yield item
我已经编写了下载所有页面的下一个代码。这是基于我读到的其他帖子:

import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html

class mySpider(CrawlSpider):
    name = "education3"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )

    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="linkNextB"]',)), callback="parse_table", follow= True),)

    def parse_table(self,response):
        sel = response.selector
        for tr in sel.xpath('//*[@id="divData"]/div/table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('td[1]/text()').extract_first()
            item['years'] = tr.xpath('td[position()>1]/text()').extract() 
            print(item)
            yield item

当我试图打印所有的页面时,我什么也得不到。谁能帮我知道是什么错误吗?

Scrapy首先需要
parse
回调

或者只需使用其他回调重写
start\u request
方法:

import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html

class mySpider(CrawlSpider):
    name = "education3"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )

    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="linkNextB"]',)), callback="parse_table", follow= True),)

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url, callback=self.parse_table)


    def parse_table(self,response):
        for tr in response.xpath('//*[@id="divData"]/div/table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('./td[1]/text()').extract_first()
            item['years'] = tr.xpath('./td[position()>1]/text()').extract() 
            print(item)
            yield item
以下是对所有页面进行爬网的代码:

import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html

from w3lib.url import add_or_replace_parameter

class mySpider(CrawlSpider):
    name = "education3"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )


    api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'

    def parse(self, response):
        max_page = int(response.xpath('//*[@id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
        for page in range(1, max_page + 1):
            yield Request(
                url=add_or_replace_parameter(self.api_url, 'Page', page),
                callback=self.parse_table)


    def parse_table(self,response):
        for tr in response.xpath('//table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('./td[1]/text()').extract_first()
            item['years'] = tr.xpath('./td[position()>1]/text()').extract() 
            print(item)
            yield item

我只收到第一页。我试图使用爬行器来获取多个页面,但我已经了解到我应该定义另一个方法,而不是使用parse,因为它在内部使用它并尝试使用它们的API,
http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=2&DataFilter=series:NER_1&DataMartId=UNESCO&UserQuery=&c=2,3,5,7,9,10&s=参考区域名称:asc,时间段:desc&RequestId=607
。这里有一个
Page
参数。如何更改url中的页数以刮取下一页?请求ID也会更改,但我不知道这是否重要,或者url是否以第一页结尾”,“我尝试过更改url(页码),但执行时只打印最后一页
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html

from w3lib.url import add_or_replace_parameter

class mySpider(CrawlSpider):
    name = "education3"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )


    api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'

    def parse(self, response):
        max_page = int(response.xpath('//*[@id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
        for page in range(1, max_page + 1):
            yield Request(
                url=add_or_replace_parameter(self.api_url, 'Page', page),
                callback=self.parse_table)


    def parse_table(self,response):
        for tr in response.xpath('//table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('./td[1]/text()').extract_first()
            item['years'] = tr.xpath('./td[position()>1]/text()').extract() 
            print(item)
            yield item