如何用scrapy刮多页？_Scrapy_Web Crawler

如何用scrapy刮多页？

scrapy web-crawler

如何用scrapy刮多页？,scrapy,web-crawler,Scrapy,Web Crawler,我正试图把一张有多页的桌子拼凑起来。我使用以下代码打印第一页数据： import scrapy from scrapy.http.request import Request from indicators.items import EducationIndicators class mySpider(scrapy.Spider): name = "education2" allowed_domains = ["data.un.org"] start_urls = (

我正试图把一张有多页的桌子拼凑起来。我使用以下代码打印第一页数据：

import scrapy
from scrapy.http.request import Request
from indicators.items import EducationIndicators

class mySpider(scrapy.Spider):
    name = "education2"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )

    def parse(self, response):
        return Request(
            url='http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
            callback=self.parse_table
        )

    def parse_table(self,response):
        sel = response.selector
        for tr in sel.xpath('//*[@id="divData"]/div/table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('td[1]/text()').extract_first()
            item['years'] = tr.xpath('td[position()>1]/text()').extract() 
            print(item)
            yield item

我已经编写了下载所有页面的下一个代码。这是基于我读到的其他帖子：

import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html

class mySpider(CrawlSpider):
    name = "education3"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )

    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="linkNextB"]',)), callback="parse_table", follow= True),)

    def parse_table(self,response):
        sel = response.selector
        for tr in sel.xpath('//*[@id="divData"]/div/table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('td[1]/text()').extract_first()
            item['years'] = tr.xpath('td[position()>1]/text()').extract() 
            print(item)
            yield item

当我试图打印所有的页面时，我什么也得不到。谁能帮我知道是什么错误吗？

Scrapy首先需要

parse

回调

或者只需使用其他回调重写

start\u request

方法：

import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html

class mySpider(CrawlSpider):
    name = "education3"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )

    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="linkNextB"]',)), callback="parse_table", follow= True),)

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url, callback=self.parse_table)


    def parse_table(self,response):
        for tr in response.xpath('//*[@id="divData"]/div/table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('./td[1]/text()').extract_first()
            item['years'] = tr.xpath('./td[position()>1]/text()').extract() 
            print(item)
            yield item

以下是对所有页面进行爬网的代码：

import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html

from w3lib.url import add_or_replace_parameter

class mySpider(CrawlSpider):
    name = "education3"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )


    api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'

    def parse(self, response):
        max_page = int(response.xpath('//*[@id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
        for page in range(1, max_page + 1):
            yield Request(
                url=add_or_replace_parameter(self.api_url, 'Page', page),
                callback=self.parse_table)


    def parse_table(self,response):
        for tr in response.xpath('//table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('./td[1]/text()').extract_first()
            item['years'] = tr.xpath('./td[position()>1]/text()').extract() 
            print(item)
            yield item

我只收到第一页。我试图使用爬行器来获取多个页面，但我已经了解到我应该定义另一个方法，而不是使用parse，因为它在内部使用它并尝试使用它们的API，

http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=2&DataFilter=series:NER_1&DataMartId=UNESCO&UserQuery=&c=2，3,5,7,9,10&s=参考区域名称：asc，时间段：desc&RequestId=607

。这里有一个

Page

参数。如何更改url中的页数以刮取下一页？请求ID也会更改，但我不知道这是否重要，或者url是否以第一页结尾”，“我尝试过更改url（页码），但执行时只打印最后一页

import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html

from w3lib.url import add_or_replace_parameter

class mySpider(CrawlSpider):
    name = "education3"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )


    api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'

    def parse(self, response):
        max_page = int(response.xpath('//*[@id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
        for page in range(1, max_page + 1):
            yield Request(
                url=add_or_replace_parameter(self.api_url, 'Page', page),
                callback=self.parse_table)


    def parse_table(self,response):
        for tr in response.xpath('//table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('./td[1]/text()').extract_first()
            item['years'] = tr.xpath('./td[position()>1]/text()').extract() 
            print(item)
            yield item