Python 刮页：跟随分页链接以刮取数据_Python_Xpath_Web Scraping_Scrapy

Python 刮页：跟随分页链接以刮取数据

python xpath web-scraping scrapy

Python 刮页：跟随分页链接以刮取数据,python,xpath,web-scraping,scrapy,Python,Xpath,Web Scraping,Scrapy,我正在尝试从页面中抓取数据，并继续按照分页链接进行抓取我正在尝试刮取的页面是--> 问题代码无法跟随分页链接你能帮忙吗修改代码以跟随分页链接它不起作用，因为url无效。如果您想继续使用scrapy.Request，您可以使用： next_page_url = response.xpath('//link[@rel="next"]/@href').extract_first() if next_page_url: next_page_url = response.urljo

我正在尝试从页面中抓取数据，并继续按照分页链接进行抓取

我正在尝试刮取的页面是-->

问题

代码无法跟随分页链接

你能帮忙吗

修改代码以跟随分页链接

scrapy.Request

next_page_url = response.xpath('//link[@rel="next"]/@href').extract_first()
if next_page_url:
    next_page_url = response.urljoin(next_page_url)
    yield scrapy.Request(url=next_page_url, callback=self.parse)

next_page_url = response.xpath('//link[@rel="next"]/@href').extract_first()
if next_page_url:
    yield response.follow(next_page_url)

scrapy.Request

next_page_url = response.xpath('//link[@rel="next"]/@href').extract_first()
if next_page_url:
    next_page_url = response.urljoin(next_page_url)
    yield scrapy.Request(url=next_page_url, callback=self.parse)

next_page_url = response.xpath('//link[@rel="next"]/@href').extract_first()
if next_page_url:
    yield response.follow(next_page_url)

response.follow（）

import scrapy

class AlibabaSpider(scrapy.Spider):
    name = 'alibaba'
    allowed_domains = ['alibaba.com']
    start_urls = ['https://www.alibaba.com/catalog/agricultural-growing-media_cid144?page=1']

    def parse(self, response):
        for products in response.xpath('//div[contains(@class, "m-gallery-product-item-wrap")]'):
            item = {
            'product_name': products.xpath('.//h2/a/@title').extract_first(),
            'price': products.xpath('.//div[@class="price"]/b/text()').extract_first('').strip(),
            'min_order': products.xpath('.//div[@class="min-order"]/b/text()').extract_first(),
            'company_name': products.xpath('.//div[@class="stitle util-ellipsis"]/a/@title').extract_first(),
            'prod_detail_link': products.xpath('.//div[@class="item-img-inner"]/a/@href').extract_first(),
            'response_rate': products.xpath('.//i[@class="ui2-icon ui2-icon-skip"]/text()').extract_first('').strip(),
            #'image_url': products.xpath('.//div[@class=""]/').extract_first(),
            }
            yield item

        #Follow the paginatin link
        next_page_url = response.xpath('//link[@rel="next"]/@href').extract_first()
        if next_page_url:
            yield response.follow(url=next_page_url, callback=self.parse)

response.follow（）

import scrapy

class AlibabaSpider(scrapy.Spider):
    name = 'alibaba'
    allowed_domains = ['alibaba.com']
    start_urls = ['https://www.alibaba.com/catalog/agricultural-growing-media_cid144?page=1']

    def parse(self, response):
        for products in response.xpath('//div[contains(@class, "m-gallery-product-item-wrap")]'):
            item = {
            'product_name': products.xpath('.//h2/a/@title').extract_first(),
            'price': products.xpath('.//div[@class="price"]/b/text()').extract_first('').strip(),
            'min_order': products.xpath('.//div[@class="min-order"]/b/text()').extract_first(),
            'company_name': products.xpath('.//div[@class="stitle util-ellipsis"]/a/@title').extract_first(),
            'prod_detail_link': products.xpath('.//div[@class="item-img-inner"]/a/@href').extract_first(),
            'response_rate': products.xpath('.//i[@class="ui2-icon ui2-icon-skip"]/text()').extract_first('').strip(),
            #'image_url': products.xpath('.//div[@class=""]/').extract_first(),
            }
            yield item

        #Follow the paginatin link
        next_page_url = response.xpath('//link[@rel="next"]/@href').extract_first()
        if next_page_url:
            yield response.follow(url=next_page_url, callback=self.parse)