Python 3.x 使用response.css导航到下一页_Python 3.x_Scrapy

Python 3.x 使用response.css导航到下一页

python-3.x scrapy

Python 3.x 使用response.css导航到下一页,python-3.x,scrapy,Python 3.x,Scrapy,我有一个从一个页面提取文章的功能，但我无法导航到下一个页面来刮取所有页面：下面是我如何尝试的： import scrapy from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor class MedicalSpider(scrapy.Spider): name = 'medical' # allowed_domains = ['https://blogs.webmd.co

我有一个从一个页面提取文章的功能，但我无法导航到下一个页面来刮取所有页面：

下面是我如何尝试的：

import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor


class MedicalSpider(scrapy.Spider):
    name = 'medical'
    # allowed_domains = ['https://blogs.webmd.com/diabetes/default.htm']
    allowed_domains = ['blogs.webmd.com']  # Only the domain, not the URL
    start_urls = ['https://blogs.webmd.com/diabetes/default.htm']

    def parse(self, response):
        article_links = response.css('.posts-list-post-content a ::attr(href)')
        print(article_links)
        for link in article_links:
            url = link.get()
            if url:
                yield response.follow(url=url, callback=self.parse_article)

    def parse_article(self, response):
        headline = response.css('.blog-header-container h1::text').get()
        article_sections = response.css('.article-body .article-page section p::text')
        body = ""
        for article_sections in article_sections:
            body += article_sections.get() + "\n"

        yield {
            'headline': headline,
            'body': body
        }

        # url_apnd = "https://blogs.webmd.com/diabetes"
        next_page = response.css('.next a ::attr(href)').get()
        print(next_page)
        # print("URL " + response.urljoin(next_page))
        if next_page:
            yield scrapy.Request(response.urljoin(next_page),callback=self.parse)

请帮助我正确导航到下一页。

您需要在

解析功能中移动下一页逻辑，因为下一页按钮位于开始url
中定义的url中
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor


class MedicalSpider(scrapy.Spider):
    name = 'medical'
    allowed_domains = ['blogs.webmd.com']  # Only the domain, not the URL
    start_urls = ['https://blogs.webmd.com/diabetes/default.htm']

    def parse(self, response):
        article_links = response.css('.posts-list-post-content a ::attr(href)')
        print(article_links)
        for link in article_links:
            url = link.get()
            if url:
                yield response.follow(url=url, callback=self.parse_article)
        next_page = response.css('.next a ::attr(href)').get()
        print(next_page)
        # print("URL " + response.urljoin(next_page))
        if next_page:
            yield scrapy.Request(response.urljoin(next_page),callback=self.parse)

    def parse_article(self, response):
        headline = response.css('.blog-header-container h1::text').get()
        article_sections = response.css('.article-body .article-page section p::text')
        body = ""
        for article_sections in article_sections:
            body += article_sections.get() + "\n"

        yield {
            'headline': headline,
            'body': body
        }