Scrapy-从下一页提取数据_Scrapy

Scrapy-从下一页提取数据

scrapy

Scrapy-从下一页提取数据,scrapy,Scrapy,我需要你们的帮助，翻页和提取元素。这是我的蜘蛛 import json import scrapy class YPSpider(scrapy.Spider): name = 'yp' start_urls = ['https://www.infobel.com/fr/france/business/50000/informatique_internet/'] def parse(self, response): next_page_url = respon

我需要你们的帮助，翻页和提取元素。这是我的蜘蛛

    import json
import scrapy


class YPSpider(scrapy.Spider):
    name = 'yp'
    start_urls = ['https://www.infobel.com/fr/france/business/50000/informatique_internet/']

    def parse(self, response):
next_page_url = response.xpath('//ul[@class="pagination"]/li[@class="active"]/following-sibling::li[1]/a/@href').extract_first()
if next_page_url:
    yield response.follow(next_page_url, callback=self.parse)
        if response.meta.get('has_phone'):
            item = response.meta['item']

            response = json.loads(response.body)
            item['phone'] = response['result']

            yield item
        else:
            items = response.xpath('//*[contains(@class, "customer-box")]')

            for item in items:
                address_lines = item.xpath('.//span[contains(@class, "fa-map-marker")]/../span[@class="detail-text"]//text()').extract()

                title = item.xpath('.//h2[@class="customer-item-name"]/a/text()').extract_first().strip()
                address = address_lines[0].replace('\r', '').replace('\t', '').strip() if address_lines else ''
                village = address_lines[1].replace('\r', '').replace('\t', '').strip() if len(address_lines) >= 1 else ''
                phone = item.xpath('.//span[contains(@class, "icon-phone")]/../span[@class="detail-text"]/text()').extract()

                item = {
                    'title': title,
                    'address': address,
                    'village': village,
                    'phone': phone,
                }

                if phone:
                    if phone[0].isnumeric():
                        item['phone'] = phone[0]

                        yield item
                    elif len(phone) >= 2:
                        yield scrapy.Request('https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={}'.format(phone[1]), meta={'item': item, 'has_phone': True}

                        )

我如何进行爬网以转到下一页并从该页中刮取元素？我感谢你的帮助

提前感谢

您需要将此代码添加到您的

解析

方法中：

next_page_url = response.xpath('//ul[@class="pagination"]/li[@class="active"]/following-sibling::li[1]/a/@href').extract_first()
if next_page_url:
    yield response.follow(next_page_url, callback=self.parse)

在这里提问之前，您需要先阅读一下Python语法：

def parse(self, response): next_page_url = response.xpath('//ul[@class="pagination"]/li[@class="active"]/following-sibling::li[1]/a/@href').extract_first() if next_page_url: yield response.follow(next_page_url, callback=self.parse) if response.meta.get('has_phone'): item = response.meta['item'] response = json.loads(response.body) item['phone'] = response['result'] yield item else: items = response.xpath('//*[contains(@class, "customer-box")]') for item in items: address_lines = item.xpath('.//span[contains(@class, "fa-map-marker")]/../span[@class="detail-text"]//text()').extract() title = item.xpath('.//h2[@class="customer-item-name"]/a/text()').extract_first().strip() address = address_lines[0].replace('\r', '').replace('\t', '').strip() if address_lines else '' village = address_lines[1].replace('\r', '').replace('\t', '').strip() if len(address_lines) >= 1 else '' phone = item.xpath('.//span[contains(@class, "icon-phone")]/../span[@class="detail-text"]/text()').extract() item = { 'title': title, 'address': address, 'village': village, 'phone': phone, } if phone: if phone[0].isnumeric(): item['phone'] = phone[0] yield item elif len(phone) >= 2: yield scrapy.Request('https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={}'.format(phone[1]), meta={'item': item, 'has_phone': True} )

谢谢你能告诉我应该把它放在哪一行吗？先生，非常感谢。我开始爬网时出错