使用Scrapy&；Python，没有附加正确的值_Python_Scrapy

使用Scrapy&；Python，没有附加正确的值

python scrapy

使用Scrapy&；Python，没有附加正确的值,python,scrapy,Python,Scrapy,我正在尝试使用Scrapy将一些数据保存到CSV文件中，我在一个爬行器中运行了3个刮削过程。但是爬网生成的CSV文件只保存我使用第一个函数“parse”提取的第一个数据，我需要的是保存第三个函数的数据我甚至试着用熊猫来生成CSV文件，但Scrapy似乎在熊猫上遇到了麻烦运行代码刮擦爬网goodmans-o goodmans.csv-t csv 我能做什么？这是我的密码： import os import scrapy from ..items import TutorialItem impo

我正在尝试使用Scrapy将一些数据保存到CSV文件中，我在一个爬行器中运行了3个刮削过程。但是爬网生成的CSV文件只保存我使用第一个函数“parse”提取的第一个数据，我需要的是保存第三个函数的数据

我甚至试着用熊猫来生成CSV文件，但Scrapy似乎在熊猫上遇到了麻烦

运行代码

刮擦爬网goodmans-o goodmans.csv-t csv

我能做什么？这是我的密码：

import os
import scrapy
from ..items import TutorialItem
import pandas as pd
from scrapy.selector import Selector
from scrapy.http import Request
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

class GoodmanSpider(scrapy.Spider):
name = "goodmans"
start_urls = ['http://www.goodmans.net/d/1706/brands.htm']
supplier_urls = []

def parse(self, response):
    items = TutorialItem()
    all_data = response.css('.SubDepartments')
    for data in all_data:
        category = data.css('.SubDepartments a::text').extract()
        category_url = data.css('.SubDepartments a::attr(href)').extract()
        items['category'] = category
        items['category_url'] = category_url
        for cat, url in zip(category, category_url):
            item = dict(category=cat, category_url=url)
            yield item
            yield Request(url, callback=self.parse_following_urls, dont_filter=True)
            # yield {'Category': category, 'url': category_url}

def parse_following_urls(self, response):  
    items = TutorialItem()
    all_data = response.css('.SubDepartments')
    for data in all_data:         
        article_url = data.css('.SubDepartments a::attr(href)').extract() 
        items['article_url'] = article_url
        for url in article_url:
            item = dict(article_url=url)        
            # yield item    
            yield Request(url, callback=self.parse_following_urls_2, dont_filter=True)

def parse_following_urls_2(self, response):  
    items = TutorialItem()
    all_data = response.css('.recin')
    for data in all_data:
        article_url_2 = data.css('.recit a').xpath('@href').extract() 
        article_name = data.css('.recit span::text').extract()
        article_price = data.css('.price span::text').extract()
        article_item_num = data.css('.itemno span::text').extract() 
        article_image = data.css('.linkit img').xpath('@src').extract()
        items['article_url_2'] = article_url_2
        items['article_name'] = article_name
        items['article_price'] = article_price
        items['article_item_num'] = article_item_num
        items['article_image'] = article_image
        for au, an, ap, ain, ai in zip(article_url_2, article_name, article_price, article_item_num, article_image):
            item = dict(article_url_2=au, article_name=an, article_price=ap, article_item_num=ain, article_image=ai)
            df_result = pd.DataFrame.from_dict(item)
            path_file = os.path.join(BASE_DIR, 'csv/supplier_product_database.csv')
            my_csv = df_result.to_csv(path_file, index=None, header=True)
            yield item

items.py文件

我使用了response.xpath，希望这能帮助您

class GoodManSpider(scrapy.Spider):
    name = "goodmans"


    def start_requests(self):
        url = 'http://www.goodmans.net/d/1706/brands.htm'
        yield scrapy.Request(url=url, callback=self.parse)

### scraped all links listed in goodman's page
    def parse(self, response):
        department_links = response.xpath('//ul[@class="SubDepartments"]/li/a/@href').extract()
        for link in department_links:
            yield scrapy.Request(url=link, callback=self.parse_contents)


### scraped all product links, scraped category name, and category url, and it will be carried by meta as dictionary
    def parse_contents(self, response):
        category_url = response.url
        category = response.xpath('//div[@id="YouAreHere"]/a/text()').extract()[-1]
        products = response.xpath('//div[@class="name rname recit"]/a/@href').extract()
        for product_link in products:
            yield scrapy.Request(url=product_link, callback=self.parse_products, meta={'category_name': category, 'category_link': category_url})

### scraped all necessary data and yield items
    def parse_products(self, response):
        category = response.meta['category_name']
        category_url = response.meta['category_link']
        article_url = response.url
        article_name = response.xpath('//h1[@class="SectionTitle"]/text()').extract()[0]
        article_price = response.xpath('//div[@id="ProdInfoL"]/div/span/text()').extract()[0]
        article_item_number = response.xpath('//span[@itemprop="productID"]/text()').extract()[0]
        article_image = response.xpath('//div[@id="FullPic"]/a/@href').extract()[0]
        items = TutorialItem()
        items['category'] = category
        items['category_url'] = category_url
        items['article_url'] = article_url
        items['article_name'] = article_name
        items['article_price'] = article_price
        items['article_item_num'] = article_item_num
        items['article_image'] = article_image

        yield item

据我所知，我不知道这篇文章是什么意思

category = company name
category_url = company page link in goodman
article_name = product name
article_url = product_link
article_price = product price
article_item_number = product item number
article_image = product image

基于@bonifacio_kid的代码和一些小的修改，现在它的工作方式应该是：

class GoodManSpider2(scrapy.Spider):
    name = "goodmans_2"


    def start_requests(self):
        url = 'http://www.goodmans.net/d/1706/brands.htm'
        yield scrapy.Request(url=url, callback=self.parse)

### scraped all links listed in goodman's page
    def parse(self, response):
        department_links = response.xpath('//ul[@class="SubDepartments"]/li/a/@href').extract()
        for link in department_links:
            yield scrapy.Request(url=link, callback=self.parse_contents)


### scraped all product links, scraped category name, and category url, and it will be carried by meta as dictionary
    def parse_contents(self, response):
        category_url = response.url
        category = response.xpath('//div[@id="YouAreHere"]/a/text()').extract()[-1]
        products = response.xpath('//div[@class="name rname recit"]/a/@href').extract()
        for product_link in products:
            yield scrapy.Request(url=product_link, callback=self.parse_products, meta={'category_name': category, 'category_link': category_url})

### scraped all necessary data and yield items
    def parse_products(self, response):
        items = TutorialItem()
        category = response.meta['category_name']
        category_url = response.meta['category_link']
        article_url = response.url
        article_name = response.xpath('//h1[@class="SectionTitle"]/text()').extract()
        article_price = response.xpath('//div[@id="ProdInfoL"]/div/span/text()').extract()
        article_item_num = response.xpath('//span[@itemprop="productID"]/text()').extract()
        article_image = response.xpath('//div[@id="FullPic"]/a/@href').extract()
        items['category'] = category
        items['category_url'] = category_url
        items['article_url'] = article_url
        items['article_name'] = article_name
        items['article_price'] = article_price
        items['article_item_num'] = article_item_num
        items['article_image'] = article_image

        for au, an, ap, ain, ai in zip(article_url, article_name, article_price, article_item_num, article_image):
                item = dict(supplier_url=article_url, supplier_item_name=an, min_price=ap, max_price=ap, article_item_num=ain, article_image=ai)
                yield item

图像用于什么？图像识别用于进一步开发您的代码非常有用！我纠正了一些小细节，现在它工作得很好