使用Scrapy&;Python,没有附加正确的值
我正在尝试使用Scrapy将一些数据保存到CSV文件中,我在一个爬行器中运行了3个刮削过程。但是爬网生成的CSV文件只保存我使用第一个函数“parse”提取的第一个数据,我需要的是保存第三个函数的数据 我甚至试着用熊猫来生成CSV文件,但Scrapy似乎在熊猫上遇到了麻烦 运行代码使用Scrapy&;Python,没有附加正确的值,python,scrapy,Python,Scrapy,我正在尝试使用Scrapy将一些数据保存到CSV文件中,我在一个爬行器中运行了3个刮削过程。但是爬网生成的CSV文件只保存我使用第一个函数“parse”提取的第一个数据,我需要的是保存第三个函数的数据 我甚至试着用熊猫来生成CSV文件,但Scrapy似乎在熊猫上遇到了麻烦 运行代码刮擦爬网goodmans-o goodmans.csv-t csv 我能做什么?这是我的密码: import os import scrapy from ..items import TutorialItem impo
刮擦爬网goodmans-o goodmans.csv-t csv
我能做什么?这是我的密码:
import os
import scrapy
from ..items import TutorialItem
import pandas as pd
from scrapy.selector import Selector
from scrapy.http import Request
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
class GoodmanSpider(scrapy.Spider):
name = "goodmans"
start_urls = ['http://www.goodmans.net/d/1706/brands.htm']
supplier_urls = []
def parse(self, response):
items = TutorialItem()
all_data = response.css('.SubDepartments')
for data in all_data:
category = data.css('.SubDepartments a::text').extract()
category_url = data.css('.SubDepartments a::attr(href)').extract()
items['category'] = category
items['category_url'] = category_url
for cat, url in zip(category, category_url):
item = dict(category=cat, category_url=url)
yield item
yield Request(url, callback=self.parse_following_urls, dont_filter=True)
# yield {'Category': category, 'url': category_url}
def parse_following_urls(self, response):
items = TutorialItem()
all_data = response.css('.SubDepartments')
for data in all_data:
article_url = data.css('.SubDepartments a::attr(href)').extract()
items['article_url'] = article_url
for url in article_url:
item = dict(article_url=url)
# yield item
yield Request(url, callback=self.parse_following_urls_2, dont_filter=True)
def parse_following_urls_2(self, response):
items = TutorialItem()
all_data = response.css('.recin')
for data in all_data:
article_url_2 = data.css('.recit a').xpath('@href').extract()
article_name = data.css('.recit span::text').extract()
article_price = data.css('.price span::text').extract()
article_item_num = data.css('.itemno span::text').extract()
article_image = data.css('.linkit img').xpath('@src').extract()
items['article_url_2'] = article_url_2
items['article_name'] = article_name
items['article_price'] = article_price
items['article_item_num'] = article_item_num
items['article_image'] = article_image
for au, an, ap, ain, ai in zip(article_url_2, article_name, article_price, article_item_num, article_image):
item = dict(article_url_2=au, article_name=an, article_price=ap, article_item_num=ain, article_image=ai)
df_result = pd.DataFrame.from_dict(item)
path_file = os.path.join(BASE_DIR, 'csv/supplier_product_database.csv')
my_csv = df_result.to_csv(path_file, index=None, header=True)
yield item
items.py文件
我使用了response.xpath,希望这能帮助您
class GoodManSpider(scrapy.Spider):
name = "goodmans"
def start_requests(self):
url = 'http://www.goodmans.net/d/1706/brands.htm'
yield scrapy.Request(url=url, callback=self.parse)
### scraped all links listed in goodman's page
def parse(self, response):
department_links = response.xpath('//ul[@class="SubDepartments"]/li/a/@href').extract()
for link in department_links:
yield scrapy.Request(url=link, callback=self.parse_contents)
### scraped all product links, scraped category name, and category url, and it will be carried by meta as dictionary
def parse_contents(self, response):
category_url = response.url
category = response.xpath('//div[@id="YouAreHere"]/a/text()').extract()[-1]
products = response.xpath('//div[@class="name rname recit"]/a/@href').extract()
for product_link in products:
yield scrapy.Request(url=product_link, callback=self.parse_products, meta={'category_name': category, 'category_link': category_url})
### scraped all necessary data and yield items
def parse_products(self, response):
category = response.meta['category_name']
category_url = response.meta['category_link']
article_url = response.url
article_name = response.xpath('//h1[@class="SectionTitle"]/text()').extract()[0]
article_price = response.xpath('//div[@id="ProdInfoL"]/div/span/text()').extract()[0]
article_item_number = response.xpath('//span[@itemprop="productID"]/text()').extract()[0]
article_image = response.xpath('//div[@id="FullPic"]/a/@href').extract()[0]
items = TutorialItem()
items['category'] = category
items['category_url'] = category_url
items['article_url'] = article_url
items['article_name'] = article_name
items['article_price'] = article_price
items['article_item_num'] = article_item_num
items['article_image'] = article_image
yield item
据我所知,我不知道这篇文章是什么意思
category = company name
category_url = company page link in goodman
article_name = product name
article_url = product_link
article_price = product price
article_item_number = product item number
article_image = product image
基于@bonifacio_kid的代码和一些小的修改,现在它的工作方式应该是:
class GoodManSpider2(scrapy.Spider):
name = "goodmans_2"
def start_requests(self):
url = 'http://www.goodmans.net/d/1706/brands.htm'
yield scrapy.Request(url=url, callback=self.parse)
### scraped all links listed in goodman's page
def parse(self, response):
department_links = response.xpath('//ul[@class="SubDepartments"]/li/a/@href').extract()
for link in department_links:
yield scrapy.Request(url=link, callback=self.parse_contents)
### scraped all product links, scraped category name, and category url, and it will be carried by meta as dictionary
def parse_contents(self, response):
category_url = response.url
category = response.xpath('//div[@id="YouAreHere"]/a/text()').extract()[-1]
products = response.xpath('//div[@class="name rname recit"]/a/@href').extract()
for product_link in products:
yield scrapy.Request(url=product_link, callback=self.parse_products, meta={'category_name': category, 'category_link': category_url})
### scraped all necessary data and yield items
def parse_products(self, response):
items = TutorialItem()
category = response.meta['category_name']
category_url = response.meta['category_link']
article_url = response.url
article_name = response.xpath('//h1[@class="SectionTitle"]/text()').extract()
article_price = response.xpath('//div[@id="ProdInfoL"]/div/span/text()').extract()
article_item_num = response.xpath('//span[@itemprop="productID"]/text()').extract()
article_image = response.xpath('//div[@id="FullPic"]/a/@href').extract()
items['category'] = category
items['category_url'] = category_url
items['article_url'] = article_url
items['article_name'] = article_name
items['article_price'] = article_price
items['article_item_num'] = article_item_num
items['article_image'] = article_image
for au, an, ap, ain, ai in zip(article_url, article_name, article_price, article_item_num, article_image):
item = dict(supplier_url=article_url, supplier_item_name=an, min_price=ap, max_price=ap, article_item_num=ain, article_image=ai)
yield item
图像用于什么?图像识别用于进一步开发您的代码非常有用!我纠正了一些小细节,现在它工作得很好