Python 如何在scrapy中拆分URL列表的输出
我正试图从scrapy中的url列表中为每个已删除的url生成一个csv文件。我确实理解我将修改pipeline.py,但是到目前为止我所有的尝试都失败了。我不明白如何将被刮取的url传递到管道,并将其用作输出的名称,并相应地分割输出 有什么帮助吗 谢谢 这里是蜘蛛和管道Python 如何在scrapy中拆分URL列表的输出,python,scrapy,Python,Scrapy,我正试图从scrapy中的url列表中为每个已删除的url生成一个csv文件。我确实理解我将修改pipeline.py,但是到目前为止我所有的尝试都失败了。我不明白如何将被刮取的url传递到管道,并将其用作输出的名称,并相应地分割输出 有什么帮助吗 谢谢 这里是蜘蛛和管道 from scrapy import Spider from scrapy.selector import Selector from vApp.items import fItem class VappSpider(S
from scrapy import Spider
from scrapy.selector import Selector
from vApp.items import fItem
class VappSpider(Spider):
name = "vApp"
allowed_domains = ["google.co.uk"]
start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]
def parse(self, response):
trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
for tr in trs:
item = fItem()
try:
item['item'] = tr.xpath('td/text()').extract()[0]
except IndexError:
item['item'] = 'null'
yield item
管道:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class VappPipeline(object):
def __init__(self):
self.files = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('results/%s.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.fields_to_export = ['item']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
我认为,当爬网完成时,您应该批量完成所有这些事情,作为后处理步骤,而不是按项目完成,但这里有一份关于如何做您想做的事情的草案:
from scrapy import Spider
from scrapy.selector import Selector
from vApp.items import fItem
class VappSpider(Spider):
name = "vApp"
allowed_domains = ["google.co.uk"]
start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]
def parse(self, response):
trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
for tr in trs:
item = fItem()
try:
item['item'] = tr.xpath('td/text()').extract()[0]
except IndexError:
item['item'] = 'null'
item['url'] = response.url
yield item
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
from urlparse import urlparse
class VappPipeline(object):
def __init__(self):
self.files = {}
self.exporter = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def process_item(self, item, spider):
url = item['url']
parsed_uri = urlparse(url)
domain = parsed_uri.netloc
if domain not in self.exporter:
file = open('results/%s.csv' % domain, 'w+b')
self.files[domain] = file
self.exporter[domain] = CsvItemExporter(file)
self.exporter[domain].fields_to_export = ['item']
self.exporter[domain].start_exporting()
assert domain in self.exporter
self.exporter[domain].export_item(item)
return item
def spider_closed(self, spider):
for domain, exporter in self.exporter.iteritems():
exporter.finish_exporting()
self.files[domain].close()
我认为,当爬网完成时,您应该批量完成所有这些事情,作为后处理步骤,而不是按项目完成,但这里有一份关于如何做您想做的事情的草案:
from scrapy import Spider
from scrapy.selector import Selector
from vApp.items import fItem
class VappSpider(Spider):
name = "vApp"
allowed_domains = ["google.co.uk"]
start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]
def parse(self, response):
trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
for tr in trs:
item = fItem()
try:
item['item'] = tr.xpath('td/text()').extract()[0]
except IndexError:
item['item'] = 'null'
item['url'] = response.url
yield item
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
from urlparse import urlparse
class VappPipeline(object):
def __init__(self):
self.files = {}
self.exporter = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def process_item(self, item, spider):
url = item['url']
parsed_uri = urlparse(url)
domain = parsed_uri.netloc
if domain not in self.exporter:
file = open('results/%s.csv' % domain, 'w+b')
self.files[domain] = file
self.exporter[domain] = CsvItemExporter(file)
self.exporter[domain].fields_to_export = ['item']
self.exporter[domain].start_exporting()
assert domain in self.exporter
self.exporter[domain].export_item(item)
return item
def spider_closed(self, spider):
for domain, exporter in self.exporter.iteritems():
exporter.finish_exporting()
self.files[domain].close()
谢谢@neverlastn。我试过你的解决办法,但却给了我很多错误。我认为你是对的。我应该在爬网后对数据进行后期处理。特别是在我发现每行数据不是按顺序导出的之后。刮取表的每一行似乎都不是有序刮取的,但rater在爬网程序对另一个url进行爬网后,会导出来自其他地址的一些行。所以基本上我的桌子看起来像row1url1,row1url2,row2url1,row2url1…谢谢@neverlastn。我试过你的解决办法,但却给了我很多错误。我认为你是对的。我应该在爬网后对数据进行后期处理。特别是在我发现每行数据不是按顺序导出的之后。刮取表的每一行似乎都不是有序刮取的,但rater在爬网程序对另一个url进行爬网后,会导出来自其他地址的一些行。所以基本上我的表看起来像是row1url1,row1url2,row2url1,row2url1。。。