Python 如何在scrapy中拆分URL列表的输出

Python 如何在scrapy中拆分URL列表的输出,python,scrapy,Python,Scrapy,我正试图从scrapy中的url列表中为每个已删除的url生成一个csv文件。我确实理解我将修改pipeline.py,但是到目前为止我所有的尝试都失败了。我不明白如何将被刮取的url传递到管道,并将其用作输出的名称,并相应地分割输出 有什么帮助吗 谢谢 这里是蜘蛛和管道 from scrapy import Spider from scrapy.selector import Selector from vApp.items import fItem class VappSpider(S

我正试图从scrapy中的url列表中为每个已删除的url生成一个csv文件。我确实理解我将修改pipeline.py,但是到目前为止我所有的尝试都失败了。我不明白如何将被刮取的url传递到管道,并将其用作输出的名称,并相应地分割输出

有什么帮助吗

谢谢

这里是蜘蛛和管道

from scrapy import Spider
from scrapy.selector import Selector 
from vApp.items import fItem


class VappSpider(Spider):

    name = "vApp"
    allowed_domains = ["google.co.uk"]
    start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]


def parse(self, response):

    trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
    for tr in trs:
        item = fItem()

        try:
            item['item'] = tr.xpath('td/text()').extract()[0]
        except IndexError:
            item['item'] = 'null'

        yield item
管道:

from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter

class VappPipeline(object):
    def __init__(self):
        self.files = {}

@classmethod
    def from_crawler(cls, crawler):
       pipeline = cls()
       crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
       crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
       return pipeline

   def spider_opened(self, spider):
        file = open('results/%s.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['item']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

我认为,当爬网完成时,您应该批量完成所有这些事情,作为后处理步骤,而不是按项目完成,但这里有一份关于如何做您想做的事情的草案:

from scrapy import Spider
from scrapy.selector import Selector 
from vApp.items import fItem


class VappSpider(Spider):

    name = "vApp"
    allowed_domains = ["google.co.uk"]
    start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]


def parse(self, response):

    trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
    for tr in trs:
        item = fItem()

        try:
            item['item'] = tr.xpath('td/text()').extract()[0]
        except IndexError:
            item['item'] = 'null'
        item['url'] = response.url
        yield item


from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
from urlparse import urlparse

class VappPipeline(object):
    def __init__(self):
        self.files = {}
        self.exporter = {}

    @classmethod
    def from_crawler(cls, crawler):
       pipeline = cls()
       crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
       return pipeline

    def process_item(self, item, spider):
        url = item['url']
        parsed_uri = urlparse(url)
        domain = parsed_uri.netloc
        if domain not in self.exporter:
            file = open('results/%s.csv' % domain, 'w+b')
            self.files[domain] = file
            self.exporter[domain] = CsvItemExporter(file)
            self.exporter[domain].fields_to_export = ['item']
            self.exporter[domain].start_exporting()

        assert domain in self.exporter

        self.exporter[domain].export_item(item)

        return item

    def spider_closed(self, spider):
        for domain, exporter in self.exporter.iteritems():
            exporter.finish_exporting()
            self.files[domain].close()

我认为,当爬网完成时,您应该批量完成所有这些事情,作为后处理步骤,而不是按项目完成,但这里有一份关于如何做您想做的事情的草案:

from scrapy import Spider
from scrapy.selector import Selector 
from vApp.items import fItem


class VappSpider(Spider):

    name = "vApp"
    allowed_domains = ["google.co.uk"]
    start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]


def parse(self, response):

    trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
    for tr in trs:
        item = fItem()

        try:
            item['item'] = tr.xpath('td/text()').extract()[0]
        except IndexError:
            item['item'] = 'null'
        item['url'] = response.url
        yield item


from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
from urlparse import urlparse

class VappPipeline(object):
    def __init__(self):
        self.files = {}
        self.exporter = {}

    @classmethod
    def from_crawler(cls, crawler):
       pipeline = cls()
       crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
       return pipeline

    def process_item(self, item, spider):
        url = item['url']
        parsed_uri = urlparse(url)
        domain = parsed_uri.netloc
        if domain not in self.exporter:
            file = open('results/%s.csv' % domain, 'w+b')
            self.files[domain] = file
            self.exporter[domain] = CsvItemExporter(file)
            self.exporter[domain].fields_to_export = ['item']
            self.exporter[domain].start_exporting()

        assert domain in self.exporter

        self.exporter[domain].export_item(item)

        return item

    def spider_closed(self, spider):
        for domain, exporter in self.exporter.iteritems():
            exporter.finish_exporting()
            self.files[domain].close()

谢谢@neverlastn。我试过你的解决办法,但却给了我很多错误。我认为你是对的。我应该在爬网后对数据进行后期处理。特别是在我发现每行数据不是按顺序导出的之后。刮取表的每一行似乎都不是有序刮取的,但rater在爬网程序对另一个url进行爬网后,会导出来自其他地址的一些行。所以基本上我的桌子看起来像row1url1,row1url2,row2url1,row2url1…谢谢@neverlastn。我试过你的解决办法,但却给了我很多错误。我认为你是对的。我应该在爬网后对数据进行后期处理。特别是在我发现每行数据不是按顺序导出的之后。刮取表的每一行似乎都不是有序刮取的,但rater在爬网程序对另一个url进行爬网后,会导出来自其他地址的一些行。所以基本上我的表看起来像是row1url1,row1url2,row2url1,row2url1。。。