Python 我能够使用请求/回调成功浏览多个网站。但是,当我写入csv时,会得到重复的项目

Python 我能够使用请求/回调成功浏览多个网站。但是,当我写入csv时,会得到重复的项目,python,scrapy,Python,Scrapy,我的代码执行以下操作: 通过原始网站Finviz.com进行解析,并删除一些项目,如P/E等 在Yahoo finance中通过两个单独的URL请求回调和解析,并提取更多信息 将请求的项返回到一个包含finviz信息和yahoo信息的干净字典值中 我似乎已经成功地做到了。但是,我的输出有问题。输出同时写入finviz信息,如P/E、marketcap,并输出新访问的信息,即现在的finviz+yahoo集合(在混乱中,我发现解决方案是在我的第二个请求中发布一个请求 差不多: class Finv

我的代码执行以下操作:

  • 通过原始网站Finviz.com进行解析,并删除一些项目,如P/E等
  • 在Yahoo finance中通过两个单独的URL请求回调和解析,并提取更多信息
  • 将请求的项返回到一个包含finviz信息和yahoo信息的干净字典值中

  • 我似乎已经成功地做到了。但是,我的输出有问题。输出同时写入finviz信息,如P/E、marketcap,并输出新访问的信息,即现在的finviz+yahoo集合(

    在混乱中,我发现解决方案是在我的第二个请求中发布一个请求

    差不多:

    class FinvizSpider(CrawlSpider):
        name = "finviz"
        allowed_domains = ["finviz.com", "finance.yahoo.com"]
        start_urls = ["http://finviz.com/screener.ashx?v=152&f=cap_smallover&ft=4&c=0,1,2,6,7,10,11,13,14,45,65"]
    
        rules = (Rule(LxmlLinkExtractor(allow=('r=\d+'),restrict_xpaths='//a[@class="tab-link"]')
        , callback="parse_items", follow= True),
        )
    
        def parse_start_url(self, response):
            return self.parse_items(response)
    
    
        def parse_items(self, response):
            hxs = HtmlXPathSelector(response)
            trs = hxs.select('//table[@bgcolor="#d3d3d3"]/tr');
            items = []
            for tr in trs[1:len(trs)]:
                item = StockfundamentalsItem()
                item['ticker'] = tr.select('td[2]/a/text()').extract()
                item ["marketcap"] = tr.select("td[4]//text()").extract()
                item ["pEarnings"] = tr.select("td[5]//text()").extract()
                item ["pSales"] = tr.select("td[6]//text()").extract()
                item ["pBook"] = tr.select("td[7]//text()").extract()
                item ["pFCF"] = tr.select("td[8]//text()").extract()
                item ["Div"] = tr.select("td[9]//text()").extract()
    
    
                newurl = "http://finance.yahoo.com/q/ks?s=" + item['ticker'][0] + "+Key+Statistics"
    
    
    
                yield Request(newurl, meta={'item': item}, callback=self.LinkParse)
    
    
                items.append(item)
            return items
    
    
    
        def LinkParse(self, response):
            hxs = HtmlXPathSelector(response)
            enterprise = hxs.select('//table[@class="yfnc_datamodoutline1"]//tr[9]/td[2]/text()').extract()
            item = response.meta['item']
            item['Enterprise'] = [enterprise[0]] 
            newurl2 = "http://finance.yahoo.com/q/cf?s="+ item['ticker'][0] + "&ql=1"
            yield Request(newurl2, meta={'item': item}, callback = self.LinkParse2)
            return
    
    
        def LinkParse2(self, response):
            hxs = HtmlXPathSelector(response)
            stockpurchases = hxs.select('//table[@class="yfnc_tabledata1"]//tr[23]')
            runningtot = 0 
    
            tds = (stockpurchases.select("./td/text()")).extract()
            for elements in tds[1:]:
                val = float(elements.strip().replace('-','0').replace(',','').replace('(','-').replace(')',''))
                runningtot = runningtot + val
    
            item = response.meta['item']
    
            item['BBY'] = [runningtot] 
    
            return item
    

    但是,这似乎不是解决此问题的正确方法…有没有一种方法可以正确执行多个请求?

    实际上,这是一种正确的方法。您也可以尝试使用Scrapy内联请求
    pFCF,pBook,pEarnings,BBY,Enterprise,marketcap,Div,ticker,pSales
    14.44,3.24,33.45,,10.66,13.70B,0.98%,A,2.17
    14.44,3.24,33.45,17000,10.66,13.70B,0.98%,A,2.17
    .
    .
    .
    
    class FinvizSpider(CrawlSpider):
        name = "finviz"
        allowed_domains = ["finviz.com", "finance.yahoo.com"]
        start_urls = ["http://finviz.com/screener.ashx?v=152&f=cap_smallover&ft=4&c=0,1,2,6,7,10,11,13,14,45,65"]
    
        rules = (Rule(LxmlLinkExtractor(allow=('r=\d+'),restrict_xpaths='//a[@class="tab-link"]')
        , callback="parse_items", follow= True),
        )
    
        def parse_start_url(self, response):
            return self.parse_items(response)
    
    
        def parse_items(self, response):
            hxs = HtmlXPathSelector(response)
            trs = hxs.select('//table[@bgcolor="#d3d3d3"]/tr');
            items = []
            for tr in trs[1:len(trs)]:
                item = StockfundamentalsItem()
                item['ticker'] = tr.select('td[2]/a/text()').extract()
                item ["marketcap"] = tr.select("td[4]//text()").extract()
                item ["pEarnings"] = tr.select("td[5]//text()").extract()
                item ["pSales"] = tr.select("td[6]//text()").extract()
                item ["pBook"] = tr.select("td[7]//text()").extract()
                item ["pFCF"] = tr.select("td[8]//text()").extract()
                item ["Div"] = tr.select("td[9]//text()").extract()
    
    
                newurl = "http://finance.yahoo.com/q/ks?s=" + item['ticker'][0] + "+Key+Statistics"
    
    
    
                yield Request(newurl, meta={'item': item}, callback=self.LinkParse)
    
    
                items.append(item)
            return items
    
    
    
        def LinkParse(self, response):
            hxs = HtmlXPathSelector(response)
            enterprise = hxs.select('//table[@class="yfnc_datamodoutline1"]//tr[9]/td[2]/text()').extract()
            item = response.meta['item']
            item['Enterprise'] = [enterprise[0]] 
            newurl2 = "http://finance.yahoo.com/q/cf?s="+ item['ticker'][0] + "&ql=1"
            yield Request(newurl2, meta={'item': item}, callback = self.LinkParse2)
            return
    
    
        def LinkParse2(self, response):
            hxs = HtmlXPathSelector(response)
            stockpurchases = hxs.select('//table[@class="yfnc_tabledata1"]//tr[23]')
            runningtot = 0 
    
            tds = (stockpurchases.select("./td/text()")).extract()
            for elements in tds[1:]:
                val = float(elements.strip().replace('-','0').replace(',','').replace('(','-').replace(')',''))
                runningtot = runningtot + val
    
            item = response.meta['item']
    
            item['BBY'] = [runningtot] 
    
            return item