Python Scrapy-在一个Scrapy脚本中抓取不同的网页_Python_Scrapy

Python Scrapy-在一个Scrapy脚本中抓取不同的网页

python scrapy

Python Scrapy-在一个Scrapy脚本中抓取不同的网页,python,scrapy,Python,Scrapy,我正在创建一个web应用程序，从不同的网站上收集一长串鞋子。以下是我的两个单独的scrapy脚本：我怎样才能把它们都放在一起呢？我已经阅读了这些零碎的文档，我没有看到他们提到这一点，它只是提到了如何从一个根地址中刮取两个地址。谢谢将两个域都放在允许的\u域中并将两个URL放在开始\u URL中，然后使用简单的if-else确定要执行的代码部分 from scrapy import Spider from scrapy.http import Request class ShoesSpi

我正在创建一个web应用程序，从不同的网站上收集一长串鞋子。以下是我的两个单独的scrapy脚本：

我怎样才能把它们都放在一起呢？我已经阅读了这些零碎的文档，我没有看到他们提到这一点，它只是提到了如何从一个根地址中刮取两个地址。

谢谢

将两个域都放在

允许的\u域中

并将两个URL放在

开始\u URL中

，然后使用简单的if-else确定要执行的代码部分

from scrapy import Spider
from scrapy.http import Request
class ShoesSpider(Spider):
    name = "shoes"
    allowed_domains = ["store.nike.com", "dickssportinggoods.com"]
    start_urls = ['http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3', 'http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp']
    def parse(self, response):

        if "store.nike.com" in response.url:
            shoes = response.xpath('//*[@class="grid-item-image-wrapper sprite-sheet sprite-index-0"]/a/@href').extract()
        elif "dickssportinggoods.com" in response.url:
            shoes = response.xpath('//*[@class="fplpTitle header4"]/a/@href').extract()

        for shoe in shoes:
            yield Request(shoe, callback=self.parse_shoes)

    def parse_shoes(self, response):
        url = response.url

        if "store.nike.com" in response.url:
            name = response.xpath('//*[@itemprop="name"]/text()').extract_first()
            price = response.xpath('//*[@itemprop="price"]/text()').extract_first()
            price = price.replace('$','')
            shoe_type =  response.css('.exp-product-subtitle::text').extract_first()

            sizes = response.xpath('//*[@class="nsg-form--drop-down exp-pdp-size-dropdown exp-pdp-dropdown two-column-dropdown"]/option')
            sizes = sizes.xpath('text()[not(parent::option/@class="exp-pdp-size-not-in-stock selectBox-disabled")]').extract()
            sizes = [s.strip() for s in sizes]
            yield {
                'url': url,
                'name' : name,
                'price' : price,
                'sizes' : sizes,
                'shoe_type': shoe_type
            }
        elif "dickssportinggoods.com" in response.url:
                sizes = response.xpath('//*[@class="swatches clearfix"]/input/@value').extract()
                if sizes == []:
                    pass
                url = response.url
                name = response.xpath('.//*[@id="PageHeading_3074457345618261107"]/h1/text()').extract_first()
                price = response.xpath('.//*[@itemprop="price"]/text()').extract_first()
                #shoe_type =  response.css('.exp-product-subtitle::text').extract_first()

                yield {
                        'url': url,
                        'name' : name,
                        'price' : price,
                        'sizes' : sizes,
                        'shoe_type': ''
                }

您不必指定允许的变量。

您可以忽略allowed_domains变量，这样就没有域限制。

@tadm123太棒了！。我没有运行您的代码，如果我的代码有任何语法错误或任何其他问题，请用您的完整工作代码编辑我的答案，这样它可能会帮助未来的读者/只是

ellif

，但您现在已经修复了它。谢谢你展示了如何做到这一点，我有10个不同的脚本的鞋子试图把它们绑在一起。

    from scrapy import Spider
    from scrapy.http import Request
    class ShoesSpider(Spider):
        name = "shoes"
        allowed_domains = ["dickssportinggoods.com"]
        start_urls = ['http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp']
        def parse(self, response):
            shoes = response.xpath('//*[@class="fplpTitle header4"]/a/@href').extract()
            for shoe in shoes:
                yield Request(shoe, callback=self.parse_shoes)
        def parse_shoes(self, response):
            sizes = response.xpath('//*[@class="swatches clearfix"]/input/@value').extract()
            if sizes == []:
                pass
            url = response.url
            name = response.xpath('.//*[@id="PageHeading_3074457345618261107"]/h1/text()').extract_first()
            price = response.xpath('.//*[@itemprop="price"]/text()').extract_first()
            #shoe_type =  response.css('.exp-product-subtitle::text').extract_first()
            yield {
                    'url': url,
                    'name' : name,
                    'price' : price,
                    'sizes' : sizes,
                    'shoe_type': ''
                }

from scrapy import Spider
from scrapy.http import Request
class ShoesSpider(Spider):
    name = "shoes"
    allowed_domains = ["store.nike.com", "dickssportinggoods.com"]
    start_urls = ['http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3', 'http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp']
    def parse(self, response):

        if "store.nike.com" in response.url:
            shoes = response.xpath('//*[@class="grid-item-image-wrapper sprite-sheet sprite-index-0"]/a/@href').extract()
        elif "dickssportinggoods.com" in response.url:
            shoes = response.xpath('//*[@class="fplpTitle header4"]/a/@href').extract()

        for shoe in shoes:
            yield Request(shoe, callback=self.parse_shoes)

    def parse_shoes(self, response):
        url = response.url

        if "store.nike.com" in response.url:
            name = response.xpath('//*[@itemprop="name"]/text()').extract_first()
            price = response.xpath('//*[@itemprop="price"]/text()').extract_first()
            price = price.replace('$','')
            shoe_type =  response.css('.exp-product-subtitle::text').extract_first()

            sizes = response.xpath('//*[@class="nsg-form--drop-down exp-pdp-size-dropdown exp-pdp-dropdown two-column-dropdown"]/option')
            sizes = sizes.xpath('text()[not(parent::option/@class="exp-pdp-size-not-in-stock selectBox-disabled")]').extract()
            sizes = [s.strip() for s in sizes]
            yield {
                'url': url,
                'name' : name,
                'price' : price,
                'sizes' : sizes,
                'shoe_type': shoe_type
            }
        elif "dickssportinggoods.com" in response.url:
                sizes = response.xpath('//*[@class="swatches clearfix"]/input/@value').extract()
                if sizes == []:
                    pass
                url = response.url
                name = response.xpath('.//*[@id="PageHeading_3074457345618261107"]/h1/text()').extract_first()
                price = response.xpath('.//*[@itemprop="price"]/text()').extract_first()
                #shoe_type =  response.css('.exp-product-subtitle::text').extract_first()

                yield {
                        'url': url,
                        'name' : name,
                        'price' : price,
                        'sizes' : sizes,
                        'shoe_type': ''
                }