Scrapy 第一个结果后,瘙痒结束

Scrapy 第一个结果后,瘙痒结束,scrapy,scrapy-spider,Scrapy,Scrapy Spider,我一直在四处寻找,找不到我要找的答案。我让我的爬虫(scrapy)返回的结果接近我要寻找的结果。所以我现在要做的是让它从页面中提取多个结果。目前,它拉动第一个并停止。如果我先取下extract_(),那么它会提取所有数据并对它们进行分组。因此,寻找两个答案中的一个是可行的 1) 继续爬网结果,不要结束 2) 将每个项目解组到新的结果行上 这是我的密码: import scrapy from scrapy.selector import Selector from urlparse imp

我一直在四处寻找,找不到我要找的答案。我让我的爬虫(scrapy)返回的结果接近我要寻找的结果。所以我现在要做的是让它从页面中提取多个结果。目前,它拉动第一个并停止。如果我先取下extract_(),那么它会提取所有数据并对它们进行分组。因此,寻找两个答案中的一个是可行的

1) 继续爬网结果,不要结束 2) 将每个项目解组到新的结果行上

这是我的密码:

    import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse

class MySpider(CrawlSpider):
    name = "ziprecruiter"

    def start_requests(self):
        allowed_domains = ["https://www.ziprecruiter.com/"]     
        urls = [
            'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
            ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for houses in response.xpath('/html/body'):
            yield {

                'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()[1]').extract_first(),
                'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()[1]').extract_first(),
                'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()[1]').extract_first(),
                'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()[1]').extract_first(),

                'Link' : houses.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[4]/div[1]/button[1]/text()').extract_first(),
                'Link' : houses.xpath('.//a/@href[1]').extract_first(),
                'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()[1]').extract_first()

                }
    import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse

class MySpider(CrawlSpider):
    name = "ziprecruiter"

    def start_requests(self):
        allowed_domains = ["https://www.ziprecruiter.com/"]     
        urls = [
            'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
            ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for houses in response.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[1]/div[2]'):
            yield {

                'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()').extract(),
                'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()').extract(),
                'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()').extract(),
                'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()').extract(),
                'Link' : houses.xpath('.//a/@href').extract(),
                'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()').extract()

                }
提前谢谢你

编辑:: 经过更多的研究,我重新定义了要爬行的容器,这给了我所有正确的答案。现在我的问题是如何获得页面上的每个项目,而不仅仅是第一个结果。。。它只是不循环。这是我的密码:

    import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse

class MySpider(CrawlSpider):
    name = "ziprecruiter"

    def start_requests(self):
        allowed_domains = ["https://www.ziprecruiter.com/"]     
        urls = [
            'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
            ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for houses in response.xpath('/html/body'):
            yield {

                'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()[1]').extract_first(),
                'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()[1]').extract_first(),
                'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()[1]').extract_first(),
                'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()[1]').extract_first(),

                'Link' : houses.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[4]/div[1]/button[1]/text()').extract_first(),
                'Link' : houses.xpath('.//a/@href[1]').extract_first(),
                'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()[1]').extract_first()

                }
    import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse

class MySpider(CrawlSpider):
    name = "ziprecruiter"

    def start_requests(self):
        allowed_domains = ["https://www.ziprecruiter.com/"]     
        urls = [
            'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
            ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for houses in response.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[1]/div[2]'):
            yield {

                'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()').extract(),
                'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()').extract(),
                'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()').extract(),
                'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()').extract(),
                'Link' : houses.xpath('.//a/@href').extract(),
                'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()').extract()

                }

在我看来,您应该使用以下xpath:

//div[@class="job_content"]

因为这就是你要找的那个部门。当我为这个页面执行它时,返回了20个div元素。但是,您可能需要在xpath查询中添加一些过滤,以防有其他具有该类名的div不需要解析。

我只从您提供的xpath中获得一个div元素,这解释了houses循环只执行一次的原因。在我看来,您的xpath似乎有问题。谢谢!我刚换成这个。看起来这很有帮助。我会很快做更多的测试,但我认为这就是解决办法!抱歉打扰了,我在另一个爬虫上也遇到了同样的问题。你能给我链接到关于查找“成功”容器的更多信息吗?我不确定你所说的容器是什么意思。我的意思是,在这个例子中,我将如何为允许的区域找到正确的xpath,你是如何给我//div[@class=job\u content“]。因为我还是新手,这有点让人困惑,似乎无法在谷歌搜索中提出正确的问题:/