Scrapy 第一个结果后,瘙痒结束
我一直在四处寻找,找不到我要找的答案。我让我的爬虫(scrapy)返回的结果接近我要寻找的结果。所以我现在要做的是让它从页面中提取多个结果。目前,它拉动第一个并停止。如果我先取下extract_(),那么它会提取所有数据并对它们进行分组。因此,寻找两个答案中的一个是可行的 1) 继续爬网结果,不要结束 2) 将每个项目解组到新的结果行上 这是我的密码:Scrapy 第一个结果后,瘙痒结束,scrapy,scrapy-spider,Scrapy,Scrapy Spider,我一直在四处寻找,找不到我要找的答案。我让我的爬虫(scrapy)返回的结果接近我要寻找的结果。所以我现在要做的是让它从页面中提取多个结果。目前,它拉动第一个并停止。如果我先取下extract_(),那么它会提取所有数据并对它们进行分组。因此,寻找两个答案中的一个是可行的 1) 继续爬网结果,不要结束 2) 将每个项目解组到新的结果行上 这是我的密码: import scrapy from scrapy.selector import Selector from urlparse imp
import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse
class MySpider(CrawlSpider):
name = "ziprecruiter"
def start_requests(self):
allowed_domains = ["https://www.ziprecruiter.com/"]
urls = [
'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for houses in response.xpath('/html/body'):
yield {
'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()[1]').extract_first(),
'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()[1]').extract_first(),
'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()[1]').extract_first(),
'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()[1]').extract_first(),
'Link' : houses.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[4]/div[1]/button[1]/text()').extract_first(),
'Link' : houses.xpath('.//a/@href[1]').extract_first(),
'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()[1]').extract_first()
}
import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse
class MySpider(CrawlSpider):
name = "ziprecruiter"
def start_requests(self):
allowed_domains = ["https://www.ziprecruiter.com/"]
urls = [
'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for houses in response.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[1]/div[2]'):
yield {
'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()').extract(),
'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()').extract(),
'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()').extract(),
'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()').extract(),
'Link' : houses.xpath('.//a/@href').extract(),
'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()').extract()
}
提前谢谢你
编辑::
经过更多的研究,我重新定义了要爬行的容器,这给了我所有正确的答案。现在我的问题是如何获得页面上的每个项目,而不仅仅是第一个结果。。。它只是不循环。这是我的密码:
import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse
class MySpider(CrawlSpider):
name = "ziprecruiter"
def start_requests(self):
allowed_domains = ["https://www.ziprecruiter.com/"]
urls = [
'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for houses in response.xpath('/html/body'):
yield {
'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()[1]').extract_first(),
'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()[1]').extract_first(),
'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()[1]').extract_first(),
'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()[1]').extract_first(),
'Link' : houses.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[4]/div[1]/button[1]/text()').extract_first(),
'Link' : houses.xpath('.//a/@href[1]').extract_first(),
'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()[1]').extract_first()
}
import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse
class MySpider(CrawlSpider):
name = "ziprecruiter"
def start_requests(self):
allowed_domains = ["https://www.ziprecruiter.com/"]
urls = [
'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for houses in response.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[1]/div[2]'):
yield {
'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()').extract(),
'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()').extract(),
'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()').extract(),
'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()').extract(),
'Link' : houses.xpath('.//a/@href').extract(),
'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()').extract()
}
在我看来,您应该使用以下xpath:
//div[@class="job_content"]
因为这就是你要找的那个部门。当我为这个页面执行它时,返回了20个div元素。但是,您可能需要在xpath查询中添加一些过滤,以防有其他具有该类名的div不需要解析。我只从您提供的xpath中获得一个div元素,这解释了houses循环只执行一次的原因。在我看来,您的xpath似乎有问题。谢谢!我刚换成这个。看起来这很有帮助。我会很快做更多的测试,但我认为这就是解决办法!抱歉打扰了,我在另一个爬虫上也遇到了同样的问题。你能给我链接到关于查找“成功”容器的更多信息吗?我不确定你所说的容器是什么意思。我的意思是,在这个例子中,我将如何为允许的区域找到正确的xpath,你是如何给我//div[@class=job\u content“]。因为我还是新手,这有点让人困惑,似乎无法在谷歌搜索中提出正确的问题:/