Python 需要使用scrapy提取子页面的内容
我对scrapy是个新手,但我做了一些简单的刮刀Python 需要使用scrapy提取子页面的内容,python,web-scraping,scrapy,web-crawler,Python,Web Scraping,Scrapy,Web Crawler,我对scrapy是个新手,但我做了一些简单的刮刀 import scrapy from ..items import remoteworkhub_jobs class remoteworkhub(scrapy.Spider): name = 'remoteworkhub' allowed_domains = ['www.remoteworkhub.com'] #start_urls = ['https://jobs.remoteworkhub.com/'] s
import scrapy
from ..items import remoteworkhub_jobs
class remoteworkhub(scrapy.Spider):
name = 'remoteworkhub'
allowed_domains = ['www.remoteworkhub.com']
#start_urls = ['https://jobs.remoteworkhub.com/']
start_urls = ['https://jobs.remoteworkhub.com']
# Scrape the individual job urls and pass them to the spider
def parse(self, response):
links = response.xpath('//a[@class="jobList-title"]/@href').extract()
for jobs in links:
base_url = 'https://jobs.remoteworkhub.com'
Url = base_url + jobs
yield scrapy.Request(Url, callback=self.parsejobpage)
def parsejobpage(self, response):
#Extracting the content using css selectors
titles = response.xpath('//h1[@class="u-mv--remove u-textH2"]/text()').extract()
companys = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div/div[1]/strong/a/text()').extract()
categories = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[3]/ul/li/a/text()').extract()
worktype = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[5]/div[2]/span/text()').extract()
job_decription = response.xpath('//div[@class="job-body"]//text()').extract()
#titles = response.css('.jobDetail-headerIntro::text').extract()
#titles = response.xpath('//title').get()
#votes = response.css('.score.unvoted::text').extract()
#times = response.css('time::attr(title)').extract()
#comments = response.css('.comments::text').extract()
item = remoteworkhub_jobs()
#item['jobUrl'] = jobUrl
item['title'] = titles
#item['company'] = companys
#item['category'] = categories
#item['worktype'] = worktype
#item['job_description'] = job_decription
#yield or give the scraped info to scrapy
yield item
我正试图通过从一个页面获取所有链接并删除子页面的内容来达到下一个层次。我已经阅读了一些不同的示例和Q&a,但似乎无法让这些代码为我工作
import scrapy
from ..items import remoteworkhub_jobs
class remoteworkhub(scrapy.Spider):
name = 'remoteworkhub'
allowed_domains = ['www.remoteworkhub.com']
#start_urls = ['https://jobs.remoteworkhub.com/']
start_urls = ['https://jobs.remoteworkhub.com']
# Scrape the individual job urls and pass them to the spider
def parse(self, response):
links = response.xpath('//a[@class="jobList-title"]/@href').extract()
for jobs in links:
base_url = 'https://jobs.remoteworkhub.com'
Url = base_url + jobs
yield scrapy.Request(Url, callback=self.parsejobpage)
def parsejobpage(self, response):
#Extracting the content using css selectors
titles = response.xpath('//h1[@class="u-mv--remove u-textH2"]/text()').extract()
companys = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div/div[1]/strong/a/text()').extract()
categories = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[3]/ul/li/a/text()').extract()
worktype = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[5]/div[2]/span/text()').extract()
job_decription = response.xpath('//div[@class="job-body"]//text()').extract()
#titles = response.css('.jobDetail-headerIntro::text').extract()
#titles = response.xpath('//title').get()
#votes = response.css('.score.unvoted::text').extract()
#times = response.css('time::attr(title)').extract()
#comments = response.css('.comments::text').extract()
item = remoteworkhub_jobs()
#item['jobUrl'] = jobUrl
item['title'] = titles
#item['company'] = companys
#item['category'] = categories
#item['worktype'] = worktype
#item['job_description'] = job_decription
#yield or give the scraped info to scrapy
yield item
查看下面的实现,它应该允许您从该站点解析职务及其相关的公司名称。您定义XPath的方式很容易出错。但是,我已经修改了它们,以便它们能够以正确的方式工作。试一试:
import scrapy
class remoteworkhub(scrapy.Spider):
name = 'remoteworkhub'
start_urls = ['https://jobs.remoteworkhub.com']
def parse(self, response):
for job_link in response.xpath("//*[contains(@class,'job-listing')]//*[@class='jobList-title']/@href").extract():
Url = response.urljoin(job_link)
yield scrapy.Request(Url, callback=self.parsejobpage)
def parsejobpage(self, response):
d = {}
d['title'] = response.xpath("//*[@class='jobDetail-headerIntro']/h1/text()").get()
d['company'] = response.xpath("//*[@class='jobDetail-headerIntro']//strong//text()").get()
yield d
如果我使用打印而不是产量,我可以在控制台中看到这种输出:
{'title': 'Sr Full Stack Developer, Node/React - Remote', 'company': 'Clevertech'}
{'title': 'Subject Matter Expert, Customer Experience - Remote', 'company': 'Qualtrics'}
{'title': 'Employee Experience Enterprise Account Executive - Academic and Government - Remote', 'company': 'Qualtrics'}
{'title': 'Senior Solutions Consultant, Brand Experience - Remote', 'company': 'Qualtrics'}
{'title': 'Data Analyst - Remote', 'company': 'Railsware'}
{'title': 'Recruitment Manager - Remote', 'company': 'Railsware'}
感谢您对xpath的介绍。当我在上面运行它时,我得到一个错误这是我能看到的唯一输出[scrapy.spidermiddleware.offsite]DEBUG:Filtered-offsite请求到'jobs.remoteworkhub.com':spider在终端输出的上面一行之后完成。我找不到脚本在您的情况下有其他行为的任何原因。你按原样跑了吗?太棒了!!谢谢你的帮助。之前的错误是我忘记更改回的设置