Python Scrapy没有加载整个页面?或者我有坏代码。。。

Python Scrapy没有加载整个页面?或者我有坏代码。。。,python,pagination,scrapy,Python,Pagination,Scrapy,我在以下代码中遇到分页问题 爬行器启动,但在第一页上找不到任何链接。这是因为页面实际上返回部分结果。。。我知道这听起来很奇怪,但这是真的,当我访问页面时,我看到列出的作业,但当机器人访问时,没有列出作业 据我所知,无论是JS还是AJAX,scrapy都会加载整个页面,但我开始怀疑 from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.sele

我在以下代码中遇到分页问题

爬行器启动,但在第一页上找不到任何链接。这是因为页面实际上返回部分结果。。。我知道这听起来很奇怪,但这是真的,当我访问页面时,我看到列出的作业,但当机器人访问时,没有列出作业

据我所知,无论是JS还是AJAX,scrapy都会加载整个页面,但我开始怀疑

from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.http.request import Request
from northrop.items import NorthropItem
from scrapy.http import HtmlResponse
from scrapy.exceptions import CloseSpider
import re

class NorthropSpider(CrawlSpider):
  name = "northropJobStart"

  start_urls = ['https://ngc.taleo.net/careersection/ngc_pro/jobsearch.ftl?lang=en#']
  allowed_domains = ["ngc.taleo.net"]

  rules = (
        Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="next"]/a',)), callback="parse_listings", follow= True),
  )

  def parse_start_url(self, response):
    return self.parse_listings(response)

  def parse_listings(self, response):
    sel = Selector(response)
    # There are no jobs listed.. I am lost.....
    jobs = sel.xpath('//th/div/div/span/a/@href').extract()
    for job_url in jobs:
      job_url = self.__normalise(job_url)
      job_url = self.__to_absolute_url(response.url,job_url)
      yield Request(job_url, callback=self.parse_details)

  def parse_details(self, response):
    sel = Selector(response)
    job = sel.xpath('//*[@id="mainbody-jobs"]')
    item = NorthropItem()
    # Populate job fields
    item['title'] = job.xpath('//*[@id="mainbody-jobs"]/h1/text()').extract()
    item['location'] = job.xpath('//*[@id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[3]/div[2]/text()').extract()
    item['applink'] = job.xpath('//*[@id="mainbody-jobs"]/div[3]/div[1]/a/@href').extract()
    item['description'] = job.xpath('//*[@id="mainbody-jobs"]/div[3]/div[2]/div[2]/div[1]/div[2]').extract()
    item['travel'] = job.xpath('//*[@id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[5]/div[2]/text()').extract()
    item['job_category'] = job.xpath('//*[@id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[2]/div[2]/text()').extract()
    item['clearance_have'] = job.xpath('//*[@id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[8]/div[2]/text()').extract()
    item['clearance_get'] = job.xpath('//*[@id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[8]/div[2]/text()').extract()
    item['job_number'] = job.xpath('//*[@id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[1]/div[2]/text()').extract()
    item['page_url'] = response.url
    item = self.__normalise_item(item, response.url)
    return item



  def __normalise_item(self, item, base_url):
    '''
    Standardise and format item fields
    '''
    # Loop item fields to sanitise data and standardise data types
    for key, value in vars(item).values()[0].iteritems():
      item[key] = self.__normalise(item[key])
      # Convert job URL from relative to absolute URL
      #item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])
      return item

  def __normalise(self, value):
    # Convert list to string
    value = value if type(value) is not list else ' '.join(value)
    # Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
    value = value.strip()
    return value

  def __to_absolute_url(self, base_url, link):
    '''
    Convert relative URL to absolute URL
    '''
    import urlparse
    link = urlparse.urljoin(base_url, link)
    return link

  def __to_int(self, value):
    '''
    Convert value to integer type
    '''
    try:
      value = int(value)
    except ValueError:
      value = 0
    return value

  def __to_float(self, value):
    '''
    Convert value to float type
    '''
    try:
      value = float(value)
    except ValueError:
      value = 0.0
    return value

不幸的是,搜索表单隐藏得很深,但如果在浏览器的“网络”选项卡中进行检查,则可以看到它。
结果是它发送了一个完整的默认搜索参数的json,所以你需要复制并粘贴它,只需增加
pageNo
。我情不自禁地解决了这个问题,在你知道之前,我写了一整只蜘蛛,所以在这里,如果某些部分不清楚,请告诉我:

import json
import scrapy


class TaleoSpider(scrapy.Spider):
    name = 'taleo'
    start_urls = ['https://ngc.taleo.net/careersection/ngc_pro/jobsearch.ftl?lang=en#']
    # baseform with base search values
    base_form = {'advancedSearchFiltersSelectionParam':
        {'searchFilterSelections': [
            {'id': 'ORGANIZATION', 'selectedValues': []},
            {'id': 'LOCATION', 'selectedValues': []},
            {'id': 'JOB_FIELD', 'selectedValues': []},
            {'id': 'URGENT_JOB', 'selectedValues': []},
            {'id': 'EMPLOYEE_STATUS', 'selectedValues': []},
            {'id': 'STUDY_LEVEL', 'selectedValues': []},
            {'id': 'WILL_TRAVEL', 'selectedValues': []},
            {'id': 'JOB_SHIFT', 'selectedValues': []},
            {'id': 'JOB_NUMBER', 'selectedValues': []}]},
        'fieldData': {'fields': {'JOB_TITLE': '', 'KEYWORD': '', 'LOCATION': ''},
                      'valid': True},
        'filterSelectionParam': {'searchFilterSelections': [{'id': 'POSTING_DATE',
                                                             'selectedValues': []},
                                                            {'id': 'LOCATION', 'selectedValues': []},
                                                            {'id': 'JOB_FIELD', 'selectedValues': []},
                                                            {'id': 'JOB_TYPE', 'selectedValues': []},
                                                            {'id': 'JOB_SCHEDULE', 'selectedValues': []},
                                                            {'id': 'JOB_LEVEL', 'selectedValues': []}]},
        'multilineEnabled': False,
        'pageNo': 1,  # <--- change this for pagination
        'sortingSelection': {'ascendingSortingOrder': 'false',
                             'sortBySelectionParam': '3'}}

    def parse(self, response):
        # we got cookies from first start url now lets request into the search api
        # copy base form for the first request
        form = self.base_form.copy()
        yield scrapy.Request('https://ngc.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=2160420105',
                             body=json.dumps(self.base_form),
                             # add headers to indicate we are sending a json package
                             headers={'Content-Type': 'application/json',
                                      'X-Requested-With': 'XMLHttpRequest'},
                             # scrapy.Request defaults to 'GET', but we want 'POST' here
                             method='POST',
                             # load our form into meta so we can reuse it later
                             meta={'form': form},
                             callback=self.parse_items)

    def parse_items(self, response):
        data = json.loads(response.body)
        # scrape data
        for item in data['requisitionList']:
            yield item

        # next page
        # get our form back and update the page number in it
        form = response.meta['form']
        form['pageNo'] += 1
        # check if paging is over, is our next page higher than maximum page?
        max_page = data['pagingData']['totalCount'] / data['pagingData']['pageSize']
        if form['pageNo'] > max_page:
            return
        yield scrapy.Request('https://ngc.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=2160420105',
                             body=json.dumps(form),
                             headers={'Content-Type': 'application/json',
                                      'X-Requested-With': 'XMLHttpRequest'},
                             method='POST',
                             meta={'form': form},
                             callback=self.parse_items)
导入json
进口羊瘙痒
TaleoSpider类(刮毛蜘蛛):
名称='taleo'
起始URL=['https://ngc.taleo.net/careersection/ngc_pro/jobsearch.ftl?lang=en#']
#具有基本搜索值的基本表单
基本形式={'advancedSearchFiltersSelectionParam':
{'searchFilterSelections':[
{'id':'ORGANIZATION','selectedValues':[]},
{'id':'LOCATION','selectedValues':[]},
{'id':'JOB_FIELD','selectedValues':[]},
{'id':'emergency_JOB','selectedValues':[]},
{'id':'EMPLOYEE_STATUS','selectedValues':[]},
{'id':'STUDY_LEVEL','selectedValues':[]},
{'id':'WILL_TRAVEL','selectedValues':[]},
{'id':'JOB_SHIFT','selectedValues':[]},
{'id':'JOB_NUMBER','selectedValues':[]}]},
'fieldData':{'fields':{'JOB_TITLE':'','KEYWORD':'','LOCATION':''},
“valid”:True},
'filterSelectionParam':{'searchFilterSelections':[{'id':'POSTING_DATE',
“selectedValues”:[]},
{'id':'LOCATION','selectedValues':[]},
{'id':'JOB_FIELD','selectedValues':[]},
{'id':'JOB_TYPE','selectedValues':[]},
{'id':'JOB_SCHEDULE','selectedValues':[]},
{'id':'JOB_LEVEL','selectedValues':[]}]},
“多重链接”:False,
“页面编号”:1,#最大页面:
返回
产生刮痕。请求('https://ngc.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=2160420105',
body=json.dumps(表单),
headers={'Content-Type':'application/json',
'X-request-With':'XMLHttpRequest'},
方法='POST',
meta={'form':form},
callback=self.parse_项)

看起来您刚刚解析了搜索结果页面的JSON数据。我需要从该数据中获取链接,并将其传递到一个循环中,以从页面中获取详细信息。我今晚将处理此问题,并让您知道它是如何进行的。顺便说一句,我将此问题放在GitHub上,以防需要更多帮助。我要到今晚才能处理此问题恩,我从这份工作中下班了。我实际上认为我可以简单地根据jobId字段构建URL,然后将其发送到另一个def中,以获取我需要的详细信息…希望我现在就可以处理它!