Javascript 无法爬网ally.com_Javascript_Node.js_Node Modules

Javascript 无法爬网ally.com

javascript node.js

Javascript 无法爬网ally.com,javascript,node.js,node-modules,Javascript,Node.js,Node Modules,我可以在nature.com、flipkart.com等网站上爬行。它运行得很好。但当我尝试爬ally.com时，nike.com。它返回状态代码403并表示未定义。这是我的密码 // crawlerqueue.js var request = require('request'); var cheerio = require('cheerio'); var URL = require('url-parse'); var pa11y=require('pa11y'); var START_U

我可以在nature.com、flipkart.com等网站上爬行。它运行得很好。但当我尝试爬ally.com时，nike.com。它返回状态代码403并表示未定义。这是我的密码

// crawlerqueue.js

var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var pa11y=require('pa11y');

var START_URL = "http://www.nature.com/";
//var SEARCH_WORD = "stemming";
var MAX_PAGES_TO_VISIT = 100;

var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;

pagesToVisit.push(START_URL);
crawl();

function crawl() {
  if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
    console.log("Reached max limit of number of pages to visit.");
    return;
  }
  var nextPage = pagesToVisit.pop();
  if (nextPage in pagesVisited) {
    // We've already visited this page, so repeat the crawl
    crawl();
  } else {
    // New page we haven't visited
    visitPage(nextPage, crawl);
  }
}

function visitPage(url, callback) {
  // Add page to our set
  pagesVisited[url] = true;
  numPagesVisited++;

  // Make the request
  console.log("Visiting page " + url);
  request(url, function(error, response, body) {
     // Check status code (200 is HTTP OK)
     console.log("Status code: " + response.statusCode);
     if(response.statusCode !== 200) {
       callback();
       return;
     }
     // Parse the document body
     var $ = cheerio.load(body);
     /*var isWordFound = searchForWord($, SEARCH_WORD);
     if(isWordFound) {
       console.log('Word ' + SEARCH_WORD + ' found at page ' + url);
     } else*/ {
       collectInternalLinks($);
       // In this short program, our callback is just calling crawl()
       callback();
     }
  });
}

function searchForWord($, word) {
  var bodyText = $('html > body').text().toLowerCase();
  return(bodyText.indexOf(word.toLowerCase()) !== -1);
}

function collectInternalLinks($) {
    var relativeLinks = $("a[href^='/']");
    console.log("Found " + relativeLinks.length + " relative links on page");
    relativeLinks.each(function() {
        pagesToVisit.push(baseUrl + $(this).attr('href'));
    });
}

我通过命令行运行这段代码。nature.com的输出如下：

Visiting page http://www.nature.com/
Status code: 200
Found 23 relative links on page
Visiting page http://www.nature.com/scitable/sponsors
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/pressnews
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/contact
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/about
Status code: 200
Found 25 relative links on page
Visiting page http://www.nature.com/scitable/my-profile/social-settings
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/photocredit
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/presscontact
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/presskit
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/pressroom
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/sponsorship
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/topicpage/copy-number-
Status code: 200
Found 89 relative links on page
Reached max limit of number of pages to visit.

但当我尝试爬网nike.com或ally.com时，我会看到以下错误

Visiting page http://www.ally.com
Status code: 403
Visiting page undefined
C:\Users\dashboard-master\node_modules\request\index.js:45
    throw new Error('undefined is not a valid uri or options object.')
    ^

Error: undefined is not a valid uri or options object.
    at request (C:\Users\dashboard-master\node_modules\request\
index.js:45:11)
    at visitPage (C:\Users\dashboard-master\config\crawlqueue.j
s:41:3)
    at crawl (C:\Users\dashboard-master\config\crawlqueue.js:30
:5)
    at Request._callback (C:\Users\dashboard-master\config\crawlqueue.js:45:8)
    at Request.self.callback (C:\Users\dashboard-master\node_modules\request\request.js:188:22)
    at emitTwo (events.js:106:13)
    at Request.emit (events.js:191:7)
    at Request.<anonymous> (C:\Users\dashboard-master\node_modules\request\request.js:1171:10)
    at emitOne (events.js:96:13)
    at Request.emit (events.js:188:7)

访问页面http://www.ally.com
状态代码：403
访问页面未定义
C:\Users\dashboard master\node\u modules\request\index.js:45
抛出新错误（“undefined不是有效的uri或options对象”。）
^
错误：undefined不是有效的uri或选项对象。
请求时（C:\Users\dashboard master\node\u modules\request\
索引:js:45:11)
在visitPage（C:\Users\dashboard master\config\crawlqueue.j
s:41:3）
爬网时（C:\Users\dashboard master\config\crawlqueue.js:30
:5)
应请求。\回调（C:\Users\dashboard master\config\crawlqueue.js:45:8）
at Request.self.callback（C:\Users\dashboard master\node\u modules\Request\Request.js:188:22）
两点钟（events.js:106:13）
at Request.emit（events.js:191:7）
应要求。（C:\Users\dashboard master\node\u modules\request\request.js:1171:10）
在emitOne（events.js:96:13）
at Request.emit（events.js:188:7）

它返回状态代码403

ally.com

位于Akamai Ghost Server之后，Akamai以某种方式防止爬行，并向您提供错误参考。您可以在响应正文中检查此信息，或在X-reference-error下返回标题。对我来说，它看起来是这样的18.5fcxx917.148981xxxx.dacxsd6。如果您想深入挖掘，可以查看他们的API以转换错误引用

并且说没有定义

首先，在进行请求调用时检查错误。您直接检查的是

response.statusCode

，您不知道您得到的是响应还是未定义的值

在您的情况下，调用

crawl

函数，如果没有

成功，则返回执行，这意味着您将没有nextpage进行爬网

var nextPage = pagesToVisit.pop();

这里您弹出一个空数组（pagesToVisit是空的，因为您没有收集任何链接），因此

nextPage

将

未定义，然后您将与uri
相同的内容传递给请求模块，请求模块将抛出错误
只有当数组的长度大于0或检查下一页的值时，才能执行弹出操作
if(nextPage){
    if (nextPage in pagesVisited) {