使用phantomJS抓取javascript网页

使用phantomJS抓取javascript网页,javascript,phantomjs,Javascript,Phantomjs,我正试图用phantomJS刮取一个动态网页。下面是我正在努力获取的代码和url。该代码适用于其他URL,但此URL始终作为空白html文档返回。有人知道如何解决这个问题吗 我对javascript不太熟悉,所以这段代码是从其他地方复制的。我已经将超时时间从2.5秒增加到了30秒,但这并没有产生任何影响 var url ='https://www.amazon.com/gp/profile/amzn1.account.AFJ6MBZ5CSY4R6K4USNMQ7JWEQCA/'; var pag

我正试图用phantomJS刮取一个动态网页。下面是我正在努力获取的代码和url。该代码适用于其他URL,但此URL始终作为空白html文档返回。有人知道如何解决这个问题吗

我对javascript不太熟悉,所以这段代码是从其他地方复制的。我已经将超时时间从2.5秒增加到了30秒,但这并没有产生任何影响

var url ='https://www.amazon.com/gp/profile/amzn1.account.AFJ6MBZ5CSY4R6K4USNMQ7JWEQCA/';
var page = new WebPage()
var fs = require('fs');


page.open(url, function (status) {
        just_wait();
});

function just_wait() {
    setTimeout(function() {
            fs.write('page.html', page.content, 'w');
        phantom.exit();
    }, 30000);
}

我就是这样解决这些问题的

app.js

var url ='https://www.amazon.com/gp/profile/amzn1.account.AFJ6MBZ5CSY4R6K4USNMQ7JWEQCA/';
var steps=[];
var testindex = 0;
var loadInProgress = false;

//This is set to true when a page is still loading
/*********SETTINGS*********************/
var settings   = require('./settings');
var webPage = require('webpage');
var page = webPage.create();
var fs = require('fs');
page.settings.userAgent = settings.userAgents.desktop;
page.settings.javascriptEnabled = true;
page.settings.loadImages = false;
//Script is much faster with this field set to false
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;

page.viewportSize = {
  width: settings.viewport.desktop.width,
  height: settings.viewport.desktop.height
};

/*********SETTINGS END*****************/
console.log('All settings Loaded, Start With Execution');
/**********DEFINE STEPS THAT PHANTOM SHOULD DO***********************/
steps = [
    function(){
      console.log("Step 1 - Load Page => "+url);
      page.open(url, function(status){
        if(status === 'success'){
          console.log('Loaded');
        }else{
          console.log('Error Loading Page. Try Logging In Again');
          phantom.exit(0);
        }
      });
    },
    function(){
      page.render('./test.png');
    },
];

/**********END STEPS THAT PHANTOM SHOULD DO***********************/
interval = setInterval(executeRequestsStepByStep, 3000);
function executeRequestsStepByStep(){
  if(loadInProgress == false && typeof steps[testindex] == "function") {
    steps[testindex]();
    testindex++;
    return;
  }

  if(typeof steps[testindex] != "function") {
    console.log("Quiting");
    fs.write('page.html', page.content, 'w');
    phantom.exit(0);
  }
}
/*
 * These listeners are very important in order to phantom work properly.
 * Using these listeners, we control loadInProgress marker which controls, weather a page is fully loaded.
 * Without this, we will get content of the page, even a page is not fully loaded.
 */

page.onLoadStarted = function() {
    loadInProgress = true;
};
page.onLoadFinished = function() {
    loadInProgress = false;
};
page.onConsoleMessage = function(msg) {
    // console.log(msg);
};

phantom.onError = function(msg, trace) {
  var msgStack = ['PHANTOM ERROR: ' + msg];
  if (trace && trace.length) {
    msgStack.push('TRACE:');
    trace.forEach(function(t) {
      msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
    });
  }
  console.error(msgStack.join('\n'));
  phantom.exit(1);
};
module.exports = {
  viewport: {
    desktop: {
      height: 663,
      width: 1200
    }
  },
  userAgents: {
    desktop: 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
  }
};
settings.js

var url ='https://www.amazon.com/gp/profile/amzn1.account.AFJ6MBZ5CSY4R6K4USNMQ7JWEQCA/';
var steps=[];
var testindex = 0;
var loadInProgress = false;

//This is set to true when a page is still loading
/*********SETTINGS*********************/
var settings   = require('./settings');
var webPage = require('webpage');
var page = webPage.create();
var fs = require('fs');
page.settings.userAgent = settings.userAgents.desktop;
page.settings.javascriptEnabled = true;
page.settings.loadImages = false;
//Script is much faster with this field set to false
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;

page.viewportSize = {
  width: settings.viewport.desktop.width,
  height: settings.viewport.desktop.height
};

/*********SETTINGS END*****************/
console.log('All settings Loaded, Start With Execution');
/**********DEFINE STEPS THAT PHANTOM SHOULD DO***********************/
steps = [
    function(){
      console.log("Step 1 - Load Page => "+url);
      page.open(url, function(status){
        if(status === 'success'){
          console.log('Loaded');
        }else{
          console.log('Error Loading Page. Try Logging In Again');
          phantom.exit(0);
        }
      });
    },
    function(){
      page.render('./test.png');
    },
];

/**********END STEPS THAT PHANTOM SHOULD DO***********************/
interval = setInterval(executeRequestsStepByStep, 3000);
function executeRequestsStepByStep(){
  if(loadInProgress == false && typeof steps[testindex] == "function") {
    steps[testindex]();
    testindex++;
    return;
  }

  if(typeof steps[testindex] != "function") {
    console.log("Quiting");
    fs.write('page.html', page.content, 'w');
    phantom.exit(0);
  }
}
/*
 * These listeners are very important in order to phantom work properly.
 * Using these listeners, we control loadInProgress marker which controls, weather a page is fully loaded.
 * Without this, we will get content of the page, even a page is not fully loaded.
 */

page.onLoadStarted = function() {
    loadInProgress = true;
};
page.onLoadFinished = function() {
    loadInProgress = false;
};
page.onConsoleMessage = function(msg) {
    // console.log(msg);
};

phantom.onError = function(msg, trace) {
  var msgStack = ['PHANTOM ERROR: ' + msg];
  if (trace && trace.length) {
    msgStack.push('TRACE:');
    trace.forEach(function(t) {
      msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
    });
  }
  console.error(msgStack.join('\n'));
  phantom.exit(1);
};
module.exports = {
  viewport: {
    desktop: {
      height: 663,
      width: 1200
    }
  },
  userAgents: {
    desktop: 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
  }
};

我已经用这个进行了测试,效果很好。

好的。因此,这将下载网页,但不是我试图搜集的“社区活动”部分的内容。这肯定比我以前更接近了。interval=setInterval(executeRequestsStepByStep,3000);增加这一行的时间,比如说20000,所有的内容都被加载。如果你尝试在HTML页面中搜索,你会在那里找到所有的内容,但是由于任何原因,数据被隐藏了。我现在看到了。谢谢谢谢,如果这解决了你的问题,请投票给这个答案,这样其他人也可以得到好处。最美好的祝福