Javascript 使用nodejs和phantomjs动态抓取

Javascript 使用nodejs和phantomjs动态抓取,javascript,node.js,web-scraping,phantomjs,Javascript,Node.js,Web Scraping,Phantomjs,首先,我已经成功地安装了PhantomJs及其npm接口。我已经将代码设置为使用新语法加载我的页面(这里发布的所有其他问题都基于旧的代码语法,或者我遗漏了一些东西) 现在,右边的边栏,一个在“Comune”附近有假的select,另一个是动态生成的,我不明白为什么phantomjs没有拾取它们。我的代码如下: var sito = "http://bicincitta.tobike.it/"; var sitepage = null; var phInstance = null; var pha

首先,我已经成功地安装了PhantomJs及其npm接口。我已经将代码设置为使用新语法加载我的页面(这里发布的所有其他问题都基于旧的代码语法,或者我遗漏了一些东西)

现在,右边的边栏,一个在“Comune”附近有假的select,另一个是动态生成的,我不明白为什么phantomjs没有拾取它们。我的代码如下:

var sito = "http://bicincitta.tobike.it/";
var sitepage = null;
var phInstance = null;
var phantom = require('phantom')

phantom.create()
    .then((instance) => {
    phInstance = instance;
return instance.createPage();
})
.then((page) => {
    sitepage = page;
return page.open(sito);
})
.then((status) => {
    console.log(status);
return sitepage.property('content');
})
.then((content) => {
    console.log(content);
sitepage.close();
phInstance.exit();
})
.catch((error) => {
    console.log(error);
phInstance.exit();
})
我现在正把头重重地撞在墙上。我是否应该以某种方式获取站点的脚本并执行它们?我是否错过了一个指令


另外,在旁注上;如果页面的范围在第二个“.then.”内,我还不清楚如何将其他方法连接到页面。然后,“

在过去的一周里,我一直在与PhantomJS合作,试图让它用角度渲染的数据对页面进行快照。我发现最简单的方法是对任何本地脚本使用
page.injectJs('../script.js')
,以及
page.includeJs('../script.js')http://jquery.com...)
用于任何外部脚本。因为Phantom是沙盒,所以它不会在它捕获的页面上执行javascript,除非您给它JS来执行。这将允许您截屏一个页面,该页面包含用javascript呈现的数据。

html底部有CData脚本,phantom无法解析该脚本。这是传播项目的地方

<script type="text/javascript">
//<![CDATA[
Sys.Application.initialize();
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxManager, {"_updatePanels":"","ajaxSettings":[],"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"defaultLoadingPanelID":"","enableAJAX":true,"enableHistory":false,"links":[],"styles":[],"uniqueID":"RadAjaxManager1","updatePanelsRenderMode":0}, null, null, $get("RadAjaxManager1"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajCheckLoginUser"}, null, null, $get("ajCheckLoginUser"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajCheckLoginAdmin"}, null, null, $get("ajCheckLoginAdmin"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajLogoutUser"}, null, null, $get("ajLogoutUser"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadWindow, {"_dockMode":false,"behaviors":0,"clientStateFieldID":"radPortal_ClientState","destroyOnClose":true,"formID":"form1","height":"180px","iconUrl":"","left":"","minimizeIconUrl":"","modal":true,"name":"radPortal","reloadOnShow":true,"showContentDuringLoad":false,"skin":"Office2007","top":"","visibleStatusbar":false,"width":"450px"}, {"close":OnClientClosePortal}, null, $get("radPortal"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadWindowManager, {"behaviors":4,"clientStateFieldID":"windowManagerPortal_ClientState","destroyOnClose":true,"formID":"form1","iconUrl":"","left":"","minimizeIconUrl":"","modal":true,"name":"windowManagerPortal","reloadOnShow":true,"showContentDuringLoad":false,"skin":"Office2007","top":"","visibleStatusbar":false,"windowControls":"['radPortal']"}, null, {"child":"radPortal"}, $get("windowManagerPortal"));
    });
//]]>
</script>

//
一旦您离开与此站点服务器的通信,这些项目也将被销毁。有一些方法可以解决这个问题,但我认为你最好尝试其他方法。我使用npm cheerio加载CDATA html