Node.js——健壮的HTML解析+对HTML中javascript函数的访问

Node.js——健壮的HTML解析+对HTML中javascript函数的访问,javascript,parsing,node.js,jsdom,Javascript,Parsing,Node.js,Jsdom,我是node的新手,希望从以下示例页面中提取javascript信息: engined.html: <html> <head> <title> This is a contrived example </title> <script type="text/javascript"> var filenames = new Array

我是node的新手,希望从以下示例页面中提取javascript信息:

engined.html:

<html>
    <head>
        <title>
            This is a contrived example          
        </title>
        <script type="text/javascript">
    var filenames = new Array()
    filenames[filenames.length] = "http://domainwhatever.s3.amazonaws.com/780BONNYVILLECOLDLAKECHRYSLER/4431716.jpg";
    filenames[filenames.length] = "http://domainwhatever.s3.amazonaws.com/780BONNYVILLECOLDLAKECHRYSLER/4431716_1.jpg";
    filenames[filenames.length] = "http://domainwhatever.s3.amazonaws.com/780BONNYVILLECOLDLAKECHRYSLER/4431716_2.jpg";
    filenames[filenames.length] = "http://domainwhatever.s3.amazonaws.com/780BONNYVILLECOLDLAKECHRYSLER/4431716_3.jpg";
    filenames[filenames.length] = "http://domainwhatever.s3.amazonaws.com/780BONNYVILLECOLDLAKECHRYSLER/4431716_4.jpg";

      function pixplosion_Content()
      {
        var eElement = document.getElementById('idLoading');
        if( eElement ) eElement.style.display = 'none';

        return "<pixplosion test=\"test\" flashGasket=\"http://www.realelivepeople.com/pixplosion/assets/flashGasket.swf?contentPath=\" ytBridge=\"/images/image.php?pixplosion=ytbridge\"><tab test=\"test\" label=\"Photos (%1)\" icon=\"Image\" autoIterate=\"false\"   ><tab test=\"test\" label=\"Vehicle Photos (%1)\" icon=\"\" autoIterate=\"true\" startFocused=\"true\"  >
    <image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025.jpg</image>
    <image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025/35102537.jpg</image>
    <image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025/35102538.jpg</image>
    <image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025/35102539.jpg</image>
    <image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025/35102540.jpg</image>
    <image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025/35102541.jpg</image>
    <image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025/35102542.jpg</image>
    <image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025/35102543.jpg</image><image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025/35102544.jpg</image><image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025/35102545.jpg</image><image>http://s3.domainwhatever_2.com/vehicles/photos/3726/1300025/35102546.jpg</image></tab></tab></pixplosion>";
      }

        </script>

    </head>
    <body>
    </body>
</html>
使用默认解析器时,会给我一个数组

有了HTML5解析器,我可以:

undefined
这是我的密码:

var jsdom = require("jsdom"),
    fs = require('fs'),
    HTML5 = require('html5');

fs.readFile('contrived.html', 'utf-8', function(err, data) {
  if (err) {
    throw err;
  }
  var document = jsdom.jsdom(data, null, {parser: HTML5});

  // HTML data should be in document creation call
  var script = document.createElement("script");
  // HTML data SHOULD NOT be in window creation call
  var window = document.createWindow();
  var parser = new HTML5.Parser({document: window.document});
  parser.parse(data);

  script.src = 'http://code.jquery.com/jquery-1.4.2.js';
  script.onload = function(window) {
                      console.log('this is a test');
                      console.log(window.filenames);
                      console.log(window.pixplosion_Content);
  }
  document.head.appendChild(script);
});
我是否遗漏了什么,或者此功能不可用


非常感谢。

看起来PixClosion\u内容返回多行字符串?这似乎不对。您需要使用\来避开换行符。另外,我认为onload处理程序不会收到window,因此通过声明该参数,您就隐藏了window变量。试试script.onload=function{…}吧?谢谢你的回答,Linus。“artived.html”中的代码不是我的,所以我无法清理它,至少不能直接清理。至于窗口,我检查过了,可以在窗口上运行jQuery函数,没有任何问题,只是在获取javascript数据结构时遇到了问题。@LinusGThiel:我的道歉和感谢。关于“窗口”你是对的。然而,它仍然会产生同样的问题。啊,你不是你的代码,我也这么认为。那太糟糕了!Aria的html5解析器接受它,这对您很好。尝试将onload处理程序放在文档上?+1用于将其添加到Elance,因此。。。
var jsdom = require("jsdom"),
    fs = require('fs'),
    HTML5 = require('html5');

fs.readFile('contrived.html', 'utf-8', function(err, data) {
  if (err) {
    throw err;
  }
  var document = jsdom.jsdom(data, null, {parser: HTML5});

  // HTML data should be in document creation call
  var script = document.createElement("script");
  // HTML data SHOULD NOT be in window creation call
  var window = document.createWindow();
  var parser = new HTML5.Parser({document: window.document});
  parser.parse(data);

  script.src = 'http://code.jquery.com/jquery-1.4.2.js';
  script.onload = function(window) {
                      console.log('this is a test');
                      console.log(window.filenames);
                      console.log(window.pixplosion_Content);
  }
  document.head.appendChild(script);
});