Javascript 选择器解析html时的Cheerio问题
网页位于 我想得到这个元素,它的选择器是#tblproxy>tbody>tr.loading-row 我在谷歌控制台上也试过同样的方法Javascript 选择器解析html时的Cheerio问题,javascript,jquery,node.js,selector,cheerio,Javascript,Jquery,Node.js,Selector,Cheerio,网页位于 我想得到这个元素,它的选择器是#tblproxy>tbody>tr.loading-row 我在谷歌控制台上也试过同样的方法 var request = require('request'); var cheerio = require('cheerio'); request('http://www.gatherproxy.com/proxylist/anonymity/?t=Elite', function (error, response, html) {
var request = require('request');
var cheerio = require('cheerio');
request('http://www.gatherproxy.com/proxylist/anonymity/?t=Elite', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var temp = $('#tblproxy tbody tr.loading-row')
console.log(temp.attr('class'))
}
});
但是它在cheerio的环境中不起作用,程序的输出是未定义的,知道吗 从页面的代码源中,在
#tblproxy
中没有tbody
,因此将其从选择器中删除:
var s = $('#tblproxy > tbody > tr.loading-row')
undefined
s.attr('class')
"loading-row"
更新
在bublik42的评论之后,如果一个tbody
随机出现,您可以使用find()
:
从页面的代码源中,在
#tblproxy
中没有tbody
,因此将其从选择器中删除:
var s = $('#tblproxy > tbody > tr.loading-row')
undefined
s.attr('class')
"loading-row"
更新
在bublik42的评论之后,如果一个tbody
随机出现,您可以使用find()
:
我注意到您试图查询的元素
tbody
,是异步加载的。这超出了请求
模块的功能范围。您可以使用以无头方式模拟网页,并从网页模块获取html。如果您想创建更多自定义网页模块,可以参考
用叉子叉这个
首先,创建一个网页模块来获取特定页面的html
phantom/request.js
var temp = $('#tblproxy').find('tr.loading-row');
'use strict';
var page = require('webpage').create();
var system = require('system');
page.open(system.args[1], function(status) {
console.log(page.evaluate(function() {
return document.documentElement.innerHTML;
}));
phantom.exit();
});
'use strict';
var path = require('path');
var spawn = require('child_process').spawn;
var phantomjs = require('phantomjs');
var fs = require('fs');
var binPath = phantomjs.path;
var slice = Array.prototype.slice;
var phantomPath = path.join(
__dirname,
'..',
'phantom'
);
exports = module.exports = function() {
var args = slice.call(arguments);
var callback = args.pop();
var command = spawn(binPath, args);
command.stdout.on('data', function(data) {
callback(null, data.toString());
});
command.stderr.on('data', function(data) {
callback({ message: data.toString() }, null);
});
};
// create methods base on the ./phantom directory web page modules
fs.readdirSync(phantomPath).reduce(function(context, filename) {
var index = path.basename(filename, '.js');
context[index] = function() {
exports.apply(null, [path.join(phantomPath, filename)].concat(slice.call(arguments)));
};
}, exports);
其次,为
phantom
目录中的所有网页模块创建一个phantomjs cli包装器
lib/phantom.js
var temp = $('#tblproxy').find('tr.loading-row');
'use strict';
var page = require('webpage').create();
var system = require('system');
page.open(system.args[1], function(status) {
console.log(page.evaluate(function() {
return document.documentElement.innerHTML;
}));
phantom.exit();
});
'use strict';
var path = require('path');
var spawn = require('child_process').spawn;
var phantomjs = require('phantomjs');
var fs = require('fs');
var binPath = phantomjs.path;
var slice = Array.prototype.slice;
var phantomPath = path.join(
__dirname,
'..',
'phantom'
);
exports = module.exports = function() {
var args = slice.call(arguments);
var callback = args.pop();
var command = spawn(binPath, args);
command.stdout.on('data', function(data) {
callback(null, data.toString());
});
command.stderr.on('data', function(data) {
callback({ message: data.toString() }, null);
});
};
// create methods base on the ./phantom directory web page modules
fs.readdirSync(phantomPath).reduce(function(context, filename) {
var index = path.basename(filename, '.js');
context[index] = function() {
exports.apply(null, [path.join(phantomPath, filename)].concat(slice.call(arguments)));
};
}, exports);
最后,使用
lib/phantom.js
脚本的request
方法获取html页面
index.js
var temp = $('#tblproxy').find('tr.loading-row');
'use strict';
var page = require('webpage').create();
var system = require('system');
page.open(system.args[1], function(status) {
console.log(page.evaluate(function() {
return document.documentElement.innerHTML;
}));
phantom.exit();
});
'use strict';
var path = require('path');
var spawn = require('child_process').spawn;
var phantomjs = require('phantomjs');
var fs = require('fs');
var binPath = phantomjs.path;
var slice = Array.prototype.slice;
var phantomPath = path.join(
__dirname,
'..',
'phantom'
);
exports = module.exports = function() {
var args = slice.call(arguments);
var callback = args.pop();
var command = spawn(binPath, args);
command.stdout.on('data', function(data) {
callback(null, data.toString());
});
command.stderr.on('data', function(data) {
callback({ message: data.toString() }, null);
});
};
// create methods base on the ./phantom directory web page modules
fs.readdirSync(phantomPath).reduce(function(context, filename) {
var index = path.basename(filename, '.js');
context[index] = function() {
exports.apply(null, [path.join(phantomPath, filename)].concat(slice.call(arguments)));
};
}, exports);
我注意到您试图查询的元素
tbody
,是异步加载的。这超出了请求
模块的功能范围。您可以使用以无头方式模拟网页,并从网页模块获取html。如果您想创建更多自定义网页模块,可以参考
用叉子叉这个
首先,创建一个网页模块来获取特定页面的html
phantom/request.js
var temp = $('#tblproxy').find('tr.loading-row');
'use strict';
var page = require('webpage').create();
var system = require('system');
page.open(system.args[1], function(status) {
console.log(page.evaluate(function() {
return document.documentElement.innerHTML;
}));
phantom.exit();
});
'use strict';
var path = require('path');
var spawn = require('child_process').spawn;
var phantomjs = require('phantomjs');
var fs = require('fs');
var binPath = phantomjs.path;
var slice = Array.prototype.slice;
var phantomPath = path.join(
__dirname,
'..',
'phantom'
);
exports = module.exports = function() {
var args = slice.call(arguments);
var callback = args.pop();
var command = spawn(binPath, args);
command.stdout.on('data', function(data) {
callback(null, data.toString());
});
command.stderr.on('data', function(data) {
callback({ message: data.toString() }, null);
});
};
// create methods base on the ./phantom directory web page modules
fs.readdirSync(phantomPath).reduce(function(context, filename) {
var index = path.basename(filename, '.js');
context[index] = function() {
exports.apply(null, [path.join(phantomPath, filename)].concat(slice.call(arguments)));
};
}, exports);
其次,为
phantom
目录中的所有网页模块创建一个phantomjs cli包装器
lib/phantom.js
var temp = $('#tblproxy').find('tr.loading-row');
'use strict';
var page = require('webpage').create();
var system = require('system');
page.open(system.args[1], function(status) {
console.log(page.evaluate(function() {
return document.documentElement.innerHTML;
}));
phantom.exit();
});
'use strict';
var path = require('path');
var spawn = require('child_process').spawn;
var phantomjs = require('phantomjs');
var fs = require('fs');
var binPath = phantomjs.path;
var slice = Array.prototype.slice;
var phantomPath = path.join(
__dirname,
'..',
'phantom'
);
exports = module.exports = function() {
var args = slice.call(arguments);
var callback = args.pop();
var command = spawn(binPath, args);
command.stdout.on('data', function(data) {
callback(null, data.toString());
});
command.stderr.on('data', function(data) {
callback({ message: data.toString() }, null);
});
};
// create methods base on the ./phantom directory web page modules
fs.readdirSync(phantomPath).reduce(function(context, filename) {
var index = path.basename(filename, '.js');
context[index] = function() {
exports.apply(null, [path.join(phantomPath, filename)].concat(slice.call(arguments)));
};
}, exports);
最后,使用
lib/phantom.js
脚本的request
方法获取html页面
index.js
var temp = $('#tblproxy').find('tr.loading-row');
'use strict';
var page = require('webpage').create();
var system = require('system');
page.open(system.args[1], function(status) {
console.log(page.evaluate(function() {
return document.documentElement.innerHTML;
}));
phantom.exit();
});
'use strict';
var path = require('path');
var spawn = require('child_process').spawn;
var phantomjs = require('phantomjs');
var fs = require('fs');
var binPath = phantomjs.path;
var slice = Array.prototype.slice;
var phantomPath = path.join(
__dirname,
'..',
'phantom'
);
exports = module.exports = function() {
var args = slice.call(arguments);
var callback = args.pop();
var command = spawn(binPath, args);
command.stdout.on('data', function(data) {
callback(null, data.toString());
});
command.stderr.on('data', function(data) {
callback({ message: data.toString() }, null);
});
};
// create methods base on the ./phantom directory web page modules
fs.readdirSync(phantomPath).reduce(function(context, filename) {
var index = path.basename(filename, '.js');
context[index] = function() {
exports.apply(null, [path.join(phantomPath, filename)].concat(slice.call(arguments)));
};
}, exports);
tbody在页面源代码中,但它是#tblproxy中唯一的元素。Cheerio忽略了它。奇怪的是,我刚刚检查了Iron(Chromium fork)、IE11、Firefox和Cheerio下载的html,看不到tbody。tbody在页面源代码中,但它是#tblproxy中唯一的元素。奇怪的是,我刚刚登录了Iron(Chromium fork)、IE11、Firefox和Cheerio下载的html,却看不到tbody。