Javascript Node Jsdom Scrape Google';s反向图像搜索

Javascript Node Jsdom Scrape Google';s反向图像搜索,javascript,node.js,httprequest,scrape,jsdom,Javascript,Node.js,Httprequest,Scrape,Jsdom,我想通过编程找到一个类似图片的URL列表,给出一个图片URL。我找不到任何免费的图像搜索API,所以我试图通过抓取谷歌的API来实现这一点 比如说,如果我有一个图像URL,那么导航到会给出相关的图像和信息 如何生成浏览器从上述URL获取的HTML 以下是我尝试过的(咖啡脚本): 你可以看到HTML与我们想要的不匹配。这是Jsdom的HTTP头的问题吗?问题是Jsdom的用户代理HTTP头。一旦设置好,一切(几乎)都能正常工作: 这给了我们一个机会。现在唯一的问题是Jsdom在返回结果后抛出错误:

我想通过编程找到一个类似图片的URL列表,给出一个图片URL。我找不到任何免费的图像搜索API,所以我试图通过抓取谷歌的API来实现这一点

比如说,如果我有一个图像URL,那么导航到会给出相关的图像和信息

如何生成浏览器从上述URL获取的HTML

以下是我尝试过的(咖啡脚本):


你可以看到HTML与我们想要的不匹配。这是Jsdom的HTTP头的问题吗?

问题是Jsdom的用户代理HTTP头。一旦设置好,一切(几乎)都能正常工作:

这给了我们一个机会。现在唯一的问题是Jsdom在返回结果后抛出错误:

timers.js:103
            if (!process.listeners('uncaughtException').length) throw e;
                                                                      ^
TypeError: Cannot call method 'call' of undefined
    at new <anonymous> (/project-root/node_modules/jsdom/lib/jsdom/browser/index.js:54:13)
    at _.Zl (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1238:93)
    at _.jm (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1239:399)
    at _.km (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1241:146)
    at Object._onTimeout (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1248:727)
    at Timer.list.ontimeout (timers.js:101:19)
timers.js:103
if(!process.listeners('uncaughtException').length)抛出e;
^
TypeError:无法调用未定义的方法“call”
新建(/project root/node_modules/jsdom/lib/jsdom/browser/index.js:54:13)
在uzl(https://www.google.com/xjs/_/js/s/c、sb、cr、CDO、jsa、ssb、sf、tbpr、tbui、rsn、qi、ob、mb、lc、hv、cfm、klc、kat、aut、esp、碧湖、amcl、kp、lu、m、RTI、shb、sfa、hsm、pcc、csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AITRSTPRAYLXRFKOPIRRY-YOTHBMQW2A:1238:93)
在ujm(https://www.google.com/xjs/_/js/s/c、sb、cr、CDO、jsa、ssb、sf、tbpr、tbui、rsn、qi、ob、mb、lc、hv、cfm、klc、kat、aut、esp、碧湖、amcl、kp、lu、m、RTI、shb、sfa、hsm、pcc、csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AITRSTPRAYLXRFKOPIRRY-YOTHBMQW2A:1239:399)
公里处(https://www.google.com/xjs/_/js/s/c,sb,cr,CDO,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,碧湖,amcl,kp,lu,m,RTI,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po。en_US./d=1/sv=1/rs=AITRSPraylXRFKOPyry-YOTHBMxW2A:1241:146)
在对象上。\u超时(https://www.google.com/xjs/_/js/s/c、sb、cr、CDO、jsa、ssb、sf、tbpr、tbui、rsn、qi、ob、mb、lc、hv、cfm、klc、kat、aut、esp、碧湖、amcl、kp、lu、m、RTI、shb、sfa、hsm、pcc、csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AITRSTPRAYLXRFKOPIRRY-YOTHBMxW2A:1248:727)
在Timer.list.ontimeout(timers.js:101:19)
对于这样的任务,我发现+比jsdom更容易。我知道你已经找到了答案,但我想我会把它作为一个替代方案提出来

例如:

var request = require('request'),
    cheerio = require('cheerio');

var google = 'https://www.google.com/searchbyimage';
var image = 'http://i.imgur.com/oLmwq.png';

var options = {
  url: google,
  qs: { image_url: image },
  headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11' }
};

request(options, function (err, res, body) {
  var $ = cheerio.load(body);
  …
});

你实际得到的输出是什么?下面是答案。我正在寻找一个ID为#topstuff的元素,但它只包含与首页相关的信息,如“我感觉很幸运”。附带提示:如果您坚持使用jsdom,并且没有像@ximo建议的那样切换到cheerio,请不要忘了在处理页面后使用window.close(),以避免内存泄漏
timers.js:103
            if (!process.listeners('uncaughtException').length) throw e;
                                                                      ^
TypeError: Cannot call method 'call' of undefined
    at new <anonymous> (/project-root/node_modules/jsdom/lib/jsdom/browser/index.js:54:13)
    at _.Zl (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1238:93)
    at _.jm (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1239:399)
    at _.km (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1241:146)
    at Object._onTimeout (https://www.google.com/xjs/_/js/s/c,sb,cr,cdos,jsa,ssb,sf,tbpr,tbui,rsn,qi,ob,mb,lc,hv,cfm,klc,kat,aut,esp,bihu,amcl,kp,lu,m,rtis,shb,sfa,hsm,pcc,csi/rt=j/ver=3w99aWPP0po.en_US./d=1/sv=1/rs=AItRSTPrAylXrfkOPyRRY-YioThBMqxW2A:1248:727)
    at Timer.list.ontimeout (timers.js:101:19)
var request = require('request'),
    cheerio = require('cheerio');

var google = 'https://www.google.com/searchbyimage';
var image = 'http://i.imgur.com/oLmwq.png';

var options = {
  url: google,
  qs: { image_url: image },
  headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11' }
};

request(options, function (err, res, body) {
  var $ = cheerio.load(body);
  …
});