Javascript NodeJS爬虫登录到站点_Javascript_Node.js_Login_Web Crawler

Javascript NodeJS爬虫登录到站点

javascript node.js login web-crawler

Javascript NodeJS爬虫登录到站点,javascript,node.js,login,web-crawler,Javascript,Node.js,Login,Web Crawler,我想抓取geocaching.com，但某些数据（如coords）仅适用于登录用户。我正在使用npm的“crawler”，现在已经知道如何使用crawler登录，但我已经得到了登录表单的名称： ctl00$ContentBody$tbUsername:用户 ctl00$ContentBody$tbPassword:passwaord ctl00$ContentBody$btnSignIn:“登录” 以下是我目前的代码： var Crawler = require("crawler"); v

我想抓取geocaching.com，但某些数据（如coords）仅适用于登录用户。我正在使用npm的“crawler”，现在已经知道如何使用crawler登录，但我已经得到了登录表单的名称：

ctl00$ContentBody$tbUsername:用户
ctl00$ContentBody$tbPassword:passwaord
ctl00$ContentBody$btnSignIn:“登录”

以下是我目前的代码：

var Crawler = require("crawler");
var url = require('url');
var mongoose = require("mongoose");
var Cache = require("./models/cache.js");

mongoose.connect("localhost:27017/Cache");

var removeTags = function(text){
    return String(text).replace(/(<([^>]+)>)/ig,'');
};
var c = new Crawler({
    maxConnections: 10,
    skipDuplicates: true,

    callback: function (error, result, $) {

        if (result.request.uri.href.startsWith("http://www.geocaching.com/geocache/")) {
            var cache = new Cache();
            var id = removeTags($(".CoordInfoCode"));
            Cache.count({
                "_id": id
            }, function (err, count) {
                if (err)
                    return;
                else if (count < 1) {
                    //Saving the data
                }

            });


        }
        if (result.headers['content-type'] == "text/html; charset=utf-8") {
            if ($('a').length != 0) {
                $('a').each(function (index, a) {
                    var toQueueUrl = $(a).attr('href');
                    process.nextTick(function () {
                        process.nextTick(function () {
                            c.queue(toQueueUrl);
                        })
                    });

                });
            }
        }

    }
});

c.queue('http://www.geocaching.com/seek/nearest.aspx?ul=Die_3sten_3');

var Crawler=require（“Crawler”）；
var url=require（'url'）；
var mongoose=要求（“mongoose”）；
var Cache=require（“./models/Cache.js”）；
connect（“localhost:27017/Cache”）；
var removeTags=函数（文本）{
返回字符串（文本）。替换（/（]+）>）/ig'，）；
};
var c=新的爬虫程序({
最大连接数：10，
skipDuplicates:对，
回调：函数（错误，结果，$）{
if（result.request.uri.href.startsWith（）http://www.geocaching.com/geocache/")) {
var cache=new cache（）；
var id=removeTags（$（“.coordinfo”）；
Cache.count({
“_id”：id
}，函数（错误，计数）{
如果（错误）
返回；
否则如果（计数<1）{
//保存数据
}
});
}
if（result.headers['content-type']==“text/html；charset=utf-8”）{
如果（$（'a'）。长度！=0）{
$（'a'）。每个函数（索引，a）{
var toQueueUrl=$（a）.attr（'href'）；
process.nextTick（函数（）{
process.nextTick（函数（）{
c、 队列（toQueueUrl）；
})
});
});
}
}
}
});
c、 队列（'http://www.geocaching.com/seek/nearest.aspx?ul=Die_3sten_3');

我在github上制作了一个javascript爬虫示例

它是事件驱动的，并使用内存队列存储所有资源（即URL）

如何在节点环境中使用

var Crawler = require('../lib/crawler')
var crawler = new Crawler('http://www.someUrl.com');

// crawler.maxDepth = 4;
// crawler.crawlInterval = 10;
// crawler.maxListenerCurrency = 10;
// crawler.redisQueue = true;
crawler.start();

这里我只是向您展示javascript爬虫的两个核心方法

Crawler.prototype.run = function() {
  var crawler = this;
  process.nextTick(() => {
    //the run loop
    crawler.crawlerIntervalId = setInterval(() => {

      crawler.crawl();

    }, crawler.crawlInterval);
    //kick off first one
    crawler.crawl();
  });

  crawler.running = true;
  crawler.emit('start');
}


Crawler.prototype.crawl = function() {
  var crawler = this;

  if (crawler._openRequests >= crawler.maxListenerCurrency) return;


  //go get the item
  crawler.queue.oldestUnfetchedItem((err, queueItem, index) => {
    if (queueItem) {
      //got the item start the fetch
      crawler.fetchQueueItem(queueItem, index);
    } else if (crawler._openRequests === 0) {
      crawler.queue.complete((err, completeCount) => {
        if (err)
          throw err;
        crawler.queue.getLength((err, length) => {
          if (err)
            throw err;
          if (length === completeCount) {
            //no open Request, no unfetcheditem stop the crawler
            crawler.emit("complete", completeCount);
            clearInterval(crawler.crawlerIntervalId);
            crawler.running = false;
          }
        });
      });
    }

  });
};

这里是github链接。它是一个javascript网络爬虫，用1000行代码编写。这会让你走上正轨