Javascript node.js/jquery上的简单scraper应用程序_Javascript_Jquery_Node.js

Javascript node.js/jquery上的简单scraper应用程序

javascript jquery node.js

Javascript node.js/jquery上的简单scraper应用程序,javascript,jquery,node.js,Javascript,Jquery,Node.js,我正在尝试使用jQuery在Node.js上构建一个小应用程序，它将请求所提供位置的内容。如果URL指向非HTML的资源，它将输出返回的内容类型。否则，它将读取HTML数据并输出页面标题和描述，然后是页面上的可单击链接列表。这是我的密码： var request = require('request'), jsdom = require('jsdom'), fs = require('

我正在尝试使用jQuery在Node.js上构建一个小应用程序，它将请求所提供位置的内容。如果URL指向非HTML的资源，它将输出返回的内容类型。否则，它将读取HTML数据并输出页面标题和描述，然后是页面上的可单击链接列表。这是我的密码：

var request             = require('request'),    
    jsdom               = require('jsdom'),
    fs                  = require('fs'),
    colors              = require('colors'),
    argv                = require('optimist').argv;
    
    colors.setTheme({ c1: 'blue', c2: 'red', c3: 'inverse' });

var myFunc = function( link, cb ){

console.log( 'requesting page: '.c3 + link.c3 );

// Step 1 - request to the page
request({
        uri: link,
    }, function (err, response, body) {
    
        // Handle response issues
        if ( err || response.statusCode !== 200 ) {
            if ( !response ){
                console.log( 'Ooops! page doesn`t exist or wrong URL format'.c2 )
            } else {
                console.log('error: '+ response.statusCode )
            }
            cb();

        } else {

            console.log( 'response code: ' + response.statusCode )
            
            // Step 2 - invoking jsdom and jQuery
            jsdom.env({
                html: body,
                src: [
                    fs.readFileSync(__dirname + "/lib/jquery-1.9.1.min.js").toString()
                ],
                done: function(err, window) {
                    
                    if(err) {
                        cb();
                    } else {

                        var $ = window.$;


                        // Step 3, final part - parse content with jQuery selectors
                        console.log( '\nThis page is:\n'.c1 + $(body)[0]._ownerDocument._contentType )
                        console.log( '\nPage title: \n'.c1 + $('title').text().trim() );
                        console.log( $('head meta[name="description"]').attr('content') !== undefined ? '\nPage description: \n'.c1 + $('head meta[name="description"]').attr('content') + '\n' : '\nPage description: \n'.c1 + 'No description on the page\n'.c2);
                        console.log( '\nClickable links on the page: \n'.c1 )

                        $('a').each(function(){
                            if ( $(this).attr('href') !== undefined ){
                                console.log( $(this).attr('href').slice(0, 4) == 'http' ? $(this).attr('href') : link + $(this).attr('href'))
                            } 
                        });

                        cb();
                    }
                } 
            })
        }
    }
);

})

因此，它完美地刮去了html页面，但我不知道如何实现这一部分

如果URL指向非HTML的资源，它将输出返回的内容类型

请分享一个如何做这部分的想法。提前谢谢

我从来没有使用过JQuery，而且我是一个javascript高手，但我要做的是获取字符串中的url，删除所有内容直到最后一个点（例如：“=>”php），将其与“html”进行比较，然后打印出来，因为它将是内容类型

编辑：

//url is a string
function validate (url) {
    //Get whatever is after the last dot in the url
    //Compare to html
    //Return true if it is equals, or false if not
    return (url.substr(url.lastIndexOf('.')) === 'html');
}