Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/javascript/368.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Javascript 如何在phantomjs中创建实体?_Javascript_Phantomjs - Fatal编程技术网

Javascript 如何在phantomjs中创建实体?

Javascript 如何在phantomjs中创建实体?,javascript,phantomjs,Javascript,Phantomjs,我正在尝试为谷歌搜索结果编写抓取程序。以下是我写的: var system = require('system'); var args = system.args; var webPage = require('webpage'); var page = webPage.create(); var useragent = []; useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KH

我正在尝试为谷歌搜索结果编写抓取程序。以下是我写的:

var system = require('system');
var args = system.args;
var webPage = require('webpage');
var page = webPage.create();

var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');

page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)]; 

var request = "search?q=",
    newPage = "&start=",
    localInfo,
    depth;

var gUrl = 'http://google.com/',
    yaUrl = 'http://yahoo.com/',
    url = '';

var searchPages = [],
    links;

    request += args[1];
    url += gUrl + request;
    depth = args[2];

    function pageHandler(uri) {
        page.open(uri, function (status) {
            if (status === 'success') {

                page.injectJs('./libs/jquery-2.1.3.min.js');

                links = page.evaluate(function () {
                    return $("li.g h3 a").map(function () {
                        return this.href;
                    }).get();
                });
                localInfo = page.evaluate(function() {
                   return $("#swml_addr").text();
                });
                console.log(links.join('\n'));
                console.log(links.length);
                console.log(localInfo);
                setTimeout(nextPage, 1000);
            }
        });
    }

    function prepareSearchPages() {
        for (var numPage = 0; numPage < depth; numPage++) {
            url += newPage+10*numPage;
            searchPages.push(url);
            url = url.substr(0,url.indexOf(newPage));
        }
        nextPage();
    }

    var nextPage = function() {
        var file = searchPages.shift();
        if(!file) phantom.exit();
        pageHandler(file);
    };


    prepareSearchPages();
很可能所有人都认为这看起来很可怕。但效果很好。所以,我决定为搜索引擎分配一个单独的实体

var webPage = require('webpage');
var page = webPage.create();


function searchEngine(engConfig) {
    var _engineUrl = engConfig.rootDomain;
    var _engineRequest = engConfig.requestPrefix;
    var _engineNewPage = engConfig.newPagePrefix;
    var _linkWrapperSelector = engConfig.linkWrapperSelector;
    var _locSelector = engConfig.locSelector;
    var _localInfo;
    var _searchPagesUrls = [];
    var _resultLinks;

    var pageHandler = function(uri) {
        page.open(uri, function (status) {
            if (status === 'success') {

                page.injectJs('./libs/jquery-2.1.3.min.js');

                _resultLinks = page.evaluate(function(_linkWrapperSelector) {
                    return $(_linkWrapperSelector).map(function () {
                        return this.href;
                    }).get();
                });
                _localInfo = page.evaluate(function(_locSelector) {
                    return $(_locSelector).text();
                });
                console.log(_resultLinks.join('\n'));
                console.log(_resultLinks.length);
                console.log(_localInfo);

                setTimeout(nextPage, 1000);
            }
        });
    };

    var nextPage = function() {
        var file = _searchPagesUrls.shift();
        if(!file) phantom.exit();
        pageHandler(file);
    };

    this.runSearch = function(keyPhrase, depthSearch) {
        var url = _engineUrl+_engineRequest+keyPhrase;
        for (var numPage = 0; numPage < depthSearch; numPage++) {
            url += _engineNewPage+10*numPage;
            _searchPagesUrls.push(url);
            url = url.substr(0,url.indexOf(_engineNewPage));
        }
        nextPage();
    };

    this.showLinks = function() {
        return _resultLinks.join('\n');
    };


}

var googleOptions = {
    rootDomain: 'http://google.ru/',
    requestPrefix: 'search?q=',
    newPagePrefix: '&start=',
    linkWrapperSelector: 'li.g h3 a',
    locSelector: '#swml_addr'
};

var useragent = [];
useragent.push('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25');
useragent.push('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36 OPR/27.0.1689.76');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36');
useragent.push('Mozilla/5.0 (Windows NT 6.3; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0');

page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];

var google = new searchEngine(googleOptions);

google.runSearch('Hello', 1);
不幸的是,它不起作用。我不明白为什么。也许我搞砸了范围界定

请注意,此代码的第一个版本工作正常,并在控制台中显示所有链接。第二个版本的代码只输出0,但传递给pageHandler函数的uri是正确的。甚至不显示“未定义”或类似的内容。

page.evaluate是沙盒页面上下文。它无法访问外部定义的变量。您必须显式地将_linkWrapperSelector传递给它:

这同样适用于_locSelector:

_resultLinks = page.evaluate(function(_linkWrapperSelector) {
    return $(_linkWrapperSelector).map(function () {
        return this.href;
    }).get();
}, _linkWrapperSelector); // this here
_localInfo = page.evaluate(function(_locSelector) {
    return $(_locSelector).text();
}, _locSelector); // this here