Javascript 使用casperjs和phantomjs刮取多个页面

Javascript 使用casperjs和phantomjs刮取多个页面,javascript,phantomjs,casperjs,Javascript,Phantomjs,Casperjs,我正在尝试刮取一些有标准格式的页面。我已经能够使用Phantomjs成功地抓取一个页面,但是当我尝试迭代多个页面时,异步处理会使事情挂起。告诉Casper/Phantom等待的正确方法是什么 var page=require('webpage').create(); var fs=需要('fs'); page.onConsolleMessage=函数(msg){ phantom.outpunecoding=“utf-8”; 控制台日志(msg); }; //这将覆盖上一个输出文件 f=fs.o

我正在尝试刮取一些有标准格式的页面。我已经能够使用Phantomjs成功地抓取一个页面,但是当我尝试迭代多个页面时,异步处理会使事情挂起。告诉Casper/Phantom等待的正确方法是什么


var page=require('webpage').create();
var fs=需要('fs');
page.onConsolleMessage=函数(msg){
phantom.outpunecoding=“utf-8”;
控制台日志(msg);
};
//这将覆盖上一个输出文件
f=fs.open(“lat_long.txt”,“w”);
f、 写(“—”;
f、 close();
//这是位置的唯一标识符。现在,我只有三个数据点
var EPAID=[“KYD980501076”、“ME8170022018”、“MEN000103584”];
///此代码将用于在不同位置循环。现在,只看一个。
对于(q=0;q<1;q++){
var处理=假;
//我们构造目标url
变量url=”http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=“+EPAID[0]+”&fac\u search\u type=start+With&postal\u code=&location\u address=&add\u search\u type=start+With&city\u name=&country\u name=&state\u code=&program\u search=1&report=2&page\u no=1&output\u sql\u switch=TRUE&database\u type=CERCLIS”;
页面打开(url);
page.onLoadFinished=功能(状态){
如果(状态==“成功”){
第页,包括http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js“,函数(){
var str=page.evaluate(函数(){
$value=[];
$Object=$(“.result tr”);
对于(i=0;i<10;i++){
$value.push($Object.find('td').html(),$Object.find('td').next().next().html());
$Object=$Object.next();
} 
$string=“{EPAID:”+$value[0]+”,“+
“名称:”+$value[1]+“,“+
城市:“+$value[4]+”+
状态:“+$value[6]+”+
ZipCode:“+$value[8]+”,“+
纬度:“+$value[14]+”+
“经度:”+$value[16]+“}”;
返回$string;
});
f=fs.open(“lat_long.txt”,“a”);
f、 书写(str);
f、 close();
处理=真;
log(“写入文件”);
phantom.exit();
});
}
//就在这里,它应该延迟到上一页完成
//当(!处理){
//setTimeout(函数(){console.log(“waiting…”);},1000);
//    }
};
}
控制台日志(“完成所有页面”);

如果切换到使用casperJS,只需将
page.open()
更改为
page.thenOpen()
。(这个问题看起来很像你的?)

如果您想继续使用PhantomJS,您需要在上一次加载的onSuccess回调中启动下一次页面加载。这很乏味,需要小心避免占用大量内存。(我做了一两次,但现在只是简单地使用CasperJS。)


另一种方法是在循环中创建
页面
对象。然而,这并不能完全回答你的问题,因为它们将并行运行。但是,如果您有数百个URL,您可以使用
setTimeout
将每个URL错开一次,以避免突发活动

以下是最终有效的代码(使用超时方法,因为我无法让成功回调更好地工作)

安装了casperjs后,我将该文件命名为“process.js”,并能够从命令行将其运行为“casperjs process.js”


var page=require('webpage').create();
var fs=需要('fs');
page.onConsolleMessage=函数(msg){
phantom.outpunecoding=“utf-8”;
控制台日志(msg);
};
//这将覆盖以前的输出f
//这是位置的唯一标识符。
变量EPAID=[“NED981713837”、…、“FLD049985302”、“NJD986643153”];
f=fs.open(“lat_long.txt”,“w”);
f、 写(“—”;
f、 close();
var计数=0;
var目标=1400;
var writed=[];
函数yourFunction(){
如果(计数<目标){
进程(计数);
计数++;
设置超时(函数,5000);
}否则{
控制台日志(“退出”);
phantom.exit();
返回;
}    
}
函数进程(计数器){
var处理=假;
console.log(“开始记录#”+计数器);
//我们构造目标url
变量url=”http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=“+EPAID[counter]+”&fac\u search\u type=begin+With&postal\u code=&location\u address=&add\u search\u type=begin+With&city\u name=&country\u name=&state\u code=&program\u search=1&report=2&page\u no=1&output\u sql\u switch=TRUE&database\u type=CERCLIS”;
页面打开(url);
page.onLoadFinished=功能(状态){
如果(状态==“成功”){
第页,包括http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js“,函数(){
var str=page.evaluate(函数(){
$value=[];
$Object=$(“.result tr”);
对于(i=0;i<10;i++){
$value.push($Object.find('td').html(),$Object.find('td').next().next().html());
$Object=$Object.next();
} 
$string=“{\'EPAID\”:\”+$value[0]+“\”,“+
“\”名称\“:\”+$value[1]+“\”,“+
“\”城市\“:\”+$value[4]+“\”,”+
“\”状态\:\”+$value[6]+“\”,”+
“\”ZipCode\“:\”+$value[8]+“\”,“+
““纬度”:“+$value[14]+”+
“\”经度\:“+$value[16]+”}”;
返回$string;
var page = require('webpage').create();
var fs = require('fs');

page.onConsoleMessage = function(msg) {
    phantom.outputEncoding = "utf-8";
    console.log(msg);
};


// this overwrites the previous output file

f = fs.open("lat_long.txt", "w");
f.write("--");
f.close();


   // this is the unique identifier for the locations. For now, I just have three datapoints
  var EPAID = ["KYD980501076","ME8170022018", "MEN000103584"]; 

 /// this code will be used to loop through the different locations. For now, set to look at only one.  
 for (q= 0;  q < 1; q++)  {
    var processing = false;



   //we construct the target url
   var url  = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[0]  + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;


   page.open(url);
   page.onLoadFinished = function(status) {
   if ( status === "success" ) {
       page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
           var str = page.evaluate(function() {                   
               $value = [];
               $Object = $(".result tr");                
               for (i =0 ; i < 10; i++) { 
             $value.push($Object.find('td').html(),$Object.find('td').next().next().html() );          
             $Object = $Object.next();
            } 

            $string = "{ EPAID: "+  $value[0] +  ", " + 
                     "Name: "+  $value[1] +  ", " +                
                     "City: "+  $value[4] +  ", " +
                     "State: "+  $value[6] +  ", " +
                     "ZipCode: "+  $value[8] +  ", " +  
                     "Latitude: "+  $value[14] +  ", " +
                     "Longitude: "+  $value[16] +  " }" ;          
            return $string;
        });

        f = fs.open("lat_long.txt", "a");
        f.write(str);
        f.close();
        processing = true;
        console.log("writing to file");
       phantom.exit();    

    });
 }


 // right here it should delay until the previous page is completed        
 //  while (!processing)  {    
 //       setTimeout(function(){ console.log("waiting....");},1000);
 //    }


};

}

console.log("finished all pages");
var page = require('webpage').create();
var fs = require('fs');

page.onConsoleMessage = function(msg) {
    phantom.outputEncoding = "utf-8";
    console.log(msg);
};


// this overwrites the previous output f
 // this is the unique identifier for the locations. 
    var EPAID = ["NED981713837",... , "FLD049985302", "NJD986643153"]; 


f = fs.open("lat_long.txt", "w");
f.write("-<>-");
f.close();


var count = 0;
var target = 1400;
var written = [];

function yourFunction(){

   if (count < target) {

      process(count);
      count++;
      setTimeout(yourFunction, 5000);

   } else {
       console.log("exiting");
       phantom.exit();    
       return;
   }    
}




function process(counter){    

    var processing = false;

         console.log("Beginning record #" + counter); 

    //we construct the target url
    var url  = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[counter]  + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;


    page.open(url);
    page.onLoadFinished = function(status) {
    if ( status === "success" ) {
        page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
            var str = page.evaluate(function() {                   
                $value = [];
                $Object = $(".result tr");                
              for (i =0 ; i < 10; i++) { 
                 $value.push($Object.find('td').html(),$Object.find('td').next().next().html() );          
                 $Object = $Object.next();
              } 

                $string = "{ \"EPAID\": \""+  $value[0] +  "\", " + 
                         "\"Name\": \""+  $value[1] +  "\", " +                
                         "\"City\": \""+  $value[4] +  "\", " +
                         "\"State\": \""+  $value[6] +  "\", " +
                         "\"ZipCode\": \""+  $value[8] +  "\", " +  
                         "\"Latitude\": "+  $value[14] +  ", " +
                         "\"Longitude\": "+  $value[16] +  " }," ;          
                return $string;
            });


           if (written[counter] === undefined) { 

             f = fs.open("lat_long.txt", "a");
             f.write(str);
             f.close();
             written[counter] = true;
             console.log("Writing to file #"+  counter);
           }  

        });
    }

    };
}

 console.log("Start...");

yourFunction();