Javascript CasperJS-内存耗尽
当我通过命令行运行它时,它会持续一两个小时,然后命令行会显示“内存耗尽”。我不知道发生了什么事 另外,还有一些关于如何使这个项目更具可读性或可修改性的一般性建议,因为我将在一个月内完成这个项目Javascript CasperJS-内存耗尽,javascript,memory,memory-leaks,web-scraping,casperjs,Javascript,Memory,Memory Leaks,Web Scraping,Casperjs,当我通过命令行运行它时,它会持续一两个小时,然后命令行会显示“内存耗尽”。我不知道发生了什么事 另外,还有一些关于如何使这个项目更具可读性或可修改性的一般性建议,因为我将在一个月内完成这个项目 var fs = require('fs'); var currentPhysician = []; var physicianData = []; var permitMax = 99999; var alreadyParsed = []; var targetFile = "CMQphysicians
var fs = require('fs');
var currentPhysician = [];
var physicianData = [];
var permitMax = 99999;
var alreadyParsed = [];
var targetFile = "CMQphysicians.csv";
var startTime = new Date().getTime();
var permitNumber = -1;
var firstLicense = 0;
var utils = require('utils');
String.prototype.contains = function (s) {
return (this.indexOf(s) != -1);
}
var casper = require('casper').create({
verbose : true,
logLevel : "info",
pageSettings : {
loadImages : false, // do not load images
loadPlugins : false // do not load NPAPI plugins (Flash, Silverlight, ...)
}
});
function getPermitNumberString() {
var pn = permitNumber.toString();
var l = pn.length;
var i;
var leadingZeros = '';
for (i = 0; i < (5 - pn.length); i++) {
leadingZeros = leadingZeros + '0';
}
return leadingZeros + pn;
}
function getDetailsData() {
var details = document.querySelectorAll('#content-html > table.griddetails > tbody > tr > td');
return Array.prototype.map.call(details, function (e) {
return e.innerText;
});
}
function getPhysicianCount() {
return document.querySelectorAll("#GViewList > tbody > tr:nth-child(2) > td:nth-child(1) > a").length;
}
casper.on("resource.error", function (resourceError) {
if (!resourceError.url.contains('google')) {
this.echo("Resource error: " + "Error code: " + resourceError.errorCode + " ErrorString: " + resourceError.errorString + " url: " + resourceError.url + " id: " + resourceError.id, "ERROR");
}
while (resourceError.errorString.contains('undefined')) {}
});
casper.on('load.started', function () {
//casper.echo('load started');
});
casper.on('navigation.requested', function (url, navigationType, navigationLocked, isMainFrame) {
//casper.echo('navigation requested');
//casper.echo(navigationType);
});
casper.on('remote.message', function (msg) {
this.echo('from within remote page DOM' + msg);
});
casper.start('https://www.google.ca/?gws_rd=ssl', function () { // Loads the initial page.
casper.echo('Starting!');
});
casper.on('load.finished', function (status) {
//casper.echo('load finished');
var date = new Date();
var hours = date.getHours();
var minutes = date.getMinutes();
//casper.echo(hours.toString() + ':' + minutes.toString() + ' ' + this.getCurrentUrl().toUpperCase());
var urlPrefix = this.getCurrentUrl().substring(0, this.getCurrentUrl().indexOf('.aspx'));
if (urlPrefix.length == 0) {
casper.echo('undefined');
urlPrefix = 'https://www.google.ca/?gws_rd=ssl'.toUpperCase();
}
switch (urlPrefix.toUpperCase()) {
case 'https://www.google.ca/?gws_rd=ssl'.toUpperCase():
casper.echo('on google');
if (fs.exists('CMQphysicians.csv')) {
stream = fs.open('CMQphysicians.csv', 'r');
line = stream.readLine();
var i = 0;
while (line) {
if (i > 0) {
alreadyParsed.push(Number(line.substring(0, line.indexOf(','))));
}
line = stream.readLine();
i++;
}
stream.close();
permitNumber = Math.max.apply(null, alreadyParsed) + 1;
firstLicense = permitNumber;
casper.echo(permitNumber);
} else {
fs.write(targetFile, "\uFEFF" + 'Permit Number,Last Name,First Name,Gender,Permit,Status,Specialty,Activity,Authorization,Address,Phone\n', 'a');
}
casper.thenOpen('http://www.cmq.org/bottin/index.aspx?lang=en&a=1');
break;
case 'http://www.cmq.org/bottin/index'.toUpperCase():
casper.waitForSelector('#___gcse_0 > div > form > table.gsc-search-box > tbody > tr > td.gsc-search-button > input', function() {
var finishedSoFar = permitNumber - firstLicense;
var timeSoFar = new Date().getTime() - startTime;
var licensesToDo = permitMax - permitNumber;
var msPerLicense = timeSoFar / finishedSoFar;
var minutesToGo = (licensesToDo * msPerLicense) / 1000 / 60;
//casper.echo(licensesToDo + ' licenses to go. ' + msPerLicense.toString() + 'ms per license. ' + minutesToGo.toString() + ' minutes remaining.');
casper.echo('index stage');
permitNumber++;
if (permitNumber > permitMax) {
casper.echo('Permit number maxed out');
} else {
var permitNumberString = getPermitNumberString();
casper.echo('going to list');
casper.sendKeys('#txbNoPermis', permitNumberString);
//casper.wait(100);
casper.echo('sent keys, now clicking');
casper.thenClick('#btSubmit');
casper.echo('after the click');
}
});
break;
case 'http://www.cmq.org/bottin/list'.toUpperCase():
casper.waitForSelector('#___gcse_0 > div > form > table.gsc-search-box > tbody > tr > td.gsc-search-button > input', function() {
casper.echo('list stage');
// Three cases:
// No results, one result, many results
// No results: go back (00000)
// One result: go forward (82365)
// Many results: crash (?????)
a = casper.evaluate(getPhysicianCount);
if (a == 0) {
casper.echo('No physicians for license ' + getPermitNumberString());
casper.echo('going to index');
casper.thenClick('#btSubmit');
//casper.wait(1000);
} else if (a == 1) {
casper.echo('Physician exists for license ' + getPermitNumberString());
casper.echo('going to details');
casper.thenClick('#GViewList > tbody > tr:nth-child(2) > td:nth-child(1) > a');
//casper.wait(1000);
} else if (a > 1) {
casper.echo('a > 1 at ') + getPermitNumberString();
while(true){}
} else {
casper.echo('negative a at ') + getPermitNumberString();
while(true){}
}
// No results
});
break;
case 'http://www.cmq.org/bottin/details'.toUpperCase():
casper.waitForSelector('#___gcse_0 > div > form > table.gsc-search-box > tbody > tr > td.gsc-search-button > input', function() {
casper.echo('details stage');
var name = casper.getHTML('#content-html > table.griddetails > tbody > tr:nth-child(1) > th').substring(0, casper.getHTML('#content-html > table.griddetails > tbody > tr:nth-child(1) > th').indexOf('(')).trim().split(',');
tableData = (casper.evaluate(getDetailsData));
currentPhysician.push(tableData[4]);
currentPhysician.push(name[0].trim());
currentPhysician.push(name[1].trim());
for (i = 2; i < tableData.length; i++) {
if (i % 2 == 0 && i != 4) {
currentPhysician.push(tableData[i]);
}
}
for (i = 0; i < currentPhysician.length; i++) {
currentPhysician[i] = currentPhysician[i].replace(/,/g, ';').replace(/\n/g, ';');
}
var physicianString = currentPhysician.join(',') + '\n';
casper.echo('writing to file!');
fs.write(targetFile, physicianString, 'a');
currentPhysician = [];
casper.echo(casper.exists('#btNewsearch'));
casper.echo('going to index');
casper.thenClick('#btNewsearch');
//casper.wait(1000);
});
break;
default:
casper.echo("Wrong URL!");
casper.back();
break;
}});
casper.run(function () {
casper.echo('ending!');
casper.echo(physicianData.length);
});
var fs=require('fs');
var currentMedicine=[];
var Physician数据=[];
var permitMax=99999;
var alreadyParsed=[];
var targetFile=“cmqphysicans.csv”;
var startTime=new Date().getTime();
变量permitNumber=-1;
var firstLicense=0;
var utils=require('utils');
String.prototype.contains=函数{
return(this.indexOf(s)!=-1);
}
var casper=require('casper')。创建({
没错,
日志级别:“信息”,
页面设置:{
loadImages:false,//不加载图像
loadPlugins:false//不加载NPAPI插件(Flash、Silverlight等)
}
});
函数getPermitNumber字符串(){
var pn=permitNumber.toString();
var l=pn.长度;
var i;
var前导零=“”;
对于(i=0;i<(5-pn.长度);i++){
leadingZeros=leadingZeros+'0';
}
返回引线零+pn;
}
函数getDetailsData(){
var details=document.querySelectorAll('content html>table.griddetails>tbody>tr>td');
返回Array.prototype.map.call(详细信息,函数(e){
返回e.innerText;
});
}
函数getPhysicianCount(){
return document.queryselectoral(#GViewList>tbody>tr:nth child(2)>td:nth child(1)>a”).length;
}
casper.on(“resource.error”,函数(resourceError){
如果(!resourceError.url.contains('google')){
echo(“资源错误:”+“错误代码:”+resourceError.errorCode+“错误字符串:”+resourceError.ErrorString+“url:”+resourceError.url+“id:”+resourceError.id,“错误”);
}
而(resourceError.errorString.contains('undefined')){
});
casper.on('load.started',函数(){
//echo('load started');
});
casper.on('navigation.requested',函数(url、navigationType、navigationLocked、isMainFrame){
//echo(“请求导航”);
//casper.echo(导航类型);
});
casper.on('remote.message',函数(msg){
this.echo('来自远程页面DOM'+msg);
});
卡斯珀,开始https://www.google.ca/?gws_rd=ssl,函数(){//加载初始页。
echo('Starting!');
});
casper.on('load.finished',函数(状态){
//echo('load finished');
变量日期=新日期();
var hours=date.getHours();
var minutes=date.getMinutes();
//echo(hours.toString()+':'+minutes.toString()+''+this.getCurrentUrl().toUpperCase());
var urlPrefix=this.getCurrentUrl().substring(0,this.getCurrentUrl().indexOf('.aspx'));
如果(urlPrefix.length==0){
casper.echo(“未定义”);
URL前缀=https://www.google.ca/?gws_rd=ssl“.toUpperCase();
}
开关(urlPrefix.toUpperCase()){
案例https://www.google.ca/?gws_rd=ssl'.toUpperCase():
echo(“在谷歌上”);
如果(fs.exists('cmqphysicans.csv')){
stream=fs.open('cmqphysicans.csv','r');
line=stream.readLine();
var i=0;
while(行){
如果(i>0){
alreadyParsed.push(数字(行.子字符串(0,行.索引of(','))));
}
line=stream.readLine();
i++;
}
stream.close();
permitNumber=Math.max.apply(null,alreadyParsed)+1;
firstLicense=许可证编号;
casper.echo(许可号码);
}否则{
fs.write(targetFile,“\uFEFF”+”许可证编号、姓氏、名字、性别、许可证、状态、专业、活动、授权、地址、电话\n、'a');
}
卡斯珀,然后打开http://www.cmq.org/bottin/index.aspx?lang=en&a=1');
打破
案例http://www.cmq.org/bottin/index'.toUpperCase():
casper.waitForSelector('#uuu gcse_0>div>form>table.gsc-search-box>tbody>tr>td.gsc-search-button>input',函数(){
var finishedSoFar=permitNumber-firstLicense;
var timeSoFar=new Date().getTime()-startTime;
var licensesToDo=许可证最大值-许可证编号;
var msPerLicense=飞行时间/完成飞行时间;
var minutesToGo=(licensesToDo*msPerLicense)/1000/60;
//casper.echo(licensesToDo+‘许可证可以使用’++msPerLicense.toString()++‘ms per license’++minutesToGo.toString()++‘剩余分钟’);
echo(‘索引阶段’);
permitNumber++;
如果(permitNumber>permitMax){
casper.echo(‘许可证号最大化’);
}否则{
var permitNumberString=getPermitNumberString();
echo(‘将要上市’);
casper.sendKeys('#txbNoPermis',permitNumber字符串);
//等一下(100);
echo('sent key,now clicking');
casper.然后单击('btSubmit');
echo('after the click');
}
});
打破
案例http://www.cmq.org/bottin/list'.toUpperCase():
casper.waitForSelector('#uuu gcse_0>div>form>table.gsc-search-box>tbody>tr>td.gsc-search-button>input',函数(){
echo(“列表阶段”);
//三个案例:
//没有结果,一个结果,很多结果
//无结果:返回(00000)
//一个结果:前进(82365)
//许多结果:崩溃(???)
a=casper.evaluate(getPhysicianCount);
如果(a==0){
echo('无医师执照'+getPermitNumberString());
echo(‘进入索引’);
casper.然后单击('btSubmit');
//等一下(1000);
}如果(a==1),则为else{
echo('医师存在于许可证'+GetPermitNumber字符串());
echo(‘进入细节’);
casper.然后单击(“#GViewList>tbody>tr:nth child(2)>td:nth child(1)>a”);