在运行下一页之前,是否等待Javascript Web抓取功能完成?
我正在尝试创建一个web scraper(在node.js中),它将从站点中提取信息,并将其写入文件。我已经构建了一个页面可以正常工作,但是当我尝试在for循环中使用这个函数,迭代多个游戏时,我在所有游戏中都得到了错误的数据 我知道这与Javascript的异步性质有关,我也读过回调函数,但我不知道如何将其应用到我的代码中。如有任何帮助,将不胜感激:在运行下一页之前,是否等待Javascript Web抓取功能完成?,javascript,jquery,json,node.js,Javascript,Jquery,Json,Node.js,我正在尝试创建一个web scraper(在node.js中),它将从站点中提取信息,并将其写入文件。我已经构建了一个页面可以正常工作,但是当我尝试在for循环中使用这个函数,迭代多个游戏时,我在所有游戏中都得到了错误的数据 我知道这与Javascript的异步性质有关,我也读过回调函数,但我不知道如何将其应用到我的代码中。如有任何帮助,将不胜感激: for(x = 4648; x < 4650; x++){ //iterate over a few gameIDs, used in U
for(x = 4648; x < 4650; x++){ //iterate over a few gameIDs, used in URL for request
scrapeGame(x);
}
function scrapeGame(gameId){
//request from URL, scrape HTML to arrays as necessary
//write final array to file
}
//迭代几个游戏ID,在URL中用于请求。
对于(x=4648;x<4650;x++){
//作为匿名函数传入回调。
//下面我将传递id和要执行的函数。
//并且,将我期望的结果定义为传入参数。
ScrapGame(x,函数(ScrapResult,err){
//这将*不会*执行*直到*您在下面的函数中调用它。
//这意味着for循环的执行将暂停。
//此函数接收传入的状态,
//在这种情况下,布尔值为true/false和错误(如果有)。
如果(结果){
//刮是真的,没什么可做的。
//for循环现在将进入下一个迭代。
console.log('Scrape Successful');
}否则{
//Scrape为false,将错误输出到console.log和
//中断循环以处理错误。
log('Scrape ERROR::'+err);
//请注意,我们在通话中呼叫break
//回调函数的作用域
//如果要继续,请删除打断
//下一个游戏ID并没有停止循环
打破
}
});
}
//此函数现在接受两个参数。
游戏功能(gameId,回调){
// ************************************************
//**在这里工作**
//从URL请求,根据需要将HTML刮到数组。
//将最终数组写入文件。
//创建文件后,执行回调并传递bool
//状态(真/假)。
// ************************************************
var request=require('request'),
cheerio=需要('cheerio'),
fs=需要('fs'),
类别=[],
类别列表=[],
ids=[],
线索=[],
值=[
'0',
'$200',
'$400',
'$600',
'$800',
'$1000',
'$400',
'$800',
'$1200',
'$1600',
'$2000'
],
valuelist=[],
答案=[],
数组=[],
文件=[],
showGameURL=http://www.j-archive.com/showgame.php?game_id=“+配子体,
showAnswerURLhttp://www.j-archive.com/showgameresponses.php?game_id=“+配子体;
请求(showGameURL、函数(err、resp、body){
如果(!err&&resp.statusCode==200){
var$=总负荷(车身);
//向类别添加一行以避免从0开始
类别。推送(“类别列表”);
//拉取所有类别以供以后使用
$('td.category_name')。每个(函数(){
var category=$(this.text();
类别。推送(类别);
});
//拉取所有线索ID(坐标),存储到1d数组
//拉动任何“卡在”字符串中的id,以防止重复
$(“[id*='stacked']”)。每个(函数(){
var id=$(this.attr('id');
id=id.toString();
id=id.substring(0,id.length-6);
id.push(id);
//如果是单J,则选择类别1-6
if(id.indexOf(“\u J”)!=-1){
变量catid=id.charAt(7);
categorylist.push(categories[catid]);
var valId=id.charAt(9);
valuelist.push(值[有效]);
}
//如果是双J,则选择类别7-12
else if(id.indexOf(“\u DJ”)!=-1){
var catid=parseInt(id.charAt(8))+6;
categorylist.push(categories[catid]);
var valId=parseInt(id.charAt(10))+5;
valuelist.push(值[有效]);
}
//如果是最终J,选择类别13
否则{
类别列表推送(类别[13]);
}
});
//提取所有线索文本,存储到1d数组
$('td.cule_text')。每个(函数(){
var clue=$(this.text();
线索。推(线索);
});
//将拉取值推送到大数组
array.push(ids);
array.push(categorylist);
array.push(valuelist);
数组。推送(线索);
//对不同URL的新请求以获取响应
请求(showAnswerURL、函数(err、resp、body){
如果(!err&&resp.statusCode==200){
var$=总负荷(车身);
$('.correct_response')。每个(函数(){
var answer=$(this.text();
回答。推(回答);
});
//将答案推送到大数组
数组。推送(应答);
//将数组合并到一维数组中以准备写入文件
对于(var i=0;i// Iterate over a few gameIDs, used in URL for request.
for (x = 4648; x < 4650; x++) {
// Pass in the callback as an anonymous function.
// So below I am passing in the id and the function I want to execute.
// AND, defining the results I am expecting as passed in arguments.
scrapeGame(x, function(scrapeResult, err) {
// This will *NOT* execute *UNTIL* you call it in the function below.
// That means that the for loop's execution is halted.
// This function receives the status that is passed in,
// in this case, a boolean true/false and an error if any.
if (scrapeResult) {
// Scrape was true, nothing to do.
// The for loop will now move on to the next iteration.
console.log('Scrape Successful');
} else {
// Scrape was false, output error to console.log and
// break loop to handle error.
console.log('Scrape ERROR :: ' + err);
// Notice we are calling break while in the
// scope of the callback function
// Remove the break if you want to just move onto
// the next game ID and not stop the loop
break;
}
});
}
// This function now accepts two arguments.
function scrapeGame(gameId, callback) {
// ************************************************
// ** Do Your Work Here **
// Request from URL, scrape HTML to arrays as necessary.
// Write final array to file.
// After file creation, execute the callback and pass bool
// status (true/false).
// ************************************************
var request = require('request'),
cheerio = require('cheerio'),
fs = require('fs'),
categories = [],
categorylist = [],
ids = [],
clues = [],
values = [
'0',
'$200',
'$400',
'$600',
'$800',
'$1000',
'$400',
'$800',
'$1200',
'$1600',
'$2000'
],
valuelist = [],
answers = [],
array = [],
file = [],
showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId,
showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;
request(showGameURL, function(err, resp, body) {
if (!err && resp.statusCode === 200) {
var $ = cheerio.load(body);
//add a row to categories to avoid starting at 0
categories.push('Category List');
//pull all categories to use for later
$('td.category_name').each(function() {
var category = $(this).text();
categories.push(category);
});
//pull all clue IDs (coordinates), store to 1d array
//pull any id that has "stuck" in the string, to prevent duplicates
$("[id*='stuck']").each(function() {
var id = $(this).attr('id');
id = id.toString();
id = id.substring(0, id.length - 6);
ids.push(id);
//if single J, pick category 1-6
if (id.indexOf("_J_") !== -1) {
var catid = id.charAt(7);
categorylist.push(categories[catid]);
var valId = id.charAt(9);
valuelist.push(values[valId]);
}
//if double J, pick category 7-12
else if (id.indexOf("_DJ_") !== -1) {
var catid = parseInt(id.charAt(8)) + 6;
categorylist.push(categories[catid]);
var valId = parseInt(id.charAt(10)) + 5;
valuelist.push(values[valId]);
}
//if final J, pick category 13
else {
categorylist.push(categories[13]);
}
});
//pull all clue texts, store to 1d array
$('td.clue_text').each(function() {
var clue = $(this).text();
clues.push(clue);
});
//push pulled values to big array
array.push(ids);
array.push(categorylist);
array.push(valuelist);
array.push(clues);
//new request to different URL to pull responses
request(showAnswerURL, function(err, resp, body) {
if (!err && resp.statusCode === 200) {
var $ = cheerio.load(body);
$('.correct_response').each(function() {
var answer = $(this).text();
answers.push(answer);
});
//push answers to big array
array.push(answers);
//combine arrays into 1-d array to prep for writing to file
for (var i = 0; i < array[0].length; i++) {
var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
var stringPrint = print.toString();
file.push(stringPrint);
}
//update string, add newlines, etc.
var stringFile = JSON.stringify(file);
stringFile = stringFile.split('\\').join('');
stringFile = stringFile.split('","').join('\n');
//write to file, eventually will append to end of one big file
fs.writeFile('J_GAME_' + gameId + '.txt', stringFile, function(err) {
//clear arrays used
valuelist = [];
answers = [];
categories = [];
categorylist = [];
ids = [];
clues = [];
array = [];
file = [];
if (err) {
// ******************************************
// Callback false with error.
callback(false, err);
// ******************************************
} else {
console.log("Game #" + gameId + " has been scraped.");
// ******************************************
// Callback true with no error.
callback(true);
// ******************************************
}
});
}
});
}
});
}
var x = 4648;
var myFunc = scrapeGame(x, function cb(){
if(x >= 4650){
return;
}
x++;
return myFunc(x, cb);
});
function scrapeGame(gameId){
//request from URL, scrape HTML to arrays as necessary
//write final array to file
}
function scrapeGame(gameId, cb){
//your code and set options
http.request(options, function(response){
var result = "";
response.on('data', function (chunk) {
result += chunk;
});
response.on('end',function(){
//write data here;
//do the callback
cb();
});
});
}