Javascript 为什么我的for循环会弄乱所有参数?
我正在尝试使用javascript解析来自多个网页的一些数据。为此,我编写了一个小型解析器。算法如下所示:Javascript 为什么我的for循环会弄乱所有参数?,javascript,parsing,for-loop,timeout,Javascript,Parsing,For Loop,Timeout,我正在尝试使用javascript解析来自多个网页的一些数据。为此,我编写了一个小型解析器。算法如下所示: URL 1 + data from URL 1 (correct line) URL 2 + data from URL 2 (correct line) URL 3 + data from URL 3 (correct line) URL 4 + data from URL 4 (correct line) URL 6(wrong URL) + data from another
URL 1 + data from URL 1 (correct line)
URL 2 + data from URL 2 (correct line)
URL 3 + data from URL 3 (correct line)
URL 4 + data from URL 4 (correct line)
URL 6(wrong URL) + data from another URL
URL 5(wrong URL) + data from another URL
URL 7 + data from URL 7 (correct line)
URL 8 + data from URL 8 (correct line)
URL 9 + data from URL 9 (correct line)
URL
数据
URL
和data
保存到json文件URL 1 + data from URL 1 (correct line)
URL 2 + data from URL 2 (correct line)
URL 3 + data from URL 3 (correct line)
URL 4 + data from URL 4 (correct line)
URL 6(wrong URL) + data from another URL
URL 5(wrong URL) + data from another URL
URL 7 + data from URL 7 (correct line)
URL 8 + data from URL 8 (correct line)
URL 9 + data from URL 9 (correct line)
我认为问题在于一些页面加载时间过长,这会打乱整个过程。但我仍然不明白为什么它有时会保存错误的数据。
这是我的密码:
var request = require('request');
var cheerio = require('cheerio');
var cloudscraper = require('cloudscraper');
var fs = require('fs');
var path = require('path');
var csvjson = require('csvjson');
//First, we read .csv file with our URL list
function getTheList() {
urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
var options = {
delimiter : ';', // optional
quote : '"' // optional
};
urlList = csvjson.toObject(urlList, options);
end = urlList.length;
logs = [];
//here we start the loop reading and saving data from each url
for (let p = 0; p < end; p += 1){
grabTheData(urlList, p)
}
}
//this code extracts the data from the page and saves it to a json file
function grabTheData(urlList, p){
setTimeout(function() {
url = url[p].ItemLink;
cloudscraper.get(url, function(err, res, body){
if (err) {
console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
callback();
} else {
var $ = cheerio.load(body);
/*
here are the lines which extract the data I need
dataIneed = ...;
*/
logs.push({
url, dataINeed
});
fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
}
});
//here I set a 2 seconds delay between each URL
}, 2000 * p);
}
getTheList()
var request=require('request');
var cheerio=需要('cheerio');
var cloudscraper=require('cloudscraper');
var fs=需要('fs');
var path=require('path');
var csvjson=require('csvjson');
//首先,我们读取带有URL列表的.csv文件
函数getTheList(){
urlList=fs.readFileSync(path.join(uu dirname,'data.csv'),{encoding:'utf8'});
变量选项={
分隔符:“;”,//可选
引号:“”//可选
};
urlList=csvjson.toObject(urlList,选项);
end=urlist.length;
日志=[];
//在这里,我们开始从每个url读取和保存数据的循环
对于(设p=0;p
发生这种情况的原因是回调结果与grabTheData中的url变量之间存在潜在的不匹配
现在有一个非常快速的解决方法,简单地更改url变量的范围,如下所示:
function grabTheData(urlList, p){
setTimeout(function() {
// Set scope of url variable to block
let url = url[p].ItemLink;
cloudscraper.get(url, function(err, res, body){
if (err) {
console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
callback();
} else {
var $ = cheerio.load(body);
/*
here are the lines which extract the data I need
dataIneed = ...;
*/
logs.push({
url, dataINeed
});
fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
}
});
//here I set a 2 seconds delay between each URL
}, 2000 * p);
}
这会让你的结果井然有序
下面是另一个(IMHO更好)选项,使用承诺并避免使用setTimeout来分隔调用。这应该避免任何潜在的争用情况,因为Promise.all调用将保持顺序:
async function getTheList() {
urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
var options = {
delimiter : ';', // optional
quote : '"' // optional
};
urlList = csvjson.toObject(urlList, options);
let promiseList = urlList.map(urlEntry => grabTheDataUpdated(urlEntry.ItemLink));
let logs = await Promise.all(promiseList);
fs.writeFileSync('./new_logs.json', JSON.stringify(logs, null, 4));
}
// Promisified version of cloudscraper.get
function getCloudScraperData(url) {
return new Promise((resolve, reject) => {
cloudscraper.get(url, (err, res, body) => {
if (err) {
reject(err);
} else {
resolve ( { url, res, body });
}
})
})
}
function getDataINeed(url, body) {
// Use cheerio to process data..
// Return mock data for now.. replace with actual data processed by cheerio..
return `data from ${url}`;
}
async function grabTheDataUpdated(url) {
try {
let result = await getCloudScraperData(url);
let dataINeed = getDataINeed(result.url, result.body);
return { url, dataINeed };
} catch (error) {
return { url, dataINeed: "Error occurred: " + error.message };
}
}
乍一看似乎是对setTimeout()的典型误用。我实际上使用了
setTimeout()
,但我想我是做错了吗?