Javascript 木偶演员:迭代CSV文件和每行的屏幕截图?
我想迭代一个CSV文件,并使用Puppeter截屏CSV文件中每一行的URL 我有以下代码,工作正常,但每个请求都要等待前一个请求完成,因此运行需要很长时间:Javascript 木偶演员:迭代CSV文件和每行的屏幕截图?,javascript,node.js,async-await,puppeteer,Javascript,Node.js,Async Await,Puppeteer,我想迭代一个CSV文件,并使用Puppeter截屏CSV文件中每一行的URL 我有以下代码,工作正常,但每个请求都要等待前一个请求完成,因此运行需要很长时间: const csv = require('csv-parser'); const fs = require('fs'); const puppeteer = require('puppeteer'); (async () => { const browser = await puppeteer.launch();
const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const getFile = async function(rowId, path) {
const page = await browser.newPage();
page.setViewport({ width: 1000, height: 1500, deviceScaleFactor: 1 });
let url = 'https://www.facebook.com/ads/library/?id=' + rowId;
const response = await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitFor(3000);
const body = await page.$('body');
await body.screenshot({
path: path
});
page.close();
};
let fname = 'ids.csv'
const csvPipe = fs.createReadStream(fname).pipe(csv());
csvPipe.on('data', async (row) => {
let id = row.ad_id;
console.log(id);
let path = './images/' + id + '.png';
csvPipe.pause();
await getFile(id, path);
csvPipe.resume();
}).on('end', () => {
console.log('CSV file successfully processed');
});
})();
如何使请求并行运行以加快速度
如果我删除了pause()
和resume()
行,则每次函数运行时都会出现此错误:
(node:18610) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 14)
(node:18610) UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'screenshot' of null
at getFile (/Users/me/Dropbox/Projects/scrape/index.js:29:12)
at <anonymous>
at process._tickCallback (internal/process/next_tick.js:189:7)
(节点:18610)未处理的PromisejectionWarning:未处理的承诺拒绝。此错误源于在没有catch块的异步函数中抛出,或者拒绝未使用.catch()处理的承诺。(拒绝id:14)
(节点:18610)未处理的PromisejectionWarning:TypeError:无法读取null的属性“screenshot”
在getFile(/Users/me/Dropbox/Projects/scrap/index.js:29:12)
在
在进程中。_tick回调(内部/process/next_tick.js:189:7)
您可以通过
步骤1:您必须创建准备好使用的页面:
const pages=wait Promise.all([browser.newPage(),browser.newPage()])
步骤2:您可以解析csv文件,根据在步骤1中创建的页面数量生成URL块。
您不需要加载数据,只需解析csv并获得结果。
生成结果数组,如下所示:
const rows=[url1,url2,…等]
然后,根据已初始化页面的数量将其转换为块。大概是这样的:
const rowPacks = rows.reduce((acc, cur) => {
if(!acc.length || acc[acc.length - 1].length < pages.length){
acc.push([cur]);
return acc;
}
acc[acc.length - 1].push(cur);
return acc;
}, []);
只需处理超时和页面实例数量,即可防止对目标URL的DDOS攻击,并防止内存扩展问题。您可以通过
步骤1:您必须创建准备好使用的页面:
const pages=wait Promise.all([browser.newPage(),browser.newPage()])
步骤2:您可以解析csv文件,根据在步骤1中创建的页面数量生成URL块。
您不需要加载数据,只需解析csv并获得结果。
生成结果数组,如下所示:
const rows=[url1,url2,…等]
然后,根据已初始化页面的数量将其转换为块。大概是这样的:
const rowPacks = rows.reduce((acc, cur) => {
if(!acc.length || acc[acc.length - 1].length < pages.length){
acc.push([cur]);
return acc;
}
acc[acc.length - 1].push(cur);
return acc;
}, []);
只需处理超时和页面实例的数量,以防止对目标URL的DDOS攻击,并防止内存扩展问题。以下是一个方案,它可以并行运行用户可控制数量的
getFile()
操作。您可以将maxInFlight
变量设置为希望并行运行的页面数量(这可能只是内存使用或facebook可能应用的任何速率限制的问题)。你必须通过实验来决定设置什么。我最初将其设置为10,以允许10页同时处于“飞行”状态
这里的总体思路是,getFile()
递增/递减inFlightCntr
作为一次打开多少页的度量,然后基于该计数器暂停或恢复csvPipe
const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const maxInFlight = 10; // set this value to control how many pages run in parallel
let inFlightCntr = 0;
let paused = false;
async function getFile(rowId, path) {
try {
++inFlightCntr;
const page = await browser.newPage();
page.setViewport({ width: 1000, height: 1500, deviceScaleFactor: 1 });
let url = 'https://www.facebook.com/ads/library/?id=' + rowId;
const response = await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitFor(3000);
const body = await page.$('body');
await body.screenshot({
path: path
});
await page.close();
} catch(e) {
console.log(e);
page.close();
} finally {
--inFlightCntr;
}
}
let fname = 'ids.csv'
const csvPipe = fs.createReadStream(fname).pipe(csv());
csvPipe.on('data', async (row) => {
let id = row.ad_id;
console.log(id);
let path = './images/' + id + '.png';
getFile(id, path).finally(() => {
if (paused && inFlightCntr < maxInFlight) {
cvsPipe.resume();
paused = false;
}
});
if (!paused && inFlightCntr >= maxInFlight) {
cvsPipe.pause();
paused = true;
}
}).on('end', () => {
console.log('CSV file successfully processed');
});
})();
const csv=require('csv-parser');
常数fs=要求('fs');
const puppeter=require('puppeter');
(异步()=>{
const browser=wait puppeter.launch();
const maxInFlight=10;//设置此值以控制并行运行的页面数
设inFlightCntr=0;
让暂停=假;
异步函数getFile(rowId,path){
试一试{
++inFlightCntr;
const page=wait browser.newPage();
page.setViewport({宽度:1000,高度:1500,设备比例因子:1});
让url为空https://www.facebook.com/ads/library/?id="罗维德,;
const response=wait page.goto(url,{waitill:'networkidle2'});
等待页面。等待(3000);
const body=等待页面。$(“body”);
等待身体。截图({
路径:路径
});
等待页面。关闭();
}捕获(e){
控制台日志(e);
page.close();
}最后{
--inFlightCntr;
}
}
让fname='ids.csv'
const csvPipe=fs.createReadStream(fname).pipe(csv());
csvPipe.on('data',异步(行)=>{
设id=row.ad_id;
console.log(id);
让路径='./images/'+id+'.png';
getFile(id,path).finally(()=>{
如果(暂停(&inFlightCntr=maxInFlight){
cvsPipe.pause();
暂停=真;
}
}).on('end',()=>{
console.log('CSV文件已成功处理');
});
})();
如果只运行csvPipe将所有行收集到一个数组中(在处理任何行之前),代码可能会简单一些。然后,您可以使用任意数量的promise并发函数来处理阵列,同时控制并行运行的数量。有关在并行处理阵列时可用于管理并发性的许多函数,请参见“从昨天开始”。下面是该实现的外观:
const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const maxInFlight = 10; // set this value to control how many pages run in parallel
const fname = 'ids.csv'
const csvPipe = fs.createReadStream(fname).pipe(csv());
const rowIDs = [];
async function getFile(rowId, path) {
try {
const page = await browser.newPage();
page.setViewport({ width: 1000, height: 1500, deviceScaleFactor: 1 });
let url = 'https://www.facebook.com/ads/library/?id=' + rowId;
const response = await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitFor(3000);
const body = await page.$('body');
await body.screenshot({
path: path
});
} catch(e) {
console.log(e);
} finally {
await page.close();
}
}
csvPipe.on('data', row => {
rowIDs.push(row.ad_id);
}).on('end', () => {
// all rowIDs in the array now
pMap(rowIDs, (id) => {
let path = './images/' + id + '.png';
return getFile(id, path);
}, maxInFlight).then(() => {
console.log("all items processed"); // all done now
}).catch(err => {
console.log(e);
});
});
})();
// utility function for processing an array asynchronously with
// no more than limit items "in flight" at the same time
function pMap(array, fn, limit) {
return new Promise(function(resolve, reject) {
var index = 0, cnt = 0, stop = false, results = new Array(array.length);
function run() {
while (!stop && index < array.length && cnt < limit) {
(function(i) {
++cnt;
++index;
fn(array[i]).then(function(data) {
results[i] = data;
--cnt;
// see if we are done or should run more requests
if (cnt === 0 && index === array.length) {
resolve(results);
} else {
run();
}
}, function(err) {
// set stop flag so no more requests will be sent
stop = true;
--cnt;
reject(err);
});
})(index);
}
}
run();
});
}
const csv=require('csv-parser');
常数fs=要求('fs');
const puppeter=require('puppeter');
(异步()=>{
const browser=wait puppeter.launch();
const maxInFlight=10;//设置此值以控制并行运行的页面数
const fname='ids.csv'
const csvPipe=fs.createReadStream(fname).pipe(csv());
常量rowIDs=[];
异步函数getFile(rowId,path){
试一试{
const page=wait browser.newPage();
page.setViewport({宽度:1000,高度:1500,设备比例因子:1});
让url为空https://www.facebook.com/ads/library/?id="罗维德,;
const response=wait page.goto(url,{waitUnti