Web scraping 木偶演员的执行环境被破坏,很可能是因为导航。当刮取大量内容时
我用的是木偶演员 当我刮刀较少的追随者/用户,它的工作原理就像16k,20k。但当我试图通过以下错误删除拥有更多追随者的用户时: 木偶演员执行上下文被破坏,很可能是因为导航的缘故 这是我的密码Web scraping 木偶演员的执行环境被破坏,很可能是因为导航。当刮取大量内容时,web-scraping,puppeteer,Web Scraping,Puppeteer,我用的是木偶演员 当我刮刀较少的追随者/用户,它的工作原理就像16k,20k。但当我试图通过以下错误删除拥有更多追随者的用户时: 木偶演员执行上下文被破坏,很可能是因为导航的缘故 这是我的密码 const puppeteer = require("puppeteer"); const agent = require("secure-random-user-agent"); const { DateTime } = require(&quo
const puppeteer = require("puppeteer");
const agent = require("secure-random-user-agent");
const { DateTime } = require("luxon");
const argv = require("yargs").argv;
const fsConstants = require("fs").constants;
const fs = require("fs").promises;
const path = require( "path" );
const { performance } = require("perf_hooks");
async function getProfile ( page )
{
console.log('get profile')
try
{
let data = await page.evaluate(() => {
const id = document.querySelector(".ProfileNav");
const username = document.querySelector(
".ProfileHeaderCard > h2 > a > span > b"
);
const fullname = document.querySelector(".ProfileHeaderCard > h1 > a");
const bio = document.querySelector(".ProfileHeaderCard > p");
const location = document.querySelector(
".ProfileHeaderCard-locationText.u-dir"
);
const url = document.querySelector(
".ProfileHeaderCard-urlText.u-dir > a"
);
const avatar = document.querySelector(
".ProfileCanopy-avatar > div > a > img"
);
const background = document.querySelector(
".ProfileCanopy-headerBg > img"
);
const verified = document.querySelector(
".ProfileHeaderCard > h1 > span > a > span"
);
const tweets = document.querySelector(
"#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div.css-1dbjc4n.r-yfoy6g.r-18bvks7.r-1ljd8xs.r-13l2t4g.r-1phboty.r-1jgb5lz.r-11wrixw.r-61z16t.r-1ye8kvj.r-13qz1uu.r-184en5c > div > div.css-1dbjc4n.r-aqfbo4.r-yfoy6g.r-1ila09b.r-rull8r.r-qklmqi.r-gtdqiz.r-ipm5af.r-1g40b8q > div.css-1dbjc4n.r-1loqt21.r-136ojw6 > div > div > div > div > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1pi2tsx.r-1777fci > div > div"
);
const following = document.querySelector(
".ProfileNav-item--following > a > span.ProfileNav-value"
);
const followers = document.querySelector(
".ProfileNav-item--followers > a > span.ProfileNav-value"
);
const likes = document.querySelector(
".ProfileNav-item--favorites > a > span.ProfileNav-value"
);
const date = document.querySelector(
".ProfileHeaderCard-joinDateText.js-tooltip.u-dir"
);
return {
id: id ? Number(id.dataset.userId) : null,
username: username ? username.innerText : "",
fullname: fullname ? fullname.innerText : "",
bio: bio ? bio.innerText : "",
location: location ? location.innerText : "",
url: url ? url.title : "",
avatar: avatar ? avatar.src : "",
background: background ? background.src : "",
verified: verified ? true : false,
tweets: tweets ? Number(tweets.dataset.count) : 0,
following: following ? Number(following.dataset.count) : 0,
followers: followers ? Number(followers.dataset.count) : 0,
likes: likes ? Number(likes.dataset.count) : 0,
date: date ? date.title : "",
};
});
if (data.date) {
data.date = DateTime.fromFormat(data.date, "H:m a - d MMM yyyy").toISO();
}
return data;
} catch (e) {
console.log(e);
}
}
async function getDataFromPage ( page )
{
console.log('get data from page ')
try {
const records = [];
// await page.waitForNavigation();
const users = await page.$$(".user-item .username");
for (let user of users) {
const username = await user.evaluate((e) => e.innerText.replace("@", ""));
records.push(username);
}
return records;
} catch (e) {
console.log(e);
}
}
async function scroll ( page, fn )
{
console.log("scroll")
return new Promise(async (resolve, reject) => {
try {
const records = [];
const interval = setInterval(async () => {
try {
const data = await fn(page);
records.push(...data);
const moreButton = await page.$("div.user-list > div > a");
if (!moreButton) {
resolve(records);
return clearInterval(interval);
}
await moreButton.click();
}
catch ( e )
{
reject(e);
}
}, 2000 );
await Promise.all([
page.$( "div.user-list > div > a" ),
page.waitForNavigation( { waitUntil: "networkidle0" } ),
]);
} catch (e) {
reject(e);
}
}
);
// await Promise.race([user1, page.waitForNavigation()]);
}
async function fsExists(path) {
let exists = true;
try {
await fs.access(path, fsConstants.F_OK);
} catch (e) {
exists = false;
}
return exists;
}
async function getUser ( config, depth = 0, blacklist = [] )
{
console.log('get user')
try {
const { browser, page } = config;
if (blacklist.includes(config.username)) {
return;
}
const filename = path.join(config.dataFolder, `${config.username}.json`);
let isCached = await fsExists(filename);
let data = {
username: config.username,
profile: {},
following: [],
followers: [],
};
if (isCached) {
data = JSON.parse(await fs.readFile(filename));
} else {
await page.goto(`https://twitter.com/${config.username}?lang=en`, {
waitUntil: "load",
timeout: 0,
});
data.profile = await getProfile(page);
const userAgent = await browser.userAgent();
await page.setUserAgent(agent());
if (config.following) {
await page.goto(
`https://mobile.twitter.com/${config.username}/following?lang=en`,
{ waitUntil: "load", timeout: 0 }
);
data.following = await scroll(page, await getDataFromPage);
}
if (config.followers) {
await page.goto(
`https://mobile.twitter.com/${config.username}/followers?lang=en`,
{ waitUntil: "load", timeout: 0 }
);
data.followers = await scroll(page, await getDataFromPage);
}
await page.setUserAgent(userAgent);
await fs.writeFile(filename, JSON.stringify(data, null, 2));
}
blacklist.push(data.username);
// console.log( "one",data )
// console.log( "two",data.followers )
// console.log( "three",data.followers.length )
if (data.following.length > 0 && depth < config.depth - 1) {
for (const username of data.following) {
config.username = username;
await getUser(config, depth + 1, blacklist);
}
}
} catch (e) {
throw new Error(e.message);
}
}
async function main ( argv )
{
const t0 = performance.now();
try
{
if ( !argv.username )
{
return;
}
const dataFolder = path.join( process.cwd(), "data" );
const isDataFolderCreated = await fsExists( dataFolder );
if ( !isDataFolderCreated )
{
try
{
await fs.mkdir( dataFolder );
} catch ( err )
{
throw new Error( err.message );
}
}
const config = {
depth: argv.depth || 2,
username: "MuftiKifayatJUI",
followers: argv.followers || false,
following: argv.following || false,
dataFolder: dataFolder,
};
let args = [
"--no-sandbox",
"--disable-setuid-sandbox",
"--enable-features=NetworkService",
];
const browser = await puppeteer.launch( {
defaultViewport: null,
headless: true,
args: args,
browserContext: "default",
} );
const page = ( await browser.pages() )[ 0 ];
await page.setDefaultNavigationTimeout(0);
config.browser = browser;
config.page = page;
await getUser( config );
await browser.close();
} catch ( err )
{
throw err;
}
const t1 = performance.now();
times = t1 - t0;
seconds= (times/1000 % 60)
minutes= (times/1000 / 60)
console.log( `Call to doSomething took ${ times } milliseconds.` );
console.log( `Call to doSomething took ${ seconds } seconds.` );
console.log(`Call to doSomething took ${minutes} minutes.`);
}
main(argv)
.then(process.exit)
.catch((e) => {
console.log(e);
process.exit(-1);
});
// run this node abc.js --username username --depth 2 --following --depth 2 --followers
我用的是木偶演员
当我刮刀较少的追随者/用户,它的工作原理就像16k,20k。但当我试图通过以下错误删除拥有更多追随者的用户时:
木偶演员执行上下文被破坏,很可能是因为导航的缘故
这是我的密码
const puppeteer = require("puppeteer");
const agent = require("secure-random-user-agent");
const { DateTime } = require("luxon");
const argv = require("yargs").argv;
const fsConstants = require("fs").constants;
const fs = require("fs").promises;
const path = require( "path" );
const { performance } = require("perf_hooks");
async function getProfile ( page )
{
console.log('get profile')
try
{
let data = await page.evaluate(() => {
const id = document.querySelector(".ProfileNav");
const username = document.querySelector(
".ProfileHeaderCard > h2 > a > span > b"
);
const fullname = document.querySelector(".ProfileHeaderCard > h1 > a");
const bio = document.querySelector(".ProfileHeaderCard > p");
const location = document.querySelector(
".ProfileHeaderCard-locationText.u-dir"
);
const url = document.querySelector(
".ProfileHeaderCard-urlText.u-dir > a"
);
const avatar = document.querySelector(
".ProfileCanopy-avatar > div > a > img"
);
const background = document.querySelector(
".ProfileCanopy-headerBg > img"
);
const verified = document.querySelector(
".ProfileHeaderCard > h1 > span > a > span"
);
const tweets = document.querySelector(
"#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div.css-1dbjc4n.r-yfoy6g.r-18bvks7.r-1ljd8xs.r-13l2t4g.r-1phboty.r-1jgb5lz.r-11wrixw.r-61z16t.r-1ye8kvj.r-13qz1uu.r-184en5c > div > div.css-1dbjc4n.r-aqfbo4.r-yfoy6g.r-1ila09b.r-rull8r.r-qklmqi.r-gtdqiz.r-ipm5af.r-1g40b8q > div.css-1dbjc4n.r-1loqt21.r-136ojw6 > div > div > div > div > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1pi2tsx.r-1777fci > div > div"
);
const following = document.querySelector(
".ProfileNav-item--following > a > span.ProfileNav-value"
);
const followers = document.querySelector(
".ProfileNav-item--followers > a > span.ProfileNav-value"
);
const likes = document.querySelector(
".ProfileNav-item--favorites > a > span.ProfileNav-value"
);
const date = document.querySelector(
".ProfileHeaderCard-joinDateText.js-tooltip.u-dir"
);
return {
id: id ? Number(id.dataset.userId) : null,
username: username ? username.innerText : "",
fullname: fullname ? fullname.innerText : "",
bio: bio ? bio.innerText : "",
location: location ? location.innerText : "",
url: url ? url.title : "",
avatar: avatar ? avatar.src : "",
background: background ? background.src : "",
verified: verified ? true : false,
tweets: tweets ? Number(tweets.dataset.count) : 0,
following: following ? Number(following.dataset.count) : 0,
followers: followers ? Number(followers.dataset.count) : 0,
likes: likes ? Number(likes.dataset.count) : 0,
date: date ? date.title : "",
};
});
if (data.date) {
data.date = DateTime.fromFormat(data.date, "H:m a - d MMM yyyy").toISO();
}
return data;
} catch (e) {
console.log(e);
}
}
async function getDataFromPage ( page )
{
console.log('get data from page ')
try {
const records = [];
// await page.waitForNavigation();
const users = await page.$$(".user-item .username");
for (let user of users) {
const username = await user.evaluate((e) => e.innerText.replace("@", ""));
records.push(username);
}
return records;
} catch (e) {
console.log(e);
}
}
async function scroll ( page, fn )
{
console.log("scroll")
return new Promise(async (resolve, reject) => {
try {
const records = [];
const interval = setInterval(async () => {
try {
const data = await fn(page);
records.push(...data);
const moreButton = await page.$("div.user-list > div > a");
if (!moreButton) {
resolve(records);
return clearInterval(interval);
}
await moreButton.click();
}
catch ( e )
{
reject(e);
}
}, 2000 );
await Promise.all([
page.$( "div.user-list > div > a" ),
page.waitForNavigation( { waitUntil: "networkidle0" } ),
]);
} catch (e) {
reject(e);
}
}
);
// await Promise.race([user1, page.waitForNavigation()]);
}
async function fsExists(path) {
let exists = true;
try {
await fs.access(path, fsConstants.F_OK);
} catch (e) {
exists = false;
}
return exists;
}
async function getUser ( config, depth = 0, blacklist = [] )
{
console.log('get user')
try {
const { browser, page } = config;
if (blacklist.includes(config.username)) {
return;
}
const filename = path.join(config.dataFolder, `${config.username}.json`);
let isCached = await fsExists(filename);
let data = {
username: config.username,
profile: {},
following: [],
followers: [],
};
if (isCached) {
data = JSON.parse(await fs.readFile(filename));
} else {
await page.goto(`https://twitter.com/${config.username}?lang=en`, {
waitUntil: "load",
timeout: 0,
});
data.profile = await getProfile(page);
const userAgent = await browser.userAgent();
await page.setUserAgent(agent());
if (config.following) {
await page.goto(
`https://mobile.twitter.com/${config.username}/following?lang=en`,
{ waitUntil: "load", timeout: 0 }
);
data.following = await scroll(page, await getDataFromPage);
}
if (config.followers) {
await page.goto(
`https://mobile.twitter.com/${config.username}/followers?lang=en`,
{ waitUntil: "load", timeout: 0 }
);
data.followers = await scroll(page, await getDataFromPage);
}
await page.setUserAgent(userAgent);
await fs.writeFile(filename, JSON.stringify(data, null, 2));
}
blacklist.push(data.username);
// console.log( "one",data )
// console.log( "two",data.followers )
// console.log( "three",data.followers.length )
if (data.following.length > 0 && depth < config.depth - 1) {
for (const username of data.following) {
config.username = username;
await getUser(config, depth + 1, blacklist);
}
}
} catch (e) {
throw new Error(e.message);
}
}
async function main ( argv )
{
const t0 = performance.now();
try
{
if ( !argv.username )
{
return;
}
const dataFolder = path.join( process.cwd(), "data" );
const isDataFolderCreated = await fsExists( dataFolder );
if ( !isDataFolderCreated )
{
try
{
await fs.mkdir( dataFolder );
} catch ( err )
{
throw new Error( err.message );
}
}
const config = {
depth: argv.depth || 2,
username: "MuftiKifayatJUI",
followers: argv.followers || false,
following: argv.following || false,
dataFolder: dataFolder,
};
let args = [
"--no-sandbox",
"--disable-setuid-sandbox",
"--enable-features=NetworkService",
];
const browser = await puppeteer.launch( {
defaultViewport: null,
headless: true,
args: args,
browserContext: "default",
} );
const page = ( await browser.pages() )[ 0 ];
await page.setDefaultNavigationTimeout(0);
config.browser = browser;
config.page = page;
await getUser( config );
await browser.close();
} catch ( err )
{
throw err;
}
const t1 = performance.now();
times = t1 - t0;
seconds= (times/1000 % 60)
minutes= (times/1000 / 60)
console.log( `Call to doSomething took ${ times } milliseconds.` );
console.log( `Call to doSomething took ${ seconds } seconds.` );
console.log(`Call to doSomething took ${minutes} minutes.`);
}
main(argv)
.then(process.exit)
.catch((e) => {
console.log(e);
process.exit(-1);
});
// run this node abc.js --username username --depth 2 --following --depth 2 --followers
const puppeter=require(“木偶演员”);
const agent=require(“安全随机用户代理”);
const{DateTime}=require(“luxon”);
常数argv=要求(“码”).argv;
常量fsConstants=需要(“fs”)。常量;
const fs=要求(“fs”).承诺;
常量路径=要求(“路径”);
const{performance}=require(“perf_hooks”);
异步函数getProfile(第页)
{
console.log('get profile')
尝试
{
让数据=等待页面。评估(()=>{
const id=document.querySelector(“.ProfileNav”);
const username=document.querySelector(
“.ProfileHeaderCard>h2>a>span>b”
);
const fullname=document.querySelector(“.ProfileHeaderCard>h1>a”);
const bio=document.querySelector(“.ProfileHeaderCard>p”);
const location=document.querySelector(
“.ProfileHeaderCard locationText.u-dir”
);
const url=document.querySelector(
“.ProfileHeaderCard-urlText.u-dir>a”
);
const avatar=document.querySelector(
“.profiler>avatar>div>a>img”
);
const background=document.querySelector(
“.ProfileCanopy headerBg>img”
);
const verified=document.querySelector(
“.ProfileHeaderCard>h1>span>a>span”
);
const tweets=document.querySelector(
(3)反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,div,div,div,反应根,反应根,反应根,反应根,div,div,Di,反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,根,根,根,根,根,根,根,根,根,根,r-13QZ1.r-13QZ1.r-13QZ1.r-13QZ1.r-13QZZ1.r-1.r-1.r-1.r-1.r-1.r-1.r-611.r-Z1.r-1-1-6.r-Z1-6.r-1-1-1-1-1-1-6.r-1-1-1-1-6.r-1-6.1-1-1-1-1-1-6.1 r-qklmqi.r-gtdqiz.r-ipm5af.r-1g40b8q>div.css-1dbjc4n.r-1LOKT21。r-136ojw6>div>div>div>div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1pi2tsx.r-1777fci>div>div“
);
const following=document.querySelector(
“.ProfileNav项--跟随>a>span.ProfileNav值”
);
const followers=document.querySelector(
“.ProfileNav项--跟随者>a>span.ProfileNav值”
);
const likes=document.querySelector(
“.ProfileNav项--收藏夹>a>span.ProfileNav值”
);
const date=document.querySelector(
“.ProfileHeaderCard joinDateText.js tooltip.u-dir”
);
返回{
id:id?编号(id.dataset.userId):null,
用户名:username?username.innerText:“”,
全名:全名?全名。内部文本:“”,
bio:bio?bio.innerText:“”,
位置:位置?位置。内部文本:“”,
url:url?url.title:“”,
阿凡达:阿凡达?avatar.src:“”,
背景:背景?background.src:“”,
已验证:已验证?正确:错误,
tweets:tweets?数量(tweets.dataset.count):0,
following:following?编号(following.dataset.count):0,
关注者:关注者?数量(followers.dataset.count):0,
喜欢:喜欢?数量(喜欢.数据集.计数):0,
日期:日期?日期。标题:“”,
};
});
如果(数据日期){
data.date=DateTime.fromFormat(data.date,“H:m a-d MMM yyyy”).toISO();
}
返回数据;
}捕获(e){
控制台日志(e);
}
}
异步函数getDataFromPage(第页)
{
console.log('从页面获取数据')
试一试{
常量记录=[];
//等待page.waitForNavigation();
const users=等待页面。$$(“.user item.username”);
for(让用户对用户){
const username=wait user.evaluate((e)=>e.innerText.replace(“@”和“);
记录。推送(用户名);
}
退货记录;
}捕获(e){
控制台日志(e);
}
}
异步功能滚动(第页,fn)
{
console.log(“滚动”)
返回新承诺(异步(解析、拒绝)=>{
试一试{
常量记录=[];
const interval=setInterval(异步()=>{
试一试{
常数数据=等待fn(第页);
记录。推送(…数据);
const moreButton=等待页面。$(“div.user-list>div>a”);
如果(!moreButton){
解决(记录);
返回clearInterval(间隔);
}
等待更多按钮。单击();
}
捕获(e)
{
拒绝(e);
}
}, 2000 );
等待承诺([
第.$页(“div.user-list>div>a”),
page.waitForNavigation({waitUntil:“networkidle0”}),
]);
}捕获(e){
拒绝(e);
}
}
);
//wait Promise.race([user1,page.waitForNavigation()]);
}
异步函数fsExists(路径){
让存在=真实;
试一试{
等待fs.access(路径,fsConstants.F_OK);
}