Web scraping 木偶演员的执行环境被破坏,很可能是因为导航。当刮取大量内容时

Web scraping 木偶演员的执行环境被破坏,很可能是因为导航。当刮取大量内容时,web-scraping,puppeteer,Web Scraping,Puppeteer,我用的是木偶演员 当我刮刀较少的追随者/用户,它的工作原理就像16k,20k。但当我试图通过以下错误删除拥有更多追随者的用户时: 木偶演员执行上下文被破坏,很可能是因为导航的缘故 这是我的密码 const puppeteer = require("puppeteer"); const agent = require("secure-random-user-agent"); const { DateTime } = require(&quo

我用的是木偶演员

当我刮刀较少的追随者/用户,它的工作原理就像16k,20k。但当我试图通过以下错误删除拥有更多追随者的用户时:

木偶演员执行上下文被破坏,很可能是因为导航的缘故

这是我的密码

 const puppeteer = require("puppeteer");
    const agent = require("secure-random-user-agent");
    const { DateTime } = require("luxon");
    const argv = require("yargs").argv;
    const fsConstants = require("fs").constants;
    const fs = require("fs").promises;
    const path = require( "path" );
    const { performance } = require("perf_hooks");
    
    async function getProfile ( page )
    {
    
      console.log('get profile')
    
  
      try
      {
        
    
        let data = await page.evaluate(() => {
          const id = document.querySelector(".ProfileNav");
          const username = document.querySelector(
            ".ProfileHeaderCard > h2 > a > span > b"
          );
          const fullname = document.querySelector(".ProfileHeaderCard > h1 > a");
          const bio = document.querySelector(".ProfileHeaderCard > p");
          const location = document.querySelector(
            ".ProfileHeaderCard-locationText.u-dir"
          );
          const url = document.querySelector(
            ".ProfileHeaderCard-urlText.u-dir > a"
          );
          const avatar = document.querySelector(
            ".ProfileCanopy-avatar > div > a > img"
          );
          const background = document.querySelector(
            ".ProfileCanopy-headerBg > img"
          );
          const verified = document.querySelector(
            ".ProfileHeaderCard > h1 > span > a > span"
          );
          const tweets = document.querySelector(
            "#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div.css-1dbjc4n.r-yfoy6g.r-18bvks7.r-1ljd8xs.r-13l2t4g.r-1phboty.r-1jgb5lz.r-11wrixw.r-61z16t.r-1ye8kvj.r-13qz1uu.r-184en5c > div > div.css-1dbjc4n.r-aqfbo4.r-yfoy6g.r-1ila09b.r-rull8r.r-qklmqi.r-gtdqiz.r-ipm5af.r-1g40b8q > div.css-1dbjc4n.r-1loqt21.r-136ojw6 > div > div > div > div > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1pi2tsx.r-1777fci > div > div"
          );
          const following = document.querySelector(
            ".ProfileNav-item--following > a > span.ProfileNav-value"
          );
          const followers = document.querySelector(
            ".ProfileNav-item--followers > a > span.ProfileNav-value"
          );
          const likes = document.querySelector(
            ".ProfileNav-item--favorites > a > span.ProfileNav-value"
          );
          const date = document.querySelector(
            ".ProfileHeaderCard-joinDateText.js-tooltip.u-dir"
          );
    
          return {
            id: id ? Number(id.dataset.userId) : null,
            username: username ? username.innerText : "",
            fullname: fullname ? fullname.innerText : "",
            bio: bio ? bio.innerText : "",
            location: location ? location.innerText : "",
            url: url ? url.title : "",
            avatar: avatar ? avatar.src : "",
            background: background ? background.src : "",
            verified: verified ? true : false,
            tweets: tweets ? Number(tweets.dataset.count) : 0,
            following: following ? Number(following.dataset.count) : 0,
            followers: followers ? Number(followers.dataset.count) : 0,
            likes: likes ? Number(likes.dataset.count) : 0,
            date: date ? date.title : "",
          };
        });
    
        if (data.date) {
          data.date = DateTime.fromFormat(data.date, "H:m a - d MMM yyyy").toISO();
        }
    
        return data;
      } catch (e) {
        console.log(e);
      }
    }
    
    async function getDataFromPage ( page )
    {
      
      console.log('get data from page ')
    
      try {
        const records = [];
        // await page.waitForNavigation();
        const users = await page.$$(".user-item .username");
    
        for (let user of users) {
          const username = await user.evaluate((e) => e.innerText.replace("@", ""));
    
          records.push(username);
        }
    
        return records;
      } catch (e) {
        console.log(e);
      }
    }
    
    async function scroll ( page, fn )
    {
      console.log("scroll")
    
       return  new Promise(async (resolve, reject) => {
        try {
          const records = [];
          const interval = setInterval(async () => {
            try {
              const data = await fn(page);
              records.push(...data);
              const moreButton = await page.$("div.user-list > div > a");
    
              if (!moreButton) {
                resolve(records);
                return clearInterval(interval);
              }
    
              await moreButton.click();
    
            }
                    
            catch ( e )
            {
              reject(e);
            }
          }, 2000 );
    
    await Promise.all([
      page.$( "div.user-list > div > a" ),
      page.waitForNavigation( { waitUntil: "networkidle0" } ),
    ]);
    
        } catch (e) {
          reject(e);
        }
    
       }
    
       );
      
        //  await Promise.race([user1, page.waitForNavigation()]);
    
    
    }
    
    
    
    async function fsExists(path) {
      let exists = true;
    
      try {
        await fs.access(path, fsConstants.F_OK);
      } catch (e) {
        exists = false;
      }
    
      return exists;
    }
    
    async function getUser ( config, depth = 0, blacklist = [] )
    {
      
      console.log('get user')
      try {
        const { browser, page } = config;
    
        if (blacklist.includes(config.username)) {
          return;
        }
    
        const filename = path.join(config.dataFolder, `${config.username}.json`);
        let isCached = await fsExists(filename);
    
        let data = {
          username: config.username,
          profile: {},
          following: [],
          followers: [],
        };
    
        if (isCached) {
          data = JSON.parse(await fs.readFile(filename));
        } else {
          await page.goto(`https://twitter.com/${config.username}?lang=en`, {
            waitUntil: "load",
            timeout: 0,
          });
          data.profile = await getProfile(page);
    
          const userAgent = await browser.userAgent();
          await page.setUserAgent(agent());
    
          if (config.following) {
            await page.goto(
              `https://mobile.twitter.com/${config.username}/following?lang=en`,
              { waitUntil: "load", timeout: 0 }
            );
            data.following = await scroll(page, await getDataFromPage);
          }
    
          if (config.followers) {
            await page.goto(
              `https://mobile.twitter.com/${config.username}/followers?lang=en`,
              { waitUntil: "load", timeout: 0 }
            );
            data.followers = await scroll(page, await getDataFromPage);
          }
    
          await page.setUserAgent(userAgent);
          await fs.writeFile(filename, JSON.stringify(data, null, 2));
        }
    
        blacklist.push(data.username);
    
        // console.log( "one",data )
        // console.log( "two",data.followers )
        // console.log( "three",data.followers.length )
        
    
        if (data.following.length > 0 && depth < config.depth - 1) {
          for (const username of data.following) {
            config.username = username;
            await getUser(config, depth + 1, blacklist);
          }
          
        }
      } catch (e) {
        throw new Error(e.message);
      }
    }
    
    async function main ( argv )
    {
      const t0 = performance.now();
    
      try
      {
        if ( !argv.username )
        {
          return;
        }
    
        const dataFolder = path.join( process.cwd(), "data" );
        const isDataFolderCreated = await fsExists( dataFolder );
    
        if ( !isDataFolderCreated )
        {
          try
          {
            await fs.mkdir( dataFolder );
          } catch ( err )
          {
            throw new Error( err.message );
          }
        }
    
    
        const config = {
          depth: argv.depth || 2,
          username: "MuftiKifayatJUI",
          followers: argv.followers || false,
          following: argv.following || false,
          dataFolder: dataFolder,
        };
    
        let args = [
          "--no-sandbox",
          "--disable-setuid-sandbox",
          "--enable-features=NetworkService",
        ];
    
        const browser = await puppeteer.launch( {
          defaultViewport: null,
          headless: true,
          args: args,
          browserContext: "default",
        } );
    
        const page = ( await browser.pages() )[ 0 ];
        await page.setDefaultNavigationTimeout(0); 
    
        config.browser = browser;
        config.page = page;
    
    
    
        await getUser( config );
    
    
        await browser.close();
      } catch ( err )
      {
        throw err;
      }
    
      const t1 = performance.now();
      times = t1 - t0;
      seconds= (times/1000 % 60) 
      minutes= (times/1000 / 60) 
      console.log( `Call to doSomething took ${ times } milliseconds.` );
      console.log( `Call to doSomething took ${ seconds } seconds.` );
      console.log(`Call to doSomething took ${minutes} minutes.`);
    }
    
    main(argv)
      .then(process.exit)
      .catch((e) => {
        console.log(e);
        process.exit(-1);
      });
    
    
    
    
      // run this node abc.js --username username --depth 2 --following --depth 2 --followers 
我用的是木偶演员

当我刮刀较少的追随者/用户,它的工作原理就像16k,20k。但当我试图通过以下错误删除拥有更多追随者的用户时:

木偶演员执行上下文被破坏,很可能是因为导航的缘故

这是我的密码

 const puppeteer = require("puppeteer");
    const agent = require("secure-random-user-agent");
    const { DateTime } = require("luxon");
    const argv = require("yargs").argv;
    const fsConstants = require("fs").constants;
    const fs = require("fs").promises;
    const path = require( "path" );
    const { performance } = require("perf_hooks");
    
    async function getProfile ( page )
    {
    
      console.log('get profile')
    
  
      try
      {
        
    
        let data = await page.evaluate(() => {
          const id = document.querySelector(".ProfileNav");
          const username = document.querySelector(
            ".ProfileHeaderCard > h2 > a > span > b"
          );
          const fullname = document.querySelector(".ProfileHeaderCard > h1 > a");
          const bio = document.querySelector(".ProfileHeaderCard > p");
          const location = document.querySelector(
            ".ProfileHeaderCard-locationText.u-dir"
          );
          const url = document.querySelector(
            ".ProfileHeaderCard-urlText.u-dir > a"
          );
          const avatar = document.querySelector(
            ".ProfileCanopy-avatar > div > a > img"
          );
          const background = document.querySelector(
            ".ProfileCanopy-headerBg > img"
          );
          const verified = document.querySelector(
            ".ProfileHeaderCard > h1 > span > a > span"
          );
          const tweets = document.querySelector(
            "#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div.css-1dbjc4n.r-yfoy6g.r-18bvks7.r-1ljd8xs.r-13l2t4g.r-1phboty.r-1jgb5lz.r-11wrixw.r-61z16t.r-1ye8kvj.r-13qz1uu.r-184en5c > div > div.css-1dbjc4n.r-aqfbo4.r-yfoy6g.r-1ila09b.r-rull8r.r-qklmqi.r-gtdqiz.r-ipm5af.r-1g40b8q > div.css-1dbjc4n.r-1loqt21.r-136ojw6 > div > div > div > div > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1pi2tsx.r-1777fci > div > div"
          );
          const following = document.querySelector(
            ".ProfileNav-item--following > a > span.ProfileNav-value"
          );
          const followers = document.querySelector(
            ".ProfileNav-item--followers > a > span.ProfileNav-value"
          );
          const likes = document.querySelector(
            ".ProfileNav-item--favorites > a > span.ProfileNav-value"
          );
          const date = document.querySelector(
            ".ProfileHeaderCard-joinDateText.js-tooltip.u-dir"
          );
    
          return {
            id: id ? Number(id.dataset.userId) : null,
            username: username ? username.innerText : "",
            fullname: fullname ? fullname.innerText : "",
            bio: bio ? bio.innerText : "",
            location: location ? location.innerText : "",
            url: url ? url.title : "",
            avatar: avatar ? avatar.src : "",
            background: background ? background.src : "",
            verified: verified ? true : false,
            tweets: tweets ? Number(tweets.dataset.count) : 0,
            following: following ? Number(following.dataset.count) : 0,
            followers: followers ? Number(followers.dataset.count) : 0,
            likes: likes ? Number(likes.dataset.count) : 0,
            date: date ? date.title : "",
          };
        });
    
        if (data.date) {
          data.date = DateTime.fromFormat(data.date, "H:m a - d MMM yyyy").toISO();
        }
    
        return data;
      } catch (e) {
        console.log(e);
      }
    }
    
    async function getDataFromPage ( page )
    {
      
      console.log('get data from page ')
    
      try {
        const records = [];
        // await page.waitForNavigation();
        const users = await page.$$(".user-item .username");
    
        for (let user of users) {
          const username = await user.evaluate((e) => e.innerText.replace("@", ""));
    
          records.push(username);
        }
    
        return records;
      } catch (e) {
        console.log(e);
      }
    }
    
    async function scroll ( page, fn )
    {
      console.log("scroll")
    
       return  new Promise(async (resolve, reject) => {
        try {
          const records = [];
          const interval = setInterval(async () => {
            try {
              const data = await fn(page);
              records.push(...data);
              const moreButton = await page.$("div.user-list > div > a");
    
              if (!moreButton) {
                resolve(records);
                return clearInterval(interval);
              }
    
              await moreButton.click();
    
            }
                    
            catch ( e )
            {
              reject(e);
            }
          }, 2000 );
    
    await Promise.all([
      page.$( "div.user-list > div > a" ),
      page.waitForNavigation( { waitUntil: "networkidle0" } ),
    ]);
    
        } catch (e) {
          reject(e);
        }
    
       }
    
       );
      
        //  await Promise.race([user1, page.waitForNavigation()]);
    
    
    }
    
    
    
    async function fsExists(path) {
      let exists = true;
    
      try {
        await fs.access(path, fsConstants.F_OK);
      } catch (e) {
        exists = false;
      }
    
      return exists;
    }
    
    async function getUser ( config, depth = 0, blacklist = [] )
    {
      
      console.log('get user')
      try {
        const { browser, page } = config;
    
        if (blacklist.includes(config.username)) {
          return;
        }
    
        const filename = path.join(config.dataFolder, `${config.username}.json`);
        let isCached = await fsExists(filename);
    
        let data = {
          username: config.username,
          profile: {},
          following: [],
          followers: [],
        };
    
        if (isCached) {
          data = JSON.parse(await fs.readFile(filename));
        } else {
          await page.goto(`https://twitter.com/${config.username}?lang=en`, {
            waitUntil: "load",
            timeout: 0,
          });
          data.profile = await getProfile(page);
    
          const userAgent = await browser.userAgent();
          await page.setUserAgent(agent());
    
          if (config.following) {
            await page.goto(
              `https://mobile.twitter.com/${config.username}/following?lang=en`,
              { waitUntil: "load", timeout: 0 }
            );
            data.following = await scroll(page, await getDataFromPage);
          }
    
          if (config.followers) {
            await page.goto(
              `https://mobile.twitter.com/${config.username}/followers?lang=en`,
              { waitUntil: "load", timeout: 0 }
            );
            data.followers = await scroll(page, await getDataFromPage);
          }
    
          await page.setUserAgent(userAgent);
          await fs.writeFile(filename, JSON.stringify(data, null, 2));
        }
    
        blacklist.push(data.username);
    
        // console.log( "one",data )
        // console.log( "two",data.followers )
        // console.log( "three",data.followers.length )
        
    
        if (data.following.length > 0 && depth < config.depth - 1) {
          for (const username of data.following) {
            config.username = username;
            await getUser(config, depth + 1, blacklist);
          }
          
        }
      } catch (e) {
        throw new Error(e.message);
      }
    }
    
    async function main ( argv )
    {
      const t0 = performance.now();
    
      try
      {
        if ( !argv.username )
        {
          return;
        }
    
        const dataFolder = path.join( process.cwd(), "data" );
        const isDataFolderCreated = await fsExists( dataFolder );
    
        if ( !isDataFolderCreated )
        {
          try
          {
            await fs.mkdir( dataFolder );
          } catch ( err )
          {
            throw new Error( err.message );
          }
        }
    
    
        const config = {
          depth: argv.depth || 2,
          username: "MuftiKifayatJUI",
          followers: argv.followers || false,
          following: argv.following || false,
          dataFolder: dataFolder,
        };
    
        let args = [
          "--no-sandbox",
          "--disable-setuid-sandbox",
          "--enable-features=NetworkService",
        ];
    
        const browser = await puppeteer.launch( {
          defaultViewport: null,
          headless: true,
          args: args,
          browserContext: "default",
        } );
    
        const page = ( await browser.pages() )[ 0 ];
        await page.setDefaultNavigationTimeout(0); 
    
        config.browser = browser;
        config.page = page;
    
    
    
        await getUser( config );
    
    
        await browser.close();
      } catch ( err )
      {
        throw err;
      }
    
      const t1 = performance.now();
      times = t1 - t0;
      seconds= (times/1000 % 60) 
      minutes= (times/1000 / 60) 
      console.log( `Call to doSomething took ${ times } milliseconds.` );
      console.log( `Call to doSomething took ${ seconds } seconds.` );
      console.log(`Call to doSomething took ${minutes} minutes.`);
    }
    
    main(argv)
      .then(process.exit)
      .catch((e) => {
        console.log(e);
        process.exit(-1);
      });
    
    
    
    
      // run this node abc.js --username username --depth 2 --following --depth 2 --followers 
const puppeter=require(“木偶演员”);
const agent=require(“安全随机用户代理”);
const{DateTime}=require(“luxon”);
常数argv=要求(“码”).argv;
常量fsConstants=需要(“fs”)。常量;
const fs=要求(“fs”).承诺;
常量路径=要求(“路径”);
const{performance}=require(“perf_hooks”);
异步函数getProfile(第页)
{
console.log('get profile')
尝试
{
让数据=等待页面。评估(()=>{
const id=document.querySelector(“.ProfileNav”);
const username=document.querySelector(
“.ProfileHeaderCard>h2>a>span>b”
);
const fullname=document.querySelector(“.ProfileHeaderCard>h1>a”);
const bio=document.querySelector(“.ProfileHeaderCard>p”);
const location=document.querySelector(
“.ProfileHeaderCard locationText.u-dir”
);
const url=document.querySelector(
“.ProfileHeaderCard-urlText.u-dir>a”
);
const avatar=document.querySelector(
“.profiler>avatar>div>a>img”
);
const background=document.querySelector(
“.ProfileCanopy headerBg>img”
);
const verified=document.querySelector(
“.ProfileHeaderCard>h1>span>a>span”
);
const tweets=document.querySelector(
(3)反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,div,div,div,反应根,反应根,反应根,反应根,div,div,Di,反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,反应根,根,根,根,根,根,根,根,根,根,根,r-13QZ1.r-13QZ1.r-13QZ1.r-13QZ1.r-13QZZ1.r-1.r-1.r-1.r-1.r-1.r-1.r-611.r-Z1.r-1-1-6.r-Z1-6.r-1-1-1-1-1-1-6.r-1-1-1-1-6.r-1-6.1-1-1-1-1-1-6.1 r-qklmqi.r-gtdqiz.r-ipm5af.r-1g40b8q>div.css-1dbjc4n.r-1LOKT21。r-136ojw6>div>div>div>div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1pi2tsx.r-1777fci>div>div“
);
const following=document.querySelector(
“.ProfileNav项--跟随>a>span.ProfileNav值”
);
const followers=document.querySelector(
“.ProfileNav项--跟随者>a>span.ProfileNav值”
);
const likes=document.querySelector(
“.ProfileNav项--收藏夹>a>span.ProfileNav值”
);
const date=document.querySelector(
“.ProfileHeaderCard joinDateText.js tooltip.u-dir”
);
返回{
id:id?编号(id.dataset.userId):null,
用户名:username?username.innerText:“”,
全名:全名?全名。内部文本:“”,
bio:bio?bio.innerText:“”,
位置:位置?位置。内部文本:“”,
url:url?url.title:“”,
阿凡达:阿凡达?avatar.src:“”,
背景:背景?background.src:“”,
已验证:已验证?正确:错误,
tweets:tweets?数量(tweets.dataset.count):0,
following:following?编号(following.dataset.count):0,
关注者:关注者?数量(followers.dataset.count):0,
喜欢:喜欢?数量(喜欢.数据集.计数):0,
日期:日期?日期。标题:“”,
};
});
如果(数据日期){
data.date=DateTime.fromFormat(data.date,“H:m a-d MMM yyyy”).toISO();
}
返回数据;
}捕获(e){
控制台日志(e);
}
}
异步函数getDataFromPage(第页)
{
console.log('从页面获取数据')
试一试{
常量记录=[];
//等待page.waitForNavigation();
const users=等待页面。$$(“.user item.username”);
for(让用户对用户){
const username=wait user.evaluate((e)=>e.innerText.replace(“@”和“);
记录。推送(用户名);
}
退货记录;
}捕获(e){
控制台日志(e);
}
}
异步功能滚动(第页,fn)
{
console.log(“滚动”)
返回新承诺(异步(解析、拒绝)=>{
试一试{
常量记录=[];
const interval=setInterval(异步()=>{
试一试{
常数数据=等待fn(第页);
记录。推送(…数据);
const moreButton=等待页面。$(“div.user-list>div>a”);
如果(!moreButton){
解决(记录);
返回clearInterval(间隔);
}
等待更多按钮。单击();
}
捕获(e)
{
拒绝(e);
}
}, 2000 );
等待承诺([
第.$页(“div.user-list>div>a”),
page.waitForNavigation({waitUntil:“networkidle0”}),
]);
}捕获(e){
拒绝(e);
}
}
);
//wait Promise.race([user1,page.waitForNavigation()]);
}
异步函数fsExists(路径){
让存在=真实;
试一试{
等待fs.access(路径,fsConstants.F_OK);
}