Javascript 我如何解决这个分页网页与木偶刮?正确地定位节点,但在for循环中单击它会不断抛出错误
因此,我试图从一个网站(这是一个有公共信息的网站)上搜集一些基本的公司数据。我正在使用node和puppeter来实现这一点。下面的工作代码成功地抓取了第一页,但是当点击第二页时,我得到了Javascript 我如何解决这个分页网页与木偶刮?正确地定位节点,但在for循环中单击它会不断抛出错误,javascript,node.js,web-scraping,puppeteer,Javascript,Node.js,Web Scraping,Puppeteer,因此,我试图从一个网站(这是一个有公共信息的网站)上搜集一些基本的公司数据。我正在使用node和puppeter来实现这一点。下面的工作代码成功地抓取了第一页,但是当点击第二页时,我得到了错误:执行上下文被破坏,很可能是因为导航。,现在我得到一个错误,说我的函数不是函数 有谁能指出我做错了什么,以及刮去所有28页的最佳方法是什么 成功地刮取第一页 const puppeteer = require("puppeteer"); // var fs = require("fs"); const fs
错误:执行上下文被破坏,很可能是因为导航。
,现在我得到一个错误,说我的函数不是函数
有谁能指出我做错了什么,以及刮去所有28页的最佳方法是什么
成功地刮取第一页
const puppeteer = require("puppeteer");
// var fs = require("fs");
const fsp = require("fs").promises;
const fs = require("fs");
let pageCount = 1; // 21 full pages of content
let companyRows;
function delay(time) {
return new Promise(function(resolve) {
setTimeout(resolve, time);
});
}
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on("console", msg => {
for (let i = 0; i < msg.args().length; ++i)
console.log(`${i}: ${msg.args()[i]}`);
});
await page.goto(
"http://dpsstnet.state.or.us/IRIS_PublicInquiry/PrivateSecurity/SMSAgcyTable.aspx"
);
//Clicks a tag by id
await page.click("#btnNaLL");
await page.waitFor(1000);
const result = await page.evaluate(() => {
let row = document.querySelectorAll("tr");
let companyData = [];
row.forEach(el => {
let company = {};
let count = 0;
for (data of el.cells) {
switch (count) {
case 0:
company.name = data.innerText.trim();
case 1:
company.primaryContact = data.innerText.trim();
case 2:
company.address = data.innerText.trim();
case 3:
company.phone = data.innerText.trim();
case 4:
company.county = data.innerText.trim();
case 5:
company.status = data.innerText.trim();
default:
company.default = data.innerText.trim();
}
count++;
companyData.push(company);
//GOT SOME STUUFFFF
console.log(JSON.stringify(companyData));
}
});
// await page.waitFor(3000);
// await fsp.writeFile("./json/file.json", result.stringify());
companyData = companyData.filter((a, b) => companyData.indexOf(a) === b);
companyData = companyData.filter(e => e.status === "Active");
return companyData;
});
// fsp.writeFile(
// "./json/file.json",
// JSON.stringify(companyData, null, 2),
// err =>
// err
// ? console.error("Data not written!", err)
// : console.log("Data Written")
// );
await fsp.writeFile(
"./json/file.json",
JSON.stringify(result, null, 2),
err =>
err
? console.error("Data not written!", err)
: console.log("Data Written")
);
await page.screenshot({
path: "./screenshots/page1.png"
});
await page.pdf({ path: "./pdfs/page1.pdf" });
await browser.close();
return result;
} catch (error) {
console.log(error);
}
})();
const puppeter=require(“木偶演员”);
//var fs=要求(“fs”);
const fsp=要求(“fs”)承诺;
常数fs=要求(“fs”);
让pageCount=1;//21整页内容
让同伴来;
功能延迟(时间){
返回新承诺(函数(解析){
setTimeout(解析,时间);
});
}
(异步()=>{
试一试{
const browser=wait puppeter.launch();
const page=wait browser.newPage();
第页(“控制台”,msg=>{
for(设i=0;i{
let row=document.querySelectorAll(“tr”);
让companyData=[];
row.forEach(el=>{
让公司={};
让计数=0;
用于(el.单元格的数据){
开关(计数){
案例0:
company.name=data.innerText.trim();
案例1:
company.primaryContact=data.innerText.trim();
案例2:
company.address=data.innerText.trim();
案例3:
company.phone=data.innerText.trim();
案例4:
company.county=data.innerText.trim();
案例5:
company.status=data.innerText.trim();
违约:
company.default=data.innerText.trim();
}
计数++;
companyData.push(公司);
//有一些笨蛋
log(JSON.stringify(companyData));
}
});
//等待页面。等待(3000);
//等待fsp.writeFile(“./json/file.json”,result.stringify());
companyData=companyData.filter((a,b)=>companyData.indexOf(a)==b);
companyData=companyData.filter(e=>e.status==“活动”);
返回公司数据;
});
//写文件(
//“/json/file.json”,
//stringify(companyData,null,2),
//错误=>
//错误
//?控制台错误(“数据未写入!”,错误)
//:console.log(“数据写入”)
// );
等待fsp.writeFile(
“/json/file.json”,
stringify(结果,null,2),
错误=>
犯错误
?控制台错误(“数据未写入!”,错误)
:console.log(“数据写入”)
);
等待页面。屏幕截图({
路径:“./screenshots/page1.png”
});
等待page.pdf({路径:“./pdfs/page1.pdf”});
等待浏览器关闭();
返回结果;
}捕获(错误){
console.log(错误);
}
})();
重写代码以浏览页面(不工作)
目前,我在运行此程序时得到“clickLink不是一个函数”
const puppeteer = require("puppeteer");
const fsp = require("fs").promises;
const fs = require("fs");
let pageCount = 1; // 21 full pages of content
let companyRows;
let pageToClick;
function delay(time) {
return new Promise(function(resolve) {
setTimeout(resolve, time);
});
}
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const clickLink = link => {
page.click(link);
page.waitFor(1000);
};
page.on("console", msg => {
for (let i = 0; i < msg.args().length; ++i)
console.log(`${i}: ${msg.args()[i]}`);
});
await page.goto(
"http://dpsstnet.state.or.us/IRIS_PublicInquiry/PrivateSecurity/SMSAgcyTable.aspx"
);
//Clicks a tag by id
await page.click("#btnNaLL");
await page.waitFor(1000);
let fullResult = [];
let result;
result = await page.evaluate(
(fullResult, clickLink => {
let row = document.querySelectorAll("tr");
let companyData = [];
let pageList = document.querySelectorAll("b > a");
for (let step = 0; step < 2; step++) {
row.forEach(el => {
let company = {};
let count = 0;
for (data of el.cells) {
switch (count) {
case 0:
company.name = data.innerText.trim();
case 1:
company.primaryContact = data.innerText.trim();
case 2:
company.address = data.innerText.trim();
case 3:
company.phone = data.innerText.trim();
case 4:
company.county = data.innerText.trim();
case 5:
company.status = data.innerText.trim();
default:
company.default = data.innerText.trim();
}
count++;
companyData.push(company);
//GOT SOME STUUFFFF
console.log(JSON.stringify(companyData));
}
});
companyData = companyData.filter(
(a, b) => companyData.indexOf(a) === b
);
companyData = companyData.filter(e => e.status === "Active");
fullResult = [...fullResult, ...companyData];
// console.log(JSON.stringify(pageList[step].innerText));
clickLink(pageList[step]);
}
return fullResult;
},
fullResult,
clickLink
);
await fsp.writeFile(
"./json/file.json",
JSON.stringify(result, null, 2),
err =>
err
? console.error("Data not written!", err)
: console.log("Data Written")
);
//*
await page.screenshot({
path: "./screenshots/page1.png"
});
await page.pdf({ path: "./pdfs/page1.pdf" });
await browser.close();
return result;
} catch (error) {
console.log(error);
}
})();
const puppeter=require(“木偶演员”);
const fsp=要求(“fs”)承诺;
常数fs=要求(“fs”);
让pageCount=1;//21整页内容
让同伴来;
让pageToClick;
功能延迟(时间){
返回新承诺(函数(解析){
setTimeout(解析,时间);
});
}
(异步()=>{
试一试{
const browser=wait puppeter.launch();
const page=wait browser.newPage();
const clickLink=link=>{
页面。单击(链接);
第页等待(1000);
};
第页(“控制台”,msg=>{
for(设i=0;i{
let row=document.querySelectorAll(“tr”);
让companyData=[];
让pageList=document.querySelectorAll(“b>a”);
对于(步骤=0;步骤<2;步骤++){
row.forEach(el=>{
让公司={};
让计数=0;
用于(el.单元格的数据){
开关(计数){
案例0:
company.name=data.innerText.trim();
案例1:
company.primaryContact=data.innerText.trim();
案例2:
company.address=data.innerText.trim();
案例3:
company.phone=data.innerText.trim();
案例4:
company.county=data.innerText.trim();
案例5:
company.status=data.innerText.trim();
违约:
company.default=data.innerText.trim();
}
计数++;
companyData.push(公司);
//有一些笨蛋
log(JSON.stringify(companyData));
}
});
companyData=companyData.filter(
(a,b)=>companyData.indexOf(a)==b
);
companyData=companyData.filter(e=>e.status==“活动”);
fullResult=[…fullResult,…companyData];
//log(JSON.stringify(pageList[step].innerText));
单击链接(页面列表[步骤]);
}
返回完整结果;
},