使用javascript获取页面上的所有元素
我编写了一个网页爬虫程序,该程序使用使用javascript获取页面上的所有元素,javascript,node.js,ecmascript-6,puppeteer,Javascript,Node.js,Ecmascript 6,Puppeteer,我编写了一个网页爬虫程序,该程序使用puppeter成功地抓取了一个网页并抓取了一个职位发布的数据。我现在正尝试使用.opportunity类获取页面上的所有元素,然后将其传递到一个函数中,该函数将获取特定职位公告的数据。但是,GetElementsByCassName列表返回的是空对象吗 const puppeteer = require('puppeteer'); const fs = require('fs'); async function crawlOpo(opo) { c
puppeter
成功地抓取了一个网页并抓取了一个职位发布的数据。我现在正尝试使用.opportunity
类获取页面上的所有元素,然后将其传递到一个函数中,该函数将获取特定职位公告的数据。但是,GetElementsByCassName
列表返回的是空对象吗
const puppeteer = require('puppeteer');
const fs = require('fs');
async function crawlOpo(opo) {
const opportunity = {
title: '',
desc: '',
category: '',
reqName: '',
hours: '',
postingDate: '',
locationName: '',
address: ''
};
const title = await page.evaluate(() => {
try {
return opo.querySelector('.row .col-lg-20 h3 a').innerText
} catch(err) {
return err
}
});
const desc = await page.evaluate(() => {
try {
return opo.querySelector('.hidden-xs.paragraph').innerText
} catch(err) {
return err
}
});
const category = await page.evaluate(() => {
try {
return opo.querySelector('.row.paragraph .col-sm-18 .row .col-md-8 .label-with-icon span').innerText
} catch(err) {
return err
}
});
const reqName = await page.evaluate(() => {
try {
return opo.querySelector('.row.paragraph .col-sm-18 .row .col-md-8:nth-of-type(2) .label-with-icon span').innerText
} catch(err) {
return err
}
});
const hours = await page.evaluate(() => {
try {
return opo.querySelector('.row.paragraph .col-sm-18 .row .col-md-8:nth-of-type(3) .label-with-icon span').innerText
} catch(err) {
return err
}
});
const postingDate = await page.evaluate(() => {
try {
return opo.querySelector('.row .col-lg-4 h3 small').innerText
} catch(err) {
return err
}
});
const locationName = await page.evaluate(() => {
try {
return opo.querySelector('.row.paragraph:nth-of-type(2) .col-lg-20 div div candidate-physical-location address span:nth-of-type(2) span').innerText
} catch(err) {
return err
}
});
opportunity.title = title;
opportunity.desc = desc;
opportunity.category = category;
opportunity.reqName = reqName;
opportunity.hours = hours;
opportunity.postingDate = postingDate;
opportunity.locationName = locationName;
opportunities.push(opportunity)
console.log(opportunities);
browser.close();
}
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const opportunities = [];
let url = "https://recruiting2.ultipro.com/PUB1004PSCU/JobBoard/d433f5c3-37c8-4bcf-a3af-248a707c7d31/?q=&o=postedDateDesc"
await page.goto(url, {timeout: 0, waitUntil: 'networkidle0'});
const oportunitiesDOM = await page.evaluate(() => {
return document.getElementsByClassName('opportunity');
});
oportunitiesDOM.forEach(opo => {
await crawlOpo(opo)
});
} catch (err) {
console.error(err)
}
})()
这里的逻辑是,它运行一个
async
箭头函数,该函数将启动浏览器->加载页面->评估页面->获取类.opportunity
->循环列表中的所有元素,并将每个opportunity传递到crawlOpo
函数中,然后获取该opportunity所需的特定数据并然后将该对象分配给一个数组。在document.getElementsByClassName('.opportunity')
的参数中,您有CSS选择器'.opportunity'
该方法将类的名称作为参数,而不是CSS选择器
很可能应该更正为
document.getElementsByClassName('opportunity')
删除所选文档中的
。getElementsByClassName('opportunity')
是,现在获取数据,但数据不是预期的:{'0':{ku ko 1568043156424:'ko169',1':{uuuu ko_uu1568043156424:'ko187','2':{uuu ko_uu1568043156424:'ko205','3':{uu ko_u1568043156424:'ko223',}