Javascript 木偶手:在桌子上刮网(字符串数组?)
我试图从一个表中提取数据,我可以提取Javascript 木偶手:在桌子上刮网(字符串数组?),javascript,node.js,web-scraping,puppeteer,Javascript,Node.js,Web Scraping,Puppeteer,我试图从一个表中提取数据,我可以提取updateDate,但是我遇到了列和行的问题 我正在尝试清理的表嵌套在id为col2的td中 我的问题: // Find Table Rows console.log('Searching for COVID-19 Data from Orange County'); // Table Rows let tableRows = await page.$$('#col2 > div > table > tbody > tr'); //
updateDate
,但是我遇到了列和行的问题
我正在尝试清理的表嵌套在id为col2
的td
中
我的问题:
// Find Table Rows
console.log('Searching for COVID-19 Data from Orange County');
// Table Rows
let tableRows = await page.$$('#col2 > div > table > tbody > tr');
// console.log(tableRows);
// Check For Table Rows
if (tableRows.length > 0) {
console.log('Table Rows found');
// Update Date (Length: 10)
if (await tableRows[2].$$('tr > td')) {
// Assign Element (First Row)
let updateField = String(await tableRows[2].$eval('tr > td', td => td.innerText.trim()));
// Check If Matches
if (updateField.match(/(as of [0-9][0-9]\/[0-9][0-9]\/[0-9][0-9][0-9][0-9])/)) {
const updateDate = updateField.slice(51, updateField.length - 1).trim();
console.log(`Update Date: ${updateDate}`);
}
else {
throw error('Error: Update Date doesn\'t match format');
}
}
// Cases
if (await tableRows[5].$$('tr > td')) {
// Assign Element (First Row)
let totalCasesField = String(await tableRows[5].$eval('tr > td', td => td.innerText.trim()));
console.log(totalCasesField);
}
我似乎不知道如何正确地查询行,所以我可以得到所有的数字数据(每行一个字符串数组)
表格(来自检查员):
// Find Table Rows
console.log('Searching for COVID-19 Data from Orange County');
// Table Rows
let tableRows = await page.$$('#col2 > div > table > tbody > tr');
// console.log(tableRows);
// Check For Table Rows
if (tableRows.length > 0) {
console.log('Table Rows found');
// Update Date (Length: 10)
if (await tableRows[2].$$('tr > td')) {
// Assign Element (First Row)
let updateField = String(await tableRows[2].$eval('tr > td', td => td.innerText.trim()));
// Check If Matches
if (updateField.match(/(as of [0-9][0-9]\/[0-9][0-9]\/[0-9][0-9][0-9][0-9])/)) {
const updateDate = updateField.slice(51, updateField.length - 1).trim();
console.log(`Update Date: ${updateDate}`);
}
else {
throw error('Error: Update Date doesn\'t match format');
}
}
// Cases
if (await tableRows[5].$$('tr > td')) {
// Assign Element (First Row)
let totalCasesField = String(await tableRows[5].$eval('tr > td', td => td.innerText.trim()));
console.log(totalCasesField);
}
我的代码:
// Find Table Rows
console.log('Searching for COVID-19 Data from Orange County');
// Table Rows
let tableRows = await page.$$('#col2 > div > table > tbody > tr');
// console.log(tableRows);
// Check For Table Rows
if (tableRows.length > 0) {
console.log('Table Rows found');
// Update Date (Length: 10)
if (await tableRows[2].$$('tr > td')) {
// Assign Element (First Row)
let updateField = String(await tableRows[2].$eval('tr > td', td => td.innerText.trim()));
// Check If Matches
if (updateField.match(/(as of [0-9][0-9]\/[0-9][0-9]\/[0-9][0-9][0-9][0-9])/)) {
const updateDate = updateField.slice(51, updateField.length - 1).trim();
console.log(`Update Date: ${updateDate}`);
}
else {
throw error('Error: Update Date doesn\'t match format');
}
}
// Cases
if (await tableRows[5].$$('tr > td')) {
// Assign Element (First Row)
let totalCasesField = String(await tableRows[5].$eval('tr > td', td => td.innerText.trim()));
console.log(totalCasesField);
}
像这样的
const puppeter=require('puppeter');
(异步函数main(){
试一试{
const browser=wait puppeter.launch();
const[page]=wait browser.pages();
等待页面。转到('https://www.ochealthinfo.com/phs/about/epidasmt/epi/dip/prevention/novel_coronavirus');
常量数据=等待页面。评估(()=>{
const table=document.querySelector(“#col2>div>table+table”);
const rowsWithNumbers=[…table.rows].slice(3,9);
const numbers=rowsWithNumbers.map(
行=>[…行.单元格].slice(1).map(单元格=>cell.innerText)
);
返回号码;
});
控制台日志(数据);
等待浏览器关闭();
}捕捉(错误){
控制台错误(err);
}
})();
结果:
[
['42', '26', '16', '0', '21', '13', '8'],
['22', '13', '9', '0', '10', '8', '4'],
['7', '6', '1', '0', '5', '2', '0'],
['12', '7', '5', '0', '5', '3', '4'],
['1', '0', '1', '0', '1', '0', '0'],
['0', '0', '0', '0', '0', '0', '0']
]