Web scraping 如何使用Puppeter添加输入
我一直收到未找到节点的错误。我转而尝试jQuery。不走运。现在我在这里。page.text不起作用。我得到了节点错误。我试图通过提供一个案例编号来获取案例信息和文件Web scraping 如何使用Puppeter添加输入,web-scraping,puppeteer,apify,Web Scraping,Puppeteer,Apify,我一直收到未找到节点的错误。我转而尝试jQuery。不走运。现在我在这里。page.text不起作用。我得到了节点错误。我试图通过提供一个案例编号来获取案例信息和文件 const Apify = require('apify'); const { puppeteer } = Apify.utils; const saveScreen = async(page, key = 'debug-screen') = > { const screenshotBuffer = awai
const Apify = require('apify');
const {
puppeteer
} = Apify.utils;
const saveScreen = async(page, key = 'debug-screen') = > {
const screenshotBuffer = await page.screenshot({
fullPage: true
});
await Apify.setValue(key, screenshotBuffer, {
contentType: 'image/png'
});
};
Apify.main(async() = > {
// Launch Puppeteer
const browser = await Apify.launchPuppeteer();
const page = await browser.newPage();
await page.goto('https://web6.seattle.gov/courts/ECFPortal/Default.aspx');
await page.addScriptTag({
url: 'https://code.jquery.com/jquery-3.2.1.min.js'
});
await page.waitForFunction(() = > window.jQuery);
page.evaluate(() = > $('span:contains("Case Information")').click());
//await page.waitForNavigation();
await page.waitFor(4000);
const input = await Apify.getInput()
console.log('json stringify input: ' + JSON.stringify(input))
const caseNumber = input['court_case'];
console.log('CASE NUMBER: ' + caseNumber)
var html = await page.$eval('body', e = > e.outerHTML);
const output2 = {
html,
crawledAt: new Date(),
};
await Apify.setValue('HTMltestOUTPUT', output2);
console.log('html to test.');
page.evaluate(() = > $('#ContentPlaceHolder1_CaseDocuments1_CaseSearch1_txtCaseNumber').val("585344"));
await saveScreen(page, 'test-screen');
await page.waitFor(1000);
console.log('Attempted to enter case number');
page.evaluate(() = > $('#ContentPlaceHolder1_CaseDocuments1_CaseSearch1_btnSearch').click());
console.log('Attempted to click button');
// Times-out here
//await page.waitForNavigation();
console.log('Attempted to wait for navigation');
// Get cookies
const cookies = await page.cookies();
console.log('Attempted to wait for cookies');
var html = await page.$eval('body', e = > e.outerHTML);
// And then save output
const output = {
html,
crawledAt: new Date(),
};
console.log('My output:');
console.dir(output);
await Apify.setValue('OUTPUT', output);
await browser.close();
console.log('Done.');
});
代码的主要问题是,该网站是一个单页aspx应用程序,不进行任何导航,并通过XHR请求加载所有内容。因此,每个page.waitForNavigation调用都将始终超时 您可以通过等待页面上的元素可见或跟踪网络请求来解决此问题。我已经将这一点铭记在心,重新编写了您的代码,并制作了一个使用这两种方法的功能版本。希望这对您有所帮助:
const Apify = require('apify');
const { puppeteer } = Apify.utils;
const saveScreen = async(page, key = 'debug-screen') => {
const screenshotBuffer = await page.screenshot({
fullPage: true
});
await Apify.setValue(key, screenshotBuffer, {
contentType: 'image/png'
});
};
const saveHtml = async (page, key = 'output', logOutput = false) => {
const html = await page.$eval('body', e => e.outerHTML);
const output = {
html,
crawledAt: new Date(),
};
if (logOutput) {
console.log('My output:');
console.dir(output);
}
return Apify.setValue(key, output);
};
Apify.main(async() => {
const input = await Apify.getInput()
console.log('json stringify input: ' + JSON.stringify(input));
// Get case number from input or use default (for testing)
const caseNumber = input && input.court_case || '585344';
console.log('CASE NUMBER: ' + caseNumber)
// Launch Puppeteer
const browser = await Apify.launchPuppeteer();
const page = await browser.newPage();
await page.goto('https://web6.seattle.gov/courts/ECFPortal/Default.aspx');
console.log('Page opened');
// Wait for the link in menu to appear and then click on it
await page.waitForSelector('#ctl00_ContentPlaceHolder1_rtsECFPortal li:nth-child(4) a span');
await page.click('#ctl00_ContentPlaceHolder1_rtsECFPortal li:nth-child(4) a span');
console.log('Redirecting to case information');
// Wait for the new page to load and input to appear
await page.waitForSelector('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_txtCaseNumber', { visible: true });
console.log('Inputing case number');
// Input the case number
await page.type('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_txtCaseNumber', caseNumber, { delay: 20 })
// Save current html and screenshot for debugging
await saveScreen(page, 'search-screen');
await saveHtml(page, 'search-html');
// Prepare waitForResponse promise, we need to do it here, because after clicking on
// button it might be too late.
const waitForResponsePromise = page.waitForResponse((response) => {
return response.url().includes('courts/ECFPortal/Default.aspx');
});
console.log('clicking on search');
// Click on the search button
await page.click('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_btnSearch');
// Wait for the xhr request to finish, this means that the case information should be loaded
await waitForResponsePromise;
await page.waitFor(500);
console.log('Case information loaded');
// Save current html and screenshot for debugging
await saveScreen(page, 'output-screen');
await saveHtml(page, 'output', true);
await browser.close();
console.log('Done.');
});
我可能弄错了,但你实际上并没有输入案件编号,是吗?在网站上,你输入的是法院案件编号、被告或律师。我的意思是我不确定你在代码中输入的案件编号。页面。评估(()=>$('#内容占位符1"案件文档1"案件搜索1"案件编号')。val(“585344”);