Web scraping 如何使用Puppeter添加输入

Web scraping 如何使用Puppeter添加输入,web-scraping,puppeteer,apify,Web Scraping,Puppeteer,Apify,我一直收到未找到节点的错误。我转而尝试jQuery。不走运。现在我在这里。page.text不起作用。我得到了节点错误。我试图通过提供一个案例编号来获取案例信息和文件 const Apify = require('apify'); const { puppeteer } = Apify.utils; const saveScreen = async(page, key = 'debug-screen') = > { const screenshotBuffer = awai

我一直收到未找到节点的错误。我转而尝试jQuery。不走运。现在我在这里。page.text不起作用。我得到了节点错误。我试图通过提供一个案例编号来获取案例信息和文件

const Apify = require('apify');
const {
    puppeteer
} = Apify.utils;
const saveScreen = async(page, key = 'debug-screen') = > {
    const screenshotBuffer = await page.screenshot({
        fullPage: true
    });
    await Apify.setValue(key, screenshotBuffer, {
        contentType: 'image/png'
    });
};
Apify.main(async() = > {
    // Launch Puppeteer
    const browser = await Apify.launchPuppeteer();
    const page = await browser.newPage();
    await page.goto('https://web6.seattle.gov/courts/ECFPortal/Default.aspx');
    await page.addScriptTag({
        url: 'https://code.jquery.com/jquery-3.2.1.min.js'
    });
    await page.waitForFunction(() = > window.jQuery);
    page.evaluate(() = > $('span:contains("Case Information")').click());
    //await page.waitForNavigation();
    await page.waitFor(4000);
    const input = await Apify.getInput()
    console.log('json stringify input: ' + JSON.stringify(input))
    const caseNumber = input['court_case'];
    console.log('CASE NUMBER: ' + caseNumber)
    var html = await page.$eval('body', e = > e.outerHTML);
    const output2 = {
        html,
        crawledAt: new Date(),
    };
    await Apify.setValue('HTMltestOUTPUT', output2);
    console.log('html to test.');
    page.evaluate(() = > $('#ContentPlaceHolder1_CaseDocuments1_CaseSearch1_txtCaseNumber').val("585344"));
    await saveScreen(page, 'test-screen');
    await page.waitFor(1000);
    console.log('Attempted to enter case number');
    page.evaluate(() = > $('#ContentPlaceHolder1_CaseDocuments1_CaseSearch1_btnSearch').click());
    console.log('Attempted to click button');
    // Times-out here
    //await page.waitForNavigation();
    console.log('Attempted to wait for navigation');
    // Get cookies
    const cookies = await page.cookies();
    console.log('Attempted to wait for cookies');
    var html = await page.$eval('body', e = > e.outerHTML);
    // And then save output
    const output = {
        html,
        crawledAt: new Date(),
    };
    console.log('My output:');
    console.dir(output);
    await Apify.setValue('OUTPUT', output);
    await browser.close();
    console.log('Done.');
});

代码的主要问题是,该网站是一个单页aspx应用程序,不进行任何导航,并通过XHR请求加载所有内容。因此,每个page.waitForNavigation调用都将始终超时

您可以通过等待页面上的元素可见或跟踪网络请求来解决此问题。我已经将这一点铭记在心,重新编写了您的代码,并制作了一个使用这两种方法的功能版本。希望这对您有所帮助:

const Apify = require('apify');
const { puppeteer } = Apify.utils;

const saveScreen = async(page, key = 'debug-screen') => {
    const screenshotBuffer = await page.screenshot({
        fullPage: true
    });
    await Apify.setValue(key, screenshotBuffer, {
        contentType: 'image/png'
    });
};

const saveHtml = async (page, key = 'output', logOutput = false) => {
    const html = await page.$eval('body', e => e.outerHTML);
    const output = {
        html,
        crawledAt: new Date(),
    };
    if (logOutput) {
        console.log('My output:');
        console.dir(output);
    }
    return Apify.setValue(key, output);
};

Apify.main(async() => {
    const input = await Apify.getInput()
    console.log('json stringify input: ' + JSON.stringify(input));
    // Get case number from input or use default (for testing)
    const caseNumber = input && input.court_case || '585344';
    console.log('CASE NUMBER: ' + caseNumber)

    // Launch Puppeteer
    const browser = await Apify.launchPuppeteer();
    const page = await browser.newPage();
    await page.goto('https://web6.seattle.gov/courts/ECFPortal/Default.aspx');

    console.log('Page opened');

    // Wait for the link in menu to appear and then click on it
    await page.waitForSelector('#ctl00_ContentPlaceHolder1_rtsECFPortal li:nth-child(4) a span');
    await page.click('#ctl00_ContentPlaceHolder1_rtsECFPortal li:nth-child(4) a span');

    console.log('Redirecting to case information');

    // Wait for the new page to load and input to appear
    await page.waitForSelector('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_txtCaseNumber', { visible: true });

    console.log('Inputing case number');

    // Input the case number
    await page.type('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_txtCaseNumber', caseNumber, { delay: 20 })

    // Save current html and screenshot for debugging
    await saveScreen(page, 'search-screen');
    await saveHtml(page, 'search-html');

    // Prepare waitForResponse promise, we need to do it here, because after clicking on
    // button it might be too late.
    const waitForResponsePromise = page.waitForResponse((response) => {
        return response.url().includes('courts/ECFPortal/Default.aspx');
    });

    console.log('clicking on search');

    // Click on the search button
    await page.click('#ContentPlaceHolder1_CaseInfo1_CaseSearch1_btnSearch');

    // Wait for the xhr request to finish, this means that the case information should be loaded
    await waitForResponsePromise;
    await page.waitFor(500);

    console.log('Case information loaded');

    // Save current html and screenshot for debugging
    await saveScreen(page, 'output-screen');
    await saveHtml(page, 'output', true);
    await browser.close();
    console.log('Done.');
});

我可能弄错了,但你实际上并没有输入案件编号,是吗?在网站上,你输入的是法院案件编号、被告或律师。我的意思是我不确定你在代码中输入的案件编号。页面。评估(()=>$('#内容占位符1"案件文档1"案件搜索1"案件编号')。val(“585344”);