Javascript 如何使用Apify和Puppeter构建成功的web刮取结果?

Javascript 如何使用Apify和Puppeter构建成功的web刮取结果?,javascript,html,web-scraping,puppeteer,apify,Javascript,Html,Web Scraping,Puppeteer,Apify,使用Apify和Puppeter,我想在以下URL处刮取数据表: 我希望结果是一个对象数组。数组的每个元素都应该表示原始数据源表的每一行,并且是具有以下属性的JS对象 { firmName, firmUrl, hq, hqUrl, aum, } 其中: firmName是每行第一个元素的.innerText() firmUrl是每行第一个元素的href属性 hq是。每行第二个元素的innerText() hqUrl是每行第二个元素的href属性 aum是。每行第三个元素的innerText

使用Apify和Puppeter,我想在以下URL处刮取数据表:

我希望结果是一个对象数组。数组的每个元素都应该表示原始数据源表的每一行,并且是具有以下属性的JS对象

{ firmName, firmUrl, hq, hqUrl, aum, }
其中:

  • firmName
    是每行第一个
    元素的
    .innerText()
  • firmUrl
    是每行第一个
    元素的
    href
    属性
  • hq
    。每行第二个
    元素的innerText()
  • hqUrl
    是每行第二个
    元素的
    href
    属性
  • aum
    。每行第三个
    元素的innerText()
具体来说,例如,我希望看到以下对象返回给我

我想看到的是备选方案A:
[{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "links": [
    {
      firmName: "Bridgewater Associates",
      firmUrl: "/wiki/Bridgewater_Associates",
      hq: "Westport, Connecticut",
      hqUrl: "/wiki/Westport,_Connecticut",
      aum: "$132,050",
    }
    // ...x39 more times
  ]
}]
或者,对象可以如下(我不知道哪种可能,这是我困惑的一部分)

我想看到的是备选方案B:
[
  {
    "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
    "pageTitle": "List of hedge funds - Wikipedia",
    "links": {
      firmName: "Bridgewater Associates",
      firmUrl: "/wiki/Bridgewater_Associates",
      hq: "Westport, Connecticut",
      hqUrl: "/wiki/Westport,_Connecticut",
      aum: "$132,050",
    },  
  },
  // ...x39 more times
]
但实际上,我看到了以下结果

我实际看到的是:
[{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "links": [
    "/wiki/Bridgewater_Associates",
    "/wiki/Westport,_Connecticut",
    "/wiki/Renaissance_Technologies",
    "/wiki/East_Setauket,_New_York",
    "/wiki/Man_Group",
    "/wiki/London",
    "/wiki/AQR_Capital_Management",
    "/wiki/Greenwich,_Connecticut",
    "/wiki/Two_Sigma_Investments",
    "/wiki/New_York_City,_New_York",
    "/wiki/Millennium_Management,_LLC",
    "/wiki/New_York_City,_New_York",
    "/wiki/Elliott_Management",
    "/wiki/New_York_City,_New_York",
    "/wiki/BlackRock",
    "/wiki/New_York_City,_New_York",
    "/wiki/Citadel_LLC",
    "/wiki/Chicago,_IL",
    "/wiki/Davidson_Kempner_Capital_Management",
    "/wiki/New_York_City,_New_York",
    "/wiki/Viking_Global_Investors",
    "/wiki/Greenwich,_Connecticut",
    "/wiki/Baupost_Group",
    "/wiki/Boston,_MA",
    "/wiki/D.E._Shaw_%26_Co.",
    "/wiki/New_York_City,_New_York",
    "/wiki/Farallon_Capital",
    "/wiki/San_Francisco,_CA",
    "/wiki/Marshall_Wace",
    "/wiki/London",
    "/wiki/The_Children%27s_Investment_Fund_Management",
    "/wiki/London",
    "/wiki/Wellington_Management_Company",
    "/wiki/Boston,_MA",
    "/wiki/Winton_Group",
    "/wiki/London",
    "/wiki/Capula_Investment_Management",
    "/wiki/London",
    "/wiki/York_Capital_Management",
    "/wiki/New_York_City,_NY"
  ]
}]
我正在使用以下代码作为我的
pageFunction

页面功能
我需要如何更改代码?

看起来不错,您需要更改表中数据的解析。 这里有一个pageFunction的例子,它可以工作

// The function accepts a single argument: the "context" object.
// For a complete list of its properties and functions,
// see https://apify.com/apify/web-scraper#page-function 
async function pageFunction( context ) {
    const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
    const TITLE_SELECTOR = 'title';
    const ANCHOR_SELECTOR = 'tr > td > a';
    const LINE_SELECTOR = '.wikitable tr'
    const HREF_SELECTOR = 'href';

    // jQuery is handy for finding DOM elements and extracting data from them.
    //  To use it, make sure to enable the "Inject jQuery" option.
    const $ = context.jQuery;
    const pageTitle = $( TITLE_SELECTOR ).first().text();
    const anchorTag = $( ANCHOR_SELECTOR );
    const lines = $( LINE_SELECTOR );
    const links = [];
    lines.each((index, item) => {
        const columns = $(item).find('td');
        const link = {
          firmName: columns.eq(1).text().trim(),
          firmUrl: columns.eq(1).find('a').eq(0).attr('href'),
          hq: columns.eq(2).text().trim(),
          hqUrl: columns.eq(2).find('a').eq(0).attr('href'),
        }
        if (link.firmUrl) {
            links.push(link);
        }       
    });

    return {
      url: context.request.url,
      pageTitle,
      links,
    };
}
// The function accepts a single argument: the "context" object.
// For a complete list of its properties and functions,
// see https://apify.com/apify/web-scraper#page-function 
async function pageFunction( context ) {
    const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
    const TITLE_SELECTOR = 'title';
    const ANCHOR_SELECTOR = 'tr > td > a';
    const LINE_SELECTOR = '.wikitable tr'
    const HREF_SELECTOR = 'href';

    // jQuery is handy for finding DOM elements and extracting data from them.
    //  To use it, make sure to enable the "Inject jQuery" option.
    const $ = context.jQuery;
    const pageTitle = $( TITLE_SELECTOR ).first().text();
    const anchorTag = $( ANCHOR_SELECTOR );
    const lines = $( LINE_SELECTOR );
    const links = [];
    lines.each((index, item) => {
        const columns = $(item).find('td');
        const link = {
          firmName: columns.eq(1).text().trim(),
          firmUrl: columns.eq(1).find('a').eq(0).attr('href'),
          hq: columns.eq(2).text().trim(),
          hqUrl: columns.eq(2).find('a').eq(0).attr('href'),
        }
        if (link.firmUrl) {
            links.push(link);
        }       
    });

    return {
      url: context.request.url,
      pageTitle,
      links,
    };
}