Node.js 使用Cheerio从抓取页面获取信息_Node.js_Puppeteer_Cheerio_Html Parser

Node.js 使用Cheerio从抓取页面获取信息

node.js

Node.js 使用Cheerio从抓取页面获取信息,node.js,puppeteer,cheerio,html-parser,Node.js,Puppeteer,Cheerio,Html Parser,我正在使用Puppeter和Cheerio实现web抓取节点服务器我有一个前端React应用程序使用Soundcloud小部件来播放音乐，但问题是API只能在正确的Soundcloud URL下工作因此，我正在进行UI输入，以便通过Scraper发送SoundCloud搜索请求，并从中获取结果因为我所关心的是URL和歌曲名，从中获取它在子span标记（在其“class”attr处）中保存名称，URL为href，就足够了问题是我越来越这是span/song名称的默认输出，和h

我正在使用Puppeter和Cheerio实现web抓取节点服务器

我有一个前端React应用程序使用Soundcloud小部件来播放音乐，但问题是API只能在正确的Soundcloud URL下工作

因此，我正在进行UI输入，以便通过Scraper发送SoundCloud搜索请求，并从中获取结果

因为我所关心的是URL和歌曲名，从中获取

它在子span标记（在其“class”attr处）中保存名称，URL为href，就足够了

问题是我越来越

这是span/song名称的默认输出，和href URL的“未定义”

即使当我尝试使用{.text（）/.attr（“class”）/etc…}时，resp- 取消查找/此默认响应/错误文本（）不是函数

这是我的密码-

我用devtools附加soundcloud的屏幕截图，也许我在那里做错了什么

提前谢谢

您不能使用cheerio来抓取像Soundcloud这样的web应用程序，因为该网站主要通过Javascript和XHR JSON请求（AJAX）工作。推荐的替代方案：Puppeter，因为这种无头浏览器几乎可以运行任何Javascript（它只是没有GUI的Chrome浏览器，当然也有一些限制）

Soundcloud标题生成仅在页面滚动时产生，因此您需要使用setInterval每次运行一个autoscroll函数

您可以通过选择选择器查询结果，或者通过侦听页面请求事件并“监视”结果来查询结果。我最好选择第二个，因为它工作平稳，没有任何选择器

因为你们只需要歌曲的URL和标题，所以我制作了这个脚本。尝试一下，并根据您的喜好进行修改

您可以在上面的代码中更改搜索查询和最大结果。甚至可以将搜索查询作为参数运行此脚本。例如：$node search_soundcloud.js“iris-goo-goo-dolls”。如果您没有在这里传递参数，那么它将搜索“goo-goo dolls”作为脚本默认值

const puppeter=require（'puppeter'）
;（异步（）=>{
const input=process.argv.length<3？'goo-goo dolls'：process.argv[2]
常数最大值=100
常量标题数组=[]
const searchquery=input.split（“”）.join（“%20”）
常量url=`https://soundcloud.com/search?q=${searchquery}`
const browser=wait puppeter.launch({
无头：是的，
devtools:false
})
const[page]=wait browser.pages（）
page.setDefaultNavigationTimeout（0）
page.setRequestInterception（true）
page.on（'request'，异步请求=>{
if（request.resourceType（）=='font'||
request.resourceType（）||
request.resourceType（）{
请求。中止（）
}否则{
请求。继续（）
}
})
page.on（'requestfinished'，异步请求=>{
if（request.url（）.search（'https://api-v2.soundcloud.com/search?') > -1 ) {
const response=wait request.response（）
const content=wait response.json（）
const songs=content.collection
for（让num出现在歌曲中）{
if（titlesArray.length<100&&typeof歌曲[num].title！=“未定义”）{
console.log（`[${titlesArray.length+1}]${songs[num].title}`）
console.log（`${songs[num].permalink\u url}\n`）
titleArray.push（歌曲[num].title）
}else if（歌曲类型[num].title！=“未定义”）{
const exit=wait browser.close（）
}
}
}
})
const search=wait page.goto（url，{waitill:'networkidle2'}）
const scroll=wait page.evaluate（'const autoscroll=setInterval（（）=>{window.scrollBy（0100）}，250）'））
})()

我想知道为什么在Puppeter中使用cheerio而不使用DOM？只需将fid修改为{.each（（索引，结果）}，所有操作都会顺利运行

$(a.soundTitle__title)

{
 options: {
  withDomLvl1: true,
  normalizeWhitespace: false,
  xml: false,
  decodeEntities: true
},
length: 0,
prevObject: {
  options: {
   withDomLvl1: true,
   normalizeWhitespace: false,
   xml: false,
   decodeEntities: true
  }
 }
}

... req

const addaptReq = req.text.replace(' ', '%20');  
const url = `https://soundcloud.com/search?q=${addaptReq}`;

let myBrowser;

puppeteer
 .launch()
 .then(browser => {
     myBrowser = browser;
     return myBrowser.newPage();
   })
 .then(page => {
     return page.goto(url).then(() => {
         return page.content();
      });
  })
 .then(html => {
      // console.log(html)
      $('a.soundTitle__title', html, ).each(result => {
        let songName = $('span', result)
        let songURL = $(result).attr('href')

        //  hopefuly name will give the span text (which is it class and href the URL)


          console.log(songName, songURL)
          
      })
   }).then(() => {
       myBrowser.close()
   })
  .catch(err => {
      console.log(err);
   });

const puppeteer = require ('puppeteer')

;(async () => {
    const input = process.argv.length < 3 ? 'goo goo dolls' : process.argv[2]
    const maximumSongs = 100
    const titlesArray = []
    const searchquery = input.split(' ').join('%20')
    const url = `https://soundcloud.com/search?q=${searchquery}`

    const browser = await puppeteer.launch({
        headless: true,
        devtools: false
    })
    const [page] = await browser.pages()
    page.setDefaultNavigationTimeout(0)

    page.setRequestInterception(true)

    page.on('request', async request => {
        if (request.resourceType() === 'font' ||
            request.resourceType() === 'image' ||
            request.resourceType() === 'media' ){
            request.abort()
        } else {
            request.continue()
        }
    })

    page.on('requestfinished', async request => {
        if ( request.url().search('https://api-v2.soundcloud.com/search?') > -1 ) {
            const response = await request.response()
            const content = await response.json()
            const songs = content.collection
            for ( let num in songs ) {
                if (titlesArray.length < 100 && typeof songs[num].title !== 'undefined') {
                    console.log ( `[${titlesArray.length + 1}]  ${songs[num].title}` )
                    console.log ( `${songs[num].permalink_url}\n` )
                    titlesArray.push ( songs[num].title )
                } else if (typeof songs[num].title !== 'undefined') {
                    const exit = await browser.close()
                }
            }
        }
    })

    const search = await page.goto(url, {waitUntil: 'networkidle2'})

    const scroll = await page.evaluate ('const autoscroll = setInterval( () => {window.scrollBy(0,100)}, 250)')

})()