Node.js 使用Cheerio从抓取页面获取信息
我正在使用Puppeter和Cheerio实现web抓取节点服务器 我有一个前端React应用程序使用Soundcloud小部件来播放音乐,但问题是API只能在正确的Soundcloud URL下工作 因此,我正在进行UI输入,以便通过Scraper发送SoundCloud搜索请求,并从中获取结果 因为我所关心的是URL和歌曲名,从中获取Node.js 使用Cheerio从抓取页面获取信息,node.js,puppeteer,cheerio,html-parser,Node.js,Puppeteer,Cheerio,Html Parser,我正在使用Puppeter和Cheerio实现web抓取节点服务器 我有一个前端React应用程序使用Soundcloud小部件来播放音乐,但问题是API只能在正确的Soundcloud URL下工作 因此,我正在进行UI输入,以便通过Scraper发送SoundCloud搜索请求,并从中获取结果 因为我所关心的是URL和歌曲名,从中获取 它在子span标记(在其“class”attr处)中保存名称,URL为href,就足够了 问题是我越来越 这是span/song名称的默认输出, 和h
它在子span标记(在其“class”attr处)中保存名称,URL为href,就足够了 问题是我越来越
这是span/song名称的默认输出, 和href URL的“未定义” 即使当我尝试使用{.text()/.attr(“class”)/etc…}时,resp- 取消查找/此默认响应/错误文本()不是函数 这是我的密码-
我用devtools附加soundcloud的屏幕截图,也许我在那里做错了什么 提前谢谢
const puppeter=require('puppeter')
;(异步()=>{
const input=process.argv.length<3?'goo-goo dolls':process.argv[2]
常数最大值=100
常量标题数组=[]
const searchquery=input.split(“”).join(“%20”)
常量url=`https://soundcloud.com/search?q=${searchquery}`
const browser=wait puppeter.launch({
无头:是的,
devtools:false
})
const[page]=wait browser.pages()
page.setDefaultNavigationTimeout(0)
page.setRequestInterception(true)
page.on('request',异步请求=>{
if(request.resourceType()=='font'||
request.resourceType()||
request.resourceType(){
请求。中止()
}否则{
请求。继续()
}
})
page.on('requestfinished',异步请求=>{
if(request.url().search('https://api-v2.soundcloud.com/search?') > -1 ) {
const response=wait request.response()
const content=wait response.json()
const songs=content.collection
for(让num出现在歌曲中){
if(titlesArray.length<100&&typeof歌曲[num].title!=“未定义”){
console.log(`[${titlesArray.length+1}]${songs[num].title}`)
console.log(`${songs[num].permalink\u url}\n`)
titleArray.push(歌曲[num].title)
}else if(歌曲类型[num].title!=“未定义”){
const exit=wait browser.close()
}
}
}
})
const search=wait page.goto(url,{waitill:'networkidle2'})
const scroll=wait page.evaluate('const autoscroll=setInterval(()=>{window.scrollBy(0100)},250)'))
})()
我想知道为什么在Puppeter中使用cheerio而不使用DOM?只需将fid修改为{.each((索引,结果)},所有操作都会顺利运行
$(a.soundTitle__title)
{
options: {
withDomLvl1: true,
normalizeWhitespace: false,
xml: false,
decodeEntities: true
},
length: 0,
prevObject: {
options: {
withDomLvl1: true,
normalizeWhitespace: false,
xml: false,
decodeEntities: true
}
}
}
... req
const addaptReq = req.text.replace(' ', '%20');
const url = `https://soundcloud.com/search?q=${addaptReq}`;
let myBrowser;
puppeteer
.launch()
.then(browser => {
myBrowser = browser;
return myBrowser.newPage();
})
.then(page => {
return page.goto(url).then(() => {
return page.content();
});
})
.then(html => {
// console.log(html)
$('a.soundTitle__title', html, ).each(result => {
let songName = $('span', result)
let songURL = $(result).attr('href')
// hopefuly name will give the span text (which is it class and href the URL)
console.log(songName, songURL)
})
}).then(() => {
myBrowser.close()
})
.catch(err => {
console.log(err);
});
const puppeteer = require ('puppeteer')
;(async () => {
const input = process.argv.length < 3 ? 'goo goo dolls' : process.argv[2]
const maximumSongs = 100
const titlesArray = []
const searchquery = input.split(' ').join('%20')
const url = `https://soundcloud.com/search?q=${searchquery}`
const browser = await puppeteer.launch({
headless: true,
devtools: false
})
const [page] = await browser.pages()
page.setDefaultNavigationTimeout(0)
page.setRequestInterception(true)
page.on('request', async request => {
if (request.resourceType() === 'font' ||
request.resourceType() === 'image' ||
request.resourceType() === 'media' ){
request.abort()
} else {
request.continue()
}
})
page.on('requestfinished', async request => {
if ( request.url().search('https://api-v2.soundcloud.com/search?') > -1 ) {
const response = await request.response()
const content = await response.json()
const songs = content.collection
for ( let num in songs ) {
if (titlesArray.length < 100 && typeof songs[num].title !== 'undefined') {
console.log ( `[${titlesArray.length + 1}] ${songs[num].title}` )
console.log ( `${songs[num].permalink_url}\n` )
titlesArray.push ( songs[num].title )
} else if (typeof songs[num].title !== 'undefined') {
const exit = await browser.close()
}
}
}
})
const search = await page.goto(url, {waitUntil: 'networkidle2'})
const scroll = await page.evaluate ('const autoscroll = setInterval( () => {window.scrollBy(0,100)}, 250)')
})()