如何抓取国际足联网站的javascripted表格
作为一个研究项目,我想从国际足联网站上搜集国际足球比赛的所有结果。我用R来做这个。但是,似乎包含匹配项的表是使用javascript生成的。 这是我想要抓取的url: 我尝试在呈现javascript表之后使用phantomjs来呈现页面,但是在生成的html中,仍然没有给出包含匹配结果的表。这是我的代码:如何抓取国际足联网站的javascripted表格,javascript,r,phantomjs,scrape,Javascript,R,Phantomjs,Scrape,作为一个研究项目,我想从国际足联网站上搜集国际足球比赛的所有结果。我用R来做这个。但是,似乎包含匹配项的表是使用javascript生成的。 这是我想要抓取的url: 我尝试在呈现javascript表之后使用phantomjs来呈现页面,但是在生成的html中,仍然没有给出包含匹配结果的表。这是我的代码: url = "http://www.fifa.com/live-scores/international- tournaments/fixtures-results/in
url = "http://www.fifa.com/live-scores/international-
tournaments/fixtures-results/index.html#month5-2018"
writeLines(sprintf("
var page = require('webpage').create();
var fs = require('fs');
var path = 'scrape.html'
page.open('%s', function (status) {
var content = page.content;
fs.write(path, content, 'w')
phantom.exit();
});", url), con="scrape.js")
system("./phantomjs.exe scrape.js")
您不需要在构建表之后对其进行爬网,此网站会对一些端点进行一些调用,如以下所示 要查找它们,请使用浏览器上的网络检查器(按f12)。更简单的方法是选择构造这些表的JSON,而不是在构造表之后再选择它们 编辑:所有构成表的数据都在这些JSON上,以获取数据。首先执行get请求并下载包含这些JSON的网页的内容。当你检查网页的内容时,你会看到这些是JSON,但它们在一个函数中,只需删除它 例如,在第一个链接中,您可以删除转义json的
\u匹配ByYear和MonthCallback(
和最后一个)
删除后,您将获得一个有效的json,您可以使用包json.lite或json在R中解析该json,请查看文档。使用这些包中的一个之后,您应该会得到一个数据帧,您可以选择这些信息
您将得到的json请求示例
{
"competitionslist": {
"0": {
"name": "Friendlies",
"idCup": 506,
"edition": 1872,
"idCupSeason": 2000010101,
"isFifaCompetition": true,
"countryCode": "",
"cupKindID": 105,
"competitionSeoName": "friendly-506",
"hasStanding": false,
"linkMatches": "",
"linkStanding": "",
"link": "",
"hasMatchLive": false,
"isActiveSeason": true,
"matchlist": [{
"idCup": 506,
"idCupSeason": 2000010101,
"edition": 1872,
"isLive": false,
"isActiveSeason": true,
"isFifaCompetition": true,
"isClubCompetition": false,
"competitionName": "Friendlies",
"providerCompetitionID": 0,
"providerEditionID": 0,
"idMatch": 300438343,
"internalMatchID": 0,
"idRound": 281863,
"idHomeTeam": 43818,
"homeCountryCode": "IRQ",
"homeTeamName": "Iraq",
"idAwayTeam": 43989,
"awayCountryCode": "PLE",
"awayTeamName": "Palestine",
"matchDate": "2018-05-08T16:00:00Z",
"matchDateUTC": "2018-05-08T16:00:00Z",
"kickOffTime": "16:00",
"minute": 0,
"status": 0,
"cupKindID": 105,
"cupKindName": "Friendly",
"hasLineup": false,
"scoreHome": 0,
"scoreAway": 0,
"venueName": "Basra ",
"competitionSeoName": "friendly-506",
"matchSeoName": "Iraq-Palestine-300438343",
"homeTeamSeoName": "iraq-43818",
"awayTeamSeoName": "palestine-43989",
"hasStanding": false,
"winTeamName": "",
"winTeamShortName": "",
"isStarted": true,
"isFinished": true,
"isAwarded": false,
"isPostponed": false,
"isSuspended": false,
"isAbandoned": false,
"link": "",
"isNextDay": false
}, {
"idCup": 506,
"idCupSeason": 2000010101,
"edition": 1872,
"isLive": false,
"isActiveSeason": true,
"isFifaCompetition": true,
"isClubCompetition": false,
"competitionName": "Friendlies",
"providerCompetitionID": 0,
"providerEditionID": 0,
"idMatch": 300439349,
"internalMatchID": 0,
"idRound": 281863,
"idHomeTeam": 43843,
"homeCountryCode": "ALG",
"homeTeamName": "Algeria",
"idAwayTeam": 43835,
"awayCountryCode": "KSA",
"awayTeamName": "Saudi Arabia",
"matchDate": "2018-05-09T19:30:00Z",
"minute": 0,
"status": 0,
"cupKindID": 105,
"cupKindName": "Friendly",
"hasLineup": false,
"scoreHome": 0,
"scoreAway": 2,
"venueName": "Cadiz ",
"idWinTeam": 43835,
"competitionSeoName": "friendly-506",
"matchSeoName": "Algeria-Saudi Arabia-300439349",
"homeTeamSeoName": "algeria-43843",
"awayTeamSeoName": "saudi-arabia-43835",
"hasStanding": false,
"winTeamName": "Saudi Arabia",
"winTeamShortName": "Saudi Arabia",
"isStarted": true,
"isFinished": true,
"isAwarded": false,
"isPostponed": false,
"isSuspended": false,
"isAbandoned": false,
"link": "",
"isNextDay": false
},
谢谢你的回复,它澄清了很多!然而,我真的不明白如何选择构造表的JSON?