如何抓取国际足联网站的javascripted表格

如何抓取国际足联网站的javascripted表格,javascript,r,phantomjs,scrape,Javascript,R,Phantomjs,Scrape,作为一个研究项目,我想从国际足联网站上搜集国际足球比赛的所有结果。我用R来做这个。但是,似乎包含匹配项的表是使用javascript生成的。 这是我想要抓取的url: 我尝试在呈现javascript表之后使用phantomjs来呈现页面,但是在生成的html中,仍然没有给出包含匹配结果的表。这是我的代码: url = "http://www.fifa.com/live-scores/international- tournaments/fixtures-results/in

作为一个研究项目,我想从国际足联网站上搜集国际足球比赛的所有结果。我用R来做这个。但是,似乎包含匹配项的表是使用javascript生成的。 这是我想要抓取的url:

我尝试在呈现javascript表之后使用phantomjs来呈现页面,但是在生成的html中,仍然没有给出包含匹配结果的表。这是我的代码:

    url = "http://www.fifa.com/live-scores/international- 
    tournaments/fixtures-results/index.html#month5-2018"

    writeLines(sprintf("
    var page = require('webpage').create();
    var fs = require('fs');
    var path = 'scrape.html'

    page.open('%s', function (status) {
      var content = page.content;
      fs.write(path, content, 'w')
      phantom.exit();
    });", url), con="scrape.js")

    system("./phantomjs.exe scrape.js")

您不需要在构建表之后对其进行爬网,此网站会对一些端点进行一些调用,如以下所示

要查找它们,请使用浏览器上的网络检查器(按f12)。更简单的方法是选择构造这些表的JSON,而不是在构造表之后再选择它们

编辑:所有构成表的数据都在这些JSON上,以获取数据。首先执行get请求并下载包含这些JSON的网页的内容。当你检查网页的内容时,你会看到这些是JSON,但它们在一个函数中,只需删除它

例如,在第一个链接中,您可以删除转义json的
\u匹配ByYear和MonthCallback(
和最后一个

删除后,您将获得一个有效的json,您可以使用包json.litejsonR中解析该json,请查看文档。使用这些包中的一个之后,您应该会得到一个数据帧,您可以选择这些信息

您将得到的json请求示例

{
"competitionslist": {
    "0": {
        "name": "Friendlies",
        "idCup": 506,
        "edition": 1872,
        "idCupSeason": 2000010101,
        "isFifaCompetition": true,
        "countryCode": "",
        "cupKindID": 105,
        "competitionSeoName": "friendly-506",
        "hasStanding": false,
        "linkMatches": "",
        "linkStanding": "",
        "link": "",
        "hasMatchLive": false,
        "isActiveSeason": true,
        "matchlist": [{
            "idCup": 506,
            "idCupSeason": 2000010101,
            "edition": 1872,
            "isLive": false,
            "isActiveSeason": true,
            "isFifaCompetition": true,
            "isClubCompetition": false,
            "competitionName": "Friendlies",
            "providerCompetitionID": 0,
            "providerEditionID": 0,
            "idMatch": 300438343,
            "internalMatchID": 0,
            "idRound": 281863,
            "idHomeTeam": 43818,
            "homeCountryCode": "IRQ",
            "homeTeamName": "Iraq",
            "idAwayTeam": 43989,
            "awayCountryCode": "PLE",
            "awayTeamName": "Palestine",
            "matchDate": "2018-05-08T16:00:00Z",
            "matchDateUTC": "2018-05-08T16:00:00Z",
            "kickOffTime": "16:00",
            "minute": 0,
            "status": 0,
            "cupKindID": 105,
            "cupKindName": "Friendly",
            "hasLineup": false,
            "scoreHome": 0,
            "scoreAway": 0,
            "venueName": "Basra ",
            "competitionSeoName": "friendly-506",
            "matchSeoName": "Iraq-Palestine-300438343",
            "homeTeamSeoName": "iraq-43818",
            "awayTeamSeoName": "palestine-43989",
            "hasStanding": false,
            "winTeamName": "",
            "winTeamShortName": "",
            "isStarted": true,
            "isFinished": true,
            "isAwarded": false,
            "isPostponed": false,
            "isSuspended": false,
            "isAbandoned": false,
            "link": "",
            "isNextDay": false
        }, {
            "idCup": 506,
            "idCupSeason": 2000010101,
            "edition": 1872,
            "isLive": false,
            "isActiveSeason": true,
            "isFifaCompetition": true,
            "isClubCompetition": false,
            "competitionName": "Friendlies",
            "providerCompetitionID": 0,
            "providerEditionID": 0,
            "idMatch": 300439349,
            "internalMatchID": 0,
            "idRound": 281863,
            "idHomeTeam": 43843,
            "homeCountryCode": "ALG",
            "homeTeamName": "Algeria",
            "idAwayTeam": 43835,
            "awayCountryCode": "KSA",
            "awayTeamName": "Saudi Arabia",
            "matchDate": "2018-05-09T19:30:00Z",
            "minute": 0,
            "status": 0,
            "cupKindID": 105,
            "cupKindName": "Friendly",
            "hasLineup": false,
            "scoreHome": 0,
            "scoreAway": 2,
            "venueName": "Cadiz ",
            "idWinTeam": 43835,
            "competitionSeoName": "friendly-506",
            "matchSeoName": "Algeria-Saudi Arabia-300439349",
            "homeTeamSeoName": "algeria-43843",
            "awayTeamSeoName": "saudi-arabia-43835",
            "hasStanding": false,
            "winTeamName": "Saudi Arabia",
            "winTeamShortName": "Saudi Arabia",
            "isStarted": true,
            "isFinished": true,
            "isAwarded": false,
            "isPostponed": false,
            "isSuspended": false,
            "isAbandoned": false,
            "link": "",
            "isNextDay": false
        },

谢谢你的回复,它澄清了很多!然而,我真的不明白如何选择构造表的JSON?