Twitter数据-查找MongoDB中最受关注的用户_Mongodb_Twitter

Twitter数据-查找MongoDB中最受关注的用户

mongodb twitter

Twitter数据-查找MongoDB中最受关注的用户,mongodb,twitter,Mongodb,Twitter,假设我有来自twitterapi的流数据，并且数据作为文档存储在MongoDB中。我试图找到的是实体下的屏幕名称的计数。用户提到 { "_id" : ObjectId("50657d5844956d06fb5b36c7"), "contributors" : null, "text" : "", "entities" : { "urls" : [ ], "hashtags" : [ {

假设我有来自twitterapi的流数据，并且数据作为文档存储在MongoDB中。我试图找到的是

实体下的屏幕名称
的计数。用户提到

{
    "_id" : ObjectId("50657d5844956d06fb5b36c7"),
    "contributors" : null,
    "text" : "",
    "entities" : {
        "urls" : [ ],
        "hashtags" : [
            {
                "text" : "",
                "indices" : [
                    26,
                    30
                ]
            },
            {
                "text" : "",
                "indices" : []
            }
        ],
        "user_mentions" : [ 
                {
                    "name":"Twitter API", 
                    "indices":[4,15], 
                    "screen_name":"twitterapi", 
                    "id":6253282, "id_str":"6253282"
                }]
    },
    ...

我已尝试使用map reduce：

map = function() {
    if (!this.entities.user_mentions.screen_name) {
        return;
    }

    for (index in this.entities.user_mentions.screen_name) {
        emit(this.entities.user_mentions.screen_name[index], 1);
    }
}

reduce = function(previous, current) {
    var count = 0;

    for (index in current) {
        count += current[index];
    }

    return count;
}

result = db.runCommand({
    "mapreduce" : "twitter_sample",
    "map" : map,
    "reduce" : reduce,
    "out" : "user_mentions"
});

但是它不太有效…

因为

实体。用户提到的是一个数组，您希望为映射（）中的每个屏幕名称发出一个值：
然后在reduce（）
中按唯一屏幕名称计算值：
注意：要调试map/reduce JavaScript函数，可以使用print（）
和printjson（）
命令。输出将出现在您的mongod
日志中
编辑：为了进行比较，下面是一个使用MongoDB 2.2中新版本的示例：
db.twitter_sample.aggregate(
    // Project to limit the document fields included
    { $project: {
        _id: 0,
        "entities.user_mentions" : 1
    }},

    // Split user_mentions array into a stream of documents
    { $unwind: "$entities.user_mentions" },

    // Group and count the unique mentions by screen_name
    { $group : {
        _id: "$entities.user_mentions.screen_name",
        count: { $sum : 1 }
    }},

    // Optional: sort by count, descending
    { $sort : {
        "count" : -1
    }}
)

原始的Map/Reduce方法最适合于大型数据集，正如Twitter数据所暗示的那样。有关Map/Reduce与聚合框架限制的比较，请参阅StackOverflow问题的相关讨论。
向您致敬，先生。我想知道我可以使用聚合框架做同样的事情吗group（）
function？@chutsu：您可以对聚合框架执行相同的操作，但有一些注意事项，例如目前结果仅限于内联，文档最大大小为16Mb。也可以使用group（）
命令，但这也有一些限制。有关方法和限制的比较，请参见：。@chutsu:为您添加了一个聚合框架示例；-）hmmm聚合版本似乎返回的值与map reduce的值不同version@chutsu：怎么会这样？我的测试得到了相同的值，但数据结构略有不同（aggregate（）
使用result
而不是results
，并且count
位于顶层而不是嵌套在value下）。
var reduce = function(key, values) {
    // NB: reduce() uses same format as results emitted by map()
    var result = { count: 0 };

    values.forEach(function(value) {
        result.count += value.count;
    });

    return result;
};

db.twitter_sample.aggregate(
    // Project to limit the document fields included
    { $project: {
        _id: 0,
        "entities.user_mentions" : 1
    }},

    // Split user_mentions array into a stream of documents
    { $unwind: "$entities.user_mentions" },

    // Group and count the unique mentions by screen_name
    { $group : {
        _id: "$entities.user_mentions.screen_name",
        count: { $sum : 1 }
    }},

    // Optional: sort by count, descending
    { $sort : {
        "count" : -1
    }}
)