Mongodb 按天/次日逻辑汇总分析数据
我为移动应用分析写了一个服务器,其中有一个碎片!Upd集合,事件如下:Mongodb 按天/次日逻辑汇总分析数据,mongodb,aggregation-framework,Mongodb,Aggregation Framework,我为移动应用分析写了一个服务器,其中有一个碎片!Upd集合,事件如下: { "event": "install", "userId": "a", "time": 2014-02-09, "data" : ... }, { "event": "login", "userId": "a", "time": 2014-02-12, "data" : ... }, { "event": "install", "userId": "b", "time": 201
{
"event": "install",
"userId": "a",
"time": 2014-02-09,
"data" : ...
},
{
"event": "login",
"userId": "a",
"time": 2014-02-12,
"data" : ...
},
{
"event": "install",
"userId": "b",
"time": 2014-4-29,
"data" : ...
},
{
"event": "login",
"userId": "b",
"time": 2014-4-30,
"data" : ...
}
...
我需要选择已安装事件但在安装事件后第二天未登录的用户。换句话说,我要选择已安装应用程序但第二天未登录的用户。因此,上述数据的输出应为:
{
"userId": "a",
"data" : ...
}
如何使用聚合框架或mapreduce执行此任务?或者另一个解决方案?这是一个有点棘手的解决方案:- 如果时间只是一个没有时间数据的日期字段,则可以通过聚合来实现, 然后 收藏
{
"_id" : ObjectId("57694365ef9176ec54960a66"),
"event" : "install",
"userId" : "a",
"time" : ISODate("2014-09-02T00:00:00.000Z")
},{
"_id" : ObjectId("57694365ef9176ec54960a67"),
"event" : "login",
"userId" : "a",
"time" : ISODate("2014-12-02T00:00:00.000Z")
},{
"_id" : ObjectId("57694365ef9176ec54960a68"),
"event" : "install",
"userId" : "b",
"time" : ISODate("2014-04-29T00:00:00.000Z")
},{
"_id" : ObjectId("57694365ef9176ec54960a69"),
"event" : "login",
"userId" : "b",
"time" : ISODate("2014-04-30T00:00:00.000Z")
}
我们可以使用聚合查询:
var match = {
$match : {
"event" : "install"
}
};
var projectNextDayDate = {
$project : {
_id : 1,
event : 1,
userId : 1,
time : 1,
nextDay : {
$add : ["$time", 24 * 60 * 60 * 1000]
}
}
}
var lookup = {
$lookup : {
from : "zella",
localField : "nextDay",
foreignField : "time",
as : "mergedDocs"
}
}
var nowMatchUsers = {
$project : {
_id : 1,
event : 1,
userId : 1,
time : 1,
nextDay : 1,
mergedDocs : {
$filter : {
input : "mergedDocs",
as : "m",
cond : {
$eq : ["$$m.userId", "$userId"]
}
}
}
}
}
var findEmptyArrays = {
$match : {
mergedDocs : []
}
}
db.zella.aggregate([match, projectNextDayDate, lookup, findEmptyArrays])
使用此输出:
{
"_id" : ObjectId("57694365ef9176ec54960a66"),
"event" : "install",
"userId" : "a",
"time" : ISODate("2014-09-02T00:00:00.000Z"),
"nextDay" : ISODate("2014-09-03T00:00:00.000Z"),
"mergedDocs" : []
}
此处假设时间为日期2014-09-02T00:00:00.000
合并集合的另一种方法是使用用户ID作为$lookup点,但这样会有更多逻辑来筛选结果集,性能可能会下降。您可以尝试运行以下聚合管道:
db.test.aggregate([
{
"$project": {
"event": 1,
"userId": 1,
"time": 1,
"data": 1,
"dayAfter": {
"$add": [ "$time", 24 * 60 * 60 * 1000 ]
}
}
},
{ "$match": { "event": { "$in": ["install", "login"] } } },
{
"$group": {
"_id": "$userId",
"eventsTimeLine": {
"$push": {
"event": "$event",
"time": "$time",
"dayAfter": "$dayAfter"
}
},
"data": { "$push": "$data" }
}
},
{ "$unwind": "$eventsTimeLine" },
{ "$sort": { "eventsTimeLine.event": 1 } },
{
"$group": {
"_id": "$_id",
"dayAfterInstall": { "$first": "$eventsTimeLine.dayAfter" },
"loginTime": { "$last": "$eventsTimeLine.time" },
"data": { "$first": "$data" }
}
},
{
"$project": {
"isChurn": { "$ne": [ "$loginTime", "$dayAfterInstall" ] },
"userId": "$_id", "data": 1, "_id": 0
}
},
{ "$match" : { "isChurn" : true } }
])
这里是另一个使用mapreduce和聚合的解决方案:
var mapFunction = function() {
if (this.event != 'install' && this.event != 'login'){
return;
}
var value = {data: this.data, count: 1};
if (this.event == 'install'){
var nextDay = new Date(this.date.getTime() + 24 * 60 * 60 * 1000)
emit({userId:this.userId, nextDayAfterInstall:nextDay}, value );
} else
if (this.event == 'login'){
emit({userId:this.userId, nextDayAfterInstall:this.date}, value );
}
};
var reduceFunction = function(event, values) {
var value = { data: null, count: 1 };
for (var index = 0; index < values.length; ++index) {
value.count += values[index].count;
value.data = values[index].data;
}
return value ;
};
db.events.mapReduce(
mapFunction,
reduceFunction,
{ out: "case1_mr_out" }
)
var groupByUserId = {
$group :
{
_id : { userId: "$_id.userId" },
data : { $last: '$value.data' },
count : { $max: '$value.count' }
}
}
var filterWhereOnlyOne = {
$match : {
"count" : 1
}
};
db.case1_mr_out.aggregate([groupByUserId,filterWhereOnlyOne])
谢谢,解决方案有效,但查找不支持分片收集,所以现在我尝试了解chridam解决方案,这似乎有点棘手:哈哈:-问题中应该有这些信息:-,Christopher answer将对分片收集有效Orry,当我问问题时,我不知道这个限制。一点问题也没有:-我们都是人类,正在学习如何处理mongo:D