Mongodb 考虑到给定字段中的每个子字段,如何计算具有最大唯一值的文档数? 问题

Mongodb 考虑到给定字段中的每个子字段,如何计算具有最大唯一值的文档数? 问题,mongodb,nosql-aggregation,Mongodb,Nosql Aggregation,鉴于这种结构: { "_id": ObjectId("56n606c39q49b80adfe6d17b") "data": { "field1": [ { "subfield1.1": [ { "val1.1.1&

鉴于这种结构:

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": b
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": c
                    },
                    {
                        "val1.2.2": d
                    }
                ]
            }
        ]
    }
}
考虑到给定字段中每个“子字段”中的所有“val”,如何编写一个查询来统计具有最大唯一“val”的文档数

需要考虑的事实:

  • “val”是数组中的元素
  • “子字段”也是数组中的元素
  • 所有文档的“字段”、“子字段”和“val”字段名称都相同
  • 可能有1个或多个“val”
我对NoSQL有些陌生。在普通的SQL中,我可能会使用自连接来解决这个问题,但在这里,即使可以使用聚合,我也找不到接近实际解决方案的方法

use myDB;

// Function that determines if a "field" has a unique maximum value.
function validate(list){
    let len = list.length;
    let isGood = false;
    if(len == 0){
        isGood = false;
    }
    else if (len == 1){
        isGood = true;
    }
    else{
        isGood = list[0] != list[1];
    }
    return isGood;
}

// These function iterates over all the "values" in every "subfield" 
//  within a "field" of a document.
// They add possible maximum values to a list which is then 
//  validated in "validate()".
function verifySubfields(field){
    let list = [];
    field.forEach(fieldElement => {
        // Check if subfield exists within the element and
        //  check that is not empty
        if (fieldElement.subfield && fieldElement.subfield[0]){
            let subfield = fieldElement.subfield;
            subfield.forEach(subfieldElement => {
                let val = subfieldElement.val;

                if (list.length == 0){
                    list.push(val);
                }
                else{
                    if (a >= list[0]){
                        list.unshift(val);
                    }
                }
            });
        }
    });

    return validate(list);
}

function verifyField(doc){
    return verifySubfields(doc.data.field);
};

let cont = 0, iterations = 0;
db.myCollection.find({ "data.field.subfield": {$exists: true} }).forEach(doc => { 
   iterations++; 
   if(verifyField(doc)){
      cont++;
   } 
});
print(`\nTotal: ${iterations} \nPositives: ${cont} \n`);
案例 假设a是最大值。。。 本文件应计入:

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": b
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": c
                    },
                    {
                        "val1.2.2": d
                    }
                ]
            }
        ]
    }
}
{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": a
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": b
                    },
                    {
                        "val1.2.2": c
                    }
                ]
            }
        ]
    }
}
本文件不应计算在内:

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": b
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": c
                    },
                    {
                        "val1.2.2": d
                    }
                ]
            }
        ]
    }
}
{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": a
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": b
                    },
                    {
                        "val1.2.2": c
                    }
                ]
            }
        ]
    }
}
本文件也不应计算在内:

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": b
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": a
                    },
                    {
                        "val1.2.2": c
                    }
                ]
            }
        ]
    }
}
本文件应计算在内(即使重复b):


任何想法都会受到欢迎。谢谢大家!

这里绝对需要聚合。它可能需要一些修改,但像这样的东西可能(希望)适合您:

db.collection.aggregate([
    /*
        Step 1: We need to unravel the multi-dimensional array first, otherwise we can't efficiently search for globally unique maximums.
    */

    // Unravel the outer array first.
    {$unwind: "$data.field1"},

    // Simplifies the representation of the unwind result so that we have a flat field path rather than a nested one.
    {$project: {
        vals: "$data.field1.subfield1"
    }},

    // Now unravel the inner array.
    {$unwind: "$vals"},

    // Another simplification step for the same reason as the previous projection.
    {$project: {
        val: "$vals.val1"
    }},

    /*
        Step 2: We want to create counts for array elements that are the same value from the same source document.
    */

    // Creating the counts is easy--simply group by documents with the same source document ID and the same value, adding 1 to our total for each entry.
    {$group: {
        _id: {
            _id: "$_id",
            val: "$val"
        },
        count: {$sum: 1}
    }},

    /*
        Step 3: Once we have our counts, can retrieve the maximum value for each source document.
    */

    // First, sort by descending value so that the maximum value is the first we encounter.
    {$sort: {
        "_id.val": -1
    }},

    // With the entries in descending order, we can grab the first entry for each source document, which will give us all of the maximums.
    {$group: {
        _id: "$_id._id",
        max: {
            $first: {
                val: "$_id.val",
                count: "$count"
            }
        }
    }},

    // Just for kicks, let's simplify once more by unnesting the data.
    {$project: {
        val: "$max.val",
        count: "$max.count"
    }},

    /*
        Step 4: Now we just need to limit our results.
    */

    // Any result with a count of 1 is a unique maximum.
    {$match: {
        count: 1
    }}
])

不可否认,这是一个复杂的查询,在不了解实际文档结构的情况下很难保证功能。也就是说,评论中应该有足够的信息来帮助您修改它以满足您的需要。但是,如果您遇到任何问题,请告诉我,我会尽我所能帮助您解决问题。

只是想发布一个不同的解决方案,因为这个解决方案(在我使用的数据库中)比使用聚合框架的解决方案运行得稍快一些;这是一个JavaScript解决方案

use myDB;

// Function that determines if a "field" has a unique maximum value.
function validate(list){
    let len = list.length;
    let isGood = false;
    if(len == 0){
        isGood = false;
    }
    else if (len == 1){
        isGood = true;
    }
    else{
        isGood = list[0] != list[1];
    }
    return isGood;
}

// These function iterates over all the "values" in every "subfield" 
//  within a "field" of a document.
// They add possible maximum values to a list which is then 
//  validated in "validate()".
function verifySubfields(field){
    let list = [];
    field.forEach(fieldElement => {
        // Check if subfield exists within the element and
        //  check that is not empty
        if (fieldElement.subfield && fieldElement.subfield[0]){
            let subfield = fieldElement.subfield;
            subfield.forEach(subfieldElement => {
                let val = subfieldElement.val;

                if (list.length == 0){
                    list.push(val);
                }
                else{
                    if (a >= list[0]){
                        list.unshift(val);
                    }
                }
            });
        }
    });

    return validate(list);
}

function verifyField(doc){
    return verifySubfields(doc.data.field);
};

let cont = 0, iterations = 0;
db.myCollection.find({ "data.field.subfield": {$exists: true} }).forEach(doc => { 
   iterations++; 
   if(verifyField(doc)){
      cont++;
   } 
});
print(`\nTotal: ${iterations} \nPositives: ${cont} \n`);
注意:使用
mongo
运行


选中解决方案的注释中提到的问题可以通过多次调用“verifySubfields()”来解决,并在“verifyFields()”中验证这些结果,这可能会将名称更改为“verifyFields()”

非常感谢您的回复,它工作得非常好。对于这个问题,我在开始时想要实现的实际上是更进一步的一步:我不想只考虑一个字段及其子字段来计算记录,而是想同时使用四个字段来计算满足这些条件的记录。因此,假设还有另外3个字段,分别称为“field2”、“field3”和“field4”,以及它们各自的子字段/值。也许我太野心勃勃了,我不想要求更多,因为你已经帮了我很多。不过,你有什么建议吗?这当然是可以实现的。请查看
$concatarray
操作符。使用此运算符,可以合并
field1
field2
field3
field4
的数组。将此作为第一个聚合管道阶段。结果将是一个包含所有四个字段的所有数组元素的单个字段。这个新字段就是您将使用答案中列出的其他聚合来处理的字段。再次感谢您的回复。对不起,我想我没有正确地解释我自己。我的意思是,它不仅应该统计满足“field1”中此标准的文档,还应该将相同的标准分别应用于更多字段。在编程逻辑中,我想说的是“if(validate(field1)&&validate(field2)){count++;}”,您帮助我解决的问题只是“if(validate(field1){count++}”。