Mongodb 考虑到给定字段中的每个子字段，如何计算具有最大唯一值的文档数？问题_Mongodb_Nosql Aggregation

Mongodb 考虑到给定字段中的每个子字段，如何计算具有最大唯一值的文档数？问题

mongodb

Mongodb 考虑到给定字段中的每个子字段，如何计算具有最大唯一值的文档数？问题,mongodb,nosql-aggregation,Mongodb,Nosql Aggregation,鉴于这种结构： { "_id": ObjectId("56n606c39q49b80adfe6d17b") "data": { "field1": [ { "subfield1.1": [ { "val1.1.1&

鉴于这种结构：

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": b
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": c
                    },
                    {
                        "val1.2.2": d
                    }
                ]
            }
        ]
    }
}

考虑到给定字段中每个“子字段”中的所有“val”，如何编写一个查询来统计具有最大唯一“val”的文档数

需要考虑的事实：

“val”是数组中的元素
“子字段”也是数组中的元素
所有文档的“字段”、“子字段”和“val”字段名称都相同
可能有1个或多个“val”

我对NoSQL有些陌生。在普通的SQL中，我可能会使用自连接来解决这个问题，但在这里，即使可以使用聚合，我也找不到接近实际解决方案的方法

use myDB;

// Function that determines if a "field" has a unique maximum value.
function validate(list){
    let len = list.length;
    let isGood = false;
    if(len == 0){
        isGood = false;
    }
    else if (len == 1){
        isGood = true;
    }
    else{
        isGood = list[0] != list[1];
    }
    return isGood;
}

// These function iterates over all the "values" in every "subfield" 
//  within a "field" of a document.
// They add possible maximum values to a list which is then 
//  validated in "validate()".
function verifySubfields(field){
    let list = [];
    field.forEach(fieldElement => {
        // Check if subfield exists within the element and
        //  check that is not empty
        if (fieldElement.subfield && fieldElement.subfield[0]){
            let subfield = fieldElement.subfield;
            subfield.forEach(subfieldElement => {
                let val = subfieldElement.val;

                if (list.length == 0){
                    list.push(val);
                }
                else{
                    if (a >= list[0]){
                        list.unshift(val);
                    }
                }
            });
        }
    });

    return validate(list);
}

function verifyField(doc){
    return verifySubfields(doc.data.field);
};

let cont = 0, iterations = 0;
db.myCollection.find({ "data.field.subfield": {$exists: true} }).forEach(doc => { 
   iterations++; 
   if(verifyField(doc)){
      cont++;
   } 
});
print(`\nTotal: ${iterations} \nPositives: ${cont} \n`);

案例假设a是最大值。。。本文件应计入：

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": b
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": c
                    },
                    {
                        "val1.2.2": d
                    }
                ]
            }
        ]
    }
}

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": a
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": b
                    },
                    {
                        "val1.2.2": c
                    }
                ]
            }
        ]
    }
}

本文件不应计算在内：

{ "_id": ObjectId("56n606c39q49b80adfe6d17b") "data": { "field1": [ { "subfield1.1": [ { "val1.1.1": a }, { "val1.1.2": b } ] }, { "subfield1.2": [ { "val1.2.1": c }, { "val1.2.2": d } ] } ] } }

{ "_id": ObjectId("56n606c39q49b80adfe6d17b") "data": { "field1": [ { "subfield1.1": [ { "val1.1.1": a }, { "val1.1.2": a } ] }, { "subfield1.2": [ { "val1.2.1": b }, { "val1.2.2": c } ] } ] } }
本文件也不应计算在内：

{ "_id": ObjectId("56n606c39q49b80adfe6d17b") "data": { "field1": [ { "subfield1.1": [ { "val1.1.1": a }, { "val1.1.2": b } ] }, { "subfield1.2": [ { "val1.2.1": a }, { "val1.2.2": c } ] } ] } }
本文件应计算在内（即使重复b）：

任何想法都会受到欢迎。谢谢大家!
这里绝对需要聚合。它可能需要一些修改，但像这样的东西可能（希望）适合您：

db.collection.aggregate([ /* Step 1: We need to unravel the multi-dimensional array first, otherwise we can't efficiently search for globally unique maximums. */ // Unravel the outer array first. {$unwind: "$data.field1"}, // Simplifies the representation of the unwind result so that we have a flat field path rather than a nested one. {$project: { vals: "$data.field1.subfield1" }}, // Now unravel the inner array. {$unwind: "$vals"}, // Another simplification step for the same reason as the previous projection. {$project: { val: "$vals.val1" }}, /* Step 2: We want to create counts for array elements that are the same value from the same source document. */ // Creating the counts is easy--simply group by documents with the same source document ID and the same value, adding 1 to our total for each entry. {$group: { _id: { _id: "$_id", val: "$val" }, count: {$sum: 1} }}, /* Step 3: Once we have our counts, can retrieve the maximum value for each source document. */ // First, sort by descending value so that the maximum value is the first we encounter. {$sort: { "_id.val": -1 }}, // With the entries in descending order, we can grab the first entry for each source document, which will give us all of the maximums. {$group: { _id: "$_id._id", max: { $first: { val: "$_id.val", count: "$count" } } }}, // Just for kicks, let's simplify once more by unnesting the data. {$project: { val: "$max.val", count: "$max.count" }}, /* Step 4: Now we just need to limit our results. */ // Any result with a count of 1 is a unique maximum. {$match: { count: 1 }} ])

不可否认，这是一个复杂的查询，在不了解实际文档结构的情况下很难保证功能。也就是说，评论中应该有足够的信息来帮助您修改它以满足您的需要。但是，如果您遇到任何问题，请告诉我，我会尽我所能帮助您解决问题。
只是想发布一个不同的解决方案，因为这个解决方案（在我使用的数据库中）比使用聚合框架的解决方案运行得稍快一些；这是一个JavaScript解决方案

use myDB; // Function that determines if a "field" has a unique maximum value. function validate(list){ let len = list.length; let isGood = false; if(len == 0){ isGood = false; } else if (len == 1){ isGood = true; } else{ isGood = list[0] != list[1]; } return isGood; } // These function iterates over all the "values" in every "subfield" // within a "field" of a document. // They add possible maximum values to a list which is then // validated in "validate()". function verifySubfields(field){ let list = []; field.forEach(fieldElement => { // Check if subfield exists within the element and // check that is not empty if (fieldElement.subfield && fieldElement.subfield[0]){ let subfield = fieldElement.subfield; subfield.forEach(subfieldElement => { let val = subfieldElement.val; if (list.length == 0){ list.push(val); } else{ if (a >= list[0]){ list.unshift(val); } } }); } }); return validate(list); } function verifyField(doc){ return verifySubfields(doc.data.field); }; let cont = 0, iterations = 0; db.myCollection.find({ "data.field.subfield": {$exists: true} }).forEach(doc => { iterations++; if(verifyField(doc)){ cont++; } }); print(`\nTotal: ${iterations} \nPositives: ${cont} \n`);
注意：使用
mongo
运行选中解决方案的注释中提到的问题可以通过多次调用“verifySubfields（）”来解决，并在“verifyFields（）”中验证这些结果，这可能会将名称更改为“verifyFields（）” 非常感谢您的回复，它工作得非常好。对于这个问题，我在开始时想要实现的实际上是更进一步的一步：我不想只考虑一个字段及其子字段来计算记录，而是想同时使用四个字段来计算满足这些条件的记录。因此，假设还有另外3个字段，分别称为“field2”、“field3”和“field4”，以及它们各自的子字段/值。也许我太野心勃勃了，我不想要求更多，因为你已经帮了我很多。不过，你有什么建议吗？这当然是可以实现的。请查看$concatarray 操作符。使用此运算符，可以合并field1 、field2 、field3 和field4 的数组。将此作为第一个聚合管道阶段。结果将是一个包含所有四个字段的所有数组元素的单个字段。这个新字段就是您将使用答案中列出的其他聚合来处理的字段。再次感谢您的回复。对不起，我想我没有正确地解释我自己。我的意思是，它不仅应该统计满足“field1”中此标准的文档，还应该将相同的标准分别应用于更多字段。在编程逻辑中，我想说的是“if（validate（field1）&&validate（field2））{count++；}”，您帮助我解决的问题只是“if（validate（field1）{count++}”。