Mongodb 考虑到给定字段中的每个子字段,如何计算具有最大唯一值的文档数? 问题
鉴于这种结构:Mongodb 考虑到给定字段中的每个子字段,如何计算具有最大唯一值的文档数? 问题,mongodb,nosql-aggregation,Mongodb,Nosql Aggregation,鉴于这种结构: { "_id": ObjectId("56n606c39q49b80adfe6d17b") "data": { "field1": [ { "subfield1.1": [ { "val1.1.1&
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": c
},
{
"val1.2.2": d
}
]
}
]
}
}
考虑到给定字段中每个“子字段”中的所有“val”,如何编写一个查询来统计具有最大唯一“val”的文档数
需要考虑的事实:
- “val”是数组中的元素
- “子字段”也是数组中的元素
- 所有文档的“字段”、“子字段”和“val”字段名称都相同
- 可能有1个或多个“val”
use myDB;
// Function that determines if a "field" has a unique maximum value.
function validate(list){
let len = list.length;
let isGood = false;
if(len == 0){
isGood = false;
}
else if (len == 1){
isGood = true;
}
else{
isGood = list[0] != list[1];
}
return isGood;
}
// These function iterates over all the "values" in every "subfield"
// within a "field" of a document.
// They add possible maximum values to a list which is then
// validated in "validate()".
function verifySubfields(field){
let list = [];
field.forEach(fieldElement => {
// Check if subfield exists within the element and
// check that is not empty
if (fieldElement.subfield && fieldElement.subfield[0]){
let subfield = fieldElement.subfield;
subfield.forEach(subfieldElement => {
let val = subfieldElement.val;
if (list.length == 0){
list.push(val);
}
else{
if (a >= list[0]){
list.unshift(val);
}
}
});
}
});
return validate(list);
}
function verifyField(doc){
return verifySubfields(doc.data.field);
};
let cont = 0, iterations = 0;
db.myCollection.find({ "data.field.subfield": {$exists: true} }).forEach(doc => {
iterations++;
if(verifyField(doc)){
cont++;
}
});
print(`\nTotal: ${iterations} \nPositives: ${cont} \n`);
案例
假设a是最大值。。。
本文件应计入:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": c
},
{
"val1.2.2": d
}
]
}
]
}
}
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": a
}
]
},
{
"subfield1.2": [
{
"val1.2.1": b
},
{
"val1.2.2": c
}
]
}
]
}
}
本文件不应计算在内:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": c
},
{
"val1.2.2": d
}
]
}
]
}
}
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": a
}
]
},
{
"subfield1.2": [
{
"val1.2.1": b
},
{
"val1.2.2": c
}
]
}
]
}
}
本文件也不应计算在内:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": a
},
{
"val1.2.2": c
}
]
}
]
}
}
本文件应计算在内(即使重复b):
任何想法都会受到欢迎。谢谢大家! 这里绝对需要聚合。它可能需要一些修改,但像这样的东西可能(希望)适合您:
db.collection.aggregate([
/*
Step 1: We need to unravel the multi-dimensional array first, otherwise we can't efficiently search for globally unique maximums.
*/
// Unravel the outer array first.
{$unwind: "$data.field1"},
// Simplifies the representation of the unwind result so that we have a flat field path rather than a nested one.
{$project: {
vals: "$data.field1.subfield1"
}},
// Now unravel the inner array.
{$unwind: "$vals"},
// Another simplification step for the same reason as the previous projection.
{$project: {
val: "$vals.val1"
}},
/*
Step 2: We want to create counts for array elements that are the same value from the same source document.
*/
// Creating the counts is easy--simply group by documents with the same source document ID and the same value, adding 1 to our total for each entry.
{$group: {
_id: {
_id: "$_id",
val: "$val"
},
count: {$sum: 1}
}},
/*
Step 3: Once we have our counts, can retrieve the maximum value for each source document.
*/
// First, sort by descending value so that the maximum value is the first we encounter.
{$sort: {
"_id.val": -1
}},
// With the entries in descending order, we can grab the first entry for each source document, which will give us all of the maximums.
{$group: {
_id: "$_id._id",
max: {
$first: {
val: "$_id.val",
count: "$count"
}
}
}},
// Just for kicks, let's simplify once more by unnesting the data.
{$project: {
val: "$max.val",
count: "$max.count"
}},
/*
Step 4: Now we just need to limit our results.
*/
// Any result with a count of 1 is a unique maximum.
{$match: {
count: 1
}}
])
不可否认,这是一个复杂的查询,在不了解实际文档结构的情况下很难保证功能。也就是说,评论中应该有足够的信息来帮助您修改它以满足您的需要。但是,如果您遇到任何问题,请告诉我,我会尽我所能帮助您解决问题。只是想发布一个不同的解决方案,因为这个解决方案(在我使用的数据库中)比使用聚合框架的解决方案运行得稍快一些;这是一个JavaScript解决方案
use myDB;
// Function that determines if a "field" has a unique maximum value.
function validate(list){
let len = list.length;
let isGood = false;
if(len == 0){
isGood = false;
}
else if (len == 1){
isGood = true;
}
else{
isGood = list[0] != list[1];
}
return isGood;
}
// These function iterates over all the "values" in every "subfield"
// within a "field" of a document.
// They add possible maximum values to a list which is then
// validated in "validate()".
function verifySubfields(field){
let list = [];
field.forEach(fieldElement => {
// Check if subfield exists within the element and
// check that is not empty
if (fieldElement.subfield && fieldElement.subfield[0]){
let subfield = fieldElement.subfield;
subfield.forEach(subfieldElement => {
let val = subfieldElement.val;
if (list.length == 0){
list.push(val);
}
else{
if (a >= list[0]){
list.unshift(val);
}
}
});
}
});
return validate(list);
}
function verifyField(doc){
return verifySubfields(doc.data.field);
};
let cont = 0, iterations = 0;
db.myCollection.find({ "data.field.subfield": {$exists: true} }).forEach(doc => {
iterations++;
if(verifyField(doc)){
cont++;
}
});
print(`\nTotal: ${iterations} \nPositives: ${cont} \n`);
注意:使用mongo
运行
选中解决方案的注释中提到的问题可以通过多次调用“verifySubfields()”来解决,并在“verifyFields()”中验证这些结果,这可能会将名称更改为“verifyFields()”非常感谢您的回复,它工作得非常好。对于这个问题,我在开始时想要实现的实际上是更进一步的一步:我不想只考虑一个字段及其子字段来计算记录,而是想同时使用四个字段来计算满足这些条件的记录。因此,假设还有另外3个字段,分别称为“field2”、“field3”和“field4”,以及它们各自的子字段/值。也许我太野心勃勃了,我不想要求更多,因为你已经帮了我很多。不过,你有什么建议吗?这当然是可以实现的。请查看
$concatarray
操作符。使用此运算符,可以合并field1
、field2
、field3
和field4
的数组。将此作为第一个聚合管道阶段。结果将是一个包含所有四个字段的所有数组元素的单个字段。这个新字段就是您将使用答案中列出的其他聚合来处理的字段。再次感谢您的回复。对不起,我想我没有正确地解释我自己。我的意思是,它不仅应该统计满足“field1”中此标准的文档,还应该将相同的标准分别应用于更多字段。在编程逻辑中,我想说的是“if(validate(field1)&&validate(field2)){count++;}”,您帮助我解决的问题只是“if(validate(field1){count++}”。