elasticsearch ElasticSearch-返回唯一值
如何从记录中获取所有elasticsearch ElasticSearch-返回唯一值,elasticsearch,elasticsearch,如何从记录中获取所有语言的值并使其唯一 记录 PUT items/1 { "language" : 10 } PUT items/2 { "language" : 11 } PUT items/3 { "language" : 10 } 查询 GET items/_search { ... } # => Expected Response [10, 11] 任何帮助都会很好 您可以使用 聚合中的size参数指定要包含在聚合结果中的最大术语数。如果需要所有结果,请将该值设置为大于数
语言的值并使其唯一
记录
PUT items/1
{ "language" : 10 }
PUT items/2
{ "language" : 11 }
PUT items/3
{ "language" : 10 }
查询
GET items/_search
{ ... }
# => Expected Response
[10, 11]
任何帮助都会很好 您可以使用
聚合中的size
参数指定要包含在聚合结果中的最大术语数。如果需要所有结果,请将该值设置为大于数据中唯一项数的值
搜索将返回以下内容:
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 2,
"successful" : 2,
"failed" : 0
},
"hits" : {
"total" : 1000000,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"langs" : {
"buckets" : [ {
"key" : "10",
"doc_count" : 244812
}, {
"key" : "11",
"doc_count" : 136794
}, {
"key" : "12",
"doc_count" : 32312
} ]
}
}
}
Elasticsearch 1.1+具有,这将为您提供唯一的计数
请注意,它实际上是一个近似值,使用高基数数据集时,精度可能会降低,但在我的测试中,它通常相当准确
您还可以使用precision\u threshold
参数调整精度。取舍或者说当然是内存使用
该文档中的图表显示了更高的精度\u阈值如何导致更精确的结果
我也在为自己寻找这种解决方案。我在一本书里找到了参考书
因此,根据这一点,以下是正确的解决方案
{
"aggs" : {
"langs" : {
"terms" : { "field" : "language",
"size" : 500 }
}
}}
但如果您遇到以下错误:
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [fastest_method] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
}
]}
在这种情况下,您必须在请求中添加“关键字”,如下所示:
{
"aggs" : {
"langs" : {
"terms" : { "field" : "language.keyword",
"size" : 500 }
}
}}
如果要获取每个语言字段的第一个文档唯一值,可以执行以下操作:
{
"query": {
"match_all": {
}
},
"collapse": {
"field": "language.keyword",
"inner_hits": {
"name": "latest",
"size": 1
}
}
}
如果您希望在不使用任何近似值或设置幻数(大小:500
)的情况下获得所有唯一值,请使用(ES 6.5+)
发件人:
“如果要检索嵌套术语聚合中的所有术语或所有术语组合,应使用复合聚合,该聚合允许对所有可能的术语进行分页,而不是设置大于术语聚合中字段基数的大小。术语聚合旨在返回顶部术语,不允许分页。”
JavaScript中的实现示例:
const ITEMS\u/页=1000;
常数体={
“大小”:0,//仅返回聚合结果:https://www.elastic.co/guide/en/elasticsearch/reference/current/returning-only-agg-results.html
“aggs”:{
“langs”:{
“复合”:{
“尺寸”:每页的项目,
“资料来源”:[
{“语言”:{“术语”:{“字段”:“语言”}}
]
}
}
}
};
const uniqueLanguages=[];
while(true){
常量结果=等待es.搜索(正文);
const currentUniqueLangs=result.aggregations.langs.bucket.map(bucket=>bucket.key);
uniqueLanguages.push(…当前uniquelangs);
const after=result.aggregations.langs.after_键;
如果(之后){
//继续对唯一项进行分页
body.aggs.langs.composite.after=之后;
}否则{
打破
}
}
console.log(uniqueLanguages);
必须通过两个字段(派生id和车辆类型)进行区分,并按最便宜的汽车进行排序。必须嵌套AGG
GET /cars/_search
{
"size": 0,
"aggs": {
"distinct_by_derivative_id": {
"terms": {
"field": "derivative_id"
},
"aggs": {
"vehicle_type": {
"terms": {
"field": "vehicle_type"
},
"aggs": {
"cheapest_vehicle": {
"top_hits": {
"sort": [
{ "rental": { "order": "asc" } }
],
"_source": { "includes": [ "manufacturer_name",
"rental",
"vehicle_type"
]
},
"size": 1
}
}
}
}
}
}
}
}
结果:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 8,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"distinct_by_derivative_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "04",
"doc_count" : 3,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 2,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "8",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Renault",
"rental" : 89.99
},
"sort" : [
89.99
]
}
]
}
}
},
{
"key" : "LCV",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "7",
"_score" : null,
"_source" : {
"vehicle_type" : "LCV",
"manufacturer_name" : "Ford",
"rental" : 99.99
},
"sort" : [
99.99
]
}
]
}
}
}
]
}
},
{
"key" : "01",
"doc_count" : 2,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "1",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Ford",
"rental" : 599.99
},
"sort" : [
599.99
]
}
]
}
}
},
{
"key" : "LCV",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "2",
"_score" : null,
"_source" : {
"vehicle_type" : "LCV",
"manufacturer_name" : "Ford",
"rental" : 599.99
},
"sort" : [
599.99
]
}
]
}
}
}
]
}
},
{
"key" : "02",
"doc_count" : 2,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 2,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "4",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Audi",
"rental" : 499.99
},
"sort" : [
499.99
]
}
]
}
}
}
]
}
},
{
"key" : "03",
"doc_count" : 1,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "5",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Audi",
"rental" : 399.99
},
"sort" : [
399.99
]
}
]
}
}
}
]
}
}
]
}
}
}
字段:[语言]
将只给出给定字段的值,但在代码中使其唯一可能更容易。虽然可能有一个方便的聚合可以为您做到这一点。对于研究此主题的人,这里还有有用的讨论:“字段”:[“语言”]
返回相同的结果。您能否展开您的答案,看看聚合框架是否可以只返回语言值?
#=>[10,11,10]
@CharlesJHardy,它没有相同的结果。您正在查找的数据位于“聚合”键下。我用一个示例结果编辑了我的答案。您还可以/应该设置”大小“:0,为了不包含任何文档,只包含所需的聚合结果。请注意,如果您有许多可能的语言
值,则可能需要添加大小=0
和碎片大小=0
,以确保获得所有值。”。看,我认为这个答案没有解决OP。最初的问题需要不同的值,而不是计数。我遗漏了什么吗?@BHBH,答案确实提供了不同的值。它们是“关键”值,即“10”、“11”和“12”。(聚合>语言>桶>键…)是否保证如果一个术语存在,那么它将出现在结果中(计数>=1)?或者它可能会错过一些在大型数据集中只出现一次的术语吗?@标记它取决于您设置的精度阈值。门槛越高,错过的机会就越小。请注意,精度阈值设置的限制为40000。这意味着,如果数据集高于此值,将有一个估计值,因此可能会忽略单个值,我认为这个答案是错误的。基数聚合是一个很好的工具。但是,任务是检索术语本身,而不是估计有多少不同的术语。“名称”:“最新”,是ES语法中的“名称”还是自定义用户字段?这应该是可接受的答案。它解决了size
参数的问题
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 8,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"distinct_by_derivative_id" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "04",
"doc_count" : 3,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 2,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "8",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Renault",
"rental" : 89.99
},
"sort" : [
89.99
]
}
]
}
}
},
{
"key" : "LCV",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "7",
"_score" : null,
"_source" : {
"vehicle_type" : "LCV",
"manufacturer_name" : "Ford",
"rental" : 99.99
},
"sort" : [
99.99
]
}
]
}
}
}
]
}
},
{
"key" : "01",
"doc_count" : 2,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "1",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Ford",
"rental" : 599.99
},
"sort" : [
599.99
]
}
]
}
}
},
{
"key" : "LCV",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "2",
"_score" : null,
"_source" : {
"vehicle_type" : "LCV",
"manufacturer_name" : "Ford",
"rental" : 599.99
},
"sort" : [
599.99
]
}
]
}
}
}
]
}
},
{
"key" : "02",
"doc_count" : 2,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 2,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "4",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Audi",
"rental" : 499.99
},
"sort" : [
499.99
]
}
]
}
}
}
]
}
},
{
"key" : "03",
"doc_count" : 1,
"vehicle_type" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "CAR",
"doc_count" : 1,
"cheapest_vehicle" : {
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : null,
"hits" : [
{
"_index" : "cars",
"_type" : "_doc",
"_id" : "5",
"_score" : null,
"_source" : {
"vehicle_type" : "CAR",
"manufacturer_name" : "Audi",
"rental" : 399.99
},
"sort" : [
399.99
]
}
]
}
}
}
]
}
}
]
}
}
}