Sql BigQuery:对象数组中的最低时间戳和特定时间戳之间的平均值
场景:用户有一个可以更新产品大小的订阅。这将产生以下数据集Sql BigQuery:对象数组中的最低时间戳和特定时间戳之间的平均值,sql,google-cloud-firestore,google-bigquery,Sql,Google Cloud Firestore,Google Bigquery,场景:用户有一个可以更新产品大小的订阅。这将产生以下数据集 With subscriptions as ( SELECT "{\"currentSize\":\"2\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data, "docWithSingleHi
With subscriptions as (
SELECT "{\"currentSize\":\"2\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithSingleHistory" as document_name UNION ALL
SELECT "{\"currentSize\":\"3\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}, {\"from\":\"2\", \"to\":\"3\", \"timestamp\":{\"_seconds\":1589543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithMoreHistory" as document_name UNION ALL
SELECT "{\"currentSize\":\"3\", \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithoutHistory" as document_name
)
select document_name, data from subscriptions
这里有一个更直观的例子
订阅具有当前大小和可选大小历史记录,其中包含更新和更新时间
我的目标是:
- 获取每个订阅的初始大小。如果没有历史记录,则为createdAt时间戳。如果有历史记录,它将是最小时间戳的sizeHistory数组成员的“from”值
- 获取从一种尺寸到另一种尺寸所需时间的平均值
With subscriptions as (
SELECT "{\"currentSize\":\"2\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543201}}], \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithSingleHistory" as document_name UNION ALL
SELECT "{\"currentSize\":\"3\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543202}}, {\"from\":\"2\", \"to\":\"3\", \"timestamp\":{\"_seconds\":1589543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithMoreHistory" as document_name UNION ALL
SELECT "{\"currentSize\":\"3\", \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithoutHistory" as document_name
)
select
document_name,
JSON_EXTRACT_SCALAR(sizeHistory, "$.from") as start_size,
JSON_EXTRACT_SCALAR(sizeHistory, "$.timestamp['_seconds']") as timestamp,
from (
select MIN(JSON_EXTRACT(sizeHistoryDoc, "$.timestamp['_seconds']")) as minStartDate
from subscriptions, UNNEST(JSON_EXTRACT_ARRAY(data, "$.sizeHistory")) as sizeHistoryDoc
), subscriptions, UNNEST(JSON_EXTRACT_ARRAY(data, "$.sizeHistory")) as sizeHistory
where JSON_EXTRACT_SCALAR(sizeHistory, "$.timestamp['_seconds']") = minStartDate
主要的理由是:对于每个订阅,获取具有最小时间戳的对应sizeHistory元素。问题是where条件在整个数据集上,所以我只得到一个订阅(具有最小时间戳的订阅)
下面是我理想结果结构的一个示例(不是基于上面提供的虚拟数据):
我已经在这上面呆了好几天了,所以任何解决方案/帮助/提示都会非常有用。请参阅下面我的查询。我使用几个CTE将逻辑分解为可管理的块;更改查询末尾的select语句以查看每个语句都在做什么
With subscriptions as (
SELECT "{\"currentSize\":\"2\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithSingleHistory" as document_name UNION ALL
SELECT "{\"currentSize\":\"3\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}, {\"from\":\"2\", \"to\":\"3\", \"timestamp\":{\"_seconds\":1589543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithMoreHistory" as document_name UNION ALL
SELECT "{\"currentSize\":\"3\", \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithoutHistory" as document_name
),
extract_json1 as (
select
document_name,
json_extract_scalar(data,'$.currentSize') as current_size,
json_extract_array(data,'$.sizeHistory') as size_history,
json_extract(data,'$.createdAt._seconds') as created_at_seconds
from subscriptions
order by document_name
),
extract_json2 as (
select
document_name,
current_size,
created_at_seconds,
json_extract_scalar(s,'$.from') as size_history_from,
json_extract_scalar(s,'$.to') as size_history_to,
json_extract_scalar(s,'$.timestamp._seconds') as size_history_seconds
from extract_json1
left join unnest(size_history) s
order by document_name
),
working as (
select
document_name,
cast(current_size as int64) as current_size,
cast(created_at_seconds as int64) as created_at_seconds,
cast(size_history_from as int64) as size_history_from,
cast(size_history_to as int64) as size_history_to,
cast(size_history_seconds as int64) as size_history_seconds
from extract_json2
order by document_name, size_history_seconds
),
results1_temp as (
select
*,
row_number() over (partition by document_name order by size_history_seconds asc) as size_history_order
from working
order by document_name, size_history_seconds
),
results1 as (
select
coalesce(size_history_from,current_size) as start_size,
count(distinct document_name) as number_of_subscriptions
from results1_temp
where size_history_order = 1
group by 1
),
results2_temp as (
select
document_name,
size_history_from,
size_history_to,
size_history_seconds,
ifnull(lag(size_history_seconds,1) over(partition by document_name order by size_history_seconds asc),created_at_seconds) as prev_size_seconds
from working
order by document_name,size_history_seconds
),
results2 as (
select
concat(size_history_from,' to ',size_history_to) as change,
count(distinct document_name) as number_of_subscriptions,
avg( (ifnull(size_history_seconds,0)-ifnull(prev_size_seconds,0))/(60*60*24) ) as avg_days_to_change
from results2_temp
where size_history_from is not null
group by 1
)
select * from results1
-- select * from results2
*注意:在对任何实际数据运行此操作之前,请删除内部CTEorder by
语句(但不在窗口函数中),因为这会增加额外开销
With subscriptions as (
SELECT "{\"currentSize\":\"2\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithSingleHistory" as document_name UNION ALL
SELECT "{\"currentSize\":\"3\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}, {\"from\":\"2\", \"to\":\"3\", \"timestamp\":{\"_seconds\":1589543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithMoreHistory" as document_name UNION ALL
SELECT "{\"currentSize\":\"3\", \"createdAt\": {\"_seconds\":1587543200}}" as data,
"docWithoutHistory" as document_name
),
extract_json1 as (
select
document_name,
json_extract_scalar(data,'$.currentSize') as current_size,
json_extract_array(data,'$.sizeHistory') as size_history,
json_extract(data,'$.createdAt._seconds') as created_at_seconds
from subscriptions
order by document_name
),
extract_json2 as (
select
document_name,
current_size,
created_at_seconds,
json_extract_scalar(s,'$.from') as size_history_from,
json_extract_scalar(s,'$.to') as size_history_to,
json_extract_scalar(s,'$.timestamp._seconds') as size_history_seconds
from extract_json1
left join unnest(size_history) s
order by document_name
),
working as (
select
document_name,
cast(current_size as int64) as current_size,
cast(created_at_seconds as int64) as created_at_seconds,
cast(size_history_from as int64) as size_history_from,
cast(size_history_to as int64) as size_history_to,
cast(size_history_seconds as int64) as size_history_seconds
from extract_json2
order by document_name, size_history_seconds
),
results1_temp as (
select
*,
row_number() over (partition by document_name order by size_history_seconds asc) as size_history_order
from working
order by document_name, size_history_seconds
),
results1 as (
select
coalesce(size_history_from,current_size) as start_size,
count(distinct document_name) as number_of_subscriptions
from results1_temp
where size_history_order = 1
group by 1
),
results2_temp as (
select
document_name,
size_history_from,
size_history_to,
size_history_seconds,
ifnull(lag(size_history_seconds,1) over(partition by document_name order by size_history_seconds asc),created_at_seconds) as prev_size_seconds
from working
order by document_name,size_history_seconds
),
results2 as (
select
concat(size_history_from,' to ',size_history_to) as change,
count(distinct document_name) as number_of_subscriptions,
avg( (ifnull(size_history_seconds,0)-ifnull(prev_size_seconds,0))/(60*60*24) ) as avg_days_to_change
from results2_temp
where size_history_from is not null
group by 1
)
select * from results1
-- select * from results2