Sql BigQuery:对象数组中的最低时间戳和特定时间戳之间的平均值

Sql BigQuery:对象数组中的最低时间戳和特定时间戳之间的平均值,sql,google-cloud-firestore,google-bigquery,Sql,Google Cloud Firestore,Google Bigquery,场景:用户有一个可以更新产品大小的订阅。这将产生以下数据集 With subscriptions as ( SELECT "{\"currentSize\":\"2\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data, "docWithSingleHi

场景:用户有一个可以更新产品大小的订阅。这将产生以下数据集

With subscriptions as (
  SELECT "{\"currentSize\":\"2\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithSingleHistory" as document_name UNION ALL
   SELECT "{\"currentSize\":\"3\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}, {\"from\":\"2\", \"to\":\"3\", \"timestamp\":{\"_seconds\":1589543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithMoreHistory" as document_name UNION ALL
   SELECT "{\"currentSize\":\"3\", \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithoutHistory" as document_name
)
select document_name, data from subscriptions
这里有一个更直观的例子

订阅具有当前大小和可选大小历史记录,其中包含更新和更新时间

我的目标是:

  • 获取每个订阅的初始大小。如果没有历史记录,则为createdAt时间戳。如果有历史记录,它将是最小时间戳的sizeHistory数组成员的“from”值
  • 获取从一种尺寸到另一种尺寸所需时间的平均值
我已经在第一个目标上被卡住了,甚至没有考虑没有历史记录的订阅案例。这是我到目前为止所拥有的

With subscriptions as (
  SELECT "{\"currentSize\":\"2\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543201}}], \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithSingleHistory" as document_name UNION ALL
   SELECT "{\"currentSize\":\"3\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543202}}, {\"from\":\"2\", \"to\":\"3\", \"timestamp\":{\"_seconds\":1589543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithMoreHistory" as document_name UNION ALL
   SELECT "{\"currentSize\":\"3\", \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithoutHistory" as document_name
)
select 
  document_name,
  JSON_EXTRACT_SCALAR(sizeHistory, "$.from") as start_size,
  JSON_EXTRACT_SCALAR(sizeHistory, "$.timestamp['_seconds']") as timestamp,
from (
  select MIN(JSON_EXTRACT(sizeHistoryDoc, "$.timestamp['_seconds']")) as minStartDate
  from subscriptions, UNNEST(JSON_EXTRACT_ARRAY(data, "$.sizeHistory")) as sizeHistoryDoc
), subscriptions, UNNEST(JSON_EXTRACT_ARRAY(data, "$.sizeHistory")) as sizeHistory
where JSON_EXTRACT_SCALAR(sizeHistory, "$.timestamp['_seconds']") = minStartDate
主要的理由是:对于每个订阅,获取具有最小时间戳的对应sizeHistory元素。问题是where条件在整个数据集上,所以我只得到一个订阅(具有最小时间戳的订阅)

下面是我理想结果结构的一个示例(不是基于上面提供的虚拟数据):


我已经在这上面呆了好几天了,所以任何解决方案/帮助/提示都会非常有用。

请参阅下面我的查询。我使用几个CTE将逻辑分解为可管理的块;更改查询末尾的select语句以查看每个语句都在做什么

With subscriptions as (
  SELECT "{\"currentSize\":\"2\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithSingleHistory" as document_name UNION ALL
   SELECT "{\"currentSize\":\"3\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}, {\"from\":\"2\", \"to\":\"3\", \"timestamp\":{\"_seconds\":1589543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithMoreHistory" as document_name UNION ALL
   SELECT "{\"currentSize\":\"3\", \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithoutHistory" as document_name
),
extract_json1 as (
  select
    document_name,
    json_extract_scalar(data,'$.currentSize') as current_size,
    json_extract_array(data,'$.sizeHistory') as size_history,
    json_extract(data,'$.createdAt._seconds') as created_at_seconds
  from subscriptions
  order by document_name
),
extract_json2 as (
  select
    document_name,
    current_size,
    created_at_seconds,
    json_extract_scalar(s,'$.from') as size_history_from,
    json_extract_scalar(s,'$.to') as size_history_to,
    json_extract_scalar(s,'$.timestamp._seconds') as size_history_seconds
  from extract_json1
  left join unnest(size_history) s
  order by document_name
),
working as (
  select
    document_name,
    cast(current_size as int64) as current_size,
    cast(created_at_seconds as int64) as created_at_seconds,
    cast(size_history_from as int64) as size_history_from,
    cast(size_history_to as int64) as size_history_to,
    cast(size_history_seconds as int64) as size_history_seconds
  from extract_json2
  order by document_name, size_history_seconds
),  
results1_temp as (
  select
    *, 
    row_number() over (partition by document_name order by size_history_seconds asc) as size_history_order
  from working
  order by document_name, size_history_seconds
),
results1 as (
  select 
    coalesce(size_history_from,current_size) as start_size, 
    count(distinct document_name) as number_of_subscriptions
  from results1_temp
  where size_history_order = 1
  group by 1
),
results2_temp as (
  select
    document_name,
    size_history_from,
    size_history_to,
    size_history_seconds,
    ifnull(lag(size_history_seconds,1) over(partition by document_name order by size_history_seconds asc),created_at_seconds) as prev_size_seconds
  from working
  order by document_name,size_history_seconds
),
results2 as (
  select
    concat(size_history_from,' to ',size_history_to) as change,
    count(distinct document_name) as number_of_subscriptions,
    avg( (ifnull(size_history_seconds,0)-ifnull(prev_size_seconds,0))/(60*60*24) ) as avg_days_to_change
  from results2_temp
  where size_history_from is not null
  group by 1
)
select * from results1
-- select * from results2
*注意:在对任何实际数据运行此操作之前,请删除内部CTE
order by
语句(但不在窗口函数中),因为这会增加额外开销

With subscriptions as (
  SELECT "{\"currentSize\":\"2\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithSingleHistory" as document_name UNION ALL
   SELECT "{\"currentSize\":\"3\", \"sizeHistory\":[{\"from\":\"1\", \"to\":\"2\", \"timestamp\":{\"_seconds\":1588543200}}, {\"from\":\"2\", \"to\":\"3\", \"timestamp\":{\"_seconds\":1589543200}}], \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithMoreHistory" as document_name UNION ALL
   SELECT "{\"currentSize\":\"3\", \"createdAt\": {\"_seconds\":1587543200}}" as data, 
  "docWithoutHistory" as document_name
),
extract_json1 as (
  select
    document_name,
    json_extract_scalar(data,'$.currentSize') as current_size,
    json_extract_array(data,'$.sizeHistory') as size_history,
    json_extract(data,'$.createdAt._seconds') as created_at_seconds
  from subscriptions
  order by document_name
),
extract_json2 as (
  select
    document_name,
    current_size,
    created_at_seconds,
    json_extract_scalar(s,'$.from') as size_history_from,
    json_extract_scalar(s,'$.to') as size_history_to,
    json_extract_scalar(s,'$.timestamp._seconds') as size_history_seconds
  from extract_json1
  left join unnest(size_history) s
  order by document_name
),
working as (
  select
    document_name,
    cast(current_size as int64) as current_size,
    cast(created_at_seconds as int64) as created_at_seconds,
    cast(size_history_from as int64) as size_history_from,
    cast(size_history_to as int64) as size_history_to,
    cast(size_history_seconds as int64) as size_history_seconds
  from extract_json2
  order by document_name, size_history_seconds
),  
results1_temp as (
  select
    *, 
    row_number() over (partition by document_name order by size_history_seconds asc) as size_history_order
  from working
  order by document_name, size_history_seconds
),
results1 as (
  select 
    coalesce(size_history_from,current_size) as start_size, 
    count(distinct document_name) as number_of_subscriptions
  from results1_temp
  where size_history_order = 1
  group by 1
),
results2_temp as (
  select
    document_name,
    size_history_from,
    size_history_to,
    size_history_seconds,
    ifnull(lag(size_history_seconds,1) over(partition by document_name order by size_history_seconds asc),created_at_seconds) as prev_size_seconds
  from working
  order by document_name,size_history_seconds
),
results2 as (
  select
    concat(size_history_from,' to ',size_history_to) as change,
    count(distinct document_name) as number_of_subscriptions,
    avg( (ifnull(size_history_seconds,0)-ifnull(prev_size_seconds,0))/(60*60*24) ) as avg_days_to_change
  from results2_temp
  where size_history_from is not null
  group by 1
)
select * from results1
-- select * from results2