Sql 如何获取相对于BigQuery中的前一行已更改的列列表? 为了更好的理解,考虑下面的例子和附图:
假设在ID相同的情况下,BigQuery中表中的任何列都不应该从第1天更改为第2天,那么如何获得行为不符合预期的列列表 换句话说,我想列出ID相同时从第1天更改到第2天的所有列 在示例的最后一列中,我将介绍所需的输出:Sql 如何获取相对于BigQuery中的前一行已更改的列列表? 为了更好的理解,考虑下面的例子和附图:,sql,database,google-bigquery,etl,analytics,Sql,Database,Google Bigquery,Etl,Analytics,假设在ID相同的情况下,BigQuery中表中的任何列都不应该从第1天更改为第2天,那么如何获得行为不符合预期的列列表 换句话说,我想列出ID相同时从第1天更改到第2天的所有列 在示例的最后一列中,我将介绍所需的输出: | ID | Day | Column1 | Column2 | Column3 | Column4 | Column5 | Column6 | Column7 | Desired outputs | 1 | 1 | x | x | x
| ID | Day | Column1 | Column2 | Column3 | Column4 | Column5 | Column6 | Column7 | Desired outputs
| 1 | 1 | x | x | x | x | x | x | x | Column3
| 1 | 2 | x | x | y | x | x | x | x | Column3
| 2 | 1 | x | x | x | x | x | x | x | Column2
| 2 | 2 | x | y | x | x | x | x | x | Column2
| 3 | 1 | x | x | x | x | x | x | x | Column4,Column6
| 3 | 2 | x | x | x | y | x | y | x | Column4,Column6
| 4 | 1 | x | x | x | x | x | x | y | Column7
| 4 | 2 | x | x | x | x | x | x | x | Column7
问候,,
布鲁诺
这里有一个可能的解决方案(忽略WITH语句,这是为了复制您的表):
SELECT COLUMN_NAME
FROM yourdataset.INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = yourtablename
AND COLUMN_NAME NOT IN ("ID", "Day")
使用下面的方法
select * except(prev_row, cur_row),
(
select ifnull(string_agg(col_name),'no changes')
from (
select * from unnest(split(prev_row)) p,
unnest([struct(split(p, ':')[offset(0)] as col_name, split(p, ':')[safe_offset(1)] as value_previous)])
)
join (
select * from unnest(split(cur_row)) c,
unnest([struct(split(c, ':')[offset(0)] as col_name, split(c, ':')[safe_offset(1)] as value_current)])
)
using(col_name)
where value_previous != value_current
and not col_name in ('id', 'day')
) as changes_vs_previous_day
from (
select *,
translate(to_json_string(t), '{}"', '') cur_row,
translate(to_json_string(lag(t) over(partition by id order by day)), '{}"', '') prev_row,
from `project.dataset.table` t
)
如果应用于问题中的样本数据,则输出为
更新:下面是上面的稍微重构版本,只是为了让它不那么冗长
#standardSQL
create temp function extract_name_value_pairs (row string) as (
array(
select as struct * from unnest(split(translate(row, '{}"', ''))) t,
unnest([struct(split(t, ':')[offset(0)] as col_name, split(t, ':')[safe_offset(1)] as value)])
)
);
select * except(prev_row, cur_row),
(
select ifnull(string_agg(col_name),'no changes')
from unnest(extract_name_value_pairs(prev_row)) p
join unnest(extract_name_value_pairs(cur_row)) c
using(col_name)
where p.value != c.value
and not col_name in ('id', 'day')
) as changes_vs_previous_day
from (
select *,
to_json_string(t) as cur_row,
to_json_string(lag(t) over(partition by id order by day)) as prev_row,
from `project.dataset.table` t
)
你有固定的列要检查吗?或者你正在寻找一个更通用的解决方案来允许任意数量的列?一个更通用的解决方案会更好,但我认为一个更简单的解决方案考虑20列会解决我的问题。检查和伟大的想法!非常感谢。没问题。不要忘记接受答案以结束问题(如果答案令人满意)
SELECT COLUMN_NAME
FROM yourdataset.INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = yourtablename
AND COLUMN_NAME NOT IN ("ID", "Day")
select * except(prev_row, cur_row),
(
select ifnull(string_agg(col_name),'no changes')
from (
select * from unnest(split(prev_row)) p,
unnest([struct(split(p, ':')[offset(0)] as col_name, split(p, ':')[safe_offset(1)] as value_previous)])
)
join (
select * from unnest(split(cur_row)) c,
unnest([struct(split(c, ':')[offset(0)] as col_name, split(c, ':')[safe_offset(1)] as value_current)])
)
using(col_name)
where value_previous != value_current
and not col_name in ('id', 'day')
) as changes_vs_previous_day
from (
select *,
translate(to_json_string(t), '{}"', '') cur_row,
translate(to_json_string(lag(t) over(partition by id order by day)), '{}"', '') prev_row,
from `project.dataset.table` t
)
#standardSQL
create temp function extract_name_value_pairs (row string) as (
array(
select as struct * from unnest(split(translate(row, '{}"', ''))) t,
unnest([struct(split(t, ':')[offset(0)] as col_name, split(t, ':')[safe_offset(1)] as value)])
)
);
select * except(prev_row, cur_row),
(
select ifnull(string_agg(col_name),'no changes')
from unnest(extract_name_value_pairs(prev_row)) p
join unnest(extract_name_value_pairs(cur_row)) c
using(col_name)
where p.value != c.value
and not col_name in ('id', 'day')
) as changes_vs_previous_day
from (
select *,
to_json_string(t) as cur_row,
to_json_string(lag(t) over(partition by id order by day)) as prev_row,
from `project.dataset.table` t
)