Sql bigquery:查找以下行匹配条件

Sql bigquery:查找以下行匹配条件,sql,google-bigquery,Sql,Google Bigquery,我正在查看BigQuery中的文本序列,并尝试识别共享ID的多行上的单词补全。数据如下所示: ID, Text 1, t 1, th 1, the 1, the 1, the c 1, the ca 1, the cat 1, the cat 1, the cat s ... 1, the cat sat on the mat 2, r ... 对于每个给定的ID和序列,我试图找到下一个单词边界。因此,理想的输出是: ID, Text, Boundary 1, t, the 1, th, t

我正在查看BigQuery中的文本序列,并尝试识别共享ID的多行上的单词补全。数据如下所示:

ID, Text
1, t
1, th
1, the
1, the
1, the c
1, the ca
1, the cat
1, the cat 
1, the cat s
...
1, the cat sat on the mat
2, r
...
对于每个给定的ID和序列,我试图找到下一个单词边界。因此,理想的输出是:

ID, Text, Boundary
1, t, the
1, th, the
1, the c, the cat
1, the ca, the cat
1, the cat s, the cat sat 

在上面,共享ID并以空格结尾的下一行给出了下一行,可以有多个单词完成边界。

下面是BigQuery标准SQL

注意:这是一种蛮力方法,所以查询并不像可能的那个样优雅——但希望这能给你们一个好的开始

#standardSQL
SELECT id, item, boundary
FROM (
  SELECT id, grp, 
    STRING_AGG(IF(boundary, text, ''), '') boundary,
    ARRAY_AGG(IF(NOT boundary, text, NULL) IGNORE NULLS ORDER BY LENGTH(text)) items
  FROM (
    SELECT id, text, 
      LENGTH(text) - LENGTH(REPLACE(text, ' ', '')) - IF(SUBSTR(text, -1) = ' ', 1, 0) grp,
      SUBSTR(text, -1) = ' ' boundary
    FROM `project.dataset.table`
  )
  GROUP BY id, grp
), UNNEST(items) item WITH OFFSET pos
WHERE RTRIM(item) != RTRIM(boundary)
如果要应用于问题中的虚拟数据,请如下所示

#standardSQL
WITH `project.dataset.table` AS (
  SELECT 1 id, 't' text UNION ALL
  SELECT 1, 'th' UNION ALL
  SELECT 1, 'the' UNION ALL
  SELECT 1, 'the ' UNION ALL
  SELECT 1, 'the c' UNION ALL
  SELECT 1, 'the ca' UNION ALL
  SELECT 1, 'the cat' UNION ALL
  SELECT 1, 'the cat ' UNION ALL
  SELECT 1, 'the cat s' UNION ALL
  SELECT 1, 'the cat sat ' 
)
SELECT id, item, boundary
FROM (
  SELECT id, grp, 
    STRING_AGG(IF(boundary, text, ''), '') boundary,
    ARRAY_AGG(IF(NOT boundary, text, NULL) IGNORE NULLS ORDER BY LENGTH(text)) items
  FROM (
    SELECT id, text, 
      LENGTH(text) - LENGTH(REPLACE(text, ' ', '')) - IF(SUBSTR(text, -1) = ' ', 1, 0) grp,
      SUBSTR(text, -1) = ' ' boundary
    FROM `project.dataset.table`
  )
  GROUP BY id, grp
), UNNEST(items) item WITH OFFSET pos
WHERE RTRIM(item) != RTRIM(boundary)
ORDER BY id, grp, pos   
结果是

Row     id      item        boundary     
1       1       t           the  
2       1       th          the  
3       1       the c       the cat  
4       1       the ca      the cat  
5       1       the cat s   the cat sat  

BigQueryUDF在这些情况下很有用。以下是一个可行的解决方案:

#standardSQL
/*boundary function*/
create temp function boundaryf (text string, sentence string) as (
  array_to_string(array(
    select q.w from unnest(
      array(select struct(w as w, row_number() over () as i)  from unnest(split(sentence, ' ')) w
      ) 
    ) q
    -- respect the ending space
    where q.i <= array_length(split(text, ' ')) - (length(text) - length(rtrim(text)))
  ), ' ')
);

WITH items AS (
  #--your data. assuming this is already ordered
  SELECT 1 as id, 't' as text UNION ALL
  SELECT 1, 'th' UNION ALL
  SELECT 1, 'the' UNION ALL
  SELECT 1, 'the ' UNION ALL
  SELECT 1, 'the c' UNION ALL
  SELECT 1, 'the ca' UNION ALL
  SELECT 1, 'the cat' UNION ALL
  SELECT 1, 'the cat ' UNION ALL
  SELECT 1, 'the cat s' UNION ALL
  SELECT 1, 'the cat sa' union all
  SELECT 1, 'the cat sat' union all
  SELECT 1, 'the cat sat ' union all
  SELECT 1, 'the cat sat o' union all 
  SELECT 1, 'the cat sat on' union all
  SELECT 1, 'the cat sat on ' union all
  SELECT 1, 'the cat sat on a' union all
  SELECT 1, 'the cat sat on a ' union all
  SELECT 1, 'the cat sat on a m' union all
  SELECT 1, 'the cat sat on a ma' union all
  SELECT 1, 'the cat sat on a mat' union all
  select 2, 'i' union all
  select 2, 'i a' union all
  select 2, 'i am' union all
  select 2, 'i am f' union all
  select 2, 'i am fr' union all
  select 2, 'i am fre' union all
  select 2, 'i am free'
),
sentences as (
  select id, sentences[offset (array_length(sentences)-1)] as sentence from (
    select id, array_agg(text) as sentences 
    from items group by 1
  )
),
control as (
  select i.id, i.text, boundaryf(i.text, s.sentence) as boundary
  from items i
  left join sentences s on s.id  = i.id
)
select * from control