Hadoop 缓慢变化的尺寸-配置单元中的SCD1和SCD2实现

Hadoop 缓慢变化的尺寸-配置单元中的SCD1和SCD2实现,hadoop,hive,data-warehouse,Hadoop,Hive,Data Warehouse,我正在寻找配置单元(1.2.1)中的SCD1和SCD2实现。我知道在配置单元(0.14)之前加载SCD1和SCD2表的解决方法。以下是使用变通方法加载SCD1和SCD2的链接 既然Hive支持ACID操作,我只想知道是否有更好或更直接的加载方式。好吧,我使用两个临时表来解决这个问题: drop table if exists administrator_tmp1; drop table if exists administrator_tmp2; set hive.exec.dynami

我正在寻找配置单元(1.2.1)中的SCD1和SCD2实现。我知道在配置单元(0.14)之前加载SCD1和SCD2表的解决方法。以下是使用变通方法加载SCD1和SCD2的链接


既然Hive支持ACID操作,我只想知道是否有更好或更直接的加载方式。

好吧,我使用两个临时表来解决这个问题:

    drop table if exists administrator_tmp1;
drop table if exists administrator_tmp2;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;

--review_administrator
CREATE TABLE if not exists review_administrator(
    admin_id bigint ,
    admin_name string,
    create_time string,
    email string ,
    password string,
    status_description string,
    token string ,
    expire_time string ,
    granter_user_id bigint ,
    admin_time string ,
    effect_start_date string ,
    effect_end_date string 
)
partitioned by (current_row_indicator string comment 'current, expired')
stored as parquet;

--tmp1 is used for saving origin data
CREATE TABLE if not exists administrator_tmp1(
    admin_id bigint ,
    admin_name string,
    create_time string,
    email string ,
    password string ,
    status_description string ,
    token string ,
    expire_time string ,
    granter_user_id bigint ,
    admin_time string ,
    effect_start_date string ,
    effect_end_date string 
)
partitioned by (current_row_indicator string comment 'current, expired:')
stored as parquet;

--tmp2 saving the scd data
CREATE TABLE if not exists administrator_tmp2(
    admin_id bigint ,
    admin_name string,
    create_time string,
    email string ,
    password string ,
    status_description string ,
    token string ,
    expire_time string ,
    granter_user_id bigint ,
    admin_time string ,
    effect_start_date string ,
    effect_end_date string 
)
partitioned by (current_row_indicator string comment 'current, expired')
stored as parquet;

--insert origin data into tmp1
INSERT OVERWRITE TABLE administrator_tmp1 PARTITION(current_row_indicator)
SELECT 
    user_id as admin_id,
    name as admin_name,
    time as create_time,
    email as email,
    password as password,
    status as status_description,
    token as token,
    expire_time as expire_time,
    admin_id as granter_user_id,
    admin_time as admin_time,
    '{{ ds }}' as effect_start_date,
    '9999-12-31' as effect_end_date,
    'current' as current_row_indicator
FROM 
    ks_db_origin.gifshow_administrator_origin
;

--insert scd data into tmp2
--for the data unchanged
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
    t2.admin_id,
    t2.admin_name,
    t2.create_time,
    t2.email,
    t2.password,
    t2.status_description,
    t2.token,
    t2.expire_time,
    t2.granter_user_id,
    t2.admin_time,
    t2.effect_start_date,
    t2.effect_end_date as effect_end_date,
    t2.current_row_indicator
FROM
    administrator_tmp1 t1
INNER JOIN 
    (
        SELECT * FROM review_administrator 
        WHERE current_row_indicator = 'current'
    ) t2
ON 
    t1.admin_id = t2.admin_id
AND t1.admin_name = t2.admin_name
AND t1.create_time = t2.create_time
AND t1.email = t2.email
AND t1.password = t2.password
AND t1.status_description = t2.status_description
AND t1.token = t2.token
AND t1.expire_time = t2.expire_time
AND t1.granter_user_id = t2.granter_user_id
AND t1.admin_time = t2.admin_time
;

--for the data changed , update the effect_end_date
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
    t2.admin_id,
    t2.admin_name,
    t2.create_time,
    t2.email,
    t2.password,
    t2.status_description,
    t2.token,
    t2.expire_time,
    t2.granter_user_id,
    t2.admin_time,
    t2.effect_start_date as effect_start_date,
    '{{ yesterday_ds }}' as effect_end_date,
    'expired' as current_row_indicator
FROM
    administrator_tmp1 t1
INNER JOIN 
    (
        SELECT * FROM review_administrator 
        WHERE current_row_indicator = 'current'
    ) t2
ON 
    t1.admin_id = t2.admin_id
WHERE NOT 
    (
        t1.admin_name = t2.admin_name
    AND t1.create_time = t2.create_time
    AND t1.email = t2.email
    AND t1.password = t2.password
    AND t1.status_description = t2.status_description
    AND t1.token = t2.token
    AND t1.expire_time = t2.expire_time
    AND t1.granter_user_id = t2.granter_user_id
    AND t1.admin_time = t2.admin_time
    )
;

--for the changed data and the new data
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
    t1.admin_id,
    t1.admin_name,
    t1.create_time,
    t1.email,
    t1.password,
    t1.status_description,
    t1.token,
    t1.expire_time,
    t1.granter_user_id,
    t1.admin_time,
    t1.effect_start_date,
    t1.effect_end_date,
    t1.current_row_indicator
FROM
    administrator_tmp1 t1
LEFT OUTER JOIN 
    (
        SELECT * FROM review_administrator 
        WHERE current_row_indicator = 'current'
    ) t2
ON 
    t1.admin_id = t2.admin_id
AND t1.admin_name = t2.admin_name
AND t1.create_time = t2.create_time
AND t1.email = t2.email
AND t1.password = t2.password
AND t1.status_description = t2.status_description
AND t1.token = t2.token
AND t1.expire_time = t2.expire_time
AND t1.granter_user_id = t2.granter_user_id
AND t1.admin_time = t2.admin_time
WHERE t2.admin_id IS NULL
;

--for the data already marked by 'expired'
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
    t1.admin_id,
    t1.admin_name,
    t1.create_time,
    t1.email,
    t1.password,
    t1.status_description,
    t1.token,
    t1.expire_time,
    t1.granter_user_id,
    t1.admin_time,
    t1.effect_start_date,
    t1.effect_end_date,
    t1.current_row_indicator
FROM
    review_administrator t1
WHERE t1.current_row_indicator = 'expired'
;

--populate the dim table
INSERT OVERWRITE TABLE review_administrator PARTITION(current_row_indicator)
SELECT
    t1.admin_id,
    t1.admin_name,
    t1.create_time,
    t1.email,
    t1.password,
    t1.status_description,
    t1.token,
    t1.expire_time,
    t1.granter_user_id,
    t1.admin_time,
    t1.effect_start_date,
    t1.effect_end_date,
    t1.current_row_indicator
FROM
    administrator_tmp2 t1
;

--drop the two temp table
drop table administrator_tmp1;
drop table administrator_tmp2;


-- --example data
-- --2017-01-01
-- insert into table review_administrator PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-01','9999-12-31','current' 
-- FROM default.sample_07 limit 1;

-- --2017-01-02
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a01@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-02','9999-12-31','current' 
-- FROM default.sample_07 limit 1;

-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '2','b','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-02','9999-12-31','current' 
-- FROM default.sample_07 limit 1;

-- --2017-01-03
-- --id 1 is changed
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a03@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current' 
-- FROM default.sample_07 limit 1;
-- --id 2 is not changed at all
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '2','b','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current' 
-- FROM default.sample_07 limit 1;
-- --id 3 is a new record
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '3','c','2016-12-31','c@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current' 
-- FROM default.sample_07 limit 1;

-- --now dim table will show you the right SCD.

由于HDFS是不可变存储,因此可以认为数据版本控制和历史记录(SCD2)应该是加载维度的默认行为。您可以在Hadoop SQL查询引擎(Hive、Impala、Drill等)中创建一个视图,使用窗口功能检索当前状态/最新值。您可以在我的博文中找到更多信息,例如如何处理大维度和事实表。

在使用SCD管理数据时,我确实使用了另一种方法:

  • 切勿更新历史文件或表中确实存在的数据

  • 确保新行将与最新一代进行比较,例如,加载逻辑将添加控制列:
    loaded\u on
    校验和
    如果需要,在同一天发生多个加载时将使用序列列,然后将新数据与最新生成的数据进行比较,将同时使用控制列和数据中存在的键列,如客户或产品键

  • 现在,神奇的是通过计算除了控制列之外的所有相关列的
    校验和
    ,为每一行创建一个唯一的指纹。然后,指纹(
    校验和
    )列将用于确定与最新一代相比,是否有任何列发生了更改(最新一代基于基于键、加载和序列的数据的最新状态)

    现在,您知道来自每日更新的行是否是新的,因为没有上一代,或者来自每日更新的行是否需要创建新行(新一代)在历史文件或表中,如果来自每日更新的行没有任何更改,则无需创建行,因为与上一代相比没有差异

    可以使用
    apachespark
    构建所需的逻辑类型,在一条语句中,您可以要求
    Spark
    将任何
    数据类型的任意数量的列串联起来,然后计算一个
    散列
    值,该值用于指纹


    总而言之,现在您可以基于
    spark
    开发一个实用程序,它可以接受任何数据源,并输出一个组织良好、干净且具有慢维度意识的历史文件、表等,。。。最后,永远不要只更新附加

    下面是使用独占连接方法在配置单元中缓慢更改维度类型2的详细实现

    假设源正在发送完整的数据文件,即旧记录、更新记录和新记录

    Steps-
    
  • 将最近的文件数据加载到STG表

  • 从HIST表中选择所有过期记录

    exp\u dt!='2099-12-31'

  • 使用HIST.column=STG.column上的内部联接和过滤器,选择STG和HIST中未更改的所有记录,如下所示

    从hist_选项卡hist中选择hist.*
    内部连接STG\U选项卡STG
    在hist.key=stg.key上
    其中hist.column=stg.column

  • 选择所有从STG_选项卡更改的新记录和更新记录,使用独占左连接和历史选项卡,并设置到期和生效日期,如下所示

    选择stg.*、有效日期(yyyy-MM-dd)、有效日期(2099-12-31)
    从STG_选项卡STG
    左连接
    (从历史选项卡中选择*,其中exp_dt='2099-12-31')历史
    在hist.key=stg.key上
    其中hist.key为空
    或hist.column!=标准列

  • 使用与STG表的独占左联接从HIST表中选择所有更新的旧记录,并设置其到期日期,如下所示:

    从中选择历史。*,exp_dt(yyyy-MM-dd)
    (从历史选项卡中选择*,其中exp_dt='2099-12-31')历史
    左连接STG\U选项卡STG
    on hist.key=stg.key
    其中hist.key为空
    或历史列!=标准列

  • unionall
    从2-5查询并将覆盖结果插入HIST表

  • 这里可以找到SCD类型2的更详细的实现-

    如果存在harsha.emp,则删除表格;
    如果存在harsha.emp_tmp1,则删除表格;
    如果存在,则删除表harsha.emp_tmp2;
    如果存在harsha.init_load,则删除表格;
    显示数据库;
    使用哈沙;
    展示表格;
    创建表harsha.emp(eid int、ename string、sal int、loc string、dept int、开始日期时间戳、结束日期时间戳、当前状态字符串)
    评论“emp scd实施”
    行格式分隔
    以“,”结尾的字段
    以“\n”结尾的行
    ;
    创建表harsha.emp\u tmp1(eid int、ename字符串、sal int、loc字符串、dept int、开始日期时间戳、结束日期时间戳、当前状态字符串)
    评论“emp scd实施”
    行格式分隔
    以“,”结尾的字段
    以“\n”结尾的行
    ;
    创建表harsha.emp\u tmp2(eid int、ename字符串、sal int、loc字符串、dept int、开始日期时间戳、结束日期时间戳、当前状态字符串)
    评论“emp scd实施”
    行格式分隔
    以“,”结尾的字段
    以“\n”结尾的行
    ;
    创建表harsha.init_load(eid int、ename string、sal int、loc string、dept int)
    行格式分隔
    以“,”结尾的字段
    以“\n”结尾的行
    ;
    展示表格;
    插入到表harsha.emp中,选择101作为eid,'aaaa'作为ename,3400作为sal,'chicago'作为loc,10作为did,从unixtime(unix_timestamp())作为开始日期,从unixtime(unix_timestamp('9999-12-31 23:59:59','yyyyy-mm-dd hh:mm:ss')作为结束日期,'current'作为当前状态从(选择'123')x;
    在表harsha.emp中选择102作为eid,“abaa”作为ename,6400作为sal,“ny”作为loc,10作为did,从\u unixtime(unix\u timestamp())作为开始日期,从\u unixtime(un
    
    drop table if exists harsha.emp;
    
    drop table if exists harsha.emp_tmp1;
    
    drop table if exists harsha.emp_tmp2;
    
    drop table if exists harsha.init_load;
    
    show databases;
    use harsha;
    show tables;
    
    create table harsha.emp (eid int,ename string,sal int,loc string,dept int,start_date timestamp,end_date timestamp,current_status string)
    comment "emp scd implementation"
    row format delimited
    fields terminated by ','
    lines terminated by '\n'
    ;
    
    create table harsha.emp_tmp1 (eid int,ename string,sal int,loc string,dept int,start_date timestamp,end_date timestamp,current_status string)
    comment "emp scd implementation"
    row format delimited
    fields terminated by ','
    lines terminated by '\n'
    ;
    
    create table harsha.emp_tmp2 (eid int,ename string,sal int,loc string,dept int,start_date timestamp,end_date timestamp,current_status string)
    comment "emp scd implementation"
    row format delimited
    fields terminated by ','
    lines terminated by '\n'
    ;
    
    create table harsha.init_load (eid int,ename string,sal int,loc string,dept int) 
    row format delimited
    fields terminated by ','
    lines terminated by '\n'
    ;
    
    show tables;
    
    insert into table harsha.emp select 101 as eid,'aaaa' as ename,3400 as sal,'chicago' as loc,10 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
    
    insert into table harsha.emp select 102 as eid,'abaa' as ename,6400 as sal,'ny' as loc,10 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
    
    insert into table harsha.emp select 103 as eid,'abca' as ename,2300 as sal,'sfo' as loc,20 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
    
    insert into table harsha.emp select 104 as eid,'afga' as ename,3000 as sal,'seattle' as loc,10 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
    
    insert into table harsha.emp select 105 as eid,'ikaa' as ename,1400 as sal,'LA' as loc,30 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
    
    insert into table harsha.emp select 106 as eid,'cccc' as ename,3499 as sal,'spokane' as loc,20 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
    
    insert into table harsha.emp select 107 as eid,'toiz' as ename,4000 as sal,'WA.DC' as loc,40 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
    
    load data local inpath 'Documents/hadoop_scripts/t3.txt' into table harsha.emp;
    
    load data local inpath 'Documents/hadoop_scripts/t4.txt' into table harsha.init_load;
    
    insert into table harsha.emp_tmp1 select eid,ename,sal,loc,dept,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status 
    from harsha.init_load;
    
    insert into table harsha.emp_tmp2
    select a.eid,a.ename,a.sal,a.loc,a.dept,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'updated' as current_status from emp_tmp1 a
    left outer join emp b on
    a.eid=b.eid and 
    a.ename=b.ename and
    a.sal=b.sal and 
    a.loc = b.loc and 
    a.dept = b.dept
    where b.eid is null
    union all
    select a.eid,a.ename,a.sal,a.loc,a.dept,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from emp_tmp1 a
    left outer join emp b on
    a.eid = b.eid and
    a.ename=b.ename and
    a.sal=b.sal and 
    a.loc=b.loc and 
    a.dept=b.dept
    where b.eid is not null
    union all
    select b.eid,b.ename,b.sal,b.loc,b.dept,b.start_date as start_date,from_unixtime(unix_timestamp()) as end_date,'expired' as current_status from emp b
    inner join emp_tmp1 a on
    a.eid=b.eid  
    where
    a.ename <> b.ename or
    a.sal <> b.sal or 
    a.loc <> b.loc or 
    a.dept <> b.dept 
    ;
    
    insert into table harsha.emp select eid,ename,sal,loc,dept,start_date,end_date,current_status from emp_tmp2;
    
    records including expired:
    
    select * from harsha.emp order by eid;
    
    latest recods:
    
    select a.* from emp a inner join (select eid ,max(start_date) as start_date from emp where current_status <> 'expired' group by eid) b on a.eid=b.eid and a.start_date=b.start_date;