Hadoop 缓慢变化的尺寸-配置单元中的SCD1和SCD2实现_Hadoop_Hive_Data Warehouse

Hadoop 缓慢变化的尺寸-配置单元中的SCD1和SCD2实现

hadoop hive

Hadoop 缓慢变化的尺寸-配置单元中的SCD1和SCD2实现,hadoop,hive,data-warehouse,Hadoop,Hive,Data Warehouse,我正在寻找配置单元（1.2.1）中的SCD1和SCD2实现。我知道在配置单元（0.14）之前加载SCD1和SCD2表的解决方法。以下是使用变通方法加载SCD1和SCD2的链接既然Hive支持ACID操作，我只想知道是否有更好或更直接的加载方式。好吧，我使用两个临时表来解决这个问题： drop table if exists administrator_tmp1; drop table if exists administrator_tmp2; set hive.exec.dynami

我正在寻找配置单元（1.2.1）中的SCD1和SCD2实现。我知道在配置单元（0.14）之前加载SCD1和SCD2表的解决方法。以下是使用变通方法加载SCD1和SCD2的链接

既然Hive支持ACID操作，我只想知道是否有更好或更直接的加载方式。

好吧，我使用两个临时表来解决这个问题：

    drop table if exists administrator_tmp1;
drop table if exists administrator_tmp2;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;

--review_administrator
CREATE TABLE if not exists review_administrator(
    admin_id bigint ,
    admin_name string,
    create_time string,
    email string ,
    password string,
    status_description string,
    token string ,
    expire_time string ,
    granter_user_id bigint ,
    admin_time string ,
    effect_start_date string ,
    effect_end_date string 
)
partitioned by (current_row_indicator string comment 'current, expired')
stored as parquet;

--tmp1 is used for saving origin data
CREATE TABLE if not exists administrator_tmp1(
    admin_id bigint ,
    admin_name string,
    create_time string,
    email string ,
    password string ,
    status_description string ,
    token string ,
    expire_time string ,
    granter_user_id bigint ,
    admin_time string ,
    effect_start_date string ,
    effect_end_date string 
)
partitioned by (current_row_indicator string comment 'current, expired:')
stored as parquet;

--tmp2 saving the scd data
CREATE TABLE if not exists administrator_tmp2(
    admin_id bigint ,
    admin_name string,
    create_time string,
    email string ,
    password string ,
    status_description string ,
    token string ,
    expire_time string ,
    granter_user_id bigint ,
    admin_time string ,
    effect_start_date string ,
    effect_end_date string 
)
partitioned by (current_row_indicator string comment 'current, expired')
stored as parquet;

--insert origin data into tmp1
INSERT OVERWRITE TABLE administrator_tmp1 PARTITION(current_row_indicator)
SELECT 
    user_id as admin_id,
    name as admin_name,
    time as create_time,
    email as email,
    password as password,
    status as status_description,
    token as token,
    expire_time as expire_time,
    admin_id as granter_user_id,
    admin_time as admin_time,
    '{{ ds }}' as effect_start_date,
    '9999-12-31' as effect_end_date,
    'current' as current_row_indicator
FROM 
    ks_db_origin.gifshow_administrator_origin
;

--insert scd data into tmp2
--for the data unchanged
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
    t2.admin_id,
    t2.admin_name,
    t2.create_time,
    t2.email,
    t2.password,
    t2.status_description,
    t2.token,
    t2.expire_time,
    t2.granter_user_id,
    t2.admin_time,
    t2.effect_start_date,
    t2.effect_end_date as effect_end_date,
    t2.current_row_indicator
FROM
    administrator_tmp1 t1
INNER JOIN 
    (
        SELECT * FROM review_administrator 
        WHERE current_row_indicator = 'current'
    ) t2
ON 
    t1.admin_id = t2.admin_id
AND t1.admin_name = t2.admin_name
AND t1.create_time = t2.create_time
AND t1.email = t2.email
AND t1.password = t2.password
AND t1.status_description = t2.status_description
AND t1.token = t2.token
AND t1.expire_time = t2.expire_time
AND t1.granter_user_id = t2.granter_user_id
AND t1.admin_time = t2.admin_time
;

--for the data changed , update the effect_end_date
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
    t2.admin_id,
    t2.admin_name,
    t2.create_time,
    t2.email,
    t2.password,
    t2.status_description,
    t2.token,
    t2.expire_time,
    t2.granter_user_id,
    t2.admin_time,
    t2.effect_start_date as effect_start_date,
    '{{ yesterday_ds }}' as effect_end_date,
    'expired' as current_row_indicator
FROM
    administrator_tmp1 t1
INNER JOIN 
    (
        SELECT * FROM review_administrator 
        WHERE current_row_indicator = 'current'
    ) t2
ON 
    t1.admin_id = t2.admin_id
WHERE NOT 
    (
        t1.admin_name = t2.admin_name
    AND t1.create_time = t2.create_time
    AND t1.email = t2.email
    AND t1.password = t2.password
    AND t1.status_description = t2.status_description
    AND t1.token = t2.token
    AND t1.expire_time = t2.expire_time
    AND t1.granter_user_id = t2.granter_user_id
    AND t1.admin_time = t2.admin_time
    )
;

--for the changed data and the new data
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
    t1.admin_id,
    t1.admin_name,
    t1.create_time,
    t1.email,
    t1.password,
    t1.status_description,
    t1.token,
    t1.expire_time,
    t1.granter_user_id,
    t1.admin_time,
    t1.effect_start_date,
    t1.effect_end_date,
    t1.current_row_indicator
FROM
    administrator_tmp1 t1
LEFT OUTER JOIN 
    (
        SELECT * FROM review_administrator 
        WHERE current_row_indicator = 'current'
    ) t2
ON 
    t1.admin_id = t2.admin_id
AND t1.admin_name = t2.admin_name
AND t1.create_time = t2.create_time
AND t1.email = t2.email
AND t1.password = t2.password
AND t1.status_description = t2.status_description
AND t1.token = t2.token
AND t1.expire_time = t2.expire_time
AND t1.granter_user_id = t2.granter_user_id
AND t1.admin_time = t2.admin_time
WHERE t2.admin_id IS NULL
;

--for the data already marked by 'expired'
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
    t1.admin_id,
    t1.admin_name,
    t1.create_time,
    t1.email,
    t1.password,
    t1.status_description,
    t1.token,
    t1.expire_time,
    t1.granter_user_id,
    t1.admin_time,
    t1.effect_start_date,
    t1.effect_end_date,
    t1.current_row_indicator
FROM
    review_administrator t1
WHERE t1.current_row_indicator = 'expired'
;

--populate the dim table
INSERT OVERWRITE TABLE review_administrator PARTITION(current_row_indicator)
SELECT
    t1.admin_id,
    t1.admin_name,
    t1.create_time,
    t1.email,
    t1.password,
    t1.status_description,
    t1.token,
    t1.expire_time,
    t1.granter_user_id,
    t1.admin_time,
    t1.effect_start_date,
    t1.effect_end_date,
    t1.current_row_indicator
FROM
    administrator_tmp2 t1
;

--drop the two temp table
drop table administrator_tmp1;
drop table administrator_tmp2;


-- --example data
-- --2017-01-01
-- insert into table review_administrator PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-01','9999-12-31','current' 
-- FROM default.sample_07 limit 1;

-- --2017-01-02
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a01@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-02','9999-12-31','current' 
-- FROM default.sample_07 limit 1;

-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '2','b','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-02','9999-12-31','current' 
-- FROM default.sample_07 limit 1;

-- --2017-01-03
-- --id 1 is changed
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a03@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current' 
-- FROM default.sample_07 limit 1;
-- --id 2 is not changed at all
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '2','b','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current' 
-- FROM default.sample_07 limit 1;
-- --id 3 is a new record
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '3','c','2016-12-31','c@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current' 
-- FROM default.sample_07 limit 1;

-- --now dim table will show you the right SCD.

由于HDFS是不可变存储，因此可以认为数据版本控制和历史记录（SCD2）应该是加载维度的默认行为。您可以在Hadoop SQL查询引擎（Hive、Impala、Drill等）中创建一个视图，使用窗口功能检索当前状态/最新值。您可以在我的博文中找到更多信息，例如如何处理大维度和事实表。

在使用SCD管理数据时，我确实使用了另一种方法：

切勿更新历史文件或表中确实存在的数据

确保新行将与最新一代进行比较，例如，加载逻辑将添加控制列：

loaded\u on

，

校验和

如果需要，在同一天发生多个加载时将使用序列列，然后将新数据与最新生成的数据进行比较，将同时使用控制列和数据中存在的键列，如客户或产品键

现在，神奇的是通过计算除了控制列之外的所有相关列的

校验和

，为每一行创建一个唯一的指纹。然后，指纹（

校验和

）列将用于确定与最新一代相比，是否有任何列发生了更改（最新一代基于基于键、加载和序列的数据的最新状态）

现在，您知道来自每日更新的行是否是新的，因为没有上一代，或者来自每日更新的行是否需要创建新行（新一代）在历史文件或表中，如果来自每日更新的行没有任何更改，则无需创建行，因为与上一代相比没有差异

可以使用

apachespark

构建所需的逻辑类型，在一条语句中，您可以要求

Spark

将任何

数据类型的任意数量的列串联起来，然后计算一个散列
值，该值用于指纹
总而言之，现在您可以基于spark
开发一个实用程序，它可以接受任何数据源，并输出一个组织良好、干净且具有慢维度意识的历史文件、表等，。。。最后，永远不要只更新附加
 下面是使用独占连接方法在配置单元中缓慢更改维度类型2的详细实现
假设源正在发送完整的数据文件，即旧记录、更新记录和新记录
Steps-

将最近的文件数据加载到STG表
从HIST表中选择所有过期记录
exp\u dt！='2099-12-31'

使用HIST.column=STG.column上的内部联接和过滤器，选择STG和HIST中未更改的所有记录，如下所示
从hist_选项卡hist中选择hist.*
内部连接STG\U选项卡STG
在hist.key=stg.key上
其中hist.column=stg.column

选择所有从STG_选项卡更改的新记录和更新记录，使用独占左连接和历史选项卡，并设置到期和生效日期，如下所示
选择stg.*、有效日期（yyyy-MM-dd）、有效日期（2099-12-31）
从STG_选项卡STG
左连接
（从历史选项卡中选择*，其中exp_dt='2099-12-31'）历史
在hist.key=stg.key上
其中hist.key为空
或hist.column！=标准列

使用与STG表的独占左联接从HIST表中选择所有更新的旧记录，并设置其到期日期，如下所示：
从中选择历史。*，exp_dt（yyyy-MM-dd）
（从历史选项卡中选择*，其中exp_dt='2099-12-31'）历史
左连接STG\U选项卡STG
on hist.key=stg.key
其中hist.key为空
或历史列！=标准列

unionall
从2-5查询并将覆盖结果插入HIST表
这里可以找到SCD类型2的更详细的实现-
如果存在harsha.emp，则删除表格；
如果存在harsha.emp_tmp1，则删除表格；
如果存在，则删除表harsha.emp_tmp2；
如果存在harsha.init_load，则删除表格；
显示数据库；
使用哈沙；
展示表格；
创建表harsha.emp（eid int、ename string、sal int、loc string、dept int、开始日期时间戳、结束日期时间戳、当前状态字符串）
评论“emp scd实施”
行格式分隔
以“，”结尾的字段
以“\n”结尾的行
;
创建表harsha.emp\u tmp1（eid int、ename字符串、sal int、loc字符串、dept int、开始日期时间戳、结束日期时间戳、当前状态字符串）
评论“emp scd实施”
行格式分隔
以“，”结尾的字段
以“\n”结尾的行
;
创建表harsha.emp\u tmp2（eid int、ename字符串、sal int、loc字符串、dept int、开始日期时间戳、结束日期时间戳、当前状态字符串）
评论“emp scd实施”
行格式分隔
以“，”结尾的字段
以“\n”结尾的行
;
创建表harsha.init_load（eid int、ename string、sal int、loc string、dept int）
行格式分隔
以“，”结尾的字段
以“\n”结尾的行
;
展示表格；
插入到表harsha.emp中，选择101作为eid，'aaaa'作为ename，3400作为sal，'chicago'作为loc，10作为did，从unixtime（unix_timestamp（））作为开始日期，从unixtime（unix_timestamp（'9999-12-31 23:59:59'，'yyyyy-mm-dd hh:mm:ss'）作为结束日期，'current'作为当前状态从（选择'123'）x；
在表harsha.emp中选择102作为eid，“abaa”作为ename，6400作为sal，“ny”作为loc，10作为did，从\u unixtime（unix\u timestamp（））作为开始日期，从\u unixtime（un
drop table if exists harsha.emp;

drop table if exists harsha.emp_tmp1;

drop table if exists harsha.emp_tmp2;

drop table if exists harsha.init_load;

show databases;
use harsha;
show tables;

create table harsha.emp (eid int,ename string,sal int,loc string,dept int,start_date timestamp,end_date timestamp,current_status string)
comment "emp scd implementation"
row format delimited
fields terminated by ','
lines terminated by '\n'
;

create table harsha.emp_tmp1 (eid int,ename string,sal int,loc string,dept int,start_date timestamp,end_date timestamp,current_status string)
comment "emp scd implementation"
row format delimited
fields terminated by ','
lines terminated by '\n'
;

create table harsha.emp_tmp2 (eid int,ename string,sal int,loc string,dept int,start_date timestamp,end_date timestamp,current_status string)
comment "emp scd implementation"
row format delimited
fields terminated by ','
lines terminated by '\n'
;

create table harsha.init_load (eid int,ename string,sal int,loc string,dept int) 
row format delimited
fields terminated by ','
lines terminated by '\n'
;

show tables;

insert into table harsha.emp select 101 as eid,'aaaa' as ename,3400 as sal,'chicago' as loc,10 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;

insert into table harsha.emp select 102 as eid,'abaa' as ename,6400 as sal,'ny' as loc,10 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;

insert into table harsha.emp select 103 as eid,'abca' as ename,2300 as sal,'sfo' as loc,20 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;

insert into table harsha.emp select 104 as eid,'afga' as ename,3000 as sal,'seattle' as loc,10 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;

insert into table harsha.emp select 105 as eid,'ikaa' as ename,1400 as sal,'LA' as loc,30 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;

insert into table harsha.emp select 106 as eid,'cccc' as ename,3499 as sal,'spokane' as loc,20 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;

insert into table harsha.emp select 107 as eid,'toiz' as ename,4000 as sal,'WA.DC' as loc,40 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;

load data local inpath 'Documents/hadoop_scripts/t3.txt' into table harsha.emp;

load data local inpath 'Documents/hadoop_scripts/t4.txt' into table harsha.init_load;

insert into table harsha.emp_tmp1 select eid,ename,sal,loc,dept,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status 
from harsha.init_load;

insert into table harsha.emp_tmp2
select a.eid,a.ename,a.sal,a.loc,a.dept,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'updated' as current_status from emp_tmp1 a
left outer join emp b on
a.eid=b.eid and 
a.ename=b.ename and
a.sal=b.sal and 
a.loc = b.loc and 
a.dept = b.dept
where b.eid is null
union all
select a.eid,a.ename,a.sal,a.loc,a.dept,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from emp_tmp1 a
left outer join emp b on
a.eid = b.eid and
a.ename=b.ename and
a.sal=b.sal and 
a.loc=b.loc and 
a.dept=b.dept
where b.eid is not null
union all
select b.eid,b.ename,b.sal,b.loc,b.dept,b.start_date as start_date,from_unixtime(unix_timestamp()) as end_date,'expired' as current_status from emp b
inner join emp_tmp1 a on
a.eid=b.eid  
where
a.ename <> b.ename or
a.sal <> b.sal or 
a.loc <> b.loc or 
a.dept <> b.dept 
;

insert into table harsha.emp select eid,ename,sal,loc,dept,start_date,end_date,current_status from emp_tmp2;

records including expired:

select * from harsha.emp order by eid;

latest recods:

select a.* from emp a inner join (select eid ,max(start_date) as start_date from emp where current_status <> 'expired' group by eid) b on a.eid=b.eid and a.start_date=b.start_date;