Hadoop 缓慢变化的尺寸-配置单元中的SCD1和SCD2实现
我正在寻找配置单元(1.2.1)中的SCD1和SCD2实现。我知道在配置单元(0.14)之前加载SCD1和SCD2表的解决方法。以下是使用变通方法加载SCD1和SCD2的链接Hadoop 缓慢变化的尺寸-配置单元中的SCD1和SCD2实现,hadoop,hive,data-warehouse,Hadoop,Hive,Data Warehouse,我正在寻找配置单元(1.2.1)中的SCD1和SCD2实现。我知道在配置单元(0.14)之前加载SCD1和SCD2表的解决方法。以下是使用变通方法加载SCD1和SCD2的链接 既然Hive支持ACID操作,我只想知道是否有更好或更直接的加载方式。好吧,我使用两个临时表来解决这个问题: drop table if exists administrator_tmp1; drop table if exists administrator_tmp2; set hive.exec.dynami
既然Hive支持ACID操作,我只想知道是否有更好或更直接的加载方式。好吧,我使用两个临时表来解决这个问题:
drop table if exists administrator_tmp1;
drop table if exists administrator_tmp2;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
--review_administrator
CREATE TABLE if not exists review_administrator(
admin_id bigint ,
admin_name string,
create_time string,
email string ,
password string,
status_description string,
token string ,
expire_time string ,
granter_user_id bigint ,
admin_time string ,
effect_start_date string ,
effect_end_date string
)
partitioned by (current_row_indicator string comment 'current, expired')
stored as parquet;
--tmp1 is used for saving origin data
CREATE TABLE if not exists administrator_tmp1(
admin_id bigint ,
admin_name string,
create_time string,
email string ,
password string ,
status_description string ,
token string ,
expire_time string ,
granter_user_id bigint ,
admin_time string ,
effect_start_date string ,
effect_end_date string
)
partitioned by (current_row_indicator string comment 'current, expired:')
stored as parquet;
--tmp2 saving the scd data
CREATE TABLE if not exists administrator_tmp2(
admin_id bigint ,
admin_name string,
create_time string,
email string ,
password string ,
status_description string ,
token string ,
expire_time string ,
granter_user_id bigint ,
admin_time string ,
effect_start_date string ,
effect_end_date string
)
partitioned by (current_row_indicator string comment 'current, expired')
stored as parquet;
--insert origin data into tmp1
INSERT OVERWRITE TABLE administrator_tmp1 PARTITION(current_row_indicator)
SELECT
user_id as admin_id,
name as admin_name,
time as create_time,
email as email,
password as password,
status as status_description,
token as token,
expire_time as expire_time,
admin_id as granter_user_id,
admin_time as admin_time,
'{{ ds }}' as effect_start_date,
'9999-12-31' as effect_end_date,
'current' as current_row_indicator
FROM
ks_db_origin.gifshow_administrator_origin
;
--insert scd data into tmp2
--for the data unchanged
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
t2.admin_id,
t2.admin_name,
t2.create_time,
t2.email,
t2.password,
t2.status_description,
t2.token,
t2.expire_time,
t2.granter_user_id,
t2.admin_time,
t2.effect_start_date,
t2.effect_end_date as effect_end_date,
t2.current_row_indicator
FROM
administrator_tmp1 t1
INNER JOIN
(
SELECT * FROM review_administrator
WHERE current_row_indicator = 'current'
) t2
ON
t1.admin_id = t2.admin_id
AND t1.admin_name = t2.admin_name
AND t1.create_time = t2.create_time
AND t1.email = t2.email
AND t1.password = t2.password
AND t1.status_description = t2.status_description
AND t1.token = t2.token
AND t1.expire_time = t2.expire_time
AND t1.granter_user_id = t2.granter_user_id
AND t1.admin_time = t2.admin_time
;
--for the data changed , update the effect_end_date
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
t2.admin_id,
t2.admin_name,
t2.create_time,
t2.email,
t2.password,
t2.status_description,
t2.token,
t2.expire_time,
t2.granter_user_id,
t2.admin_time,
t2.effect_start_date as effect_start_date,
'{{ yesterday_ds }}' as effect_end_date,
'expired' as current_row_indicator
FROM
administrator_tmp1 t1
INNER JOIN
(
SELECT * FROM review_administrator
WHERE current_row_indicator = 'current'
) t2
ON
t1.admin_id = t2.admin_id
WHERE NOT
(
t1.admin_name = t2.admin_name
AND t1.create_time = t2.create_time
AND t1.email = t2.email
AND t1.password = t2.password
AND t1.status_description = t2.status_description
AND t1.token = t2.token
AND t1.expire_time = t2.expire_time
AND t1.granter_user_id = t2.granter_user_id
AND t1.admin_time = t2.admin_time
)
;
--for the changed data and the new data
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
t1.admin_id,
t1.admin_name,
t1.create_time,
t1.email,
t1.password,
t1.status_description,
t1.token,
t1.expire_time,
t1.granter_user_id,
t1.admin_time,
t1.effect_start_date,
t1.effect_end_date,
t1.current_row_indicator
FROM
administrator_tmp1 t1
LEFT OUTER JOIN
(
SELECT * FROM review_administrator
WHERE current_row_indicator = 'current'
) t2
ON
t1.admin_id = t2.admin_id
AND t1.admin_name = t2.admin_name
AND t1.create_time = t2.create_time
AND t1.email = t2.email
AND t1.password = t2.password
AND t1.status_description = t2.status_description
AND t1.token = t2.token
AND t1.expire_time = t2.expire_time
AND t1.granter_user_id = t2.granter_user_id
AND t1.admin_time = t2.admin_time
WHERE t2.admin_id IS NULL
;
--for the data already marked by 'expired'
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
t1.admin_id,
t1.admin_name,
t1.create_time,
t1.email,
t1.password,
t1.status_description,
t1.token,
t1.expire_time,
t1.granter_user_id,
t1.admin_time,
t1.effect_start_date,
t1.effect_end_date,
t1.current_row_indicator
FROM
review_administrator t1
WHERE t1.current_row_indicator = 'expired'
;
--populate the dim table
INSERT OVERWRITE TABLE review_administrator PARTITION(current_row_indicator)
SELECT
t1.admin_id,
t1.admin_name,
t1.create_time,
t1.email,
t1.password,
t1.status_description,
t1.token,
t1.expire_time,
t1.granter_user_id,
t1.admin_time,
t1.effect_start_date,
t1.effect_end_date,
t1.current_row_indicator
FROM
administrator_tmp2 t1
;
--drop the two temp table
drop table administrator_tmp1;
drop table administrator_tmp2;
-- --example data
-- --2017-01-01
-- insert into table review_administrator PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-01','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- --2017-01-02
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a01@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-02','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '2','b','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-02','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- --2017-01-03
-- --id 1 is changed
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a03@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- --id 2 is not changed at all
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '2','b','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- --id 3 is a new record
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '3','c','2016-12-31','c@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- --now dim table will show you the right SCD.
由于HDFS是不可变存储,因此可以认为数据版本控制和历史记录(SCD2)应该是加载维度的默认行为。您可以在Hadoop SQL查询引擎(Hive、Impala、Drill等)中创建一个视图,使用窗口功能检索当前状态/最新值。您可以在我的博文中找到更多信息,例如如何处理大维度和事实表。在使用SCD管理数据时,我确实使用了另一种方法:
loaded\u on
,校验和
如果需要,在同一天发生多个加载时将使用序列列,然后将新数据与最新生成的数据进行比较,将同时使用控制列和数据中存在的键列,如客户或产品键校验和
,为每一行创建一个唯一的指纹。然后,指纹(校验和
)列将用于确定与最新一代相比,是否有任何列发生了更改(最新一代基于基于键、加载和序列的数据的最新状态)
现在,您知道来自每日更新的行是否是新的,因为没有上一代,或者来自每日更新的行是否需要创建新行(新一代)在历史文件或表中,如果来自每日更新的行没有任何更改,则无需创建行,因为与上一代相比没有差异
可以使用apachespark
构建所需的逻辑类型,在一条语句中,您可以要求Spark
将任何数据类型的任意数量的列串联起来,然后计算一个散列
值,该值用于指纹
总而言之,现在您可以基于spark
开发一个实用程序,它可以接受任何数据源,并输出一个组织良好、干净且具有慢维度意识的历史文件、表等,。。。最后,永远不要只更新附加 下面是使用独占连接方法在配置单元中缓慢更改维度类型2的详细实现
假设源正在发送完整的数据文件,即旧记录、更新记录和新记录
Steps-
将最近的文件数据加载到STG表
从HIST表中选择所有过期记录
exp\u dt!='2099-12-31'
使用HIST.column=STG.column上的内部联接和过滤器,选择STG和HIST中未更改的所有记录,如下所示
从hist_选项卡hist中选择hist.*
内部连接STG\U选项卡STG
在hist.key=stg.key上
其中hist.column=stg.column
选择所有从STG_选项卡更改的新记录和更新记录,使用独占左连接和历史选项卡,并设置到期和生效日期,如下所示
选择stg.*、有效日期(yyyy-MM-dd)、有效日期(2099-12-31)
从STG_选项卡STG
左连接
(从历史选项卡中选择*,其中exp_dt='2099-12-31')历史
在hist.key=stg.key上
其中hist.key为空
或hist.column!=标准列
使用与STG表的独占左联接从HIST表中选择所有更新的旧记录,并设置其到期日期,如下所示:
从中选择历史。*,exp_dt(yyyy-MM-dd)
(从历史选项卡中选择*,其中exp_dt='2099-12-31')历史
左连接STG\U选项卡STG
on hist.key=stg.key
其中hist.key为空
或历史列!=标准列
unionall
从2-5查询并将覆盖结果插入HIST表
这里可以找到SCD类型2的更详细的实现-
如果存在harsha.emp,则删除表格;
如果存在harsha.emp_tmp1,则删除表格;
如果存在,则删除表harsha.emp_tmp2;
如果存在harsha.init_load,则删除表格;
显示数据库;
使用哈沙;
展示表格;
创建表harsha.emp(eid int、ename string、sal int、loc string、dept int、开始日期时间戳、结束日期时间戳、当前状态字符串)
评论“emp scd实施”
行格式分隔
以“,”结尾的字段
以“\n”结尾的行
;
创建表harsha.emp\u tmp1(eid int、ename字符串、sal int、loc字符串、dept int、开始日期时间戳、结束日期时间戳、当前状态字符串)
评论“emp scd实施”
行格式分隔
以“,”结尾的字段
以“\n”结尾的行
;
创建表harsha.emp\u tmp2(eid int、ename字符串、sal int、loc字符串、dept int、开始日期时间戳、结束日期时间戳、当前状态字符串)
评论“emp scd实施”
行格式分隔
以“,”结尾的字段
以“\n”结尾的行
;
创建表harsha.init_load(eid int、ename string、sal int、loc string、dept int)
行格式分隔
以“,”结尾的字段
以“\n”结尾的行
;
展示表格;
插入到表harsha.emp中,选择101作为eid,'aaaa'作为ename,3400作为sal,'chicago'作为loc,10作为did,从unixtime(unix_timestamp())作为开始日期,从unixtime(unix_timestamp('9999-12-31 23:59:59','yyyyy-mm-dd hh:mm:ss')作为结束日期,'current'作为当前状态从(选择'123')x;
在表harsha.emp中选择102作为eid,“abaa”作为ename,6400作为sal,“ny”作为loc,10作为did,从\u unixtime(unix\u timestamp())作为开始日期,从\u unixtime(un
drop table if exists harsha.emp;
drop table if exists harsha.emp_tmp1;
drop table if exists harsha.emp_tmp2;
drop table if exists harsha.init_load;
show databases;
use harsha;
show tables;
create table harsha.emp (eid int,ename string,sal int,loc string,dept int,start_date timestamp,end_date timestamp,current_status string)
comment "emp scd implementation"
row format delimited
fields terminated by ','
lines terminated by '\n'
;
create table harsha.emp_tmp1 (eid int,ename string,sal int,loc string,dept int,start_date timestamp,end_date timestamp,current_status string)
comment "emp scd implementation"
row format delimited
fields terminated by ','
lines terminated by '\n'
;
create table harsha.emp_tmp2 (eid int,ename string,sal int,loc string,dept int,start_date timestamp,end_date timestamp,current_status string)
comment "emp scd implementation"
row format delimited
fields terminated by ','
lines terminated by '\n'
;
create table harsha.init_load (eid int,ename string,sal int,loc string,dept int)
row format delimited
fields terminated by ','
lines terminated by '\n'
;
show tables;
insert into table harsha.emp select 101 as eid,'aaaa' as ename,3400 as sal,'chicago' as loc,10 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
insert into table harsha.emp select 102 as eid,'abaa' as ename,6400 as sal,'ny' as loc,10 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
insert into table harsha.emp select 103 as eid,'abca' as ename,2300 as sal,'sfo' as loc,20 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
insert into table harsha.emp select 104 as eid,'afga' as ename,3000 as sal,'seattle' as loc,10 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
insert into table harsha.emp select 105 as eid,'ikaa' as ename,1400 as sal,'LA' as loc,30 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
insert into table harsha.emp select 106 as eid,'cccc' as ename,3499 as sal,'spokane' as loc,20 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
insert into table harsha.emp select 107 as eid,'toiz' as ename,4000 as sal,'WA.DC' as loc,40 as did,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from (select '123')x;
load data local inpath 'Documents/hadoop_scripts/t3.txt' into table harsha.emp;
load data local inpath 'Documents/hadoop_scripts/t4.txt' into table harsha.init_load;
insert into table harsha.emp_tmp1 select eid,ename,sal,loc,dept,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status
from harsha.init_load;
insert into table harsha.emp_tmp2
select a.eid,a.ename,a.sal,a.loc,a.dept,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'updated' as current_status from emp_tmp1 a
left outer join emp b on
a.eid=b.eid and
a.ename=b.ename and
a.sal=b.sal and
a.loc = b.loc and
a.dept = b.dept
where b.eid is null
union all
select a.eid,a.ename,a.sal,a.loc,a.dept,from_unixtime(unix_timestamp()) as start_date,from_unixtime(unix_timestamp('9999-12-31 23:59:59','yyyy-mm-dd hh:mm:ss')) as end_date,'current' as current_status from emp_tmp1 a
left outer join emp b on
a.eid = b.eid and
a.ename=b.ename and
a.sal=b.sal and
a.loc=b.loc and
a.dept=b.dept
where b.eid is not null
union all
select b.eid,b.ename,b.sal,b.loc,b.dept,b.start_date as start_date,from_unixtime(unix_timestamp()) as end_date,'expired' as current_status from emp b
inner join emp_tmp1 a on
a.eid=b.eid
where
a.ename <> b.ename or
a.sal <> b.sal or
a.loc <> b.loc or
a.dept <> b.dept
;
insert into table harsha.emp select eid,ename,sal,loc,dept,start_date,end_date,current_status from emp_tmp2;
records including expired:
select * from harsha.emp order by eid;
latest recods:
select a.* from emp a inner join (select eid ,max(start_date) as start_date from emp where current_status <> 'expired' group by eid) b on a.eid=b.eid and a.start_date=b.start_date;