Base SAS:为交叉制表重新构造数据集
我需要重新构造数据集,以生成交叉表输出/数据集。我想回答的问题是,在首次注册购买后,购买了哪些其他产品,以及产品组合是什么——因此,如果客户在注册时购买了prod3,他们最终也会购买prod2和prod4 我开始使用的数据集如下所示。请注意,有些产品购买日期与启动日期相同Base SAS:为交叉制表重新构造数据集,sas,crosstab,transpose,Sas,Crosstab,Transpose,我需要重新构造数据集,以生成交叉表输出/数据集。我想回答的问题是,在首次注册购买后,购买了哪些其他产品,以及产品组合是什么——因此,如果客户在注册时购买了prod3,他们最终也会购买prod2和prod4 我开始使用的数据集如下所示。请注意,有些产品购买日期与启动日期相同 ID Signup_dt Prod_type Purchase_Date 2232 4-Jun-14 prod1 4-Jun-14 2232 4-Jun-14 prod2 4-Jun-
ID Signup_dt Prod_type Purchase_Date
2232 4-Jun-14 prod1 4-Jun-14
2232 4-Jun-14 prod2 4-Jun-14
2232 4-Jun-14 prod3 4-Jun-14
2232 4-Jun-14 prod4
2232 4-Jun-14 prod5 4-Aug-14
4545 12-Jun-14 prod1
4545 12-Jun-14 prod2 13-Sep-14
4545 12-Jun-14 prod3 12-Jun-14
4545 12-Jun-14 prod4 12-Nov-14
4545 12-Jun-14 prod5 12-Jun-14
我需要重新构造数据集,使其看起来像这样:
ID startup_month Poducts Purchase_at_Start_Up prod1 prod2 prod3 prod4 prod5
2232 June prod1 1 1
2232 June prod2 1 1
2232 June prod3 1 1
2232 June prod4
2232 June prod5
4545 June prod1
4545 June prod2
4545 June prod3 1 1 1
4545 June prod4
4545 June prod5 1 1 1
然后是一个汇总数据集,它给出了产品组合的总数/计数
Startup_month Poducts Purchase_Start_up_count prod1_dt prod2_dt prod3_dt prod4_dt prod5_dt
June prod1 1 1
June prod2 1 1
June prod3 2 1 2
June prod4
June prod5 1 1 1
我很难想象如何为此编程。我一直在使用proc转置和proc freq的不同组合,但我无法接近期望的输出。非常感谢您的帮助。您的问题不清楚您希望从第一个汇总表中获得什么,例如,为什么prod5只针对前3行而不是前5行。也就是说,您可以使用proc转置和数据步骤的组合构建类似的摘要:
data have;
infile datalines dlm = ",";
input
ID
Signup_dt date9.
Prod_type $
Purchase_Date date9.;
format Signup_dt Purchase_Date yymmdd10.;
datalines;
2232,4-Jun-14,prod1,4-Jun-14
2232,4-Jun-14,prod2,4-Jun-14
2232,4-Jun-14,prod3,4-Jun-14
2232,4-Jun-14,prod4,
2232,4-Jun-14,prod5,4-Aug-14
4545,12-Jun-14,prod1,
4545,12-Jun-14,prod2,13-Sep-14
4545,12-Jun-14,prod3,12-Jun-14
4545,12-Jun-14,prod4,12-Nov-14
4545,12-Jun-14,prod5,12-Jun-14
;
run;
/* Create month variable and check if purchased at signup */
data want1;
set have;
startup_month = strip(put(Signup_dt, monname9.));
if Signup_dt = Purchase_Date then Purchase_at_Start_Up = 1;
drop Purchase_Date Signup_dt;
run;
/* Create dummy flag for transpose*/
data have1;
set have;
if Purchase_Date then flag = 1;
run;
/* Transpose to get product variables by ID */
proc transpose data = have1 out = want2 (drop = _NAME_);
by ID;
id Prod_type;
var flag;
run;
/* Combine the two */
data want;
merge want1 want2;
by ID;
run;
有趣的问题 您的汇总表不正确。id 4545已注册购买prod5,但没有该产品的额外购买。你想要达到的目标是相当清楚的。上述解决方案也不正确。它将附加销售加入到没有任何签约购买的产品中 按id、启动月份和产品列出的第一个表格汇总表
proc sql;
create table want_sql1 as
select sign_up_purchases.id
,strip(put(sign_up_purchases.Signup_dt, monname9.)) as startup_month
,sign_up_purchases.prod_type as Products
,sum (case when sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date then 1 end)
as Purchase_at_Start_Up
,sum (additional_purchases.prod1) as prod1
,sum (additional_purchases.prod2) as prod2
,sum (additional_purchases.prod3) as prod3
,sum (additional_purchases.prod4) as prod4
,sum (additional_purchases.prod5) as prod5
from have as sign_up_purchases
/* get a summary of additional purchases. */
left join
( select id
,sum (case when prod_type eq 'prod1' then 1 end)
as prod1
,sum (case when prod_type eq 'prod2' then 1 end)
as prod2
,sum (case when prod_type eq 'prod3' then 1 end)
as prod3
,sum (case when prod_type eq 'prod4' then 1 end)
as prod4
,sum (case when prod_type eq 'prod5' then 1 end)
as prod5
from have
where Signup_dt ne Purchase_Date
and Purchase_Date is not null
group by id
) as additional_purchases
/* Join by id and products with sign_up purchases. */
on sign_up_purchases.id eq additional_purchases.id
and sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date
group by 1,2,3
;
quit;
proc compare
base=want_sql1
compare=want
;
run;
您的第二个总结(按启动月份和产品);
proc-sql;
创建想要的表\u sql2作为
select strip(put(sign_up_purchases.Signup_dt, monname9.)) as startup_month
,sign_up_purchases.prod_type as Products
,sum (case when sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date then 1 end)
as Purchase_at_Start_Up
,sum (additional_purchases.prod1_dt) as prod1_dt
,sum (additional_purchases.prod2_dt) as prod2_dt
,sum (additional_purchases.prod3_dt) as prod3_dt
,sum (additional_purchases.prod4_dt) as prod4_dt
,sum (additional_purchases.prod5_dt) as prod5_dt
from have as sign_up_purchases
/* get a summary of additional purchases. */
left join
( select id
,sum (case when prod_type eq 'prod1' then 1 end)
as prod1_dt
,sum (case when prod_type eq 'prod2' then 1 end)
as prod2_dt
,sum (case when prod_type eq 'prod3' then 1 end)
as prod3_dt
,sum (case when prod_type eq 'prod4' then 1 end)
as prod4_dt
,sum (case when prod_type eq 'prod5' then 1 end)
as prod5_dt
from have
where Signup_dt ne Purchase_Date
and Purchase_Date is not null
group by id
) as additional_purchases
/* Join by id and products with sign_up purchases. */
on sign_up_purchases.id eq additional_purchases.id
and sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date
group by 1,2
;
quit;
这是一个涉及数据步骤、Proc转置和Proc摘要的版本,生成2个输出,请参见want1和want2 对于大多数应用程序来说,这方面的工作比SQL版本多得多。SQL更直观、更强大、更灵活
/* identify additional Purchases. */
data additional_Purchases1;
set have;
/* We need records for all products regardless if there is an */
/* additional Purchase as we want proc transpose to create columns */
/* for all products (prod1-prod5). Not just those that had */
/* additional Purchases. */
if Signup_dt ne Purchase_Date
and Purchase_Date ne .
then additional_Purchases = 1;
run;
/* summarise additional Purchases. */
proc summary nway
data = additional_Purchases1 ;
class id Prod_type;
var additional_Purchases;
output out=additional_Purchases2
sum=additional_Purchases
;
run;
/* Transpose additional Purchases by ID and product*/
proc transpose
data = additional_Purchases2
out = additional_Purchases (drop = _NAME_ )
;
by ID;
id Prod_type;
var additional_Purchases;
run;
/* Join Sign-up purchases to additional purchases for each Id and Product. */
data want1
(keep = id startup_month Products Purchase_at_Start_Up prod1-prod5)
;
format id 10.
startup_month
Products $10.
Purchase_at_Start_Up 10.
;
merge have ( in=sign_up_purchases
rename = (prod_type=Products)
)
additional_Purchases
( in=additional_purchases
rename =(prod1-prod5 = oprod1-oprod5)
)
;
by id;
if sign_up_purchases;
startup_month = strip (put(Signup_dt, monname9.));
/* Tried updating Prod1 - Prod5 (set to missing when no signup purchase) */
/* but re-assignment was retained on next matching observation regardless */
/* of values of prod1-prod5 on input table. */
ARRAY oPROD oprod1-oprod5;
ARRAY PROD prod1-prod5;
/* Get additional Purchases for products with purchased at signup. */
if Signup_dt = Purchase_Date
then do;
Purchase_at_Start_Up = 1;
do over prod;
prod = oprod;
end;
end;
run;
/* Summarise Sign-up and additional Purchases by startup month and product. */
proc summary
nway
data=want1
;
class startup_month products;
var Purchase_at_Start_Up prod1-prod5;
output out = want2 (drop=_type_ _freq_)
sum = Purchase_at_Start_up_count prod1_dt prod2_dt prod3_dt prod4_dt prod5_dt
;
quit;