Base SAS:为交叉制表重新构造数据集

Base SAS:为交叉制表重新构造数据集,sas,crosstab,transpose,Sas,Crosstab,Transpose,我需要重新构造数据集,以生成交叉表输出/数据集。我想回答的问题是,在首次注册购买后,购买了哪些其他产品,以及产品组合是什么——因此,如果客户在注册时购买了prod3,他们最终也会购买prod2和prod4 我开始使用的数据集如下所示。请注意,有些产品购买日期与启动日期相同 ID Signup_dt Prod_type Purchase_Date 2232 4-Jun-14 prod1 4-Jun-14 2232 4-Jun-14 prod2 4-Jun-

我需要重新构造数据集,以生成交叉表输出/数据集。我想回答的问题是,在首次注册购买后,购买了哪些其他产品,以及产品组合是什么——因此,如果客户在注册时购买了prod3,他们最终也会购买prod2和prod4

我开始使用的数据集如下所示。请注意,有些产品购买日期与启动日期相同

ID  Signup_dt   Prod_type   Purchase_Date
2232    4-Jun-14    prod1   4-Jun-14
2232    4-Jun-14    prod2   4-Jun-14
2232    4-Jun-14    prod3   4-Jun-14
2232    4-Jun-14    prod4
2232    4-Jun-14    prod5   4-Aug-14
4545    12-Jun-14   prod1
4545    12-Jun-14   prod2   13-Sep-14
4545    12-Jun-14   prod3   12-Jun-14
4545    12-Jun-14   prod4   12-Nov-14
4545    12-Jun-14   prod5   12-Jun-14
我需要重新构造数据集,使其看起来像这样:

ID  startup_month   Poducts Purchase_at_Start_Up    prod1   prod2   prod3   prod4   prod5
2232    June         prod1            1                                               1
2232    June         prod2            1                                               1
2232    June         prod3            1                                               1
2232    June         prod4                      
2232    June         prod5                      
4545    June         prod1                      
4545    June         prod2                      
4545    June         prod3            1                      1                        1
4545    June         prod4                      
4545    June         prod5            1                      1                        1
然后是一个汇总数据集,它给出了产品组合的总数/计数

Startup_month   Poducts Purchase_Start_up_count prod1_dt    prod2_dt    prod3_dt    prod4_dt prod5_dt
June              prod1           1                                                             1
June              prod2           1                                                             1
June              prod3           2                1                                            2
June              prod4                     
June              prod5           1                1                                            1

我很难想象如何为此编程。我一直在使用proc转置和proc freq的不同组合,但我无法接近期望的输出。非常感谢您的帮助。

您的问题不清楚您希望从第一个汇总表中获得什么,例如,为什么prod5只针对前3行而不是前5行。也就是说,您可以使用proc转置和数据步骤的组合构建类似的摘要:

data have;
    infile datalines dlm = ",";
    input 
        ID
        Signup_dt date9.
        Prod_type $
        Purchase_Date date9.;
    format Signup_dt Purchase_Date yymmdd10.;
datalines;
2232,4-Jun-14,prod1,4-Jun-14
2232,4-Jun-14,prod2,4-Jun-14
2232,4-Jun-14,prod3,4-Jun-14
2232,4-Jun-14,prod4,
2232,4-Jun-14,prod5,4-Aug-14
4545,12-Jun-14,prod1,
4545,12-Jun-14,prod2,13-Sep-14
4545,12-Jun-14,prod3,12-Jun-14
4545,12-Jun-14,prod4,12-Nov-14
4545,12-Jun-14,prod5,12-Jun-14
;
run;
/* Create month variable and check if purchased at signup */
data want1;
    set have;
    startup_month = strip(put(Signup_dt, monname9.));
    if Signup_dt = Purchase_Date then Purchase_at_Start_Up = 1;
    drop Purchase_Date Signup_dt;
run;
/* Create dummy flag for transpose*/ 
data have1;
    set have;
    if Purchase_Date then flag = 1;
run;
/* Transpose to get product variables by ID */
proc transpose data = have1 out = want2 (drop =  _NAME_);  
    by ID;
    id Prod_type;
    var flag;
run;
/* Combine the two */
data want;
    merge want1 want2;
    by ID;
run; 

有趣的问题

您的汇总表不正确。id 4545已注册购买prod5,但没有该产品的额外购买。你想要达到的目标是相当清楚的。上述解决方案也不正确。它将附加销售加入到没有任何签约购买的产品中

按id、启动月份和产品列出的第一个表格汇总表

proc sql;
    create table want_sql1                      as  

    select  sign_up_purchases.id
        ,strip(put(sign_up_purchases.Signup_dt, monname9.)) as startup_month
        ,sign_up_purchases.prod_type                as Products

        ,sum (case when sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date then 1 end) 
                                        as Purchase_at_Start_Up

        ,sum (additional_purchases.prod1)               as prod1    
        ,sum (additional_purchases.prod2)               as prod2    
        ,sum (additional_purchases.prod3)               as prod3    
        ,sum (additional_purchases.prod4)               as prod4    
        ,sum (additional_purchases.prod5)               as prod5    

    from    have    as  sign_up_purchases

/*      get a summary of additional purchases. */
    left    join
        (   select  id
                ,sum (case when prod_type eq 'prod1' then 1 end)
                                        as prod1    

                ,sum (case when prod_type eq 'prod2' then 1 end)
                                        as prod2    

                ,sum (case when prod_type eq 'prod3' then 1 end)
                                        as prod3    

                ,sum (case when prod_type eq 'prod4' then 1 end)
                                        as prod4    

                ,sum (case when prod_type eq 'prod5' then 1 end)
                                        as prod5    

            from    have    

            where   Signup_dt   ne  Purchase_Date 
            and Purchase_Date is    not null

            group   by  id
        )                   as  additional_purchases
/*      Join by id and products with sign_up purchases. */
    on  sign_up_purchases.id        eq  additional_purchases.id
    and sign_up_purchases.Signup_dt eq  sign_up_purchases.Purchase_Date         

    group   by  1,2,3
    ;
quit;   


proc compare 
    base=want_sql1
    compare=want
    ;
run;
您的第二个总结(按启动月份和产品); proc-sql; 创建想要的表\u sql2作为

    select strip(put(sign_up_purchases.Signup_dt, monname9.))   as startup_month

        ,sign_up_purchases.prod_type                as Products

        ,sum (case when sign_up_purchases.Signup_dt eq sign_up_purchases.Purchase_Date then 1 end) 
                                        as Purchase_at_Start_Up

        ,sum (additional_purchases.prod1_dt)            as prod1_dt    
        ,sum (additional_purchases.prod2_dt)            as prod2_dt    
        ,sum (additional_purchases.prod3_dt)            as prod3_dt    
        ,sum (additional_purchases.prod4_dt)            as prod4_dt    
        ,sum (additional_purchases.prod5_dt)            as prod5_dt    

    from    have    as  sign_up_purchases

/*      get a summary of additional purchases. */
    left    join
        (   select  id
                ,sum (case when prod_type eq 'prod1' then 1 end)
                                        as prod1_dt    

                ,sum (case when prod_type eq 'prod2' then 1 end)
                                        as prod2_dt    

                ,sum (case when prod_type eq 'prod3' then 1 end)
                                        as prod3_dt    

                ,sum (case when prod_type eq 'prod4' then 1 end)
                                        as prod4_dt    

                ,sum (case when prod_type eq 'prod5' then 1 end)
                                        as prod5_dt    

            from    have    

            where   Signup_dt   ne  Purchase_Date 
            and Purchase_Date is    not null

            group   by  id
        )                   as  additional_purchases
/*      Join by id and products with sign_up purchases. */
    on  sign_up_purchases.id        eq  additional_purchases.id
    and sign_up_purchases.Signup_dt eq  sign_up_purchases.Purchase_Date         

    group   by  1,2
    ;
quit;   

这是一个涉及数据步骤、Proc转置和Proc摘要的版本,生成2个输出,请参见want1和want2

对于大多数应用程序来说,这方面的工作比SQL版本多得多。SQL更直观、更强大、更灵活

/*  identify additional Purchases. */
data additional_Purchases1;
    set have;

/*  We need records for all products regardless if there is an   */
/*  additional Purchase as we want proc transpose to create columns  */
/*  for all products (prod1-prod5).  Not just those that had     */
/*  additional Purchases.                                            */
    if      Signup_dt   ne  Purchase_Date 
        and Purchase_Date ne    .
        then    additional_Purchases = 1;
run;



/*  summarise additional Purchases. */
proc summary nway 
    data = additional_Purchases1    ;
    class id Prod_type;
    var additional_Purchases;
    output out=additional_Purchases2
        sum=additional_Purchases
        ;
run;



/* Transpose additional Purchases by ID and product*/
proc transpose 
    data = additional_Purchases2 
    out = additional_Purchases (drop =  _NAME_ )
    ;  
    by ID;
    id Prod_type;
    var additional_Purchases;
run;



/*  Join Sign-up purchases to additional purchases for each Id and Product. */
data want1 
    (keep = id startup_month Products Purchase_at_Start_Up prod1-prod5) 
    ;

    format id           10.
        startup_month
        Products        $10.
        Purchase_at_Start_Up 10.
        ;

    merge   have    (   in=sign_up_purchases 
                rename = (prod_type=Products)
            )
        additional_Purchases    
            (   in=additional_purchases 
                rename =(prod1-prod5 = oprod1-oprod5)
            )
        ;
    by id;

    if  sign_up_purchases;

    startup_month = strip (put(Signup_dt, monname9.));


/*  Tried updating Prod1 - Prod5 (set to missing when no signup purchase)  */
/*  but re-assignment was retained on next matching observation regardless */
/*  of values of prod1-prod5 on input table. */

    ARRAY oPROD oprod1-oprod5;
    ARRAY PROD prod1-prod5;

/*  Get additional Purchases for products with purchased at signup.      */
    if  Signup_dt = Purchase_Date 
        then    do;
                Purchase_at_Start_Up = 1;
                do over prod;
                    prod = oprod;
                end;
            end;        
run;


/*  Summarise Sign-up and additional Purchases by startup month and product.     */
proc summary 
    nway
    data=want1
    ;
    class startup_month products;
    var Purchase_at_Start_Up prod1-prod5;
    output  out = want2 (drop=_type_ _freq_)
        sum = Purchase_at_Start_up_count prod1_dt prod2_dt prod3_dt prod4_dt prod5_dt
        ;
quit;