SAS或PROC SQL中按组划分的累积最大值

SAS或PROC SQL中按组划分的累积最大值,sas,Sas,我想计算另一列分组的累计最大值 假设我有这些数据: data have; input grp $ number; datalines; a 3 b 4 a 5 b 2 a 1 b 8 ; 我期望的结果是: data want; input grp $ cummax; a 3 b 4 a 5 b 4 a 5 b 8 ; 我的实际案例将涉及多个分组列+过滤器,理想情况下,此累积最大值将同时在多个列上计算 我主要关心的是计算效率,因为我将在一千万到一亿行的表上运行它。Proc SQL或本机SAS都

我想计算另一列分组的累计最大值

假设我有这些数据:

data have;
input grp $ number;
datalines;
a 3
b 4
a 5
b 2
a 1
b 8
;
我期望的结果是:

data want;
input grp $ cummax;
a 3
b 4
a 5
b 4
a 5
b 8
;
我的实际案例将涉及多个分组列+过滤器,理想情况下,此累积最大值将同时在多个列上计算

我主要关心的是计算效率,因为我将在一千万到一亿行的表上运行它。Proc SQL或本机SAS都是受欢迎的

如有必要,行可能会被洗牌

系统信息

proc product_status;run;

像下面这样的东西会起作用。如果要保留原始订单,请添加一个行计数器并使用该计数器:

proc sort data=have; 
by grp; 
run;

data new; 
drop newnum; 
set have;            
by grp;   
retain newnum;                                                  
if first.grp then newnum = number; 
if number > newnum then newnum=number;  
else number=newnum;
run;          
使用哈希而不进行排序

data want;
  if _n_=1 then do;
  declare hash h();
  h.definekey('grp');
  h.definedata('value');
  h.definedone();
  end;
  set have;
  if h.find()^=0 then do;
  h.add(key:grp,data:number);
  max=number;
  end;
  else do;
     max=max(number,value);
     h.replace(key:grp,data:number);
  end;
  drop  value number;
run;

使用
散列
对象存储每个变量和组组合的最大值。这将允许您单次传递数据集,并编写一些可以根据组和变量数量进行缩放的代码

这不需要在大型数据集上花费高昂的排序

测试数据

data example;
format grp1-grp5 $1.;
array grp[5];
array val[5];
do rows=1 to 1000000;
    do i=1 to 5;
        r = ceil(ranuni(1)*5);
        grp[i] = substr("ABCDE",r,1);
    end;
    do j=1 to 5;
        val[j] = 10*rannor(1);
    end;
    output;
end;
keep grp: val:;
run;
计算累积最大值的数据步骤

data want;
set example;
array val[5];
array max[5];
if _n_ = 1 then do;
    declare hash mx();
    rc = mx.defineKey('grp1','grp2','grp3','grp4','grp5');
    rc = mx.definedata('max1','max2','max3','max4','max5');
    rc = mx.definedone();
end;

rc = mx.find();
/*No Max for this combination -- add it*/
if rc then do;
    do i=1 to 5;
        max[i] = val[i];
    end;
end;

/*Update Max Values*/
do i=1 to 5;
    if val[i] > max[i] then
        max[i] = val[i];
end;

/*Update Hash*/
rc = mx.replace();

drop rc i;
n = _n_; /*This is for testing*/
run;
使用该测试变量
n
,我们可以按照原始顺序对组进行排序,看看它是否有效。(暗示,确实如此)


我围绕@DomPazz的解决方案构建了一个宏函数,可以选择根据哪些列进行分组,计算哪些列,最后删除或保留哪些列

我认为包括的例子很简单

我在底部加入了我在
cummax
中使用的快捷宏函数

*------------------------------------------------------------;
* CUMMAX                                                     ;
* Compute a cumulative max on 1 or several variables grouped ;
* by one or several variables;                               ;
*------------------------------------------------------------;
/* EXAMPLE:
data have;
format grp1-grp2 $1.;
array grp[2];
array val[3];
do rows=1 to 20;
    do i=1 to 2;
        r = ceil(ranuni(1)*2);
        grp[i] = substr("AB",r,1);
    end;
    do j=1 to 3;
        val[j] = 10*rannor(1);
    end;
    output;
end;
keep grp: val:;
run;

%cummax(have,grp=grp1 grp2,val=val1 val2,out= want1)
%cummax(have,grp=grp1,val=val1,drop=grp2 val3,out= want2)
%cummax(have,grp=grp1,val=val1,keep= val2,out= want3)
*/

%macro cummax
(data  /* source table */
,grp=  /* variables to group on */
,val=  /* variables to compute on */
,keep= /* variables to keep additionally to grp and computed columns, don't use with drop */
,drop= /* variables to drop, don't use with keep */
,out=  /* output table */
);

/* default output */
%if not %length(&out) %then %let out = &data;

/* rework keep and drop */
%local n_val max_val;
%let n_val   = %list_length(&val);
%let max_val = %list_fix(&val,suffix=_cmax);
%if %length(&keep) %then %let keep = (keep= &keep &grp &max_val );
%if %length(&drop) %then %let drop = (drop= &drop);

/* data step */
data &out&keep&drop;
set &data;
array val[&n_val] &val;
array max[&n_val] &max_val;

if _n_ = 1 then do;
    declare hash mx();
    rc = mx.defineKey(%list_quote_comma(&grp));
    rc = mx.definedata(%list_quote_comma(&max_val));
    rc = mx.definedone();
end;

rc = mx.find();
/*No Max for this combination -- add it*/
if rc then do;
    do i=1 to &n_val; /* %list_length(&val) */
        max[i] = val[i];
    end;
end;

/*Update Max Values*/
do i=1 to &n_val;
    if val[i] > max[i] then
        max[i] = val[i];
end;

/*Update Hash*/
rc = mx.replace();

drop rc i;
run;

%mend;


*---------------------------------------------------------------;
* LIST_LENGTH                                                   ;
* Length of space separated list                                ;
*---------------------------------------------------------------;
/* EXAMPLES :                                                    
   %put %list_length(item1 item2 item3);                                            
*/
%macro list_length
(data
);
%sysfunc(countw(&data,%str( )))
%mend;

*---------------------------------------------------------------;
* LIST_QUOTE_COMMA                                              ;
* create comma separated list with quoted items, from           ;
* unquoted space separated list.                                ;
*---------------------------------------------------------------;
/* EXAMPLE
%put %list_quote_comma(a b c);
*/
%macro list_quote_comma
(data /* space separated list to quote */
);
%unquote(%str(%')%qsysfunc(tranwrd(&data,%str( ),%str(%',%')))%str(%'))
%mend;

*---------------------------------------------------------------;
* LIST_FIX                                                      ;
* Add prefix and/or suffix to items of space separated list     ;
*---------------------------------------------------------------;
/* EXAMPLES :                                                    
   %put %list_fix(item1 item2 item3,pref_,_suf);                 
   %put %list_fix(item1 item2 item3,pref_);                 
   %put %list_fix(item1 item2 item3,suffix=_suf);                                           
*/

%macro list_fix
(data
,prefix
,suffix
);
%local output;
%do i=1 %to %sysfunc(countw(&data,%str( ))) ;
  %let output= &output &prefix.%scan(&data,&i,%str( ))&suffix;
%end;
&output
%mend;

您有SAS ETS许可证吗?你试过什么,即使是简单的案例?您应该扩展示例数据,以更好地反映您在几个分组变量中的实际情况。我有很多机会,但我不确定如何检查:)。我提供了简短的数据来说明我的观点,但你是对的,我将用一个更复杂的案例进行编辑;运行要查看许可的内容,
proc product\u状态;运行查看安装了什么。我在上面添加了我的系统信息,我需要更多的时间来制作一个复杂的示例表,同时解决简单的示例将是一个很大的帮助。下面的解决方案很有效。在这里我不推荐SQL,除非您可以传递并使用一些DBMS函数。您没有SAS/ETS允许您使用PROC EXPAND。感谢它的工作原理,我将查看是否有人提供竞争对手的效率,然后验证:)可能是如果number>newnum,那么newnum=number;如果number>newnum,则可以更改为else,然后newnum=number;此解决方案简单且效果良好。但是,它需要排序,因此不保留原始数据的顺序。我将行数增加到100000000(4.5G输入大小)。我的笔记本电脑以1:55.89计算最大值。我测试了它,它工作正常。代码看起来也很优雅。但可能是我不太了解hash对象及其方法。让我困惑的是,如果在mx散列中找不到组合键,那么就将val分配给max变量。但是您并没有真正将它们添加到哈希中,更新max的下一步也是如此。这些值是如何在mx中添加的?如果你能进一步解释的话,那将非常有帮助。谢谢如果值不存在,replace()方法将添加这些值。您可以将mx.add()添加到该块,但不需要。现在我明白了,它与find()一起工作。当组合已经在散列中时,您可以从散列中获取最大值,这将是以前的最大值,并通过find()将它们分配给当前变量。然后将当前val变量与当前最大值进行比较,当前最大值等于以前的最大值。如果较大,则指定新的最大值,并使用replace()更新哈希中的新值。非常清晰优雅,谢谢!我喜欢使用哈希的解决方案,它足够简单,并且保留了原始顺序
data want;
  if _n_=1 then do;
  declare hash h();
  h.definekey('grp');
  h.definedata('value');
  h.definedone();
  end;
  set have;
  if h.find()^=0 then do;
  h.add(key:grp,data:number);
  max=number;
  end;
  else do;
     max=max(number,value);
     h.replace(key:grp,data:number);
  end;
  drop  value number;
run;
data example;
format grp1-grp5 $1.;
array grp[5];
array val[5];
do rows=1 to 1000000;
    do i=1 to 5;
        r = ceil(ranuni(1)*5);
        grp[i] = substr("ABCDE",r,1);
    end;
    do j=1 to 5;
        val[j] = 10*rannor(1);
    end;
    output;
end;
keep grp: val:;
run;
data want;
set example;
array val[5];
array max[5];
if _n_ = 1 then do;
    declare hash mx();
    rc = mx.defineKey('grp1','grp2','grp3','grp4','grp5');
    rc = mx.definedata('max1','max2','max3','max4','max5');
    rc = mx.definedone();
end;

rc = mx.find();
/*No Max for this combination -- add it*/
if rc then do;
    do i=1 to 5;
        max[i] = val[i];
    end;
end;

/*Update Max Values*/
do i=1 to 5;
    if val[i] > max[i] then
        max[i] = val[i];
end;

/*Update Hash*/
rc = mx.replace();

drop rc i;
n = _n_; /*This is for testing*/
run;
proc sort data=want;
by grp: n;
run;
*------------------------------------------------------------;
* CUMMAX                                                     ;
* Compute a cumulative max on 1 or several variables grouped ;
* by one or several variables;                               ;
*------------------------------------------------------------;
/* EXAMPLE:
data have;
format grp1-grp2 $1.;
array grp[2];
array val[3];
do rows=1 to 20;
    do i=1 to 2;
        r = ceil(ranuni(1)*2);
        grp[i] = substr("AB",r,1);
    end;
    do j=1 to 3;
        val[j] = 10*rannor(1);
    end;
    output;
end;
keep grp: val:;
run;

%cummax(have,grp=grp1 grp2,val=val1 val2,out= want1)
%cummax(have,grp=grp1,val=val1,drop=grp2 val3,out= want2)
%cummax(have,grp=grp1,val=val1,keep= val2,out= want3)
*/

%macro cummax
(data  /* source table */
,grp=  /* variables to group on */
,val=  /* variables to compute on */
,keep= /* variables to keep additionally to grp and computed columns, don't use with drop */
,drop= /* variables to drop, don't use with keep */
,out=  /* output table */
);

/* default output */
%if not %length(&out) %then %let out = &data;

/* rework keep and drop */
%local n_val max_val;
%let n_val   = %list_length(&val);
%let max_val = %list_fix(&val,suffix=_cmax);
%if %length(&keep) %then %let keep = (keep= &keep &grp &max_val );
%if %length(&drop) %then %let drop = (drop= &drop);

/* data step */
data &out&keep&drop;
set &data;
array val[&n_val] &val;
array max[&n_val] &max_val;

if _n_ = 1 then do;
    declare hash mx();
    rc = mx.defineKey(%list_quote_comma(&grp));
    rc = mx.definedata(%list_quote_comma(&max_val));
    rc = mx.definedone();
end;

rc = mx.find();
/*No Max for this combination -- add it*/
if rc then do;
    do i=1 to &n_val; /* %list_length(&val) */
        max[i] = val[i];
    end;
end;

/*Update Max Values*/
do i=1 to &n_val;
    if val[i] > max[i] then
        max[i] = val[i];
end;

/*Update Hash*/
rc = mx.replace();

drop rc i;
run;

%mend;


*---------------------------------------------------------------;
* LIST_LENGTH                                                   ;
* Length of space separated list                                ;
*---------------------------------------------------------------;
/* EXAMPLES :                                                    
   %put %list_length(item1 item2 item3);                                            
*/
%macro list_length
(data
);
%sysfunc(countw(&data,%str( )))
%mend;

*---------------------------------------------------------------;
* LIST_QUOTE_COMMA                                              ;
* create comma separated list with quoted items, from           ;
* unquoted space separated list.                                ;
*---------------------------------------------------------------;
/* EXAMPLE
%put %list_quote_comma(a b c);
*/
%macro list_quote_comma
(data /* space separated list to quote */
);
%unquote(%str(%')%qsysfunc(tranwrd(&data,%str( ),%str(%',%')))%str(%'))
%mend;

*---------------------------------------------------------------;
* LIST_FIX                                                      ;
* Add prefix and/or suffix to items of space separated list     ;
*---------------------------------------------------------------;
/* EXAMPLES :                                                    
   %put %list_fix(item1 item2 item3,pref_,_suf);                 
   %put %list_fix(item1 item2 item3,pref_);                 
   %put %list_fix(item1 item2 item3,suffix=_suf);                                           
*/

%macro list_fix
(data
,prefix
,suffix
);
%local output;
%do i=1 %to %sysfunc(countw(&data,%str( ))) ;
  %let output= &output &prefix.%scan(&data,&i,%str( ))&suffix;
%end;
&output
%mend;