按唯一ID和日期列出的SAS累积计数
我有一个如下的数据集按唯一ID和日期列出的SAS累积计数,sas,Sas,我有一个如下的数据集 Customer_ID Vistited_Date 1234 7-Feb-20 4567 7-Feb-20 9870 7-Feb-20 1234 14-Feb-20 7654 14-Feb-20 3421 14-Feb-20 假设我的输出如下所示,我正在尝试按日期查找累积的唯一客户数 Cust_count Vistited_Date 3 7-Feb-20 2
Customer_ID Vistited_Date
1234 7-Feb-20
4567 7-Feb-20
9870 7-Feb-20
1234 14-Feb-20
7654 14-Feb-20
3421 14-Feb-20
假设我的输出如下所示,我正在尝试按日期查找累积的唯一客户数
Cust_count Vistited_Date
3 7-Feb-20
2 14-Feb-20
2020年2月7日有3个独特的客户,而2020年2月14日只有2个,因此客户1234已经访问过
有人知道我如何在这种情况下开发数据集吗
对不起,如果我的问题不够清楚,如果有必要,我可以提供更多细节
谢谢 注意:@draycut的答案有相同的逻辑,但更快,我会解释原因 @draycut的代码使用一个哈希方法add,使用返回代码作为条件增量的测试。我的代码使用check测试条件增量,然后添加一个永远不会失败的跟踪。根据组的数量、组的大小和id的重用率,一种方法的性能可以提高15%到40% 您需要跟踪所有以前的组中出现的ID,并从当前组计数中排除跟踪的ID 跟踪可以通过散列完成,条件计数可以在每个组的DOW循环中执行。DOW循环将SET语句放在显式DO中 例如:
data have;
input ID Date: date9.; format date date11.;
datalines;
1234 7-Feb-20
4567 7-Feb-20
9870 7-Feb-20
1234 14-Feb-20
7654 14-Feb-20
3421 14-Feb-20
;
data counts(keep=date count);
if _n_ = 1 then do;
declare hash tracker();
tracker.defineKey('id');
tracker.defineDone();
end;
do until (last.date);
set have;
by date;
if tracker.check() ne 0 then do;
count = sum(count, 1);
tracker.add();
end;
end;
run;
原始性能基准-无需磁盘io,在进行哈希运算之前需要cpu填充阵列,所以这些性能组件是组合在一起的
根性能是新项目添加到哈希的速度
模拟3000000条“记录”,1000组3000个日期,10%的id重复使用,因此不同的id约为270万
%macro array_fill (top=3000000, n_group = 1000, overlap_factor=0.10);
%local group_size n_overlap index P Q;
%let group_size = %eval (&top / &n_group);
%if (&group_size < 1) %then %let group_size = 1;
%let n_overlap = %sysevalf (&group_size * &overlap_factor, floor);
%if &n_overlap < 0 %then %let n_overlap = 0;
%let top = %sysevalf (&group_size * &n_group);
P = 1;
Q = &group_size;
array ids(&top) _temporary_;
_n_ = 0;
do i = 1 to &n_group;
do j = P to Q;
_n_+1;
ids(_n_) = j;
end;
P = Q - &n_overlap;
Q = P + &group_size - 1;
end;
%mend;
options nomprint;
data _null_ (label='check then add');
length id 8;
declare hash h();
h.defineKey('id');
h.defineDone();
%array_fill;
do index = 1 to dim(ids);
id = ids(index);
if h.check() ne 0 then do;
count = sum(count,1);
h.add();
end;
end;
_n_ = h.num_items;
put 'num_items=' _n_ comma12.;
put index= comma12.;
stop;
run;
data _null_ (label='just add');
length id 8;
declare hash h();
h.defineKey('id');
h.defineDone();
%array_fill;
do index = 1 to dim(ids);
id = ids(index);
if h.add() = 0 then
count = sum(count,1);
end;
_n_ = h.num_items;
put 'num_items=' _n_ comma12.;
put index= comma12.;
stop;
run;
注:@draycut的答案逻辑相同,但速度更快,我将解释原因 @draycut的代码使用一个哈希方法add,使用返回代码作为条件增量的测试。我的代码使用check测试条件增量,然后添加一个永远不会失败的跟踪。根据组的数量、组的大小和id的重用率,一种方法的性能可以提高15%到40% 您需要跟踪所有以前的组中出现的ID,并从当前组计数中排除跟踪的ID 跟踪可以通过散列完成,条件计数可以在每个组的DOW循环中执行。DOW循环将SET语句放在显式DO中 例如:
data have;
input ID Date: date9.; format date date11.;
datalines;
1234 7-Feb-20
4567 7-Feb-20
9870 7-Feb-20
1234 14-Feb-20
7654 14-Feb-20
3421 14-Feb-20
;
data counts(keep=date count);
if _n_ = 1 then do;
declare hash tracker();
tracker.defineKey('id');
tracker.defineDone();
end;
do until (last.date);
set have;
by date;
if tracker.check() ne 0 then do;
count = sum(count, 1);
tracker.add();
end;
end;
run;
原始性能基准-无需磁盘io,在进行哈希运算之前需要cpu填充阵列,所以这些性能组件是组合在一起的
根性能是新项目添加到哈希的速度
模拟3000000条“记录”,1000组3000个日期,10%的id重复使用,因此不同的id约为270万
%macro array_fill (top=3000000, n_group = 1000, overlap_factor=0.10);
%local group_size n_overlap index P Q;
%let group_size = %eval (&top / &n_group);
%if (&group_size < 1) %then %let group_size = 1;
%let n_overlap = %sysevalf (&group_size * &overlap_factor, floor);
%if &n_overlap < 0 %then %let n_overlap = 0;
%let top = %sysevalf (&group_size * &n_group);
P = 1;
Q = &group_size;
array ids(&top) _temporary_;
_n_ = 0;
do i = 1 to &n_group;
do j = P to Q;
_n_+1;
ids(_n_) = j;
end;
P = Q - &n_overlap;
Q = P + &group_size - 1;
end;
%mend;
options nomprint;
data _null_ (label='check then add');
length id 8;
declare hash h();
h.defineKey('id');
h.defineDone();
%array_fill;
do index = 1 to dim(ids);
id = ids(index);
if h.check() ne 0 then do;
count = sum(count,1);
h.add();
end;
end;
_n_ = h.num_items;
put 'num_items=' _n_ comma12.;
put index= comma12.;
stop;
run;
data _null_ (label='just add');
length id 8;
declare hash h();
h.defineKey('id');
h.defineDone();
%array_fill;
do index = 1 to dim(ids);
id = ids(index);
if h.add() = 0 then
count = sum(count,1);
end;
_n_ = h.num_items;
put 'num_items=' _n_ comma12.;
put index= comma12.;
stop;
run;
另一种方法,因为我不太了解hash_ 另一种方法,因为我不太了解hash_ 如果您不太关心处理速度,并且想要一些简单的东西:
proc sort data=have;
by id date;
** Get date of each customer's first unique visit **;
proc sort data=have out=first_visit nodupkey;
by id;
proc freq data=first_visit noprint;
tables date /out=want (keep=date count);
run;
如果您不太关心处理速度,并且想要一些简单的东西:
proc sort data=have;
by id date;
** Get date of each customer's first unique visit **;
proc sort data=have out=first_visit nodupkey;
by id;
proc freq data=first_visit noprint;
tables date /out=want (keep=date count);
run;
如果您的数据未排序,并且您喜欢SQL,则此解决方案可能同样适用于您,而且非常简单:
/* your example 3 rows */
data have;
input ID Date: date9.; format date date11.;
datalines;
1234 7-Feb-20
4567 7-Feb-20
9870 7-Feb-20
1234 14-Feb-20
7654 14-Feb-20
3421 14-Feb-20
1234 15-Feb-20
7654 15-Feb-20
1111 15-Feb-20
;
run;
/* simple set theory. Final dataset contains your final data like results
below*/
proc sql;
create table temp(where =(mindate=date)) as select
ID, date,min(date) as mindate from have
group by id;
create table final as select count(*) as customer_count,date from temp
group by date;
quit;
/* results:
customer_count Date
3 07.febr.20
2 14.febr.20
1 15.febr.20
*/
如果您的数据未排序,并且您喜欢SQL,则此解决方案可能同样适用于您,而且非常简单:
/* your example 3 rows */
data have;
input ID Date: date9.; format date date11.;
datalines;
1234 7-Feb-20
4567 7-Feb-20
9870 7-Feb-20
1234 14-Feb-20
7654 14-Feb-20
3421 14-Feb-20
1234 15-Feb-20
7654 15-Feb-20
1111 15-Feb-20
;
run;
/* simple set theory. Final dataset contains your final data like results
below*/
proc sql;
create table temp(where =(mindate=date)) as select
ID, date,min(date) as mindate from have
group by id;
create table final as select count(*) as customer_count,date from temp
group by date;
quit;
/* results:
customer_count Date
3 07.febr.20
2 14.febr.20
1 15.febr.20
*/