Sas宏半高效地操作数据

Sas宏半高效地操作数据,sas,sas-macro,Sas,Sas Macro,目标:从“拥有表+帮助表”转到“想要表”。下面的当前实现很慢。我相信这是一个如何不使用SAS宏的好例子,但我很好奇是否。。。 1.宏观方法可以被挽救/制造得足够快,以便可行 e、 g.proc append被认为可以加快堆叠数据集的速度,但我看不到任何性能提升。 2.所有的选择都是什么样子的 我已经写了一个非宏解决方案,为了比较,我将在下面发布 Data: data have ; input name $ term $; cards; Joe 2000 Joe 2002 Joe

目标:从“拥有表+帮助表”转到“想要表”。下面的当前实现很慢。我相信这是一个如何不使用SAS宏的好例子,但我很好奇是否。。。 1.宏观方法可以被挽救/制造得足够快,以便可行 e、 g.proc append被认为可以加快堆叠数据集的速度,但我看不到任何性能提升。 2.所有的选择都是什么样子的

我已经写了一个非宏解决方案,为了比较,我将在下面发布

Data: 
data have ; 
input name $ term $; 
cards;
Joe   2000 
Joe   2002
Joe   2008 
Sally 2001
Sally 2003
; run; 

proc print ; run; 

data help ; 
input terms $ ; 
cards; 
2000
2001
2002
2003
2004
2005
2006
2007
2008
; run; 

proc print ; run; 

data want ; 
input name $ term $ status $; 
cards;
Joe   2000  here
Joe   2001  gone
Joe   2002  here
Joe   2003  gone
Joe   2004  gone
Joe   2005  gone
Joe   2006  gone
Joe   2007  gone
Joe   2008  here
Sally 2001  here
Sally 2002  gone
Sally 2003  here
; run; 

proc print data=have ; run; 
我可以为每个人编写一个小宏:

%MACRO RET(NAME); 
proc sql ; 
create table studtermlist as 
select distinct term 
from have 
where NAME = "&NAME"
; 
SELECT Max(TERM) INTO :MAXTERM 
FROM HAVE
WHERE NAME = "&NAME"
; 
SELECT MIN(TERM) INTO :MINTERM 
FROM HAVE
WHERE NAME = "&NAME"
; 
CREATE TABLE TERMLIST AS 
SELECT TERMS  
FROM HELP 
WHERE TERMS BETWEEN "&MINTERM." and "&MAXTERM."
ORDER BY TERMS 
;
CREATE TABLE HEREGONE_&Name AS 
SELECT 
A.terms , 
"&Name" as Name,
CASE 
WHEN TERMS EQ TERM THEN  'Here'
when term is null THEN 'Gone'
end as status
from termlist a left join studtermlist b 
 on a.terms eq b.term 
; 
quit; 
%MEND RET ; 


%RET(Joe);
%RET(Sally);

proc print data=HEREGONE_Joe; run; 
proc print data=HEREGONE_Sally; run; 
但它是不完整的。如果我循环查找可能相当多的名字

*******need procedure for all names - grab info on have ; 
proc sql noprint; 
select distinct name into :namelist separated by ' '
from have
; quit;

%let n=&sqlobs ; 


%MACRO RETYA ; 
OPTIONS NONOTEs ; 
%do i = 1 %to &n ; 
 %let currentvalue = %scan(&namelist,&i); 
 %put &currentvalue ; 
 %put &i ; 
%RET(&currentvalue);
%IF &i = 1 %then %do ; 
data base; set HEREGONE_&currentvalue; run; 
                 %end; 
%IF &i gt 1 %then %do ; 
proc sql ; create table base as 
select * from base
union 
select * from HEREGONE_&currentvalue
;
drop table HEREGONE_&currentvalue;
quit;
                 %end; 
%end ; 
OPTIONS NOTES; 
%MEND; 

%RETYA ; 

proc sort data=base ; by name terms; run; 
proc print data=base; run; 

所以现在我想要,但是有6000个名字,需要20分钟

让我们试试另一种解决方案。对于每个名称,通过proc SQL数据步骤查找最小/最大术语。然后使用数据步骤创建时间段表,并将其与原始表合并

*Sample data;
data have ; 
input name $ term ; 
cards;
Joe   2000 
Joe   2002
Joe   2008 
Sally 2001
Sally 2003
; run; 

*find min/max of each name;
proc sql;
create table terms as
select name, min(term) as term_min, max(term) as term_max
from have
group by name
order by name;
quit;

*Create table with the time periods for each name;
data empty;
set terms;
do term=term_min to term_max;
output;
end;
drop term_min term_max;
run;

*Create final table by merging the original table with table previously generated;
proc sql;
create table want as
select a.name, a.term, case when missing(b.term) then 'Gone'
                        else 'Here' end as status
from empty a
left join have b
on a.name=b.name
and a.term=b.term
order by a.name, a.term;
quit;
编辑:现在看看您的宏解决方案,部分问题是您扫描表的次数太多

第一个表studenttermlist不是必需的,最后一个联接可以 而不是被过滤。 这两个宏变量,最小/最大项可以是 一次计算 避免使用较小的临时术语列表,并使用where子句筛选结果 使用Call Execute调用宏,而不是另一个宏循环 而不是通过循环来附加 数据,利用命名约定并使用单个数据 添加所有输出的步骤

%MACRO RET(NAME); 
proc sql noprint; 

SELECT MIN(TERM), Max(TERM) INTO :MINTERM,  :MAXTERM
FROM HAVE
WHERE NAME = "&NAME"
; 


CREATE TABLE _HG_&Name AS 
SELECT 
A.terms , 
"&Name" as Name,
CASE 
WHEN TERMS EQ TERM THEN  'Here'
when term is null THEN 'Gone'
end as status
from help a 
left join have b 
 on a.terms eq b.term 
 and b.name="&name"
 where a.terms between "&minterm" and "&maxterm";
; 
quit; 
%MEND RET ; 


*call macro;
proc sort data=have;
by name term;
run;

data _null_;
    set have;
    by name;
    if first.name then do;
    str=catt('%ret(', name, ');');
    call execute(str);
    end;
run;


*append results;
data all;
    set _hg:;
run;

让我们试试另一种解决办法。对于每个名称,通过proc SQL数据步骤查找最小/最大术语。然后使用数据步骤创建时间段表,并将其与原始表合并

*Sample data;
data have ; 
input name $ term ; 
cards;
Joe   2000 
Joe   2002
Joe   2008 
Sally 2001
Sally 2003
; run; 

*find min/max of each name;
proc sql;
create table terms as
select name, min(term) as term_min, max(term) as term_max
from have
group by name
order by name;
quit;

*Create table with the time periods for each name;
data empty;
set terms;
do term=term_min to term_max;
output;
end;
drop term_min term_max;
run;

*Create final table by merging the original table with table previously generated;
proc sql;
create table want as
select a.name, a.term, case when missing(b.term) then 'Gone'
                        else 'Here' end as status
from empty a
left join have b
on a.name=b.name
and a.term=b.term
order by a.name, a.term;
quit;
编辑:现在看看您的宏解决方案,部分问题是您扫描表的次数太多

第一个表studenttermlist不是必需的,最后一个联接可以 而不是被过滤。 这两个宏变量,最小/最大项可以是 一次计算 避免使用较小的临时术语列表,并使用where子句筛选结果 使用Call Execute调用宏,而不是另一个宏循环 而不是通过循环来附加 数据,利用命名约定并使用单个数据 添加所有输出的步骤

%MACRO RET(NAME); 
proc sql noprint; 

SELECT MIN(TERM), Max(TERM) INTO :MINTERM,  :MAXTERM
FROM HAVE
WHERE NAME = "&NAME"
; 


CREATE TABLE _HG_&Name AS 
SELECT 
A.terms , 
"&Name" as Name,
CASE 
WHEN TERMS EQ TERM THEN  'Here'
when term is null THEN 'Gone'
end as status
from help a 
left join have b 
 on a.terms eq b.term 
 and b.name="&name"
 where a.terms between "&minterm" and "&maxterm";
; 
quit; 
%MEND RET ; 


*call macro;
proc sort data=have;
by name term;
run;

data _null_;
    set have;
    by name;
    if first.name then do;
    str=catt('%ret(', name, ');');
    call execute(str);
    end;
run;


*append results;
data all;
    set _hg:;
run;

实际上,您可以在单个嵌套SQL查询中执行此操作。这将是混乱和难以阅读

我要把它分成三个部分

首先,获得不同的名称

proc sql noprint;
create table names as
select distinct name from have;
quit;
第二,笛卡尔产品名称和术语,以获得所有组合

proc sql noprint;
create table temp as
select a.name, b.terms as term
from names as a,
     help as b;
quit;
第三,左连接以查找匹配项

proc sql noprint;
create table want as
select a.name,
       a.term,
       case
          when missing(b.term) then "gone"
          else "here"
       end as Status
from temp as a
left join
     have as b
on a.name=b.name
and a.term=b.term;
quit;
最后,删除临时表以节省空间

proc datasets lib=work nolist;
delete temp;
run;
quit;

如Reeza所示,还有其他方法可以做到这一点。正如我上面所说的,您可以将所有这些合并到一个SQL联接中,并获得所需的结果。根据计算机内存和数据大小,它应该是正常的,并且可能更快,因为所有内容都在内存中。

实际上,您可以在单个嵌套SQL查询中执行此操作。这将是混乱和难以阅读

我要把它分成三个部分

首先,获得不同的名称

proc sql noprint;
create table names as
select distinct name from have;
quit;
第二,笛卡尔产品名称和术语,以获得所有组合

proc sql noprint;
create table temp as
select a.name, b.terms as term
from names as a,
     help as b;
quit;
第三,左连接以查找匹配项

proc sql noprint;
create table want as
select a.name,
       a.term,
       case
          when missing(b.term) then "gone"
          else "here"
       end as Status
from temp as a
left join
     have as b
on a.name=b.name
and a.term=b.term;
quit;
最后,删除临时表以节省空间

proc datasets lib=work nolist;
delete temp;
run;
quit;

如Reeza所示,还有其他方法可以做到这一点。正如我上面所说的,您可以将所有这些合并到一个SQL联接中,并获得所需的结果。取决于计算机内存和数据大小,它应该是正常的,可能会更快,因为所有的东西都在内存中。

我将给出类似的答案,以便以后比较它们

proc sql ; 
create table studtermlist as 
select distinct term,name 
from have 
; 
create table MAXMINTERM as 
SELECT Max(TERM) as MAXTERM, Min(TERM) as MINTERM, name  
FROM HAVE
GROUP BY name
; 
CREATE TABLE TERMLIST AS 
SELECT TERMS,name  
FROM HELP a,MAXMINTERM b 
WHERE TERMS BETWEEN MINTERM and MAXTERM
ORDER BY name,TERMS 
;
CREATE TABLE HEREGONE AS 
SELECT 
a.terms , 
a.Name  ,
CASE 
WHEN TERMS EQ TERM THEN  'Here'
when term is null THEN 'Gone'
end as status
from termlist a left join studtermlist b 
 on a.terms eq b.term
 and a.name eq b.name 
order by name, terms
; 
quit; 

我将给出类似的答案,以便稍后比较它们

proc sql ; 
create table studtermlist as 
select distinct term,name 
from have 
; 
create table MAXMINTERM as 
SELECT Max(TERM) as MAXTERM, Min(TERM) as MINTERM, name  
FROM HAVE
GROUP BY name
; 
CREATE TABLE TERMLIST AS 
SELECT TERMS,name  
FROM HELP a,MAXMINTERM b 
WHERE TERMS BETWEEN MINTERM and MAXTERM
ORDER BY name,TERMS 
;
CREATE TABLE HEREGONE AS 
SELECT 
a.terms , 
a.Name  ,
CASE 
WHEN TERMS EQ TERM THEN  'Here'
when term is null THEN 'Gone'
end as status
from termlist a left join studtermlist b 
 on a.terms eq b.term
 and a.name eq b.name 
order by name, terms
; 
quit; 

帮助数据集中的术语实际上是数字的吗?它们是否像示例中那样连续?是否完全不使用宏?术语是有意使用的字符。实际术语可以包括多个零201000,并存储为字符。嗯……我的第一个解决方案假设相反。稍后我会再看一看,但这取决于术语的存储方式。术语是字符。实际术语可以包括多个零201000,并存储为字符。它们是连续的…尽管规模不寻常:2000002000152001520020060。。。但这是通过使用帮助表来处理的。感谢您的回复,我期待有时间仔细检查它们。帮助数据集中的术语实际上是数字的吗?它们是否像示例中那样连续?完全不使用宏如何?术语是字符。实际术语可以包括多个零201000,并存储为字符。嗯……我的第一个解决方案假设相反。稍后我会再看一看,但这取决于术语的存储方式。术语是字符。实际术语可以包括多个零201000,并存储为字符。它们是连续的……尽管如此 它的规模非同寻常:20000020010001520020020060。。。但这是通过使用帮助表来处理的。谢谢你的回复,我期待有时间仔细检查一下。