Sas GLEVEL=I信息消息将告诉您是否正在使用索引。大表索引是否唯一?(我想没有)。如果没有,是否有第三个变量可以添加到综合指数中以使其唯一,如果第三个“是”是关联值的域,可以先验地知道,因此可以在每个50K选择器的查找级别上进行查找?哈希查询有明显的区别,
Sas GLEVEL=I信息消息将告诉您是否正在使用索引。大表索引是否唯一?(我想没有)。如果没有,是否有第三个变量可以添加到综合指数中以使其唯一,如果第三个“是”是关联值的域,可以先验地知道,因此可以在每个50K选择器的查找级别上进行查找?哈希查询有明显的区别,,sas,proc-sql,Sas,Proc Sql,GLEVEL=I信息消息将告诉您是否正在使用索引。大表索引是否唯一?(我想没有)。如果没有,是否有第三个变量可以添加到综合指数中以使其唯一,如果第三个“是”是关联值的域,可以先验地知道,因此可以在每个50K选择器的查找级别上进行查找?哈希查询有明显的区别,它不包括'01Jan2017'd和'01May2018'之间的where a.dos感谢深思熟虑编写的示例代码和测试用例,我将在周一尝试实施这些措施,并报告结果。(服务器正在刷新)。您提出了一个重要提示。Claim表有8个单独的索引列,我只使用
GLEVEL=I代码>信息消息将告诉您是否正在使用索引。大表索引是否唯一?(我想没有)。如果没有,是否有第三个变量可以添加到综合指数中以使其唯一,如果第三个“是”是关联值的域,可以先验地知道,因此可以在每个50K选择器的查找级别上进行查找?哈希查询有明显的区别,它不包括'01Jan2017'd和'01May2018'之间的where a.dos感谢深思熟虑编写的示例代码和测试用例,我将在周一尝试实施这些措施,并报告结果。(服务器正在刷新)。您提出了一个重要提示。Claim表有8个单独的索引列,我只使用了2个,即memno和dos(其余的索引值在某种程度上是特定于claims的)。
proc sql;
create table myClaims as
select a.claimno, a.dos, a.memno
from s.claims a inner join myInputList b
on a.memno = b.memno
where a.dos between '01Jan2017'd and '01May2018'd;
quit;
data myClaimsTest (drop=rc);
if 0 then set myInputList;
declare hash vs(hashexp:7, dataset:'myInputList');
vs.definekey('memno');
vs.definedata();
vs.definedone();
do until (eof);
set s.claims (keep=claimno dos) end=eof;
if vs.find()=0 then output;
end;
stop;
run;
dm "clear output"; ods listing; ods noresults; options nocenter; title;
proc contents varum data=all_claims;
run;
dm "output" output; ods results;
Variables in Creation Order
# Variable Type Len Format
1 claim_id Num 8
2 member_id Num 8
3 claim_date Num 8 YYMMDD10.
Alphabetic List of Indexes and Attributes
# of
Unique Unique
# Index Option Values Variables
1 PICK YES 333338 member_id claim_date
%let MEMBER_N = 1e5;
%let CLAIM_RATE = 0.00125;
%let MEMBER_SAMPLE_N = 1e2;
%let STUDY_PROPORTION = 0.001;
data ALL_CLAIMS
( label = "BIG"
index=
(
PICK = (member_id claim_date) / unique
)
);
retain claim_id 0 member_id 0 claim_date 0 member_n 0;
format claim_date yymmdd10.;
do member_id = 1e7 by 1;
claim_n = 1;
do claim_date = '01jan2012'd to '31dec2018'd;
if ranuni(123) > &CLAIM_RATE then continue;
claim_id + 1;
if claim_n = 1 then member_n + 1;
output;
claim_n + 1;
end;
if member_n = &MEMBER_N then leave;
end;
stop;
drop member_n claim_n;
run;
%put note: sample population is %sysevalf(5e4/200e6*100)% of all claims;
%put note: or ~%sysevalf(5e4/200e6*1e6) rows in this example;
data STUDY_MEMBERS(keep=member_id label="SMALL");
* k / n selection method, Proc SURVEYSELECT is better but not always available;
* an early sighting on SAS-L would be https://listserv.uga.edu/cgi-bin/wa?A2=ind9909c&L=SAS-L&P=173979
* Re: Random Selection (Sep 20, 1999);
retain
k %sysevalf(&MEMBER_N*&STUDY_PROPORTION, FLOOR)
n &MEMBER_N
;
set ALL_CLAIMS;
by member_id;
if first.member_id;
if ranuni(123) < k/n then do;
output;
k + (-1);
end;
n + (-1);
if n=0 then stop;
run;
options msglevel=i;
proc sql;
create table ALL_STUDY_SUBSET as
select ALL.claim_id, ALL.claim_date, ALL.member_id
from ALL_CLAIMS ALL inner join STUDY_MEMBERS STUDY
on ALL.member_id = STUDY.member_id
where ALL.claim_date between '01Jan2017'd and '01May2018'd
;
quit;
* extend study data with a date variable that matches the data variable in the ALL index;
data STUDY_MEMBERS_WITH_ITERATED_DATE;
set STUDY_MEMBERS;
do claim_date = '01Jan2017'd to '01May2018'd;
output;
end;
run;
* join on both variables in ALL key;
proc sql;
create table ALL_STUDY_SUBSET2 as
select ALL.claim_id, ALL.claim_date, ALL.member_id
from ALL_CLAIMS ALL inner join STUDY_MEMBERS_WITH_ITERATED_DATE STUDY
on ALL.member_id = STUDY.member_id
and ALL.claim_date = STUDY.claim_date
;
quit;
* full scan with hash based match;
data ALL_STUDY_SUBSET3;
SET ALL_CLAIMS;
if _n_ = 1 then do;
declare hash study (dataset:'STUDY_MEMBERS');
study.defineKey('member_id');
study.defineDone();
end;
if '01jan2017'd <= claim_date <= '01may2018'd;
if study.find() = 0;
run;
* SMALL scan with iterated dates to complete info to allow BIG index (key)
* to be used;
data ALL_STUDY_SUBSET4;
set STUDY_MEMBERS;
do claim_date = '01jan2017'd to '01may2018'd;
set ALL_CLAIMS key=pick;
if _iorc_ = 0 then output;
end;
_error_ = 0;
run;
%let MEMBER_N = 1e5;
%let CLAIM_RATE = 0.00125;
%let MULTI_CLAIM_RATE = 0.05; %* iterative rate at which another claim is made on same day a claim is made;
%let STUDY_PROPORTION = 0.001;
data ALL_CLAIMS
( label = "BIG"
index=
(
/* PICK = (member_id claim_date) / unique (not happening) */
member_id
claim_id
)
);
retain claim_id 0 member_id 0 claim_date 0 member_n 0;
format claim_date yymmdd10.;
do member_id = 1e7 by 1;
claim_n = 1;
do claim_date = '01jan2012'd to '31dec2018'd;
if ranuni(123) > &CLAIM_RATE then continue;
if claim_n = 1 then member_n + 1;
do multi_n = 0 by 1 until (ranuni(123) > &MULTI_CLAIM_RATE);
claim_id + 1;
output;
end;
if multi_n > 1 then put 'NOTE: ' member_id= claim_date= multi_n 'claims';
claim_n + 1;
end;
if member_n = &MEMBER_N then leave;
end;
stop;
drop member_n claim_n;
run;
proc sql _method _tree;
create table ALL_STUDY_SUBSET as
select ALL.claim_id, ALL.claim_date, ALL.member_id
from ALL_CLAIMS ALL inner join STUDY_MEMBERS STUDY
on ALL.member_id = STUDY.member_id
where ALL.claim_date between '01Jan2017'd and '01May2018'd
;
quit;
INFO: Index member_id of SQL table WORK.ALL_CLAIMS (alias = ALL) selected for SQL WHERE clause
(join) optimization.
NOTE: SQL execution methods chosen are:
sqxcrta
sqxjndx
sqxsrc( WORK.STUDY_MEMBERS(alias = STUDY) )
sqxsrc( WORK.ALL_CLAIMS(alias = ALL) )
Tree as planned.
/-SYM-V-(ALL.claim_id:1 flag=0001)
/-OBJ----|
| |--SYM-V-(ALL.claim_date:3 flag=0001)
| \-SYM-V-(ALL.member_id:2 flag=0001)
/-JOIN---|
| | /-SYM-V-(STUDY.member_id:1 flag=0001)
| | /-OBJ----|
| | /-SRC----|
| | | \-TABL[WORK].STUDY_MEMBERS opt=''
| |--FROM---|
| | | /-SYM-V-(ALL.claim_id:1 flag=0001)
| | | /-OBJ----|
| | | | |--SYM-V-(ALL.claim_date:3 flag=0001)
| | | | \-SYM-V-(ALL.member_id:2 flag=0001)
| | \-SRC----|
| | |--TABL[WORK].ALL_CLAIMS opt=''
| | | /-NAME--(claim_date:3)
| | \-IN-----|
| | | /-LITN(20820) DATE.
| | | /-RANB---|
| | | | \-LITN(21305) DATE.
| | \-SET----|
| |--empty-
| | /-SYM-V-(STUDY.member_id:1)
| \-CEQ----|
| \-SYM-V-(ALL.member_id:2)
--SSEL---|
data ALL_STUDY_SUBSET5(label="Presuming a preponderance of members file few claims over their all_claims lifetime");
set STUDY_MEMBERS;
do until (_iorc_);
set ALL_CLAIMS key=member_id;
if _iorc_ = 0 and '01jan2017'd <= claim_date <= '01may2018'd then do;
OUTPUT;
end;
end;
_error_ = 0;
run;