Sas 如何尽快拆分大型数据集_Sas

Sas 如何尽快拆分大型数据集

sas

Sas 如何尽快拆分大型数据集,sas,Sas,我有一个非常大的数据集，大小为1T，我需要将其快速拆分为几个子数据集以下是分割数据集的传统方法： Data d1 d2...dn; Set raw_dataset; if condition1 then output d1; else if condition2 then output d2; ... else if conditionN then output dn; run; 但是对我来说还是太慢了！！有什么方法可以加快这个过程吗您可以使用以下宏，只需输入两个参数 1.要拆分的输入数

我有一个非常大的数据集，大小为1T，我需要将其快速拆分为几个子数据集

以下是分割数据集的传统方法：

Data d1 d2...dn;
Set raw_dataset;
if condition1 then output d1;
else if condition2 then output d2;
...
else if conditionN then output dn;
run;

但是对我来说还是太慢了！！

有什么方法可以加快这个过程吗

您可以使用以下宏，只需输入两个参数 1.要拆分的输入数据集 2.在每个数据集中输入所需的最大观测值

options merror mprint symbolgen mlogic;

/****CHANGE PATH for input DS location****/
libname inp "Y:\InputDS";
libname outp "Y:\OutputDS";

data inp_ds;
 set inp.input_sample; /****CHANGE Input DS****/
run;

proc sql noprint;
select count(*) into: total_obs from inp_ds;
quit;

%let max_obs=20000; /****CHANGE max number of OBS in a split DS****/
%let split_ds_num_temp=%sysfunc(int(&total_obs/&max_obs));
%let remainder= %sysfunc(mod(&total_obs,&max_obs));

data find_num;
 if &remainder>0 then split_ds_num=&split_ds_num_temp+1;
 else split_ds_num=&split_ds_num_temp;
 call symput('no_of_splits',split_ds_num);
run;

%macro split(i,inds);
data outp.out&i;
 set &inds;
 %if &i=1 %then 
 %do;
    If _N_>=1 and _N_<=&max_obs Then Output;
 %end;
 %else %if &i>1 %then
 %do;
    If _N_ >=(&max_obs*(&i-1))+1 and _N_<=&max_obs*&i Then Output;
 %end;
run;
%mend split;

data initiate_macro;
do i = 1 to &no_of_splits;
    call execute('%split('||i||', inp_ds)');
end;
run;

这将创建多个输出数据集，如：out1 out2。。。outn..根据程序中提到的输出目录路径中的观察数量

您可以使用以下宏，只需输入两个参数 1.要拆分的输入数据集 2.在每个数据集中输入所需的最大观测值

options merror mprint symbolgen mlogic;

/****CHANGE PATH for input DS location****/
libname inp "Y:\InputDS";
libname outp "Y:\OutputDS";

data inp_ds;
 set inp.input_sample; /****CHANGE Input DS****/
run;

proc sql noprint;
select count(*) into: total_obs from inp_ds;
quit;

%let max_obs=20000; /****CHANGE max number of OBS in a split DS****/
%let split_ds_num_temp=%sysfunc(int(&total_obs/&max_obs));
%let remainder= %sysfunc(mod(&total_obs,&max_obs));

data find_num;
 if &remainder>0 then split_ds_num=&split_ds_num_temp+1;
 else split_ds_num=&split_ds_num_temp;
 call symput('no_of_splits',split_ds_num);
run;

%macro split(i,inds);
data outp.out&i;
 set &inds;
 %if &i=1 %then 
 %do;
    If _N_>=1 and _N_<=&max_obs Then Output;
 %end;
 %else %if &i>1 %then
 %do;
    If _N_ >=(&max_obs*(&i-1))+1 and _N_<=&max_obs*&i Then Output;
 %end;
run;
%mend split;

data initiate_macro;
do i = 1 to &no_of_splits;
    call execute('%split('||i||', inp_ds)');
end;
run;

这将创建多个输出数据集，如：out1 out2。。。outn..根据您的程序中提到的输出目录路径中的观察次数

如果您不想使用条件，我可以与您分享我3年来使用的这个宏：

%macro partitionner(Library=, Table=, nb_part=, nblig=, tabIntr=);
data 
    %do i=1 %to &nb_part; 
        &Library..&tabIntr.&i. 
     %end;
; 
      set &Library..&Table.; 

      %do i=1 %to %eval(&nb_part-1); 
         if _n_ >= %eval(1+(&i.-1)*&nblig.) and _n_ <= %eval(&i.*&nblig.) 
         then output &Library..&tabIntr.&i.; 
      %end; 
      if _n_>=%eval((&i.-1)*&nblig+1) then output &lib..&tabIntr.&nb_part.; 
   run;
%mend partitionner;

结果是：

smalTable1有33个观察结果。 smalTable2有33个观察结果。 smalTable3有33个观察结果。 smalTable4有1个观察值。

如果您不想使用条件，我可以与您分享我3年来使用的宏：

%macro partitionner(Library=, Table=, nb_part=, nblig=, tabIntr=);
data 
    %do i=1 %to &nb_part; 
        &Library..&tabIntr.&i. 
     %end;
; 
      set &Library..&Table.; 

      %do i=1 %to %eval(&nb_part-1); 
         if _n_ >= %eval(1+(&i.-1)*&nblig.) and _n_ <= %eval(&i.*&nblig.) 
         then output &Library..&tabIntr.&i.; 
      %end; 
      if _n_>=%eval((&i.-1)*&nblig+1) then output &lib..&tabIntr.&nb_part.; 
   run;
%mend partitionner;

结果是：

smalTable1有33个观察结果。 smalTable2有33个观察结果。 smalTable3有33个观察结果。 smalTable4有1个观察值。

根据具体情况，您可以向原始数据集添加索引。速度太慢，如何添加？您让拆分运行多长时间？当您“中断”提交时，拆分会持续多长时间？数据集中有多少行和变量？你做了几道劈叉？你能举一个拆分条件的例子吗？你身上有什么样的硬件？原始数据集是否在本地驱动器上？您是否正在向网络或云目标写入？为什么您觉得需要拆分数据。。。您能否为数据集建立适当的索引，以便快速检索相关子集？是否已检查系统事件查看器中的磁盘I/o错误？是否有权访问SPDE libname引擎或SAS/CONNECT？这两种方法中的任何一种都可以用来获得某种程度的平行性。您是否尝试压缩=输出数据集？过滤条件有多复杂？有时，数据集选项或其他方法更有效。根据条件的不同，您可以向原始数据集添加索引。速度太慢如何？您让拆分运行多长时间？当您“中断”提交时，拆分会持续多长时间？数据集中有多少行和变量？你做了几道劈叉？你能举一个拆分条件的例子吗？你身上有什么样的硬件？原始数据集是否在本地驱动器上？您是否正在向网络或云目标写入？为什么您觉得需要拆分数据。。。您能否为数据集建立适当的索引，以便快速检索相关子集？是否已检查系统事件查看器中的磁盘I/o错误？是否有权访问SPDE libname引擎或SAS/CONNECT？这两种方法中的任何一种都可以用来获得某种程度的平行性。您是否尝试压缩=输出数据集？过滤条件有多复杂？有时，数据集选项或其他方法更有效。