SAS分组对

SAS分组对,sas,Sas,我有两个变量ID1和ID2。它们都是相同类型的标识符。当它们出现在同一行数据中时,表示它们在同一组中。我想为每个ID创建一个组标识符 ID1 ID2 1 4 1 5 2 5 2 6 3 7 4 1 5 1 5 2 6 2 7 3 那我就要 ID Group 1 1 2 1 3 2 4 1 5 1 6 1 7 2 因为1,2,4,5,6在原始数据中由某

我有两个变量ID1和ID2。它们都是相同类型的标识符。当它们出现在同一行数据中时,表示它们在同一组中。我想为每个ID创建一个组标识符

ID1   ID2
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3
那我就要

ID   Group
1     1
2     1
3     2
4     1
5     1
6     1
7     2
因为1,2,4,5,6在原始数据中由某种组合配对,所以它们共享一个组。3和7只是成对出现,因此它们是一个新的组。我想为大约20000行这样做。ID1中的每个ID也在ID2中(更具体地说,如果观察的ID1=1和ID2=2,那么还有另一个观察,即ID1=2和ID2=1)


我试着前后合并它们,但不起作用。我还尝试调用symput,尝试为每个ID的组创建一个宏变量,然后在移动行时更新它,但我也无法让它工作

请尝试下面的代码

data have;
input ID1 ID2;
datalines;
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3
;
run;

* Finding repeating in ID1;

proc sort data=have;by id1;run;

data want_1;

    set have;
    by id1;

    attrib flagrepeat length=8.;

    if not (first.id1 and last.id1) then flagrepeat=1;
    else flagrepeat=0;
run;

* Finding repeating in ID2;

proc sort data=want_1;by id2;run;

data want_2;
    set want_1;
    by id2;

    if not (first.id2 and last.id2) then flagrepeat=1;

run;

proc sort data=want_2 nodupkey;by id1 ;run;

data want(drop= ID2 flagrepeat rename=(ID1=ID));
    set want_2;
    attrib Group length=8.;

    if(flagrepeat eq 1) then Group=1;
    else Group=2;
run;

希望这个答案有帮助。

请尝试下面的代码

data have;
input ID1 ID2;
datalines;
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3
;
run;

* Finding repeating in ID1;

proc sort data=have;by id1;run;

data want_1;

    set have;
    by id1;

    attrib flagrepeat length=8.;

    if not (first.id1 and last.id1) then flagrepeat=1;
    else flagrepeat=0;
run;

* Finding repeating in ID2;

proc sort data=want_1;by id2;run;

data want_2;
    set want_1;
    by id2;

    if not (first.id2 and last.id2) then flagrepeat=1;

run;

proc sort data=want_2 nodupkey;by id1 ;run;

data want(drop= ID2 flagrepeat rename=(ID1=ID));
    set want_2;
    attrib Group length=8.;

    if(flagrepeat eq 1) then Group=1;
    else Group=2;
run;

希望这个答案有帮助。

正如一位评论员所提到的,散列似乎是一种可行的方法。在下面的代码中,哈希表中维护“id”和“group”,只有在整行找不到“id”匹配项时,才会添加新的“group”。请注意,“do over”是一个未记录的功能,可以轻松地用多一点编码来替换

data have;
    input ID1   ID2;
    cards;
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3
;

data _null_;
    if _n_=1 then
        do;
            declare hash h(ordered: 'a');
            h.definekey('id');
            h.definedata('id','group');
            h.definedone();
            call missing(id,group);
        end;

    set have end=last;
    array ids id1 id2;
    do over ids;
        rc=sum(rc,h.find(key:ids)=0);

        /*you can choose to 'leave' the loop here when first h.find(key:ids)=0 is met, for the sake of better efficiency*/
    end;

    if not rc > 0 then
        group+1;

    do over ids;
        id=ids;
        h.replace();
    end;
if last then rc=h.output(dataset:'want');
run;

正如一位评论员所提到的,哈希似乎确实是一种可行的方法。在下面的代码中,哈希表中维护“id”和“group”,只有在整行找不到“id”匹配项时,才会添加新的“group”。请注意,“do over”是一个未记录的功能,可以轻松地用多一点编码来替换

data have;
    input ID1   ID2;
    cards;
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3
;

data _null_;
    if _n_=1 then
        do;
            declare hash h(ordered: 'a');
            h.definekey('id');
            h.definedata('id','group');
            h.definedone();
            call missing(id,group);
        end;

    set have end=last;
    array ids id1 id2;
    do over ids;
        rc=sum(rc,h.find(key:ids)=0);

        /*you can choose to 'leave' the loop here when first h.find(key:ids)=0 is met, for the sake of better efficiency*/
    end;

    if not rc > 0 then
        group+1;

    do over ids;
        id=ids;
        h.replace();
    end;
if last then rc=h.output(dataset:'want');
run;

我以陈海阔的答案为出发点,开发了一个稍微复杂一点的算法,它似乎适用于我迄今为止尝试过的所有测试用例。它可能还可以进一步优化,但在我的电脑上,它在不到一秒钟的时间内就可以处理20000行,而只需要几MB的内存。输入数据集不需要按任何特定顺序排序,但在编写时,它假定每行至少存在一次id1 测试用例:

/* Original test case */
data have;
input id1 id2;
cards;
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3
;
run;

/* Revised test case - all in one group with connecting row right at the end */
data have; 
input ID1 ID2; 
/*Make sure each row has id1 < id2*/
if id1 > id2 then do;
t_id2 = id2;
id2   = id1;
id1   = t_id2;
end;
drop t_id2;
cards; 
2 5 
4 8 
2 4 
2 6 
3 7 
4 1 
9 1 
3 2 
6 2 
7 3
;
run;

/*Full scale test case*/
data have;
    do _N_ = 1 to 20000;
        call streaminit(1);
        id1 = int(rand('uniform')*100000);
        id2 = int(rand('uniform')*100000);
        if id1 < id2 then output;
        t_id2 = id2;
        id2   = id1;
        id1   = t_id2;
        if id1 < id2 then output;
    end;
    drop t_id2; 
run;
option fullstimer;

data _null_;
    length id group 8;
    declare hash h();
    rc = h.definekey('id');
    rc = h.definedata('id');        
    rc = h.definedata('group');
    rc = h.definedone();

    array ids(2) id1 id2;
    array groups(2) group1 group2;

    /*Initial group guesses (greedy algorithm)*/
    do until (eof);
        set have(where = (id1 < id2)) end = eof;
        match = 0;
        call missing(min_group);
        do i = 1 to 2;
            rc = h.find(key:ids[i]);
            match + (rc=0);
            if rc = 0 then min_group = min(group,min_group);
        end;
        /*If neither id was in a previously matched group, create a new one*/
        if not(match) then do;
            max_group + 1;
            group = max_group;
        end;
        /*Otherwise, assign both to the matched group with the lowest number*/
        else group = min_group;
        do i = 1 to 2;
            id = ids[i];
            rc = h.replace();
        end;
    end;

    /*We now need to work through the whole dataset multiple times
      to deal with ids that were wrongly assigned to a separate group
      at the end of the initial pass, so load the table into a 
      hash object + iterator*/
    declare hash h2(dataset:'have(where = (id1 < id2))');
    rc = h2.definekey('id1','id2');
    rc = h2.definedata('id1','id2');
    rc = h2.definedone();
    declare hiter hi2('h2');

    change_count = 1;
    do while(change_count > 0);
        change_count = 0;
        rc = hi2.first();
        do while(rc = 0);
            /*Get the current group of each id from 
              the hash we made earlier*/
            do i = 1 to 2;
                rc = h.find(key:ids[i]);
                groups[i] = group;
            end;
            /*If we find a row where the two ids have different groups, 
              move the id in the higher group to the lower group*/
            if groups[1] < groups[2] then do;
                id = ids[2];
                group = groups[1];
                rc = h.replace();
                change_count + 1;           
            end;
            else if groups[2] < groups[1] then do;
                id = ids[1];
                group = groups[2];
                rc = h.replace();       
                change_count + 1;           
            end;
            rc = hi2.next();
        end;
        pass + 1;
        put pass= change_count=; /*For information only :)*/
    end;    

    rc = h.output(dataset:'want');

run;

/*Renumber the groups sequentially*/
proc sort data = want;
    by group id;
run;

data want;
    set want;
    by group;
    if first.group then new_group + 1;
    drop group;
    rename new_group = group;
run;

/*Summarise by # of ids per group*/
proc sql;
    select a.group, count(id) as FREQ 
        from want a
        group by a.group
        order by freq desc;
quit;   
没有:

 pass=0 change_count=4637
 pass=1 change_count=182
 pass=2 change_count=23
 pass=3 change_count=9
 pass=4 change_count=2
 pass=5 change_count=1
 pass=6 change_count=0

 NOTE: DATA statement used (Total process time):
       real time           0.18 seconds
       user cpu time       0.16 seconds
       system cpu time     0.04 seconds

我以陈海阔的答案为出发点,开发了一个稍微复杂一点的算法,它似乎适用于我迄今为止尝试过的所有测试用例。它可能还可以进一步优化,但在我的电脑上,它在不到一秒钟的时间内就可以处理20000行,而只需要几MB的内存。输入数据集不需要按任何特定顺序排序,但在编写时,它假定每行至少存在一次id1 测试用例:

/* Original test case */
data have;
input id1 id2;
cards;
1     4
1     5
2     5
2     6
3     7
4     1
5     1
5     2
6     2
7     3
;
run;

/* Revised test case - all in one group with connecting row right at the end */
data have; 
input ID1 ID2; 
/*Make sure each row has id1 < id2*/
if id1 > id2 then do;
t_id2 = id2;
id2   = id1;
id1   = t_id2;
end;
drop t_id2;
cards; 
2 5 
4 8 
2 4 
2 6 
3 7 
4 1 
9 1 
3 2 
6 2 
7 3
;
run;

/*Full scale test case*/
data have;
    do _N_ = 1 to 20000;
        call streaminit(1);
        id1 = int(rand('uniform')*100000);
        id2 = int(rand('uniform')*100000);
        if id1 < id2 then output;
        t_id2 = id2;
        id2   = id1;
        id1   = t_id2;
        if id1 < id2 then output;
    end;
    drop t_id2; 
run;
option fullstimer;

data _null_;
    length id group 8;
    declare hash h();
    rc = h.definekey('id');
    rc = h.definedata('id');        
    rc = h.definedata('group');
    rc = h.definedone();

    array ids(2) id1 id2;
    array groups(2) group1 group2;

    /*Initial group guesses (greedy algorithm)*/
    do until (eof);
        set have(where = (id1 < id2)) end = eof;
        match = 0;
        call missing(min_group);
        do i = 1 to 2;
            rc = h.find(key:ids[i]);
            match + (rc=0);
            if rc = 0 then min_group = min(group,min_group);
        end;
        /*If neither id was in a previously matched group, create a new one*/
        if not(match) then do;
            max_group + 1;
            group = max_group;
        end;
        /*Otherwise, assign both to the matched group with the lowest number*/
        else group = min_group;
        do i = 1 to 2;
            id = ids[i];
            rc = h.replace();
        end;
    end;

    /*We now need to work through the whole dataset multiple times
      to deal with ids that were wrongly assigned to a separate group
      at the end of the initial pass, so load the table into a 
      hash object + iterator*/
    declare hash h2(dataset:'have(where = (id1 < id2))');
    rc = h2.definekey('id1','id2');
    rc = h2.definedata('id1','id2');
    rc = h2.definedone();
    declare hiter hi2('h2');

    change_count = 1;
    do while(change_count > 0);
        change_count = 0;
        rc = hi2.first();
        do while(rc = 0);
            /*Get the current group of each id from 
              the hash we made earlier*/
            do i = 1 to 2;
                rc = h.find(key:ids[i]);
                groups[i] = group;
            end;
            /*If we find a row where the two ids have different groups, 
              move the id in the higher group to the lower group*/
            if groups[1] < groups[2] then do;
                id = ids[2];
                group = groups[1];
                rc = h.replace();
                change_count + 1;           
            end;
            else if groups[2] < groups[1] then do;
                id = ids[1];
                group = groups[2];
                rc = h.replace();       
                change_count + 1;           
            end;
            rc = hi2.next();
        end;
        pass + 1;
        put pass= change_count=; /*For information only :)*/
    end;    

    rc = h.output(dataset:'want');

run;

/*Renumber the groups sequentially*/
proc sort data = want;
    by group id;
run;

data want;
    set want;
    by group;
    if first.group then new_group + 1;
    drop group;
    rename new_group = group;
run;

/*Summarise by # of ids per group*/
proc sql;
    select a.group, count(id) as FREQ 
        from want a
        group by a.group
        order by freq desc;
quit;   
没有:

 pass=0 change_count=4637
 pass=1 change_count=182
 pass=2 change_count=23
 pass=3 change_count=9
 pass=4 change_count=2
 pass=5 change_count=1
 pass=6 change_count=0

 NOTE: DATA statement used (Total process time):
       real time           0.18 seconds
       user cpu time       0.16 seconds
       system cpu time     0.04 seconds


你能运行Proc-BOM吗,我现在忘记了包名。我想,对于20k行,基于哈希的方法也应该是可行的。我很难理解Proc-BOM是如何工作的,你能举个例子说明它对我有什么帮助吗?@Reeza它在SAS/OR中。我发布了一个基于哈希的答案-请确认这是否符合你的预期。你能运行Proc-BOM吗,我现在忘记了包名。我想,对于20k行,基于哈希的方法也应该是可行的。我很难理解Proc-BOM是如何工作的,你能举个例子说明它对我有什么帮助吗?@Reeza它在SAS/OR中。我发布了一个基于哈希的答案-请确认这是否符合你的预期。谢谢你的回复。不幸的是,我不认为它会起作用,因为在最后一盘中会有超过两个小组。我认为您的解决方案适用于两个组,对吗?@DVL,是的,此代码只适用于两个组。请提供更多组的场景。可以有任意数量的组,例如,可能是。1 2 / 2 1 / 3 4 / 4 3 / 5 6 / 6 5. 然后第1组中的1,2,第2组中的3,4,第3组中的5,6感谢您的回答。不幸的是,我不认为它会起作用,因为在最后一盘中会有超过两个小组。我认为您的解决方案适用于两个组,对吗?@DVL,是的,此代码只适用于两个组。请提供更多组的场景。可以有任意数量的组,例如,可能是。1 2 / 2 1 / 3 4 / 4 3 / 5 6 / 6 5. 第1组中的1,2,第2组中的3,4,第3组中的5,6这或多或少是我会发布的答案。但是,请注意,这取决于输入数据集的排序顺序-例如,交换obs 1和obs 3会导致不正确的输出。此外,由于询问者表示每对数据都存在两次,因此您可以通过仅读取具有
id1
的obs来进一步优化。另一个示例是,通过
将数据集
want
降序id1 id2
将结果分为3组。我认为散列概念是可行的,但在找到链接观察之前,需要进一步的过程来合并最初看起来是分离的组。我基本上同意你的第一个评论,即这种数据操作总是混乱的,因此,它必须有某种业务规则,或者至少在一个ID上进行预分类,然后按照它进行操作。对于你的第二条评论,我同意如果它只有两个ID,但是,注意到asker发布的后续评论,它可能有“远远超过2个”,所以我认为这种优化不值得额外的编码工作。谢谢!所以我需要按ID1排序,然后按ID2排序,这样它才能工作?当我将数据更改为:data have;输入ID1和ID2;卡;2 5 4 8 2 4 2 6 3 7 4 1 9 1 3 2 6 2 7 3 ; 它不再起作用了。i、 e.5属于第1组,尽管它们都属于同一组。它永远不会更新。我猜我的例子并没有遵循数据中每个观察结果的相反规律。这会有什么不同吗?这或多或少是我会发布的答案。但是,请注意,这取决于输入数据集的排序顺序-例如,交换obs 1和obs 3会导致不正确的输出。此外,您还可以通过仅使用
id1
读取obs来进一步优化,