proc SQL-连接复制值_Sql_Sas_Proc Sql

proc SQL-连接复制值

sql sas

proc SQL-连接复制值,sql,sas,proc-sql,Sql,Sas,Proc Sql,我有两个数据集： data snippet1; input ID callDate :ddmmyy. start_date :ddmmyy. end_date :ddmmyy. cured ; format Date start_date end_date ddmmyy10.; datalines4; 001 30/11/2020 28/11/2020 01/12/2020 Cured 001 01/12/2020 28/11/2020 01/12/2020 Cured 0

我有两个数据集：

data snippet1;
  input ID callDate :ddmmyy. start_date :ddmmyy. end_date :ddmmyy. cured ;
  format Date start_date end_date  ddmmyy10.;
datalines4;
001 30/11/2020  28/11/2020  01/12/2020  Cured
001 01/12/2020  28/11/2020  01/12/2020  Cured
001 30/12/2020  28/12/2020  04/01/2021  Not Cured
001 31/12/2020  28/12/2020  04/01/2021  Not Cured
001 01/02/2021  28/01/2021  01/02/2021  Cured
    ;;;;

data have1;
  input ID  event_date :ddmmyy. description ;
  format  event_date ddmmyy10.;
datalines4;
001 28Oct2020   
001 29Nov2020   
001 29Nov2020   New Plan
001 30Nov2020   
001 01Dec2020   
001 01Dec2020   New Plan
001 01Dec2020   Stop Category
001 01Dec2020   Review Date
001 02Dec2020   
001 02Dec2020   OLd Contact Strategy Level
001 02Dec2020   
001 04Dec2020   Stop Category
001 04Dec2020   Review Date
001 29Dec2020   
001 29Dec2020   New Plan
001 30Dec2020   
001 31Dec2020   
001 01Jan2021   
001 01Jan2021   
001 02Jan2021   
001 04Jan2021   
001 05Jan2021   OLd Contact Strategy Level
001 05Jan2021   
001 29Jan2021   
001 29Jan2021   New Plan
001 30Jan2021   
001 31Jan2021   
001 01Feb2021   
001 01Feb2021   
001 02Feb2021   
001 02Feb2021   OLd Contact Strategy Level
001 02Feb2021
    ;;;;

我试图通过名为

Description1

和

Description2

的两列分别获取

Calldate+Calldate+2

之间每个时段的第一个和最后一个描述。因此，对于

Calldate=02Dec2020，Description1=旧联系人策略级别，Description2=审核日期

我当然有更多的ID，但我认为只要有一个就足以看出我的问题

这是我目前掌握的代码：

proc sql;
create table want as
select a.*
, min(c.description) as description1
, max(c.description) as description2
from snippet1 a
inner join
        have1 c
on a.id= c.id
and a.calldate<= c.event_date
and c.event_date <= a.calldate+ 2
Group by 1;
Quit;

正如你所看到的，这些日期重复了好几次，我甚至不确定所有的通话日期是否都在那里

有人知道吗？

纯SQL无法满足您的需求，或者至少不需要对数据进行一些修改。SQL不尊重“顺序”，特别是SAS的SQL实现（不允许中间查询顺序）。因此，获取“最后一行”的请求不会被执行：就SQL而言，calldate+2的所有行都是等效的，如果您成功地请求，您可以随机有效地获取其中任何一行。（它实际上不是随机的，但你应该把它当作是为了编写代码——只有当你真的不在乎你得到了什么时才这样做。）

要在SQL中执行此操作，必须添加一个排序字段。这在技术上是可能的，但不推荐（使用

monotonic（）

），因为它是一个未记录的函数。更好的方法是将其添加到数据步骤视图中

第一:

data have1_v/view=have1_v;
    set have1;
    by id;
    if first.id then id_row=0;
    id_row+1;
run;

这就建立了秩序。然后：

proc sql;
  select snippet1.*, 
    
    ( select description from have1_v where have1_v.id=snippet1.id
                                        and have1_v.event_Date between snippet1.calldate and snippet1.calldate+2
                                        and have1_v.description is not null
                                    having have1_v.id_row = min(have1_v.id_row)
    )as min_descript,
    ( select description from have1_v where have1_v.id=snippet1.id
                                        and have1_v.event_Date between snippet1.calldate and snippet1.calldate+2
                                        and have1_v.description is not null
                                    having have1_v.id_row = max(have1_v.id_row)
    )as max_descript
    from snippet1;
quit;

这抓住了你的“最小”和“最大”描述。我认为这会返回您要求的内容，尽管

snippet1

中没有与您在问题中提到的12/2日期匹配的行

综上所述，SAS有更好的工具来处理这类订单问题。SAS数据步骤确实有一个解决方案，例如：它保证了顺序，假设您自己没有弄糟它。例如，请参见如何创建汇总数据集：

data have1_summarized;
  set have1;
  by id event_date;
  where not missing(description);
  retain min_description max_description;
  if first.event_date then min_description = description;
  max_description=description;
  if last.event_date then output;
run;

现在，您可以使用SQL或其他工具将其与

snippet1

数据集结合起来，因为您不再有重复的事件日期，所以排序不再重要。

这就是所谓的

表格查找问题。我向您推荐双套
技能。这很容易学。

顺便说一下，我已经修复了数据输入步骤中的几个错误
data snippet1;
  input ID$ callDate :ddmmyy. start_date :ddmmyy. end_date :ddmmyy. cured$13. ;
  format callDate start_date end_date  ddmmyy10.;
datalines4;
001 30/11/2020  28/11/2020  01/12/2020  Cured
001 01/12/2020  28/11/2020  01/12/2020  Cured
001 30/12/2020  28/12/2020  04/01/2021  Not Cured
001 31/12/2020  28/12/2020  04/01/2021  Not Cured
001 01/02/2021  28/01/2021  01/02/2021  Cured
;;;;
run;

data have1;
  input ID$  event_date :date9. description $42. ;
  format  event_date ddmmyy10.;
datalines4;
001 28Oct2020   
001 29Nov2020   
001 29Nov2020   New Plan
001 30Nov2020   
001 01Dec2020   
001 01Dec2020   New Plan
001 01Dec2020   Stop Category
001 01Dec2020   Review Date
001 02Dec2020   
001 02Dec2020   OLd Contact Strategy Level
001 02Dec2020   
001 04Dec2020   Stop Category
001 04Dec2020   Review Date
001 29Dec2020   
001 29Dec2020   New Plan
001 30Dec2020   
001 31Dec2020   
001 01Jan2021   
001 01Jan2021   
001 02Jan2021   
001 04Jan2021   
001 05Jan2021   OLd Contact Strategy Level
001 05Jan2021   
001 29Jan2021   
001 29Jan2021   New Plan
001 30Jan2021   
001 31Jan2021   
001 01Feb2021   
001 01Feb2021   
001 02Feb2021   
001 02Feb2021   OLd Contact Strategy Level
001 02Feb2021
;;;;
run;

data want1;
  length description1 description2 $42.;
  set snippet1;

  do i = 1 to rec;
    set have1(rename=ID=TmpID)nobs=rec point=i;
    if ID=TmpID and callDate <= event_date <= callDate + 2 then do;
      if description1 = '' then description1 = description;
      if description ^= '' then description2 = description;
    end;
  end;
  drop Tmp:;
run;

数据片段1；
输入ID$callDate:ddmmyy。开始日期：ddmmyy。结束日期：ddmmyy。治愈13美元；
格式化callDate开始日期结束日期ddmmyy10。；
数据线4；
001 2020年11月30日2020年11月28日2020年1月12日治愈
01/12/2020 28/11/2020 01/12/2020固化
001 2020年12月30日2020年12月28日2021年1月4日未固化
001 2020年12月31日2020年12月28日2021年1月4日未固化
001 01/02/2021 28/01/2021 01/02/2021已固化
;;;;
跑
数据表1；
输入ID$event\u日期：date9。说明：42美元；
格式化事件日期ddmmyy10。；
数据线4；
001 2020年10月28日
001 2020年11月29日
001 2020年11月29日新计划
001 2020年11月30日
001 2020年12月1日
001 2020年12月1日新计划
001 2020年12月1日停止类别
001 2020年12月1日审核日期
001 2020年12月2日
001 2020年12月2日旧联系人策略级别
001 2020年12月2日
001 2020年12月4日停止类别
001 2020年12月4日审核日期
001 2020年12月29日
001 2020年12月29日新计划
001 2020年12月30日
001 2020年12月31日
001 2021年1月1日
001 2021年1月1日
001 2021年1月2日
001 2021年1月4日
001 2021年1月5日旧联系人策略级别
001 2021年1月5日
001 2021年1月29日
001 2021年1月29日新计划
001 2021年1月30日
001 2021年1月31日
001 02年2月1日
001 02年2月1日
001 02二月2021日
001 02二月2021旧联系人策略级别
001 02二月2021日
;;;;
跑
数据want1；
长度描述1描述2$42。；
设置代码段1；
Doi=1至rec；
设置have1（rename=ID=TmpID）nobs=rec point=i；
如果ID=TmpID和callDate我采用了@whymath的解决方案，并使其更快一些；不过，这可能仅在您拥有GBs数据或需要非常高的性能时才有必要
首先，我们构建了一个数据集，用于存储have1
中每个事件日期的第一行号。在第二个数据步骤中，当我们从have1
检索行时，我们将使用它来指导我们的工作，因此我们不需要进行不必要的迭代。我们还在这里创建一个索引，以便在下一步中启用键控集
其次，我们使用键
选项来检索起始行，然后在点
循环中使用该行，而不是1
。我们还在这里添加leave
，以允许我们在超过标记时停止迭代
所有这些都假设数据集是按您想要的顺序排列的——但我认为我们必须假设，否则您的整个想法都会出错。一定要确保它是正确的，否则你会有问题
data have1_ids(index=(id_calldate=(id calldate)));
  set have1;
  rename event_Date=calldate;
  by id event_date;
  _row+1;   *This keeps track of the row number only;
  if first.event_date;
  keep id event_date   _row;
run;


data want1;
  length description1 description2 $42.;
  set snippet1;
  set have1_ids key=id_calldate;
  do i = _row to rec;                                            *now we can start on the right row;
    set have1(rename=ID=_ID) nobs=rec point=i;              
    if (event_date gt calldate+2) or (ID ne _ID) then leave;     *conditions to exit the loop - if either of these is true then we are done here;
    if missing(description1) then description1 = description;    *populate the earlier description once we hit a valid description;
    if not missing(description) then description2 = description; *keep rewriting this until the end;
  end;
  drop _:;
run;

注意：我不会像在另一个键控设置答案中那样在这里检查if IORC\uuueq 0
——这是因为我不太在乎它是否失败；如果未找到匹配行，则可以使用行
的先前值。它不是最优的，但很接近-而且没有好的方法来获得下一行。
这里还有一种方法，在have1
相对较大的情况下，这可能是最有效的方法。在某些方面，它的灵活性稍差一些
这将使用已设置关键帧的集执行所有工作。它需要have1
，并为每一行制作三份副本，每个副本对应于您希望它符合的日期。然后，键控集只需获取正确日期的行。键控集使用集
数据集上的索引按索引查找匹配行
data have1_expanded(index=(id_calldate=(id calldate)));
  set have1;
  if not missing(description);
  format calldate date9.;
  do calldate=event_date to event_date-2 by -1;
    output;
  end;
run;

data want1;
  set snippet1;
  do _n_ = 1 by 1 until (_IORC_ ne 0);  *technically pointless but I always include it to make sure I do not forget _IORC_;
    set have1_expanded key=id_calldate end=eof;
    if _IORC_ ne 0 then leave;          *as keyed set iterates, _IORC_ will be zero when it finds a match and nonzero when it does not find any more matches;
    if _n_ eq 1 then description1=description;  *first time through, grab that first description;
    description2=description;                   *every time through, overwrite this to get the last description;
  end;
run;

分两步做。首先找到属于日期范围的描述，并按事件日期对其进行排序
proc sql ;
create table list as 
  select a.id,a.calldate,a.start_date,a.end_date,a.cured
       , b.event_date,b.description
  from snippet1 a left join have1 b
  on a.id=b.id 
    and a.calldate<= b.event_date
    and b.event_date <= a.calldate+ 2
    and b.description is not null
  order by a.id,a.calldate,a.start_date,a.end_date,a.cured,b.event_date 
;
quit;

结果:
description不是数值变量，在对其使用聚合函数之前，您可能需要先创建其数值索引。@whymath我尝试过先使用它。最后。但它不起作用。。。如果
proc sql ;
create table list as 
  select a.id,a.calldate,a.start_date,a.end_date,a.cured
       , b.event_date,b.description
  from snippet1 a left join have1 b
  on a.id=b.id 
    and a.calldate<= b.event_date
    and b.event_date <= a.calldate+ 2
    and b.description is not null
  order by a.id,a.calldate,a.start_date,a.end_date,a.cured,b.event_date 
;
quit;

data want;
  set list;
  by id calldate start_date end_date cured ;
  length description1 description2 $42  ;
  if first.cured then do;
     event_date1=event_date;
     description1=description;
  end;
  retain event_date1 description1;
  if last.cured then do;
    if not first.cured then do;
      description2=description;
      event_date2=event_date;
    end;
    output;
  end;
  drop description event_date;
  format event_date1 event_date2 yymmdd10.;
run;