如何使用SAS、SQL或伪代码创建更准确的组ID_Sql_Sas_Data Science_Data Analysis_Pseudocode

如何使用SAS、SQL或伪代码创建更准确的组ID

sql sas

如何使用SAS、SQL或伪代码创建更准确的组ID,sql,sas,data-science,data-analysis,pseudocode,Sql,Sas,Data Science,Data Analysis,Pseudocode,我有一个很大的数据集，其中一部分看起来像下面的。每组相同的KEY1（或KEY2）值应表示相同的组： YEAR KEY1 KEY2 1998 218 883 1999 218 883 2000 218 883 2001 218 883 1995 300 633 1996 300 633 1997 300 633 2003 822 119 2004 822 119 2005 8

我有一个很大的数据集，其中一部分看起来像下面的。每组相同的KEY1（或KEY2）值应表示相同的组：

YEAR    KEY1   KEY2
1998    218    883
1999    218    883
2000    218    883
2001    218    883
1995    300    633
1996    300    633
1997    300    633
2003    822    119
2004    822    119
2005    822    119
2005    992    476

我使用KEY1表示每个组，但有些组的KEY1不一致：

YEAR    KEY1    KEY2
2002    712     683
2003    344     683
2004    712     683

YEAR    KEY1    KEY2
2002    473     588
2003    473     588
2004    473     957

键2也可能不一致：

YEAR    KEY1    KEY2
2002    712     683
2003    344     683
2004    712     683

YEAR    KEY1    KEY2
2002    473     588
2003    473     588
2004    473     957

我将“344”和“957”视为数据错误，以及KEY1和KEY2中的所有其他不一致

我的目标是：创建一个名为GROUPID的变量，它在分组时比只使用KEY1更准确

其他需要注意的事项：

年份并不总是连续的

键1比键2更精确

同一组的键1和键2可能不一致

数据集很大，所以我无法手动搜索

可能的解决办法：

一个程序，搜索整个数据集并返回KEY1的问题行，以便我可以将KEY1复制到GROUPID，然后手动编辑它们

一种程序，可根据创建较大组的键1和键2进行切换。然后创建一个可以利用此功能的GROUPID

现在我只是在用键盘1。上述任一解决方案的SAS、SQL或伪代码可能是什么样子？我对其他解决方案持开放态度。

这里有一个sql fiddle，可以作为使用MS sql Server的解决方案。对于大型数据集，使用索引临时表的性能将优于普通表表达式。然而，如果时间允许，这只是一个可以改进的地方

MS SQL Server 2017架构设置：

CREATE TABLE MyData( Year INT, Key1 INT, Key2 INT )
INSERT INTO MyData VALUES
(1998,    218,    883),
(1999,    218,    883),
(2000,    218,    883),
(2001,    218,    883),
(1995,    300,    633),
(1996,    300,    633),
(1997,    300,    633),
(2003,    822,    119),
(2004,    822,    119),
(2005,    822,    119),
(2005,    992,    476),
--
(2002,    712,    683),
(2003,    344,    683),
(2004,    712,    683),
(2002,    473,    588),
(2003,    473,    588),
(2004,    473,    957),
--
(2002,    712,    222),
(2003,    344,    222),
(2004,    712,    222),
(2002,    473,    111),
(2003,    473,    111),
(2004,    473,    111)

;WITH DataInSomeOrder AS
(
    SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
    FROM MyData 
),
InstancesOfKeys AS
(
    SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
        SELECT Year,Key1,Key2,RowNumber,
            Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
            Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
        FROM DataInSomeOrder
    )AS X 
)
SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    InstancesOfKeys D
    LEFT JOIN InstancesOfKeys K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
          SELECT Year,Key1,Key2,RowNumber,
              Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
              Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
          FROM 
          (
             SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
             FROM MyData
          ) SomeOrder  
      )AS X 
    )AS D
    LEFT JOIN 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
           SELECT Year,Key1,Key2,RowNumber,
               Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
               Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
           FROM 
           (
              SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
              FROM MyData
           ) SomeOrder  
       )AS X 
     )AS K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

查询1：

CREATE TABLE MyData( Year INT, Key1 INT, Key2 INT )
INSERT INTO MyData VALUES
(1998,    218,    883),
(1999,    218,    883),
(2000,    218,    883),
(2001,    218,    883),
(1995,    300,    633),
(1996,    300,    633),
(1997,    300,    633),
(2003,    822,    119),
(2004,    822,    119),
(2005,    822,    119),
(2005,    992,    476),
--
(2002,    712,    683),
(2003,    344,    683),
(2004,    712,    683),
(2002,    473,    588),
(2003,    473,    588),
(2004,    473,    957),
--
(2002,    712,    222),
(2003,    344,    222),
(2004,    712,    222),
(2002,    473,    111),
(2003,    473,    111),
(2004,    473,    111)

;WITH DataInSomeOrder AS
(
    SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
    FROM MyData 
),
InstancesOfKeys AS
(
    SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
        SELECT Year,Key1,Key2,RowNumber,
            Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
            Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
        FROM DataInSomeOrder
    )AS X 
)
SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    InstancesOfKeys D
    LEFT JOIN InstancesOfKeys K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
          SELECT Year,Key1,Key2,RowNumber,
              Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
              Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
          FROM 
          (
             SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
             FROM MyData
          ) SomeOrder  
      )AS X 
    )AS D
    LEFT JOIN 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
           SELECT Year,Key1,Key2,RowNumber,
               Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
               Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
           FROM 
           (
              SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
              FROM MyData
           ) SomeOrder  
       )AS X 
     )AS K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

不使用CTE的

查询1：

CREATE TABLE MyData( Year INT, Key1 INT, Key2 INT )
INSERT INTO MyData VALUES
(1998,    218,    883),
(1999,    218,    883),
(2000,    218,    883),
(2001,    218,    883),
(1995,    300,    633),
(1996,    300,    633),
(1997,    300,    633),
(2003,    822,    119),
(2004,    822,    119),
(2005,    822,    119),
(2005,    992,    476),
--
(2002,    712,    683),
(2003,    344,    683),
(2004,    712,    683),
(2002,    473,    588),
(2003,    473,    588),
(2004,    473,    957),
--
(2002,    712,    222),
(2003,    344,    222),
(2004,    712,    222),
(2002,    473,    111),
(2003,    473,    111),
(2004,    473,    111)

;WITH DataInSomeOrder AS
(
    SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
    FROM MyData 
),
InstancesOfKeys AS
(
    SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
        SELECT Year,Key1,Key2,RowNumber,
            Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
            Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
        FROM DataInSomeOrder
    )AS X 
)
SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    InstancesOfKeys D
    LEFT JOIN InstancesOfKeys K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
          SELECT Year,Key1,Key2,RowNumber,
              Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
              Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
          FROM 
          (
             SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
             FROM MyData
          ) SomeOrder  
      )AS X 
    )AS D
    LEFT JOIN 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
           SELECT Year,Key1,Key2,RowNumber,
               Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
               Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
           FROM 
           (
              SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
              FROM MyData
           ) SomeOrder  
       )AS X 
     )AS K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

下面是一个使用MS sql Server的sql FIDLE解决方案。对于大型数据集，使用索引临时表的性能将优于普通表表达式。然而，如果时间允许，这只是一个可以改进的地方

MS SQL Server 2017架构设置：

CREATE TABLE MyData( Year INT, Key1 INT, Key2 INT )
INSERT INTO MyData VALUES
(1998,    218,    883),
(1999,    218,    883),
(2000,    218,    883),
(2001,    218,    883),
(1995,    300,    633),
(1996,    300,    633),
(1997,    300,    633),
(2003,    822,    119),
(2004,    822,    119),
(2005,    822,    119),
(2005,    992,    476),
--
(2002,    712,    683),
(2003,    344,    683),
(2004,    712,    683),
(2002,    473,    588),
(2003,    473,    588),
(2004,    473,    957),
--
(2002,    712,    222),
(2003,    344,    222),
(2004,    712,    222),
(2002,    473,    111),
(2003,    473,    111),
(2004,    473,    111)

;WITH DataInSomeOrder AS
(
    SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
    FROM MyData 
),
InstancesOfKeys AS
(
    SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
        SELECT Year,Key1,Key2,RowNumber,
            Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
            Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
        FROM DataInSomeOrder
    )AS X 
)
SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    InstancesOfKeys D
    LEFT JOIN InstancesOfKeys K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
          SELECT Year,Key1,Key2,RowNumber,
              Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
              Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
          FROM 
          (
             SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
             FROM MyData
          ) SomeOrder  
      )AS X 
    )AS D
    LEFT JOIN 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
           SELECT Year,Key1,Key2,RowNumber,
               Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
               Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
           FROM 
           (
              SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
              FROM MyData
           ) SomeOrder  
       )AS X 
     )AS K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

查询1：

CREATE TABLE MyData( Year INT, Key1 INT, Key2 INT )
INSERT INTO MyData VALUES
(1998,    218,    883),
(1999,    218,    883),
(2000,    218,    883),
(2001,    218,    883),
(1995,    300,    633),
(1996,    300,    633),
(1997,    300,    633),
(2003,    822,    119),
(2004,    822,    119),
(2005,    822,    119),
(2005,    992,    476),
--
(2002,    712,    683),
(2003,    344,    683),
(2004,    712,    683),
(2002,    473,    588),
(2003,    473,    588),
(2004,    473,    957),
--
(2002,    712,    222),
(2003,    344,    222),
(2004,    712,    222),
(2002,    473,    111),
(2003,    473,    111),
(2004,    473,    111)

;WITH DataInSomeOrder AS
(
    SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
    FROM MyData 
),
InstancesOfKeys AS
(
    SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
        SELECT Year,Key1,Key2,RowNumber,
            Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
            Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
        FROM DataInSomeOrder
    )AS X 
)
SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    InstancesOfKeys D
    LEFT JOIN InstancesOfKeys K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
          SELECT Year,Key1,Key2,RowNumber,
              Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
              Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
          FROM 
          (
             SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
             FROM MyData
          ) SomeOrder  
      )AS X 
    )AS D
    LEFT JOIN 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
           SELECT Year,Key1,Key2,RowNumber,
               Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
               Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
           FROM 
           (
              SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
              FROM MyData
           ) SomeOrder  
       )AS X 
     )AS K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

不使用CTE的

查询1：

CREATE TABLE MyData( Year INT, Key1 INT, Key2 INT )
INSERT INTO MyData VALUES
(1998,    218,    883),
(1999,    218,    883),
(2000,    218,    883),
(2001,    218,    883),
(1995,    300,    633),
(1996,    300,    633),
(1997,    300,    633),
(2003,    822,    119),
(2004,    822,    119),
(2005,    822,    119),
(2005,    992,    476),
--
(2002,    712,    683),
(2003,    344,    683),
(2004,    712,    683),
(2002,    473,    588),
(2003,    473,    588),
(2004,    473,    957),
--
(2002,    712,    222),
(2003,    344,    222),
(2004,    712,    222),
(2002,    473,    111),
(2003,    473,    111),
(2004,    473,    111)

;WITH DataInSomeOrder AS
(
    SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
    FROM MyData 
),
InstancesOfKeys AS
(
    SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
        SELECT Year,Key1,Key2,RowNumber,
            Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
            Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
        FROM DataInSomeOrder
    )AS X 
)
SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    InstancesOfKeys D
    LEFT JOIN InstancesOfKeys K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

SELECT 
    D.Year, D.Key1, D.Key2, D.RowNumber,
    DuplicateKeyNumber = CASE WHEN K1.Year IS NULL THEN 2 ELSE 1 END   
FROM 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
          SELECT Year,Key1,Key2,RowNumber,
              Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
              Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
          FROM 
          (
             SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
             FROM MyData
          ) SomeOrder  
      )AS X 
    )AS D
    LEFT JOIN 
    (
       SELECT Year,Key1,Key2,Key1InstanceNumber,Key2InstanceNumber,RowNumber FROM (
           SELECT Year,Key1,Key2,RowNumber,
               Key1InstanceNumber = RANK() OVER(PARTITION BY Key1 ORDER BY RowNumber),
               Key2InstanceNumber = RANK() OVER(PARTITION BY Key2 ORDER BY RowNumber)
           FROM 
           (
              SELECT Year,Key1,Key2, RowNumber = ROW_NUMBER() OVER(PARTITION BY 1 ORDER BY (SELECT 1))  
              FROM MyData
           ) SomeOrder  
       )AS X 
     )AS K1 ON K1.Key1=D.Key1 AND K1.Key2=D.Key2 AND K1.Key1InstanceNumber=1
WHERE
    (D.Key1InstanceNumber<>D.Key2InstanceNumber) 
    AND
    (K1.Key1InstanceNumber IS NULL OR K1.Key1InstanceNumber<>K1.Key2InstanceNumber)
ORDER BY 
    D.RowNumber

| Year | Key1 | Key2 | RowNumber | DuplicateKeyNumber |
|------|------|------|-----------|--------------------|
| 2003 |  344 |  683 |        13 |                  1 |
| 2004 |  473 |  957 |        17 |                  2 |
| 2002 |  712 |  222 |        18 |                  2 |
| 2004 |  712 |  222 |        20 |                  2 |
| 2002 |  473 |  111 |        21 |                  2 |
| 2003 |  473 |  111 |        22 |                  2 |
| 2004 |  473 |  111 |        23 |                  2 |

您可以使用way back机器拨打2004年的“如何按名字或姓氏分组”并查看

给定一组对（键1、键2），找出配对，使组中的每对都具有以下属性：

key1与组中任何其他对的key1匹配。
或
key2匹配组中任何其他对的key2

使用版本9哈希的迭代方法。两个哈希保持分配给每个键值的groupId。另外两个哈希用于维护组映射路径。如果可以在不引起映射的情况下传递数据，则组我们已经下了充分的决心。然后完成最后一个过程，在这一点上，GroupID被分配给每个组配对

示例代码

%let seed =
%sysfunc(mod(%sysfunc(compress(%sysfunc(constant(e)),.)),2**31));

* random pair data;
data pairs;
  do id = 1 to 1e4;
    key1 = int (1e4*ranuni(&seed));
    key2 = int (1e4*ranuni(&seed));
    output;
  end;
run;

/*
data pairs;
  id + 1;
  input key1 $ key2 $;
  cards;
John Smith
George Smith
Bill Clinton
George Bush
;

data pairs;
  id + 1;
  input key1 key2 ;
  format _numeric_ 4.;
  cards;
1 2
3 2
4 5
3 6
4 2
;
*/

%let dbg = *;

data pairsWithGroupAssignments ;

  declare hash one();
  one.definekey ('key1');
  one.definedata ('key1', 'groupid');
  one.definedone();

  declare hash two();
  two.definekey ('key2');
  two.definedata ('key2', 'groupid');
  two.definedone();

  declare hash map1();
  map1.definekey ('from');
  map1.definedata ('from', 'to');
  map1.definedone();

  declare hash map2();
  map2.definekey ('from');
  map2.definedata ('from', 'to');
  map2.definedone();

  _groupId = 0;
  noMappings = 0;

  do until (noMappings and outputDone);

    doOutput = noMappings;
    noMappings = 1;

    do _n_ = 1 to numberOfPairs;

      set pairs nobs=numberOfPairs point=_n_;

      rc1 = one.find(); g1 = groupId;
      rc2 = two.find(); g2 = groupId;

      if doOutput then do;
        output;
        continue;
      end;

      &dbg.
      put id= '(' key1 +(-1) ', ' key2 +(-1) ') ' @;

      if rc1 ne 0 and rc2 ne 0 then do;
        /** /
        addboth:
        /**/
        _groupId + 1;
        groupId = _groupId;
        one.add ();
        two.add ();
        &dbg. put 'add ' key1= 'and ' key2= 'to ' groupId=;
      end;
      else
      if rc1 ne 0 and rc2 = 0 then do;
        /** /
        add1:
        /**/
        groupId = g2;
        one.add();
        &dbg. put 'add ' key1= 'to ' groupId=;
      end;
      else
      if rc1 = 0 and rc2 ne 0 then do;
        /** /
        add2:
        /**/
        groupId = g1;
        two.add();
        &dbg. put 'add ' key2= 'to ' groupId=;
      end;
      else
      if g1 > g2 then do;
        /** /
        g1g2:
        /**/

        from = g1;
        to = g2;

        * determine groupid by following map1;
        _to = to;
        do while (map1.find(key:_to) = 0);
          _to = to;
        end;

        from = g1;
        map1.replace();

        groupId = to;
        one.replace();

        &dbg. put 'add ' key1= 'to ' groupId= 'mapped from key1 group ' from;

        noMappings = 0;
      end;
      else
      if g2 > g1 then do;
        /** /
        g2g1:
        /**/

        from = g2;
        to = g1;

        * determine groupid by following map2;
        to_ = to;
        do while (map2.find(key:to_) = 0);
          to_ = to;
        end;

        from = g2;
        map2.replace();

        groupId = to;
        two.replace();

        &dbg. put 'add ' key2= 'to ' groupId= 'mapped from key2 group ' from;

        noMappings = 0;
      end;
      else do;
        /** /
        same:
        /**/
        &dbg. put rc1= rc2= g1= g2=;
      end;
    end;

    nPass + 1;

    outputDone = doOutput;
  end;

  put 'NOTE: Data iterated ' npass 'times.';

/*
  two.output(dataset:'g2');
  one.output(dataset:'g1');
  map1.output(dataset:'map1');
  map2.output(dataset:'map2');
*/
  stop;

  keep id key1 key2 groupId;
  format _numeric_ 8.;
run;

proc sql noprint;
  select count(distinct groupId) into :ngroups from &syslast;

%put &=ngroups;

在中讨论了通过网络跟踪关系或路径

几年前的几个listserv线程（您可以使用way back机器拨打2004年的“如何按名字或姓氏分组”并查看

给定一组对（键1、键2），找出配对，使组中的每对都具有以下属性：

key1与组中任何其他对的key1匹配。
或
key2匹配组中任何其他对的key2

示例代码

%let seed =
%sysfunc(mod(%sysfunc(compress(%sysfunc(constant(e)),.)),2**31));

* random pair data;
data pairs;
  do id = 1 to 1e4;
    key1 = int (1e4*ranuni(&seed));
    key2 = int (1e4*ranuni(&seed));
    output;
  end;
run;

/*
data pairs;
  id + 1;
  input key1 $ key2 $;
  cards;
John Smith
George Smith
Bill Clinton
George Bush
;

data pairs;
  id + 1;
  input key1 key2 ;
  format _numeric_ 4.;
  cards;
1 2
3 2
4 5
3 6
4 2
;
*/

%let dbg = *;

data pairsWithGroupAssignments ;

  declare hash one();
  one.definekey ('key1');
  one.definedata ('key1', 'groupid');
  one.definedone();

  declare hash two();
  two.definekey ('key2');
  two.definedata ('key2', 'groupid');
  two.definedone();

  declare hash map1();
  map1.definekey ('from');
  map1.definedata ('from', 'to');
  map1.definedone();

  declare hash map2();
  map2.definekey ('from');
  map2.definedata ('from', 'to');
  map2.definedone();

  _groupId = 0;
  noMappings = 0;

  do until (noMappings and outputDone);

    doOutput = noMappings;
    noMappings = 1;

    do _n_ = 1 to numberOfPairs;

      set pairs nobs=numberOfPairs point=_n_;

      rc1 = one.find(); g1 = groupId;
      rc2 = two.find(); g2 = groupId;

      if doOutput then do;
        output;
        continue;
      end;

      &dbg.
      put id= '(' key1 +(-1) ', ' key2 +(-1) ') ' @;

      if rc1 ne 0 and rc2 ne 0 then do;
        /** /
        addboth:
        /**/
        _groupId + 1;
        groupId = _groupId;
        one.add ();
        two.add ();
        &dbg. put 'add ' key1= 'and ' key2= 'to ' groupId=;
      end;
      else
      if rc1 ne 0 and rc2 = 0 then do;
        /** /
        add1:
        /**/
        groupId = g2;
        one.add();
        &dbg. put 'add ' key1= 'to ' groupId=;
      end;
      else
      if rc1 = 0 and rc2 ne 0 then do;
        /** /
        add2:
        /**/
        groupId = g1;
        two.add();
        &dbg. put 'add ' key2= 'to ' groupId=;
      end;
      else
      if g1 > g2 then do;
        /** /
        g1g2:
        /**/

        from = g1;
        to = g2;

        * determine groupid by following map1;
        _to = to;
        do while (map1.find(key:_to) = 0);
          _to = to;
        end;

        from = g1;
        map1.replace();

        groupId = to;
        one.replace();

        &dbg. put 'add ' key1= 'to ' groupId= 'mapped from key1 group ' from;

        noMappings = 0;
      end;
      else
      if g2 > g1 then do;
        /** /
        g2g1:
        /**/

        from = g2;
        to = g1;

        * determine groupid by following map2;
        to_ = to;
        do while (map2.find(key:to_) = 0);
          to_ = to;
        end;

        from = g2;
        map2.replace();

        groupId = to;
        two.replace();

        &dbg. put 'add ' key2= 'to ' groupId= 'mapped from key2 group ' from;

        noMappings = 0;
      end;
      else do;
        /** /
        same:
        /**/
        &dbg. put rc1= rc2= g1= g2=;
      end;
    end;

    nPass + 1;

    outputDone = doOutput;
  end;

  put 'NOTE: Data iterated ' npass 'times.';

/*
  two.output(dataset:'g2');
  one.output(dataset:'g1');
  map1.output(dataset:'map1');
  map2.output(dataset:'map2');
*/
  stop;

  keep id key1 key2 groupId;
  format _numeric_ 8.;
run;

proc sql noprint;
  select count(distinct groupId) into :ngroups from &syslast;

%put &=ngroups;

在中讨论了通过网络跟踪关系或路径

几年前的几个listserv线程（嘿，迈克，看看这个问题。它看起来做了一些类似的事情：我的第一个想法是使用一个窗口函数来获得KEY1的每个值的不同值的计数。当该计数大于1时，KEY2可能是更好的键。我只会连接这两个字段以生成一个唯一的复合键。嘿，迈克，ch在这里勾选这个问题。它看起来做了一些类似的事情：我的第一个想法是使用一个窗口函数来获得KEY1的每个值的不同值的计数。当该计数大于1时，KEY2可能是更好的键。我只会连接这两个字段以生成一个唯一的复合键。这在SAS中不起作用，我认为除非OP有MS SQL并且可以使用SQL传递，否则t不支持WITH（）。这破坏了乐趣：）在不使用CTE的数据库和分区的情况下添加了更详细的fiddle:（这在SAS中不起作用，除非OP有MS SQL并且可以使用SQL传递，否则它不支持WITH（）。这破坏了乐趣：）在不使用CTE的数据库和分区的情况下添加了更详细的fiddle：(