R 多个变量的唯一性（）_R_Unique

R 多个变量的唯一性（）

R 多个变量的唯一性（）,r,unique,R,Unique,我在R中有以下数据帧： > str(df) 'data.frame': 545227 obs. of 15 variables: $ ykod : int 93 93 93 93 93 93 93 93 93 93 ... $ yad : Factor w/ 42 levels "BAKUGAN","BARBIE",..: 30 30 30 30 30 30 30 30 30 30 ... $ per : Factor w/ 3 levels "2 AYLIK","3 A

我在R中有以下数据帧：

> str(df)
'data.frame':   545227 obs. of  15 variables:
 $ ykod : int  93 93 93 93 93 93 93 93 93 93 ...
 $ yad  : Factor w/ 42 levels "BAKUGAN","BARBIE",..: 30 30 30 30 30 30 30 30 30 30 ...
 $ per  : Factor w/ 3 levels "2 AYLIK","3 AYLIK",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ donem: int  201101 201101 201101 201101 201101 201101 201101 201101 201101 201101 ...
 $ sayi : int  201101 201101 201101 201101 201101 201101 201101 201101 201101 201101 ...
 $ mkod : int  4 5 9 11 12 18 20 22 25 26 ...
 $ mad  : Factor w/ 10464 levels "   Defne Market          ",..: 405 8075 9710 10145 9297 7973 2542 3892 2759 5769 ...
 $ mtip : Factor w/ 29 levels "Abone Bürosu                                      ",..: 2 20 20 2 2 2 2 2 2 2 ...
 $ kanal: Factor w/ 2 levels "OB","SS": 2 2 2 2 2 2 2 2 2 2 ...
 $ bkod : int  110565 110565 110565 110565 110565 110565 110565 110565 110565 110565 ...
 $ bad  : Factor w/ 212 levels "4. Levent","500 Evler",..: 167 167 167 167 167 167 167 167 167 167 ...
 $ bolge: Factor w/ 12 levels "Adana Şehiriçi",..: 7 7 7 7 7 7 7 7 7 7 ...
 $ sevk : int  2 3 3 3 2 2 2 6 2 2 ...
 $ iade : int  2 1 0 2 0 2 1 0 0 2 ...
 $ satis: int  0 2 3 1 2 0 1 6 2 0 ...

我想列出所选多个变量的唯一值（如SQL的DISTINCT）。例如，

unique（yad）

为我提供了每个42个元素的名称，但我需要提取两列（

yad

和

per

以及所有唯一的组合）：

如何实现这一点？

有几种方法可以获得一组因素的所有独特组合

with(df, interaction(yad, per, drop=TRUE))   # gives labels
with(df, yad:per)                            # ditto

aggregate(numeric(nrow(df)), df[c("yad", "per")], length)    # gives a data frame

使用

unique（）

本身怎么样

df <- data.frame(yad = c("BARBIE", "BARBIE", "BAKUGAN", "BAKUGAN"),
                 per = c("AYLIK",  "AYLIK",  "2 AYLIK", "2 AYLIK"),
                 hmm = 1:4)

df
#       yad     per hmm
# 1  BARBIE   AYLIK   1
# 2  BARBIE   AYLIK   2
# 3 BAKUGAN 2 AYLIK   3
# 4 BAKUGAN 2 AYLIK   4

unique(df[c("yad", "per")])
#       yad     per
# 1  BARBIE   AYLIK
# 3 BAKUGAN 2 AYLIK

df这是对Josh答案的补充
您还可以保留其他变量的值，同时过滤掉data.table中的重复行
例如：
library(data.table)

#create data table
dt <- data.table(
  V1=LETTERS[c(1,1,1,1,2,3,3,5,7,1)],
  V2=LETTERS[c(2,3,4,2,1,4,4,6,7,2)],
  V3=c(1),
  V4=c(2) )

> dt
# V1 V2 V3 V4
# A  B  1  2
# A  C  1  2
# A  D  1  2
# A  B  1  2
# B  A  1  2
# C  D  1  2
# C  D  1  2
# E  F  1  2
# G  G  1  2
# A  B  1  2

# set the key to all columns
setkey(dt)

# Get Unique lines in the data table
unique( dt[list(V1, V2), nomatch = 0] ) 

# V1 V2 V3 V4
# A  B  1  2
# A  C  1  2
# A  D  1  2
# B  A  1  2
# C  D  1  2
# E  F  1  2
# G  G  1  2

库（data.table）
#创建数据表
dt
#V1 V2 V3 V4
#A B 1 2
#A C 1 2
#A D 1 2
#A B 1 2
#B A 12
#C D 1 2
#C D 1 2
#E F 1 2
#G 1 2
#A B 1 2
#将键设置为所有列
设置键（dt）
#获取数据表中的唯一行
唯一（dt[列表（V1，V2），nomatch=0]）
#V1 V2 V3 V4
#A B 1 2
#A C 1 2
#A D 1 2
#B A 12
#C D 1 2
#E F 1 2
#G 1 2

警告：如果其他变量中存在不同的值组合，则结果将为
V1和V2的独特组合这种dplyr
方法在配管时非常有效
对于所选列：
library(dplyr)
iris %>% 
  select(Sepal.Width, Species) %>% 
  t %>% c %>% unique

 [1] "3.5"        "setosa"     "3.0"        "3.2"        "3.1"       
 [6] "3.6"        "3.9"        "3.4"        "2.9"        "3.7"       
[11] "4.0"        "4.4"        "3.8"        "3.3"        "4.1"       
[16] "4.2"        "2.3"        "versicolor" "2.8"        "2.4"       
[21] "2.7"        "2.0"        "2.2"        "2.5"        "2.6"       
[26] "virginica" 

iris %>% t %>% c %>% unique 

 [1] "5.1"        "3.5"        "1.4"        "0.2"        "setosa"     "4.9"       
 [7] "3.0"        "4.7"        "3.2"        "1.3"        "4.6"        "3.1"       
[13] "1.5"        "5.0"        "3.6"        "5.4"        "3.9"        "1.7"       
[19] "0.4"        "3.4"        "0.3"        "4.4"        "2.9"        "0.1"       
[25] "3.7"        "4.8"        "1.6"        "4.3"        "1.1"        "5.8"       
[31] "4.0"        "1.2"        "5.7"        "3.8"        "1.0"        "3.3"       
[37] "0.5"        "1.9"        "5.2"        "4.1"        "5.5"        "4.2"       
[43] "4.5"        "2.3"        "0.6"        "5.3"        "7.0"        "versicolor"
[49] "6.4"        "6.9"        "6.5"        "2.8"        "6.3"        "2.4"       
[55] "6.6"        "2.7"        "2.0"        "5.9"        "6.0"        "2.2"       
[61] "6.1"        "5.6"        "6.7"        "6.2"        "2.5"        "1.8"       
[67] "6.8"        "2.6"        "virginica"  "7.1"        "2.1"        "7.6"       
[73] "7.3"        "7.2"        "7.7"        "7.4"        "7.9" 

或对于整个数据帧：
library(dplyr)
iris %>% 
  select(Sepal.Width, Species) %>% 
  t %>% c %>% unique

 [1] "3.5"        "setosa"     "3.0"        "3.2"        "3.1"       
 [6] "3.6"        "3.9"        "3.4"        "2.9"        "3.7"       
[11] "4.0"        "4.4"        "3.8"        "3.3"        "4.1"       
[16] "4.2"        "2.3"        "versicolor" "2.8"        "2.4"       
[21] "2.7"        "2.0"        "2.2"        "2.5"        "2.6"       
[26] "virginica" 

iris %>% t %>% c %>% unique 

 [1] "5.1"        "3.5"        "1.4"        "0.2"        "setosa"     "4.9"       
 [7] "3.0"        "4.7"        "3.2"        "1.3"        "4.6"        "3.1"       
[13] "1.5"        "5.0"        "3.6"        "5.4"        "3.9"        "1.7"       
[19] "0.4"        "3.4"        "0.3"        "4.4"        "2.9"        "0.1"       
[25] "3.7"        "4.8"        "1.6"        "4.3"        "1.1"        "5.8"       
[31] "4.0"        "1.2"        "5.7"        "3.8"        "1.0"        "3.3"       
[37] "0.5"        "1.9"        "5.2"        "4.1"        "5.5"        "4.2"       
[43] "4.5"        "2.3"        "0.6"        "5.3"        "7.0"        "versicolor"
[49] "6.4"        "6.9"        "6.5"        "2.8"        "6.3"        "2.4"       
[55] "6.6"        "2.7"        "2.0"        "5.9"        "6.0"        "2.2"       
[61] "6.1"        "5.6"        "6.7"        "6.2"        "2.5"        "1.8"       
[67] "6.8"        "2.6"        "virginica"  "7.1"        "2.1"        "7.6"       
[73] "7.3"        "7.2"        "7.7"        "7.4"        "7.9" 

基于任何列的唯一性，并保留所有其他列
df%distinct（col1，col2，.keep_all=TRUE）
+1还建议规范化字符串（tolower，gsub out特殊字符等）。如果df
是一个矩阵，怎么做？我应该把它转换成data.frame
，还是有函数来做呢？事实上，我已经找到了完成这项工作的unique.matrix（）
，谢谢你，如果你想保留所有其他变量（知道你选择了哪一行，或者使用这一行（可能是第一行））？也就是说，dplyr:：distinct（.data，….keep_all=TRUE）有一个R基等价物？我不知道dplyr:：distinct（）。在这里，您可以执行以下操作：df[！duplicated（df[1:2]），]
。奇怪的是，唯一操作可以工作，但结果dt将所有其他列设置为NA。你知道为什么吗？谢谢你发现了。此操作进行合并，因此可以生成一些NA
值。解决方案是将allow.cartesian=TRUE
替换为nomatch=0
，忽略结果中的NA
值。我已经更新了答案。谢谢