Dplyr创建一个唯一的夫妇ID_R_Dplyr

Dplyr创建一个唯一的夫妇ID

Dplyr创建一个唯一的夫妇ID,r,dplyr,R,Dplyr,我正在尝试为两列创建唯一的ID。数据帧有两个列Sp1和Sp2，它们是字符串。 Sp1和Sp2可以是相同的字符串。我想要一个唯一的（Sp1；Sp2）耦合标识符，它不考虑（Sp1；Sp2）的顺序。例如，（A；B）应具有与（B；A）相同的标识符我尝试使用dplyr，但它不起作用，因为sort（）函数为每一行呈现相同的标识符所以我使用了一个经典的循环，但我有一个大的数据集，需要花费太多的时间： for (k in 1:nrow(data)){ data$Couple[k] <- past

我正在尝试为两列创建唯一的ID。数据帧有两个列Sp1和Sp2，它们是字符串。 Sp1和Sp2可以是相同的字符串。我想要一个唯一的（Sp1；Sp2）耦合标识符，它不考虑（Sp1；Sp2）的顺序。例如，（A；B）应具有与（B；A）相同的标识符

我尝试使用dplyr，但它不起作用，因为sort（）函数为每一行呈现相同的标识符

所以我使用了一个经典的循环，但我有一个大的数据集，需要花费太多的时间：

for (k in 1:nrow(data)){
 data$Couple[k] <- paste0(
  sort(c(as.character(data$Sp_1[k]), as.character(data$Sp_2[k])))[1],
  "_",
  sort(c(as.character(data$Sp_1[k]), as.character(data$Sp_2[k])))[2])
}

for（k/1:nrow（数据））{
data$Couple[k]这里有一个解决方案，它首先使用rowwise（）
获得顺序不重要的Sp_1
+Sp_2
的表示，然后purr:：map_int（）
将其转换为唯一的id：
#使其可复制
种子（1）
#加载包
图书馆（dplyr）
图书馆（purrr）
#定义和检查测试数据集
数据#A tible:10 x 2
#>Sp_1 Sp_2
#>     
#>1 a e
#>二维e
#>3 a b
#>4b
#>5 e a
#>6 c e
#>7 b e
#>8 c a
#>9 c a
#>10 a e
数据%>%
#在顺序无关紧要的情况下，添加“Sp_1”和“Sp_2”的唯一表示形式
行（）
变异（字符串=粘贴（排序（c（Sp_1，Sp_2）），折叠=）%>%
解组（）%>%
#使用`map_int（）`获取`string`的整数`id`表示形式`
mutate（id=map\u int（string，~which（unique（string）==））
#>#tibble:10 x 4
#>Sp_1 Sp_2字符串id
#>        
#>1 a e ae 1
#>2 d e de 2
#>3 a b ab 3
#>4 b bb 4
#>5 e a ae 1
#>6 c e ce 5
#>7 b e是6
#>8 c a ac 7
#>9 c a ac 7
#>10 a e ae 1
这里有一个解决方案，它首先使用rowwise（）
获得顺序不重要的Sp_1
+Sp_2
的表示，然后purr:：map_int（）
将其转换为唯一id：
#使其可复制
种子（1）
#加载包
图书馆（dplyr）
图书馆（purrr）
#定义和检查测试数据集
数据#A tible:10 x 2
#>Sp_1 Sp_2
#>     
#>1 a e
#>二维e
#>3 a b
#>4b
#>5 e a
#>6 c e
#>7 b e
#>8 c a
#>9 c a
#>10 a e
数据%>%
#在顺序无关紧要的情况下，添加“Sp_1”和“Sp_2”的唯一表示形式
行（）
变异（字符串=粘贴（排序（c（Sp_1，Sp_2）），折叠=）%>%
解组（）%>%
#使用`map_int（）`获取`string`的整数`id`表示形式`
mutate（id=map\u int（string，~which（unique（string）==））
#>#tibble:10 x 4
#>Sp_1 Sp_2字符串id
#>        
#>1 a e ae 1
#>2 d e de 2
#>3 a b ab 3
#>4 b bb 4
#>5 e a ae 1
#>6 c e ce 5
#>7 b e是6
#>8 c a ac 7
#>9 c a ac 7
#>10 a e ae 1
使用pmin和pmax：
数据%>%
突变（id1=paste0（pmin（Sp_1，Sp_2），pmax（Sp_1，Sp_2）），
id2=as.integer（as.factor（id1）））
##tibble:10 x 4
#Sp_1 Sp_2 id1 id2
#       
#1 a e ae 3
#2 d e d 7
#3 a b ab 1
#4 b bb 4
#5 e a ae 3
#6 c e ce 6
#7 b e是5
#8 c a ac 2
#9 c a ac 2
#10 a e ae 3


编辑：如果我们在追求效率，请坚持使用base:：transform，请参阅基准测试。使用示例数据时，效率提高了5倍，使用较大数据时，效率提高了1.5倍：
# bigger data
set.seed(1); data <- tibble(
  Sp_1 = sample(letters[1:5], 10000, replace = TRUE),
  Sp_2 = sample(letters[1:5], 10000, replace = TRUE)
)

microbenchmark::microbenchmark(
  x1 = {
    data %>% 
      mutate(id = as.integer(as.factor(
        paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
  },
  x2 = {
    transform(data,
              id = as.integer(as.factor(
                paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
    
  }, unit = "relative")

# Unit: relative
# expr      min       lq     mean   median       uq      max neval
#   x1 1.476691 1.457313 1.414833 1.429563 1.303684 2.209446   100
#   x2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100

#更大的数据
种子集（1）；数据%
mutate（id=as.integer（as.factor(
paste0（pmin（Sp_1，Sp_2），pmax（Sp_1，Sp_2щщ））
},
x2={
转换（数据、，
id=as.integer（as.factor(
paste0（pmin（Sp_1，Sp_2），pmax（Sp_1，Sp_2щщ））
}，unit=“relative”）
#单位：亲属
#expr最小lq平均uq最大neval
#x1 1.476691 1.457313 1.414833 1.429563 1.303684 2.209446 100
#x2 1.000000 1.000000 1.000000 1.000000 1.000000 100
使用pmin和pmax：
数据%>%
突变（id1=paste0（pmin（Sp_1，Sp_2），pmax（Sp_1，Sp_2）），
id2=as.integer（as.factor（id1）））
##tibble:10 x 4
#Sp_1 Sp_2 id1 id2
#       
#1 a e ae 3
#2 d e d 7
#3 a b ab 1
#4 b bb 4
#5 e a ae 3
#6 c e ce 6
#7 b e是5
#8 c a ac 2
#9 c a ac 2
#10 a e ae 3


编辑：如果我们在追求效率，请坚持使用base:：transform，请参阅基准测试。使用示例数据时速度快5倍，使用较大数据时速度快1.5倍：
# bigger data
set.seed(1); data <- tibble(
  Sp_1 = sample(letters[1:5], 10000, replace = TRUE),
  Sp_2 = sample(letters[1:5], 10000, replace = TRUE)
)

microbenchmark::microbenchmark(
  x1 = {
    data %>% 
      mutate(id = as.integer(as.factor(
        paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
  },
  x2 = {
    transform(data,
              id = as.integer(as.factor(
                paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
    
  }, unit = "relative")

# Unit: relative
# expr      min       lq     mean   median       uq      max neval
#   x1 1.476691 1.457313 1.414833 1.429563 1.303684 2.209446   100
#   x2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100

#更大的数据
结实。种子（1）；数据%
mutate（id=as.integer（as.factor(
paste0（pmin（Sp_1，Sp_2），pmax（Sp_1，Sp_2щщ））
},
x2={
转换（数据、，
id=as.integer（as.factor(
paste0（pmin（Sp_1，Sp_2），pmax（Sp_1，Sp_2щщ））
}，unit=“relative”）
#单位：亲属
#expr最小lq平均uq最大neval
#x1 1.476691 1.457313 1.414833 1.429563 1.303684 2.209446 100
#x2 1.000000 1.000000 1.000000 1.000000 1.000000 100
基准测试
可重复的示例由@BluVoxe给出。我们比较了@BluVoxe和@zx8754解决方案的输出和运行时间：
library(dplyr)
library(purrr)
library(microbenchmark)

# Make this reproducible
set.seed(1)

# Define and inspect a test dataset
data <- tibble(
  Sp_1 = sample(letters[1:5], 10, replace = TRUE),
  Sp_2 = sample(letters[1:5], 10, replace = TRUE)
)

data

# # A tibble: 10 x 2
#    Sp_1  Sp_2 
#    <chr> <chr>
#  1 a     e    
#  2 d     e    
#  3 a     b    
#  4 b     b    
#  5 e     a    
#  6 c     e    
#  7 b     e    
#  8 c     a    
#  9 c     a    
# 10 a     e

#First solution

data1 <- data%>% 
  
  # Add a unique representation of `Sp_1` and `Sp_2` where order doesn't matter
  dplyr::rowwise() %>% 
  dplyr::mutate(string = paste(sort(c(Sp_1, Sp_2)), collapse = "")) %>% 
  dplyr::ungroup() %>% 
  
  # Use `map_int()` to get an integer `id` representation of `string`
  dplyr::mutate(id = purrr::map_int(string, ~which(unique(string) == .)))

data1

# # A tibble: 10 x 4
#   Sp_1  Sp_2  string id
#   <chr> <chr> <chr>  <int>
# 1 a     e     ae     1
# 2 d     e     de     2
# 3 a     b     ab     3
# 4 b     b     bb     4
# 5 e     a     ae     1
# 6 c     e     ce     5
# 7 b     e     be     6
# 8 c     a     ac     7
# 9 c     a     ac     7
# 10 a    e     ae     1


# Second solution

data2 <- data %>% 
  dplyr::mutate(id1 = paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)),
         id2 = as.integer(as.factor(id1)))

data2

# # A tibble: 10 x 4
#   Sp_1  Sp_2  id1   id2
#   <chr> <chr> <chr> <int>
# 1 a     e     ae    3
# 2 d     e     de    7
# 3 a     b     ab    1
# 4 b     b     bb    4
# 5 e     a     ae    3
# 6 c     e     ce    6
# 7 b     e     be    5
# 8 c     a     ac    2
# 9 c     a     ac    2
# 10 a    e     ae    3

# Third solution
data3 <- transform(
  data,
  id = as.integer(
    as.factor(
      paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2))
      )
    )
  )

data3

#       Sp_1 Sp_2 id
# 1     a    e    3
# 2     d    e    7
# 3     a    b    1
# 4     b    b    4
# 5     e    a    3
# 6     c    e    6
# 7     b    e    5
# 8     c    a    2
# 9     c    a    2
# 10    a    e    3

 #Compare efficiency :

microbenchmark::microbenchmark(
  x1 = {
    data%>% 
      dplyr::rowwise() %>% 
      dplyr::mutate(string = paste(sort(c(Sp_1, Sp_2)), collapse = "")) %>% 
      dplyr::ungroup() %>%
      dplyr::mutate(id = purrr::map_int(string, ~which(unique(string) == .)))
  },
  x2 = {
    data %>% 
      dplyr::mutate(id = as.integer(as.factor(
        paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
  },
  x3 = {
    transform(data,
              id = as.integer(as.factor(
                paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
    
  },
  unit = "relative")

#Unit: relative
 #expr       min        lq      mean    median        uq      max neval
  # x1 23.329340 24.151001 23.951911 23.710270 22.996736 28.23673   100
   #x2  8.064332  7.785381  8.214726  7.796895  7.741803 19.18936   100
   #x3  1.000000  1.000000  1.000000  1.000000  1.000000  1.00000   100

#With bigger data :

set.seed(1)
data <- tibble(
  Sp_1 = sample(letters[1:5], 10000, replace = TRUE),
  Sp_2 = sample(letters[1:5], 10000, replace = TRUE)
)

microbenchmark::microbenchmark(
  x1 = {
    data%>% 
      dplyr::rowwise() %>% 
      dplyr::mutate(string = paste(sort(c(Sp_1, Sp_2)), collapse = "")) %>% 
      dplyr::ungroup() %>% 
      dplyr::mutate(id = purrr::map_int(string, ~which(unique(string) == .)))
  },
  x2 = {
    data %>% 
      dplyr::mutate(id = as.integer(as.factor(
        paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
  },
  x3 = {
    transform(data,
              id = as.integer(as.factor(
                paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
    
  }, unit = "relative")

#Unit: relative
 #expr        min         lq       mean     median         uq        max neval
   #x1 524.626924 512.590748 506.051098 515.687843 521.642359 418.635195   100
   #x2   1.503782   1.514021   1.577941   1.559449   1.620967   1.648478   100
   #x3   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000   100


库（dplyr）
图书馆（purrr）
图书馆（微基准）
#使其可复制
种子（1）
#定义和检查测试数据集
数据基准测试
可重复的示例由@BluVoxe给出。我们比较