R 匹配两个向量的子字符串并创建一个新的组合向量

R 匹配两个向量的子字符串并创建一个新的组合向量,r,vector,R,Vector,考虑两个向量 a <- c(123, 234, 432, 223) b <- c(234, 238, 342, 325, 326) 为简单起见,考虑所有元素的长度总是为3。 我试过: sub_a <- substr(a, 2, 3) #get last two digits of a sub_b <- substr(b, 1, 2) #get first two digits of b common <- intersect(sub_a, sub_b)

考虑两个向量

a <- c(123, 234, 432, 223)
b <- c(234, 238, 342, 325, 326)
为简单起见,考虑所有元素的长度总是为3。

我试过:

sub_a <- substr(a, 2, 3)   #get last two digits of a
sub_b <- substr(b, 1, 2)   #get first two digits of b
common <- intersect(sub_a, sub_b) 
然后我一起使用
match
paste0
,得到不完整的输出

paste0(a[match(common, sub_a)], substr(b[match(common, sub_b)], 3, 3))
#[1] "1234" "2342" "4325"
as
match
仅与第一次匹配


如何实现预期输出?

这里有一种方法,第一个列表
a
for
循环中完成。在每次循环迭代中,列表
a
中元素的最后两位与列表
b
的前两位匹配。结果被合并到vector
结果中

a <- c(123, 234, 432, 223)
b <- c(234, 238, 342, 325, 326)

sub_a <- substr(a, 2, 3)   # get last two digits of a
sub_b <- substr(b, 1, 2)   # get first two digits of b

result <- c()
for (ai in a) {
    sub_ai <- substr(ai, 2, 3)
    if (sub_ai %in% sub_b) {
        b_match <- (sub_b == sub_ai)
        result <- c(result, paste0(ai, substr(b[b_match], 3, 4)))
    }
}
result
如果
a
b
不包含唯一值,您可以使用命令指定唯一结果

unique(result)

可能有点复杂,但有效:

unlist( sapply( a, function(x) {
  regex <- paste0( substr(x, 2, 3), '(\\d)')
  z <- sub(regex, paste0(x, "\\1"), b)
  z[!b %in% z] 
} ))
一种可能的解决办法:

a <- setNames(a, substr(a, 2, 3))
b <- setNames(b, substr(b, 1, 2))

df <- merge(stack(a), stack(b), by = 'ind')
paste0(substr(df$values.x, 1, 1), df$values.y)

第二种选择:

a <- setNames(a, substr(a, 2, 3))
b <- setNames(b, substr(b, 1, 2))

l <- lapply(names(a), function(x) b[x == names(b)])
paste0(substr(rep(a, lengths(l)), 1, 1), unlist(l))
a在中间部分使用dplyr::inner_连接:

library(dplyr)

a <- c(123, 234, 432, 223)
b <- c(234, 238, 342, 325, 326)

a1 <- data.frame(a)
b1 <- data.frame(b)

a1 <- a1 %>% mutate(first_a = substr(a, 1, 1), last_a = substr(a, 2, 3))
b1 <- b1 %>% mutate(first_b = substr(b, 1, 2), last_b = substr(b, 3, 3))

c1 <- inner_join(a1, b1, by = c("last_a" = "first_b"))

results <- paste0(c1$a, c1$last_b)
库(dplyr)

另一种方法是使用
expand.grid
,因此在
sub_a
sub_b
中拾取

d1 <- expand.grid(a, b, stringsAsFactors = FALSE)
d2 <- expand.grid(sub_a, sub_b, stringsAsFactors = FALSE)
i1 <- d2$Var1 == d2$Var2
d1 <- d1[i1,] 
d1$Var1 <- substr(d1$Var1, 1, 1)

do.call(paste0, d1)
#[1] "1234" "2234" "1238" "2238" "2342" "4325" "4326"

d1这里是base R中的另一个选项:

foo <- function(a, b) {
  split_a <- split(a,  substr(a, 2, 3))
  split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
  idx <- intersect(names(split_a), names(split_b))
  stopifnot(length(idx) > 0)
  unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]), 
                      use.names = FALSE)
}

foo(a, b)
# [1] "1234" "2234" "1238" "2238" "4325" "4326" "2342"
fooA基准(将sub_A和sub_b创建添加到Sotos和Heikki答案中,以便每个人都从相同的初始向量开始
A
800个观察值和
b
1000个观察值)

使用以下工具运行基准测试:

library(dplyr)
library(data.table)
library(microbenchmark)

a <- sample(100:999, 8e3, TRUE)
b <- sample(100:999, 1e4, TRUE)

microbenchmark(Jaap1(a,b), Jaap2(a,b), Tensi(a,b), Heikki(a,b), Sotos(a,b),
               Matt_base(a,b), Matt_dplyr(a,b), Docendo(a,b),
               zx8754(a,b), zx8754for(a,b), Frank(a,b),
               times = 50, unit = 'relative')

使用的功能包括:

Jaap1 <- function(a,b) {
  a <- setNames(a, substr(a,2,3))
  b <- setNames(b, substr(b,1,2))

  df <- merge(stack(a), stack(b), by = 'ind')
  paste0(substr(df$values.x,1,1), df$values.y)
}

Jaap2 <- function(a,b) {
  a <- setNames(a, substr(a,2,3))
  b <- setNames(b, substr(b,1,2))

  l <- lapply(names(a), function(x) b[x == names(b)])
  paste0(substr(rep(a, lengths(l)),1,1), unlist(l))
}

Tensi <- function(a,b) {
  unlist(sapply(a,function(x) {regex <- paste0(substr(x,2,3),'(\\d)'); z <- sub(regex,paste0(x,"\\1"),b); z[!b %in% z] } ))
}

Heikki <- function(a,b) {
  sub_a <- substr(a, 2, 3)
  sub_b <- substr(b, 1, 2)
  result <- c()
  for (ai in a) {
    sub_ai <- substr(ai,2,3)
    if (sub_ai %in% sub_a) {
      b_match <- (sub_b == sub_ai)
      result <- c(result,paste0(ai,substr(b[b_match],3,4)))
    }
  }
  result
}

Sotos <- function(a,b) {
  sub_a <- substr(a, 2, 3)
  sub_b <- substr(b, 1, 2)
  d1 <- expand.grid(a, b, stringsAsFactors = FALSE)
  d2 <- expand.grid(sub_a, sub_b, stringsAsFactors = FALSE)
  i1 <- d2$Var1 == d2$Var2
  d1 <- d1[i1,] 
  d1$Var1 <- substr(d1$Var1, 1, 1)

  do.call(paste0, d1)
}

Matt_base <- function(a,b) {
  a1 <- data.frame(a)
  b1 <- data.frame(b)

  a1$first_a = substr(a1$a, 1, 1)
  a1$last_a = substr(a1$a, 2, 3)
  b1$first_b = substr(b1$b, 1, 2)
  b1$last_b = substr(b1$b, 3, 3)

  c1 <- merge(a1, b1, by.x = "last_a", by.y = "first_b")

  results <- paste0(c1$a, c1$last_b)
}

Matt_dplyr <- function(a,b) {
  a1 <- data.frame(a)
  b1 <- data.frame(b)

  a1 <- a1 %>% mutate(first_a = substr(a, 1, 1), last_a = substr(a, 2, 3))
  b1 <- b1 %>% mutate(first_b = substr(b, 1, 2), last_b = substr(b, 3, 3))

  c1 <- inner_join(a1, b1, by = c("last_a" = "first_b"))

  results <- paste0(c1$a, c1$last_b)
}

Docendo <- function(a, b) {
  split_a <- split(a,  substr(a, 2, 3))
  split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
  idx <- intersect(names(split_a), names(split_b))
  stopifnot(length(idx) > 0)
  unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]), 
         use.names = FALSE)
}

zx8754 <- function(a, b) {
  unlist(sapply(a, function(i) i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
}

zx8754for <- function(a, b) {
  res <- integer()
  for(i in a)  res <- c(res, i * 10 + (b %% 10)[i %% 100 == b %/% 10])
  res
}

Frank <- function(a, b) {
  aDT <- as.data.table(tstrsplit(a, ""))
  bDT <- setnames(as.data.table(tstrsplit(b, "")), c("V2", "V3", "V4"))
  merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}

Jaap1来点数学怎么样*:

unlist(sapply(a, function(i)
  i * 10 + (b %% 10)[i %% 100 == b %/% 10]))

*假设:所有数字都是3位数,但这当然可以在Sappy内调整

检查输出,输出的顺序将不同于其他答案,并且输出是数字,而不是字符

identical(sort(as.numeric(docendo(a, b))), sort(zx8754(a, b)))
# [1] TRUE
identical(sort(as.numeric(jaap(a, b))), sort(zx8754(a, b)))
# [1] TRUE

Edit:forloop版本似乎快了3倍(例如,小数据集越大,速度实际上慢了3倍,请参阅benchmarks wiki文章)


zx8754另一个选项是将其放入列并连接:

library(data.table)
Frank <- function(a, b) {
  aDT <- setDT(tstrsplit(a, ""))
  bDT <- setnames(setDT(tstrsplit(b, "")), c("V2", "V3", "V4"))
  merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
库(data.table)

弗兰克,嗯。。。一个不是很有效的方法是
d1,我认为
match
不适合你,因为它在第一场比赛后就停止了。可能更像是%sub
@Sotos中的
sub\u%,除了在最终输出中重复两次
common
部分之外,它是有效的。我们能提高效率吗?我已经在使用这么多额外的变量(
sub_a
sub_b
common
),再添加两个会很昂贵吗?看看这个:@ΦXocę웃Пepeúpaツ 是的,我知道这一点。循环本身并不坏,只要你不在循环中增加一个向量。非常好@Jaap。我从来没有想到可以在命名向量上使用
堆栈
;我很好奇:这有没有记录在案?我在
?stack
@MauritsEvers中找不到任何东西如果它不在
?stack
中,那么我就不知道另一个记录此内容的来源:-\@MauritsEvers我必须回到我以前的评论:它在
?stack
的文档中,但仅在细节部分的最后一行中隐含:这些函数是通用的:提供的方法处理数据帧和对象,这些数据帧和对象可通过
as.list
(强调我的方法)强制到列表中。一个命名向量可以强制到一个列表中。这是一个很好而且最快的!请参见单独cw答案中的基准测试。
foo <- function(a, b) {
  split_a <- split(a,  substr(a, 2, 3))
  split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
  idx <- intersect(names(split_a), names(split_b))
  stopifnot(length(idx) > 0)
  unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]), 
                      use.names = FALSE)
}

foo(a, b)
# [1] "1234" "2234" "1238" "2238" "4325" "4326" "2342"
set.seed(123)
a <- sample(100:999, 1e4, TRUE)
b <- sample(100:999, 1e3, TRUE)

library(microbenchmark)
library(dplyr)
res <- microbenchmark(docendo(a, b), 
               Jaap1(a, b), 
               Jaap2(a, b), 
               Sotos(a, b), 
               Tensi(a, b), 
               Heikki(a, b), 
               Matt_base(a, b),
               Matt_dplyr(a, b), 
               zx8754(a, b),
               times = 10, unit = "relative")

Unit: relative
             expr        min         lq       mean     median         uq        max neval
    docendo(a, b)   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000    10
      Jaap1(a, b)  14.002977  13.724432  13.347755  13.433175  12.788948  13.301811    10
      Jaap2(a, b)   4.364993   4.936248   5.201879   5.125639   5.060425   7.520069    10
      Sotos(a, b)  22.215750  23.850280  25.743047  25.177676  28.274083  28.288089    10
      Tensi(a, b) 231.230360 234.830000 246.587532 242.345573 260.784725 273.184452    10
     Heikki(a, b) 135.615708 136.900943 144.775845 146.314048 150.546406 156.873954    10
  Matt_base(a, b)  13.274675  12.995334  13.402940  12.723798  12.432802  18.881093    10
 Matt_dplyr(a, b)   1.299223   1.314568   1.420479   1.345850   1.380378   1.807671    10
     zx8754(a, b)   9.607226  10.175381  10.486580  10.136439  10.096818  13.410858    10
Frank <- function(a, b) {
  aDT <- as.data.table(tstrsplit(a, ""))
  bDT <- setnames(as.data.table(tstrsplit(b, "")), c("V2", "V3", "V4"))
  merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}

set.seed(1)  # same input size as in the cw benchmark answer
a <- sample(100:999, 8e3, TRUE)
b <- sample(100:999, 1e4, TRUE)

microbenchmark(Frank(a, b), docendo(a, b), unit = "relative", times = 10)

Unit: relative
          expr     min       lq     mean   median       uq      max neval
   Frank(a, b) 1.37435 1.390417 1.500996 1.470548 1.644079 1.616446    10
 docendo(a, b) 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000    10
all.equal(sort(docendo(a, b)), sort(Frank(a, b)))
#[1] TRUE
library(dplyr)
library(data.table)
library(microbenchmark)

a <- sample(100:999, 8e3, TRUE)
b <- sample(100:999, 1e4, TRUE)

microbenchmark(Jaap1(a,b), Jaap2(a,b), Tensi(a,b), Heikki(a,b), Sotos(a,b),
               Matt_base(a,b), Matt_dplyr(a,b), Docendo(a,b),
               zx8754(a,b), zx8754for(a,b), Frank(a,b),
               times = 50, unit = 'relative')
Unit: relative
             expr        min         lq        mean     median         uq        max neval      cld
      Jaap1(a, b)  19.668483  19.316194  17.2373827  18.921573  18.829932  7.8792713    50    d    
      Jaap2(a, b)   4.253151   4.365420   4.0557281   4.309247   4.398149  2.2149125    50  b      
      Tensi(a, b) 241.682216 238.197815 212.2844582 233.473689 233.367619 93.3562331    50        h
     Heikki(a, b) 114.895836 113.754054 101.2781709 111.637570 110.541708 44.9437229    50       g 
      Sotos(a, b)  27.598767  28.725937  25.7469518  28.534011  28.638413 11.6995642    50     e   
  Matt_base(a, b)  19.159883  18.834180  16.8853660  18.513498  18.416194  7.8329323    50    d    
 Matt_dplyr(a, b)   1.108230   1.106051   1.0203776   1.102078   1.098476  1.0131898    50 a       
    Docendo(a, b)   1.000000   1.000000   1.0000000   1.000000   1.000000  1.0000000    50 a       
     zx8754(a, b)  11.601730  12.986763  11.7859245  13.054720  13.234842  5.6944437    50   c     
  zx8754for(a, b)  90.448168  92.906445  82.4905438  91.092609  90.160010 36.1277145    50      f  
      Frank(a, b)   1.070775   1.070202   0.9621499   1.063978   1.055540  0.4459918    50 a
Jaap1 <- function(a,b) {
  a <- setNames(a, substr(a,2,3))
  b <- setNames(b, substr(b,1,2))

  df <- merge(stack(a), stack(b), by = 'ind')
  paste0(substr(df$values.x,1,1), df$values.y)
}

Jaap2 <- function(a,b) {
  a <- setNames(a, substr(a,2,3))
  b <- setNames(b, substr(b,1,2))

  l <- lapply(names(a), function(x) b[x == names(b)])
  paste0(substr(rep(a, lengths(l)),1,1), unlist(l))
}

Tensi <- function(a,b) {
  unlist(sapply(a,function(x) {regex <- paste0(substr(x,2,3),'(\\d)'); z <- sub(regex,paste0(x,"\\1"),b); z[!b %in% z] } ))
}

Heikki <- function(a,b) {
  sub_a <- substr(a, 2, 3)
  sub_b <- substr(b, 1, 2)
  result <- c()
  for (ai in a) {
    sub_ai <- substr(ai,2,3)
    if (sub_ai %in% sub_a) {
      b_match <- (sub_b == sub_ai)
      result <- c(result,paste0(ai,substr(b[b_match],3,4)))
    }
  }
  result
}

Sotos <- function(a,b) {
  sub_a <- substr(a, 2, 3)
  sub_b <- substr(b, 1, 2)
  d1 <- expand.grid(a, b, stringsAsFactors = FALSE)
  d2 <- expand.grid(sub_a, sub_b, stringsAsFactors = FALSE)
  i1 <- d2$Var1 == d2$Var2
  d1 <- d1[i1,] 
  d1$Var1 <- substr(d1$Var1, 1, 1)

  do.call(paste0, d1)
}

Matt_base <- function(a,b) {
  a1 <- data.frame(a)
  b1 <- data.frame(b)

  a1$first_a = substr(a1$a, 1, 1)
  a1$last_a = substr(a1$a, 2, 3)
  b1$first_b = substr(b1$b, 1, 2)
  b1$last_b = substr(b1$b, 3, 3)

  c1 <- merge(a1, b1, by.x = "last_a", by.y = "first_b")

  results <- paste0(c1$a, c1$last_b)
}

Matt_dplyr <- function(a,b) {
  a1 <- data.frame(a)
  b1 <- data.frame(b)

  a1 <- a1 %>% mutate(first_a = substr(a, 1, 1), last_a = substr(a, 2, 3))
  b1 <- b1 %>% mutate(first_b = substr(b, 1, 2), last_b = substr(b, 3, 3))

  c1 <- inner_join(a1, b1, by = c("last_a" = "first_b"))

  results <- paste0(c1$a, c1$last_b)
}

Docendo <- function(a, b) {
  split_a <- split(a,  substr(a, 2, 3))
  split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
  idx <- intersect(names(split_a), names(split_b))
  stopifnot(length(idx) > 0)
  unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]), 
         use.names = FALSE)
}

zx8754 <- function(a, b) {
  unlist(sapply(a, function(i) i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
}

zx8754for <- function(a, b) {
  res <- integer()
  for(i in a)  res <- c(res, i * 10 + (b %% 10)[i %% 100 == b %/% 10])
  res
}

Frank <- function(a, b) {
  aDT <- as.data.table(tstrsplit(a, ""))
  bDT <- setnames(as.data.table(tstrsplit(b, "")), c("V2", "V3", "V4"))
  merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
unlist(sapply(a, function(i)
  i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
identical(sort(as.numeric(docendo(a, b))), sort(zx8754(a, b)))
# [1] TRUE
identical(sort(as.numeric(jaap(a, b))), sort(zx8754(a, b)))
# [1] TRUE
zx8754 <- function(a, b) {
  unlist(sapply(a, function(i) i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
}

zx8754_forloop <- function(a, b) {
  res <- integer()
  for(i in a)  res <- c(res, i * 10 + (b %% 10)[i %% 100 == b %/% 10])
  res
}

microbenchmark::microbenchmark(
  zx8754 = zx8754(a, b),
  zx8754_forloop = zx8754_forloop(a, b)
)

# Unit: microseconds
#           expr    min      lq     mean median     uq      max neval
# zx8754         16.535 17.3910 55.05348 17.676 18.246 3672.223   100
# zx8754_forloop  4.562  5.4165 46.74887  5.987  6.272 4080.469   100

#check output
identical(zx8754(a, b), zx8754_forloop(a, b))
# [1] TRUE
library(data.table)
Frank <- function(a, b) {
  aDT <- setDT(tstrsplit(a, ""))
  bDT <- setnames(setDT(tstrsplit(b, "")), c("V2", "V3", "V4"))
  merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
MattDT <- function(a,b){
  aDT2 <- data.table(V1 = substring(a,1,1), V23 = substring(a,2,3))
  bDT2 <- data.table(V23 = substring(b,1,2), V4 = substring(b,3,3))
  merge(aDT2, bDT2, allow.cartesian = TRUE)[, paste0(V1, V23, V4)]
}