R 匹配两个向量的子字符串并创建一个新的组合向量
考虑两个向量R 匹配两个向量的子字符串并创建一个新的组合向量,r,vector,R,Vector,考虑两个向量 a <- c(123, 234, 432, 223) b <- c(234, 238, 342, 325, 326) 为简单起见,考虑所有元素的长度总是为3。 我试过: sub_a <- substr(a, 2, 3) #get last two digits of a sub_b <- substr(b, 1, 2) #get first two digits of b common <- intersect(sub_a, sub_b)
a <- c(123, 234, 432, 223)
b <- c(234, 238, 342, 325, 326)
为简单起见,考虑所有元素的长度总是为3。
我试过:
sub_a <- substr(a, 2, 3) #get last two digits of a
sub_b <- substr(b, 1, 2) #get first two digits of b
common <- intersect(sub_a, sub_b)
然后我一起使用match
和paste0
,得到不完整的输出
paste0(a[match(common, sub_a)], substr(b[match(common, sub_b)], 3, 3))
#[1] "1234" "2342" "4325"
asmatch
仅与第一次匹配
如何实现预期输出?这里有一种方法,第一个列表
a
在for
循环中完成。在每次循环迭代中,列表a
中元素的最后两位与列表b
的前两位匹配。结果被合并到vector结果中
a <- c(123, 234, 432, 223)
b <- c(234, 238, 342, 325, 326)
sub_a <- substr(a, 2, 3) # get last two digits of a
sub_b <- substr(b, 1, 2) # get first two digits of b
result <- c()
for (ai in a) {
sub_ai <- substr(ai, 2, 3)
if (sub_ai %in% sub_b) {
b_match <- (sub_b == sub_ai)
result <- c(result, paste0(ai, substr(b[b_match], 3, 4)))
}
}
result
如果a
或b
不包含唯一值,您可以使用命令指定唯一结果
unique(result)
可能有点复杂,但有效:
unlist( sapply( a, function(x) {
regex <- paste0( substr(x, 2, 3), '(\\d)')
z <- sub(regex, paste0(x, "\\1"), b)
z[!b %in% z]
} ))
一种可能的解决办法:
a <- setNames(a, substr(a, 2, 3))
b <- setNames(b, substr(b, 1, 2))
df <- merge(stack(a), stack(b), by = 'ind')
paste0(substr(df$values.x, 1, 1), df$values.y)
第二种选择:
a <- setNames(a, substr(a, 2, 3))
b <- setNames(b, substr(b, 1, 2))
l <- lapply(names(a), function(x) b[x == names(b)])
paste0(substr(rep(a, lengths(l)), 1, 1), unlist(l))
a在中间部分使用dplyr::inner_连接:
library(dplyr)
a <- c(123, 234, 432, 223)
b <- c(234, 238, 342, 325, 326)
a1 <- data.frame(a)
b1 <- data.frame(b)
a1 <- a1 %>% mutate(first_a = substr(a, 1, 1), last_a = substr(a, 2, 3))
b1 <- b1 %>% mutate(first_b = substr(b, 1, 2), last_b = substr(b, 3, 3))
c1 <- inner_join(a1, b1, by = c("last_a" = "first_b"))
results <- paste0(c1$a, c1$last_b)
库(dplyr)
另一种方法是使用expand.grid
,因此在sub_a
和sub_b
中拾取
d1 <- expand.grid(a, b, stringsAsFactors = FALSE)
d2 <- expand.grid(sub_a, sub_b, stringsAsFactors = FALSE)
i1 <- d2$Var1 == d2$Var2
d1 <- d1[i1,]
d1$Var1 <- substr(d1$Var1, 1, 1)
do.call(paste0, d1)
#[1] "1234" "2234" "1238" "2238" "2342" "4325" "4326"
d1这里是base R中的另一个选项:
foo <- function(a, b) {
split_a <- split(a, substr(a, 2, 3))
split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
idx <- intersect(names(split_a), names(split_b))
stopifnot(length(idx) > 0)
unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]),
use.names = FALSE)
}
foo(a, b)
# [1] "1234" "2234" "1238" "2238" "4325" "4326" "2342"
fooA基准(将sub_A和sub_b创建添加到Sotos和Heikki答案中,以便每个人都从相同的初始向量开始A
800个观察值和b
1000个观察值)
使用以下工具运行基准测试:
library(dplyr)
library(data.table)
library(microbenchmark)
a <- sample(100:999, 8e3, TRUE)
b <- sample(100:999, 1e4, TRUE)
microbenchmark(Jaap1(a,b), Jaap2(a,b), Tensi(a,b), Heikki(a,b), Sotos(a,b),
Matt_base(a,b), Matt_dplyr(a,b), Docendo(a,b),
zx8754(a,b), zx8754for(a,b), Frank(a,b),
times = 50, unit = 'relative')
使用的功能包括:
Jaap1 <- function(a,b) {
a <- setNames(a, substr(a,2,3))
b <- setNames(b, substr(b,1,2))
df <- merge(stack(a), stack(b), by = 'ind')
paste0(substr(df$values.x,1,1), df$values.y)
}
Jaap2 <- function(a,b) {
a <- setNames(a, substr(a,2,3))
b <- setNames(b, substr(b,1,2))
l <- lapply(names(a), function(x) b[x == names(b)])
paste0(substr(rep(a, lengths(l)),1,1), unlist(l))
}
Tensi <- function(a,b) {
unlist(sapply(a,function(x) {regex <- paste0(substr(x,2,3),'(\\d)'); z <- sub(regex,paste0(x,"\\1"),b); z[!b %in% z] } ))
}
Heikki <- function(a,b) {
sub_a <- substr(a, 2, 3)
sub_b <- substr(b, 1, 2)
result <- c()
for (ai in a) {
sub_ai <- substr(ai,2,3)
if (sub_ai %in% sub_a) {
b_match <- (sub_b == sub_ai)
result <- c(result,paste0(ai,substr(b[b_match],3,4)))
}
}
result
}
Sotos <- function(a,b) {
sub_a <- substr(a, 2, 3)
sub_b <- substr(b, 1, 2)
d1 <- expand.grid(a, b, stringsAsFactors = FALSE)
d2 <- expand.grid(sub_a, sub_b, stringsAsFactors = FALSE)
i1 <- d2$Var1 == d2$Var2
d1 <- d1[i1,]
d1$Var1 <- substr(d1$Var1, 1, 1)
do.call(paste0, d1)
}
Matt_base <- function(a,b) {
a1 <- data.frame(a)
b1 <- data.frame(b)
a1$first_a = substr(a1$a, 1, 1)
a1$last_a = substr(a1$a, 2, 3)
b1$first_b = substr(b1$b, 1, 2)
b1$last_b = substr(b1$b, 3, 3)
c1 <- merge(a1, b1, by.x = "last_a", by.y = "first_b")
results <- paste0(c1$a, c1$last_b)
}
Matt_dplyr <- function(a,b) {
a1 <- data.frame(a)
b1 <- data.frame(b)
a1 <- a1 %>% mutate(first_a = substr(a, 1, 1), last_a = substr(a, 2, 3))
b1 <- b1 %>% mutate(first_b = substr(b, 1, 2), last_b = substr(b, 3, 3))
c1 <- inner_join(a1, b1, by = c("last_a" = "first_b"))
results <- paste0(c1$a, c1$last_b)
}
Docendo <- function(a, b) {
split_a <- split(a, substr(a, 2, 3))
split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
idx <- intersect(names(split_a), names(split_b))
stopifnot(length(idx) > 0)
unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]),
use.names = FALSE)
}
zx8754 <- function(a, b) {
unlist(sapply(a, function(i) i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
}
zx8754for <- function(a, b) {
res <- integer()
for(i in a) res <- c(res, i * 10 + (b %% 10)[i %% 100 == b %/% 10])
res
}
Frank <- function(a, b) {
aDT <- as.data.table(tstrsplit(a, ""))
bDT <- setnames(as.data.table(tstrsplit(b, "")), c("V2", "V3", "V4"))
merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
Jaap1来点数学怎么样*:
unlist(sapply(a, function(i)
i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
*假设:所有数字都是3位数,但这当然可以在Sappy内调整
检查输出,输出的顺序将不同于其他答案,并且输出是数字,而不是字符
identical(sort(as.numeric(docendo(a, b))), sort(zx8754(a, b)))
# [1] TRUE
identical(sort(as.numeric(jaap(a, b))), sort(zx8754(a, b)))
# [1] TRUE
Edit:forloop版本似乎快了3倍(例如,小数据集越大,速度实际上慢了3倍,请参阅benchmarks wiki文章)
zx8754另一个选项是将其放入列并连接:
library(data.table)
Frank <- function(a, b) {
aDT <- setDT(tstrsplit(a, ""))
bDT <- setnames(setDT(tstrsplit(b, "")), c("V2", "V3", "V4"))
merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
库(data.table)
弗兰克,嗯。。。一个不是很有效的方法是d1,我认为match
不适合你,因为它在第一场比赛后就停止了。可能更像是%sub
@Sotos中的sub\u%,除了在最终输出中重复两次common
部分之外,它是有效的。我们能提高效率吗?我已经在使用这么多额外的变量(sub_a
,sub_b
,common
),再添加两个会很昂贵吗?看看这个:@ΦXocę웃Пepeúpaツ 是的,我知道这一点。循环本身并不坏,只要你不在循环中增加一个向量。非常好@Jaap。我从来没有想到可以在命名向量上使用堆栈
;我很好奇:这有没有记录在案?我在?stack
@MauritsEvers中找不到任何东西如果它不在?stack
中,那么我就不知道另一个记录此内容的来源:-\@MauritsEvers我必须回到我以前的评论:它在?stack
的文档中,但仅在细节部分的最后一行中隐含:这些函数是通用的:提供的方法处理数据帧和对象,这些数据帧和对象可通过as.list
(强调我的方法)强制到列表中。一个命名向量可以强制到一个列表中。这是一个很好而且最快的!请参见单独cw答案中的基准测试。
foo <- function(a, b) {
split_a <- split(a, substr(a, 2, 3))
split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
idx <- intersect(names(split_a), names(split_b))
stopifnot(length(idx) > 0)
unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]),
use.names = FALSE)
}
foo(a, b)
# [1] "1234" "2234" "1238" "2238" "4325" "4326" "2342"
set.seed(123)
a <- sample(100:999, 1e4, TRUE)
b <- sample(100:999, 1e3, TRUE)
library(microbenchmark)
library(dplyr)
res <- microbenchmark(docendo(a, b),
Jaap1(a, b),
Jaap2(a, b),
Sotos(a, b),
Tensi(a, b),
Heikki(a, b),
Matt_base(a, b),
Matt_dplyr(a, b),
zx8754(a, b),
times = 10, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
docendo(a, b) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
Jaap1(a, b) 14.002977 13.724432 13.347755 13.433175 12.788948 13.301811 10
Jaap2(a, b) 4.364993 4.936248 5.201879 5.125639 5.060425 7.520069 10
Sotos(a, b) 22.215750 23.850280 25.743047 25.177676 28.274083 28.288089 10
Tensi(a, b) 231.230360 234.830000 246.587532 242.345573 260.784725 273.184452 10
Heikki(a, b) 135.615708 136.900943 144.775845 146.314048 150.546406 156.873954 10
Matt_base(a, b) 13.274675 12.995334 13.402940 12.723798 12.432802 18.881093 10
Matt_dplyr(a, b) 1.299223 1.314568 1.420479 1.345850 1.380378 1.807671 10
zx8754(a, b) 9.607226 10.175381 10.486580 10.136439 10.096818 13.410858 10
Frank <- function(a, b) {
aDT <- as.data.table(tstrsplit(a, ""))
bDT <- setnames(as.data.table(tstrsplit(b, "")), c("V2", "V3", "V4"))
merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
set.seed(1) # same input size as in the cw benchmark answer
a <- sample(100:999, 8e3, TRUE)
b <- sample(100:999, 1e4, TRUE)
microbenchmark(Frank(a, b), docendo(a, b), unit = "relative", times = 10)
Unit: relative
expr min lq mean median uq max neval
Frank(a, b) 1.37435 1.390417 1.500996 1.470548 1.644079 1.616446 10
docendo(a, b) 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 10
all.equal(sort(docendo(a, b)), sort(Frank(a, b)))
#[1] TRUE
library(dplyr)
library(data.table)
library(microbenchmark)
a <- sample(100:999, 8e3, TRUE)
b <- sample(100:999, 1e4, TRUE)
microbenchmark(Jaap1(a,b), Jaap2(a,b), Tensi(a,b), Heikki(a,b), Sotos(a,b),
Matt_base(a,b), Matt_dplyr(a,b), Docendo(a,b),
zx8754(a,b), zx8754for(a,b), Frank(a,b),
times = 50, unit = 'relative')
Unit: relative
expr min lq mean median uq max neval cld
Jaap1(a, b) 19.668483 19.316194 17.2373827 18.921573 18.829932 7.8792713 50 d
Jaap2(a, b) 4.253151 4.365420 4.0557281 4.309247 4.398149 2.2149125 50 b
Tensi(a, b) 241.682216 238.197815 212.2844582 233.473689 233.367619 93.3562331 50 h
Heikki(a, b) 114.895836 113.754054 101.2781709 111.637570 110.541708 44.9437229 50 g
Sotos(a, b) 27.598767 28.725937 25.7469518 28.534011 28.638413 11.6995642 50 e
Matt_base(a, b) 19.159883 18.834180 16.8853660 18.513498 18.416194 7.8329323 50 d
Matt_dplyr(a, b) 1.108230 1.106051 1.0203776 1.102078 1.098476 1.0131898 50 a
Docendo(a, b) 1.000000 1.000000 1.0000000 1.000000 1.000000 1.0000000 50 a
zx8754(a, b) 11.601730 12.986763 11.7859245 13.054720 13.234842 5.6944437 50 c
zx8754for(a, b) 90.448168 92.906445 82.4905438 91.092609 90.160010 36.1277145 50 f
Frank(a, b) 1.070775 1.070202 0.9621499 1.063978 1.055540 0.4459918 50 a
Jaap1 <- function(a,b) {
a <- setNames(a, substr(a,2,3))
b <- setNames(b, substr(b,1,2))
df <- merge(stack(a), stack(b), by = 'ind')
paste0(substr(df$values.x,1,1), df$values.y)
}
Jaap2 <- function(a,b) {
a <- setNames(a, substr(a,2,3))
b <- setNames(b, substr(b,1,2))
l <- lapply(names(a), function(x) b[x == names(b)])
paste0(substr(rep(a, lengths(l)),1,1), unlist(l))
}
Tensi <- function(a,b) {
unlist(sapply(a,function(x) {regex <- paste0(substr(x,2,3),'(\\d)'); z <- sub(regex,paste0(x,"\\1"),b); z[!b %in% z] } ))
}
Heikki <- function(a,b) {
sub_a <- substr(a, 2, 3)
sub_b <- substr(b, 1, 2)
result <- c()
for (ai in a) {
sub_ai <- substr(ai,2,3)
if (sub_ai %in% sub_a) {
b_match <- (sub_b == sub_ai)
result <- c(result,paste0(ai,substr(b[b_match],3,4)))
}
}
result
}
Sotos <- function(a,b) {
sub_a <- substr(a, 2, 3)
sub_b <- substr(b, 1, 2)
d1 <- expand.grid(a, b, stringsAsFactors = FALSE)
d2 <- expand.grid(sub_a, sub_b, stringsAsFactors = FALSE)
i1 <- d2$Var1 == d2$Var2
d1 <- d1[i1,]
d1$Var1 <- substr(d1$Var1, 1, 1)
do.call(paste0, d1)
}
Matt_base <- function(a,b) {
a1 <- data.frame(a)
b1 <- data.frame(b)
a1$first_a = substr(a1$a, 1, 1)
a1$last_a = substr(a1$a, 2, 3)
b1$first_b = substr(b1$b, 1, 2)
b1$last_b = substr(b1$b, 3, 3)
c1 <- merge(a1, b1, by.x = "last_a", by.y = "first_b")
results <- paste0(c1$a, c1$last_b)
}
Matt_dplyr <- function(a,b) {
a1 <- data.frame(a)
b1 <- data.frame(b)
a1 <- a1 %>% mutate(first_a = substr(a, 1, 1), last_a = substr(a, 2, 3))
b1 <- b1 %>% mutate(first_b = substr(b, 1, 2), last_b = substr(b, 3, 3))
c1 <- inner_join(a1, b1, by = c("last_a" = "first_b"))
results <- paste0(c1$a, c1$last_b)
}
Docendo <- function(a, b) {
split_a <- split(a, substr(a, 2, 3))
split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
idx <- intersect(names(split_a), names(split_b))
stopifnot(length(idx) > 0)
unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]),
use.names = FALSE)
}
zx8754 <- function(a, b) {
unlist(sapply(a, function(i) i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
}
zx8754for <- function(a, b) {
res <- integer()
for(i in a) res <- c(res, i * 10 + (b %% 10)[i %% 100 == b %/% 10])
res
}
Frank <- function(a, b) {
aDT <- as.data.table(tstrsplit(a, ""))
bDT <- setnames(as.data.table(tstrsplit(b, "")), c("V2", "V3", "V4"))
merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
unlist(sapply(a, function(i)
i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
identical(sort(as.numeric(docendo(a, b))), sort(zx8754(a, b)))
# [1] TRUE
identical(sort(as.numeric(jaap(a, b))), sort(zx8754(a, b)))
# [1] TRUE
zx8754 <- function(a, b) {
unlist(sapply(a, function(i) i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
}
zx8754_forloop <- function(a, b) {
res <- integer()
for(i in a) res <- c(res, i * 10 + (b %% 10)[i %% 100 == b %/% 10])
res
}
microbenchmark::microbenchmark(
zx8754 = zx8754(a, b),
zx8754_forloop = zx8754_forloop(a, b)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# zx8754 16.535 17.3910 55.05348 17.676 18.246 3672.223 100
# zx8754_forloop 4.562 5.4165 46.74887 5.987 6.272 4080.469 100
#check output
identical(zx8754(a, b), zx8754_forloop(a, b))
# [1] TRUE
library(data.table)
Frank <- function(a, b) {
aDT <- setDT(tstrsplit(a, ""))
bDT <- setnames(setDT(tstrsplit(b, "")), c("V2", "V3", "V4"))
merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}
MattDT <- function(a,b){
aDT2 <- data.table(V1 = substring(a,1,1), V23 = substring(a,2,3))
bDT2 <- data.table(V23 = substring(b,1,2), V4 = substring(b,3,3))
merge(aDT2, bDT2, allow.cartesian = TRUE)[, paste0(V1, V23, V4)]
}