Regex 在r中比gsub更快的接近_Regex_R

Regex 在r中比gsub更快的接近

regex r

Regex 在r中比gsub更快的接近,regex,r,Regex,R,我试图找出，在R中是否有比gsub矢量化函数更快的方法。我有下面的数据框，其中有一些“句子”（sent$words），然后我有从这些句子中删除的单词（存储在words forremoving变量中） sent正如Jason所说，stringi是您的好选择下面是stringi的性能 system.time(res <- gsub(pattern, "", sent$words)) user system elapsed 66.229 0.000 66.199 libr

我试图找出，在R中是否有比gsub矢量化函数更快的方法。我有下面的数据框，其中有一些“句子”（sent$words），然后我有从这些句子中删除的单词（存储在words forremoving变量中）

sent正如Jason所说，stringi是您的好选择
下面是stringi的性能
system.time(res <- gsub(pattern, "", sent$words))
   user  system elapsed 
 66.229   0.000  66.199 

library(stringi)
system.time(stri_replace_all_regex(sent$words, pattern, ""))
   user  system elapsed 
 21.246   0.320  21.552 

system.time（res这不是一个真正的答案，因为我没有找到任何总是更快的方法。显然这取决于文本/向量的长度。对于短文本gsub
执行速度最快。对于长文本或向量，有时gsub
使用perl=TRUE
执行速度最快，有时stri\u-replace\u-all\u-regex
执行速度最快
下面是一些要尝试的测试代码：
library(stringi)
text = "(a1,\"something (f fdd71)\");(b2,\"something else (a fa171)\");(b4,\"something else (a fa171)\")"
# text = paste(rep(text, 5), collapse = ",")
# text = rep(text, 100)
nchar(text)

a = gsub(pattern = "[()]", replacement = "", x = text)
b = gsub(pattern = "[()]", replacement = "", x = text, perl=T)
c = stri_replace_all_regex(str = text, pattern = "[()]", replacement = "")
d = stri_replace(str = text, regex = "[()]", replacement = "", mode="all")

identical(a,b); identical(a,c); identical(a,d)

library(microbenchmark)
mc <- microbenchmark(
  gsub = gsub(pattern = "[()]", replacement = "", x = text),
  gsub_perl = gsub(pattern = "[()]", replacement = "", x = text, perl=T),
  stringi_all = stri_replace_all_regex(str = text, pattern = "[()]", replacement = ""),
  stringi = stri_replace(str = text, regex = "[()]", replacement = "", mode="all")
)
mc

通过使用stringi:：stri\u replace\u all\u regex（sent$words，pattern，“”）
Try base Rgsub
withperl=TRUE，您将获得一些改进（在我尝试的示例中，复制数量减少了60%）。
pattern <- paste0("\\b(?:", paste(wordsForRemoving, collapse = "|"), ")\\b ?")
res <- gsub(pattern, "", sent$words)

#  user  system elapsed 
# 72.87    0.05   73.79

system.time(res <- gsub(pattern, "", sent$words))
   user  system elapsed 
 66.229   0.000  66.199 

library(stringi)
system.time(stri_replace_all_regex(sent$words, pattern, ""))
   user  system elapsed 
 21.246   0.320  21.552 

system.time(res <- gsub(pattern, "", sent$words, perl = TRUE))
   user  system elapsed 
 12.290   0.000  12.281 

library(stringi)
text = "(a1,\"something (f fdd71)\");(b2,\"something else (a fa171)\");(b4,\"something else (a fa171)\")"
# text = paste(rep(text, 5), collapse = ",")
# text = rep(text, 100)
nchar(text)

a = gsub(pattern = "[()]", replacement = "", x = text)
b = gsub(pattern = "[()]", replacement = "", x = text, perl=T)
c = stri_replace_all_regex(str = text, pattern = "[()]", replacement = "")
d = stri_replace(str = text, regex = "[()]", replacement = "", mode="all")

identical(a,b); identical(a,c); identical(a,d)

library(microbenchmark)
mc <- microbenchmark(
  gsub = gsub(pattern = "[()]", replacement = "", x = text),
  gsub_perl = gsub(pattern = "[()]", replacement = "", x = text, perl=T),
  stringi_all = stri_replace_all_regex(str = text, pattern = "[()]", replacement = ""),
  stringi = stri_replace(str = text, regex = "[()]", replacement = "", mode="all")
)
mc

Unit: microseconds
        expr    min      lq     mean  median     uq     max neval  cld
        gsub 10.868 11.7740 13.47869 13.5840 14.490  31.394   100 a   
   gsub_perl 79.690 80.2945 82.58225 82.4070 83.312 137.043   100    d
 stringi_all 14.188 14.7920 15.58558 15.5460 16.301  17.509   100  b  
     stringi 36.828 38.0350 39.90904 38.7895 39.543 129.194   100   c