在另一个字段上使用正则表达式在R data.table中创建新字段

在另一个字段上使用正则表达式在R data.table中创建新字段,r,regex,data.table,R,Regex,Data.table,给定此数据。表: library(data.table) dt <- data.table(f1 = c( "stuffstuff-0000097125", "stuffstuff.abc.0006496679", "stuffstuff0007517235", "stuffstuff_xyz.0007280719", "stuffstuff0005995303", "stuffstuff_a1b_0000143856", "stuffstuff000936

给定此
数据。表

library(data.table)

dt <- data.table(f1 =  c(
  "stuffstuff-0000097125",
  "stuffstuff.abc.0006496679",
  "stuffstuff0007517235",
  "stuffstuff_xyz.0007280719",
  "stuffstuff0005995303",
  "stuffstuff_a1b_0000143856",
  "stuffstuff0009362407",
  "stuffstuff.c44_0009735298"
))
以下是我尝试过的:

rex_pattern <- "(?<=(\\.|\\_|\\-))[A-Za-z0-9]{3}(?=(\\.|\\_|\\-)[0-9]{3,})"

dt[, `:=`(parsed_val = regmatches(f1, regexpr(pattern = rex_pattern, f1, perl = TRUE)))]  
我试图在函数中使用
ifelse
返回空字符串:

getMmFromFilename <- function(my_file_name){
rex_pattern <- "(?<=(\\.|\\_|\\-))[A-Za-z0-9]{3}(?=(\\.|\\_|\\-)[0-9]{3,})"
nothing_found <- character(length = 0)

mm <- regmatches(my_file_name, regexpr(pattern = rex_pattern, my_file_name, perl = TRUE))
ifelse(identical(mm, nothing_found), "missing_Mm", mm)
}

dt[, .(parsed_val = getMmFromFilename(f1))]
getMmFromFilename

如果要使用
regmatches
可以使用
模式=”(?使用
sub
的想法很好。我仍然对如何在上面的示例中为不匹配的行返回空字符串感兴趣。如果没有提供其他行,我将接受此答案。@Stan您需要使用
extract\u string
from
stringr
包,该包不会删除空字符串matches@Stan能够捕获空字符串只捕获其结尾或开头。因此,只需在正则表达式模式的结尾添加
| ^
|$
。这将解决您的问题
                          f1 parsed_val
1:     stuffstuff-0000097125        abc
2: stuffstuff.abc.0006496679        xyz
3:      stuffstuff0007517235        a1b
4: stuffstuff_xyz.0007280719        c44
5:      stuffstuff0005995303        abc
6: stuffstuff_a1b_0000143856        xyz
7:      stuffstuff0009362407        a1b
8: stuffstuff.c44_0009735298        c44
getMmFromFilename <- function(my_file_name){
rex_pattern <- "(?<=(\\.|\\_|\\-))[A-Za-z0-9]{3}(?=(\\.|\\_|\\-)[0-9]{3,})"
nothing_found <- character(length = 0)

mm <- regmatches(my_file_name, regexpr(pattern = rex_pattern, my_file_name, perl = TRUE))
ifelse(identical(mm, nothing_found), "missing_Mm", mm)
}

dt[, .(parsed_val = getMmFromFilename(f1))]
dt[,parser_val:=sub(".*?[._](.*)[._].*|.*","\\1",f1)]
dt
                          f1 parser_val
1:     stuffstuff-0000097125           
2: stuffstuff.abc.0006496679        abc
3:      stuffstuff0007517235           
4: stuffstuff_xyz.0007280719        xyz
5:      stuffstuff0005995303           
6: stuffstuff_a1b_0000143856        a1b
7:      stuffstuff0009362407           
8: stuffstuff.c44_0009735298        c44
dt[,parser_val:=regmatches(dt$f1,regexpr("(?<=[._]).*(?=[._])|$",dt$f1,perl = T))]
> dt
                          f1 parser_val
1:     stuffstuff-0000097125           
2: stuffstuff.abc.0006496679        abc
3:      stuffstuff0007517235           
4: stuffstuff_xyz.0007280719        xyz
5:      stuffstuff0005995303           
6: stuffstuff_a1b_0000143856        a1b
7:      stuffstuff0009362407           
8: stuffstuff.c44_0009735298        c44