R 从文本中提取多个正则表达式_R_Regex_Gsub_Stringr_Alphanumeric

R 从文本中提取多个正则表达式

r regex

R 从文本中提取多个正则表达式,r,regex,gsub,stringr,alphanumeric,R,Regex,Gsub,Stringr,Alphanumeric,我有以下几点 df = data.frame(id = c(1,2,3), text = c('Label issues as ISS101 and ISS 201 on label 23 with x203 17','issue as ISS5051 with label 01 as l018','there is nothing here') 我想从df中提取并创建以下数据帧 id iss label ext1 ext2 1 ISS101 23 x203 1

我有以下几点

df = data.frame(id = c(1,2,3), text = c('Label issues as ISS101  and ISS 201 on label 23 with x203 17','issue as ISS5051 with label 01 as l018','there is nothing here')

我想从df中提取并创建以下数据帧

id  iss     label  ext1 ext2
 1  ISS101  23     x203  17
 1  ISS201  23     x203  17
 2  ISS5051 01     l018  NA
 3    NA    NA      NA   NA

iss的长度可能会有所不同，如示例中所示。在“ISS”和后面的数字之间可能有空格，也可能没有空格，这在例如。标签ext1和ext2的长度是固定的。

我尝试了使用stringr和dplyr使用正则表达式的各种可能性。但这些都不接近解决方案&因此值得在这里提及。期待帮助，如果您需要更多详细信息，请告诉我。

您可以像这样使用

dplyr

和

stringr

library(dplyr)
library(stringr)

df2 <- df %>% mutate(iss=str_extract_all(str_replace_all(text,"ISS\\s+(\\d+)","ISS\\1"),
                        "ISS\\d+"), #remove spaces and then extract ISSnnn
                 label=str_match(text,"label\\s+(\\d+)")[,2], #extract label + nn
                 ext1=str_match(text,"label\\s+\\d+.*?([a-z]\\d+)")[,2], #extract Xnnn after label
                 ext2=str_match(text,"\\s(\\d+)$")[,2]) %>% #extract digits at end of string
              unnest(iss) %>% #unnest iss (creates one row for each iss)
              select(iss,label,ext1,ext2) #select wanted variables

df2

      iss label ext1 ext2
1  ISS101    23 x203   17
2  ISS201    23 x203   17
3 ISS5051    01 l018 <NA>

库（dplyr）
图书馆（stringr）
df2%突变（iss=str\u extract\u all（str\u replace\u all）（文本，“iss\\s+（\\d+），“iss\\1”），
“ISS\\d+”，#删除空格，然后提取ISSnnn
label=str_匹配（文本，“label\\s+（\\d+）”[，2]，#提取label+nn
ext1=str#u match（文本，“label\\s+\\d+.*（[a-z]\\d+））[，2]，#在label之后提取Xnnn
ext2=str#u match（text，“\\s（\\d+）$”[，2]）%>%#提取字符串末尾的数字
unnest（iss）%>%#unnest iss（为每个iss创建一行）
选择（iss、标签、ext1、ext2）#选择所需变量
df2
iss标签ext1 ext2
1 ISS101 23 x203 17
2 ISS201 23 x203 17
3 ISS5051 01 l018

您可以像这样使用

dplyr

和

stringr

library(dplyr)
library(stringr)

df2 <- df %>% mutate(iss=str_extract_all(str_replace_all(text,"ISS\\s+(\\d+)","ISS\\1"),
                        "ISS\\d+"), #remove spaces and then extract ISSnnn
                 label=str_match(text,"label\\s+(\\d+)")[,2], #extract label + nn
                 ext1=str_match(text,"label\\s+\\d+.*?([a-z]\\d+)")[,2], #extract Xnnn after label
                 ext2=str_match(text,"\\s(\\d+)$")[,2]) %>% #extract digits at end of string
              unnest(iss) %>% #unnest iss (creates one row for each iss)
              select(iss,label,ext1,ext2) #select wanted variables

df2

      iss label ext1 ext2
1  ISS101    23 x203   17
2  ISS201    23 x203   17
3 ISS5051    01 l018 <NA>

库（dplyr）
图书馆（stringr）
df2%突变（iss=str\u extract\u all（str\u replace\u all）（文本，“iss\\s+（\\d+），“iss\\1”），
“ISS\\d+”，#删除空格，然后提取ISSnnn
label=str_匹配（文本，“label\\s+（\\d+）”[，2]，#提取label+nn
ext1=str#u match（文本，“label\\s+\\d+.*（[a-z]\\d+））[，2]，#在label之后提取Xnnn
ext2=str#u match（text，“\\s（\\d+）$”[，2]）%>%#提取字符串末尾的数字
unnest（iss）%>%#unnest iss（为每个iss创建一行）
选择（iss、标签、ext1、ext2）#选择所需变量
df2
iss标签ext1 ext2
1 ISS101 23 x203 17
2 ISS201 23 x203 17
3 ISS5051 01 l018

这可能是一个开始：

do.call(plyr::rbind.fill, 
  lapply(df$text, function(x) {
    as.data.frame(cbind(
      iss = unlist(stringr::str_extract_all(x, "(ISS\\s?\\d{3,4})")),
      label = unlist(stringr::str_extract_all(x, "(?<=label)\\s?(\\d{1,2})")),
      ext1 = unlist(stringr::str_extract_all(x, "((x|l)\\d{3})")),
      ext2 = unlist(stringr::str_extract_all(x, "(?<=x|l\\d{3})\\s?\\d{1,3}"))
    ))}
    ))

      iss label ext1 ext2
1  ISS101    23 x203  203
2 ISS 201    23 x203  203
3 ISS5051    01 l018 <NA>

do.call（plyr:：rbind.fill，
lappy（df$文本，函数（x）{
as.data.frame（cbind(
iss=unlist（stringr:：str_extract_all（x，“（iss\\s？\\d{3,4}）”），
label=unlist（stringr:：str_extract_all（x），（？这可能是一个开始：
do.call(plyr::rbind.fill, 
  lapply(df$text, function(x) {
    as.data.frame(cbind(
      iss = unlist(stringr::str_extract_all(x, "(ISS\\s?\\d{3,4})")),
      label = unlist(stringr::str_extract_all(x, "(?<=label)\\s?(\\d{1,2})")),
      ext1 = unlist(stringr::str_extract_all(x, "((x|l)\\d{3})")),
      ext2 = unlist(stringr::str_extract_all(x, "(?<=x|l\\d{3})\\s?\\d{1,3}"))
    ))}
    ))

      iss label ext1 ext2
1  ISS101    23 x203  203
2 ISS 201    23 x203  203
3 ISS5051    01 l018 <NA>

do.call（plyr:：rbind.fill，
lappy（df$文本，函数（x）{
as.data.frame（cbind(
iss=unlist（stringr:：str_extract_all（x，“（iss\\s？\\d{3,4}）”），
label=unlist（stringr:：str_extract_all（x），（？根据您的描述，我已经尽了最大的努力。在没有看到更多数据的情况下，我不能保证这将是通用的，但它为您提供的df生成了所需的输出，因此这应该是一个良好的开端
# create data frame
df = data.frame(id = c(1,2,3), text = c('Label issues as ISS101  and ISS 201 on label 23 with x203 17','issue as ISS5051 with label 01 as l018','there is nothing here'))

# parse text into fields
df <- df %>% mutate(
    iss = str_extract(text, "ISS\\d+\\D"),
    iss_space = str_extract(text, "ISS\\s\\d+\\D"),
    label = str_extract(text, "label.+\\D"), 
    label = str_extract(label, "\\d+\\D"),
    ext1 = str_extract(text, "\\s\\D\\d{3}"),
    ext2 = str_extract(text, "\\s\\D\\d{3}\\s\\d{2}"),
    ext2 = str_extract(ext2, "\\s\\d{2}")) 

# clean up into correct format
df <- df %>%
   gather(iss, iss_space, key = "type", value = "iss") %>%
    select(-text, -type) %>% 
    distinct() %>%
    filter(!(duplicated(id) == T & is.na(iss) == T)) %>%
    arrange(id) %>%
    select(id, iss, label, ext1, ext2) %>%
    mutate(iss = str_replace_all(iss, " ", ""))

df

  id     iss label  ext1 ext2
1  1  ISS101   23   x203   17
2  1  ISS201   23   x203   17
3  2 ISS5051   01   l018 <NA>
4  3    <NA>  <NA>  <NA> <NA>

#创建数据帧
df=data.frame（id=c（1,2,3），text=c（‘标签号为ISS101，标签号为ISS201，标签号为x203 17’，‘标签号为ISS5051，标签号为l018’，‘此处无任何内容’）
#将文本解析为字段
df%变异(
iss=str_extract（文本，“iss\\d+\\d”），
iss_space=str_extract（文本，“iss\\s\\d+\\d”），
label=str_extract（文本，“label.+\\D”），
label=str_extract（标签“\\d+\\d”），
ext1=str_extract（文本“\\s\\D\\D{3}”），
ext2=str_extract（文本“\\s\\D\\D{3}\\s\\D{2}”），
ext2=str_extract（ext2，“\\s\\d{2}”））
#整理成正确的格式
df%
聚集（iss，iss_空格，key=“type”，value=“iss”）%>%
选择（-text，-type）%%>%
不同的（）%>%
过滤器（！（重复的（id）=T&is.na（iss）==T））%>%
排列（id）%>%
选择（id、iss、标签、ext1、ext2）%>%
变异（iss=str\u替换所有（iss，“，”））
df
id iss标签ext1 ext2
1 ISS101 23 x203 17
2 1 ISS201 23 x203 17
3 2 ISS5051 01 l018
4  3         
根据您的描述，我已经尽了最大努力。在没有看到更多数据的情况下，我无法保证这是可以推广的，但它为您提供的df生成了所需的输出，因此这应该是一个良好的开端
# create data frame
df = data.frame(id = c(1,2,3), text = c('Label issues as ISS101  and ISS 201 on label 23 with x203 17','issue as ISS5051 with label 01 as l018','there is nothing here'))

# parse text into fields
df <- df %>% mutate(
    iss = str_extract(text, "ISS\\d+\\D"),
    iss_space = str_extract(text, "ISS\\s\\d+\\D"),
    label = str_extract(text, "label.+\\D"), 
    label = str_extract(label, "\\d+\\D"),
    ext1 = str_extract(text, "\\s\\D\\d{3}"),
    ext2 = str_extract(text, "\\s\\D\\d{3}\\s\\d{2}"),
    ext2 = str_extract(ext2, "\\s\\d{2}")) 

# clean up into correct format
df <- df %>%
   gather(iss, iss_space, key = "type", value = "iss") %>%
    select(-text, -type) %>% 
    distinct() %>%
    filter(!(duplicated(id) == T & is.na(iss) == T)) %>%
    arrange(id) %>%
    select(id, iss, label, ext1, ext2) %>%
    mutate(iss = str_replace_all(iss, " ", ""))

df

  id     iss label  ext1 ext2
1  1  ISS101   23   x203   17
2  1  ISS201   23   x203   17
3  2 ISS5051   01   l018 <NA>
4  3    <NA>  <NA>  <NA> <NA>

#创建数据帧
df=data.frame（id=c（1,2,3），text=c（‘标签号为ISS101，标签号为ISS201，标签号为x203 17’，‘标签号为ISS5051，标签号为l018’，‘此处无任何内容’）
#将文本解析为字段
df%变异(
iss=str_extract（文本，“iss\\d+\\d”），
iss_space=str_extract（文本，“iss\\s\\d+\\d”），
label=str_extract（文本，“label.+\\D”），
label=str_extract（标签“\\d+\\d”），
ext1=str_extract（文本“\\s\\D\\D{3}”），
ext2=str_extract（文本“\\s\\D\\D{3}\\s\\D{2}”），
ext2=str_extract（ext2，“\\s\\d{2}”））
#整理成正确的格式
df%
聚集（iss，iss_空格，key=“type”，value=“iss”）%>%
选择（-text，-type）%%>%
不同的（）%>%
过滤器（！（重复的（id）=T&is.na（iss）==T））%>%
排列（id）%>%
选择（id、iss、标签、ext1、ext2）%>%
变异（iss=str\u替换所有（iss，“，”））
df
id iss标签ext1 ext2
1 ISS101 23 x203 17
2 1 ISS201 23 x203 17
3 2 ISS5051 01 l018
4  3         
works！！不带走任何东西，你能推荐一份正则表达式备忘单吗？你一直在使用Rstudio@Param当我陷入困境时，我倾向于使用这一个…在我看来，这很好，你可以直接+交互式地测试你的表达式tooworks！！不带走任何东西，你能推荐一个正则表达式备忘单吗？一直在使用Rstudio@Param我当我陷入困境时，倾向于使用这个…在我看来，这很好，你也可以直接+交互地测试你的表达式