R 如何在指定单词旁边隔离一个单词_R_String_Stringr

R 如何在指定单词旁边隔离一个单词

r string

R 如何在指定单词旁边隔离一个单词,r,string,stringr,R,String,Stringr,我的数据帧有多种字符串。参见示例df： strings <- c("Average complications and higher payment", "Average complications and average payment", "Average complications and lower payment", "Average mortality and higher payment", "Better mo

我的数据帧有多种字符串。参见示例df：

strings <- c("Average complications and higher payment",
        "Average complications and average payment",
        "Average complications and lower payment",
        "Average mortality and higher payment",
        "Better mortality and average payment")
    df <- data.frame(strings, stringsAsFactors = F)

strings具有strsplit
、head
和tail
功能：
outDF = do.call(rbind,lapply(DF$strings,function(x) {

#split string
strObj = unlist(strsplit(x,split=" "))

#outputDF
data.frame(strings = x,QualityWord=head(strObj,1),PaymentWord= head(tail(strObj,2),1),stringsAsFactors=FALSE) 

}))

outDF
#                                    strings QualityWord PaymentWord
#1  Average complications and higher payment     Average      higher
#2 Average complications and average payment     Average     average
#3   Average complications and lower payment     Average       lower
#4      Average mortality and higher payment     Average      higher
#5      Better mortality and average payment      Better     average

或：
使用dplyr
和自定义功能：
customFn = function(x) { 
strObj = unlist(strsplit(x,split=" ")); 
outputDF = data.frame(strings = x,QualityWord=head(strObj,1),PaymentWord= head(tail(strObj,2),1),stringsAsFactors=FALSE);
}

DF %>% 
dplyr::rowwise() %>% 
dplyr::do(customFn(.$strings))

df$QualityWord=sub（“（\\w+）.*？$”，“\\1”，df$strings）
df$PaymentWord=sub（“*？（\\w+）付款$”，“\\1”，df$字符串）
df
#>字符串QualityWord PaymentWord
#>1平均并发症和较高的报酬平均较高
#>2平均并发症和平均付款平均
#>3平均并发症和较低付款平均较低
#>4平均死亡率和较高的报酬平均较高
#>5更好的死亡率和平均报酬更好的平均水平

regex术语解释如下：

（\\w+）
=将单词字符匹配一次或多次，作为一个组捕获
*？
=不贪婪地匹配任何东西
payment
=匹配空格，然后匹配字符payment
$
=匹配字符串的结尾
\\1
=用第一组中的图案替换图案
我们可以使用提取自tidyr

library(tidyverse)
df %>%
   extract(strings, into = c("QaulityWord", "PaymentWord"),
           "^(\\w+).*\\b(\\w+)\\s+\\w+$", remove = FALSE)
#                                   strings QaulityWord PaymentWord
#1  Average complications and higher payment     Average      higher
#2 Average complications and average payment     Average     average
#3   Average complications and lower payment     Average       lower
#4      Average mortality and higher payment     Average      higher
#5      Better mortality and average payment      Better     average

library(tidyverse)
df %>%
   extract(strings, into = c("QaulityWord", "PaymentWord"),
           "^(\\w+).*\\b(\\w+)\\s+\\w+$", remove = FALSE)
#                                   strings QaulityWord PaymentWord
#1  Average complications and higher payment     Average      higher
#2 Average complications and average payment     Average     average
#3   Average complications and lower payment     Average       lower
#4      Average mortality and higher payment     Average      higher
#5      Better mortality and average payment      Better     average