从R中的字符串中提取单词

从R中的字符串中提取单词,r,regex,pattern-matching,stringr,R,Regex,Pattern Matching,Stringr,我试图提取字符串片段,并从这些匹配的模式中创建新的变量。我已经尝试了“strings”包中的许多函数,但似乎没有得到结果。下面的例子是由数据组成的。我想获取一个字符串,提取片段并将它们存储到新数据帧的新列中 例子 这是一种可能性: temp <- strsplit(ex, "\\(|\\)") df <- setNames(as.data.frame(lapply(1:4,function(i) sapply(temp,"[",i)), stringsAsFactors = FALS

我试图提取字符串片段,并从这些匹配的模式中创建新的变量。我已经尝试了“strings”包中的许多函数,但似乎没有得到结果。下面的例子是由数据组成的。我想获取一个字符串,提取片段并将它们存储到新数据帧的新列中

例子 这是一种可能性:

temp <- strsplit(ex, "\\(|\\)")
df <- setNames(as.data.frame(lapply(1:4,function(i) sapply(temp,"[",i)), stringsAsFactors = FALSE), c("title", "year", "genre", "domain"))
df <- df[ , c("title", "year", "domain", "genre")]
correct <- sapply(seq_along(df$genre), function(y) which(lengths(sapply(seq_along(genres), function(x) grep(genres[x], df$genre[y])))>0))
correct <- lapply(correct, function(x) paste0(genres[x], collapse = " "))
df$genre <- unlist(correct)

df
#                                         title year                       domain            genre
# 1                              The Accountant  2016 vodmovies112.blogspot.com.es            Crime
# 2 Miss Peregrine's Home for Peculiar Children  2016 vodmovies112.blogspot.com.es   Fantasy Sci-Fi
# 3     Fantastic Beasts And Where To Find Them  2016                  openload.co        Adventure
# 4                                     Ben-Hur  2016 vodmovies112.blogspot.com.es Action Adventure
# 5                                 The Remains  2016                  openload.co           Horror
# 6                               Suicide Squad  2016                  openload.co           Action

使用tidyverse的另一种可能性是:

library(tidyverse)

data_frame(x = ex) %>%
    extract(
        x,
        c("title", "year", "domain", "genre"), 
        "(^[^(]+)\\s+\\((\\d{4})\\)\\s*([^(]+)\\s+\\(([^)]+)"
    )

##                                         title  year             domain                        genre
## *                                       <chr> <chr>              <chr>                        <chr>
## 1                              The Accountant  2016              Crime vodmovies112.blogspot.com.es
## 2 Miss Peregrine's Home for Peculiar Children  2016      FantasySci-Fi vodmovies112.blogspot.com.es
## 3     Fantastic Beasts And Where To Find Them  2016        TSAdventure                  openload.co
## 4                                     Ben-Hur  2016  HDActionAdventure vodmovies112.blogspot.com.es
## 5                                 The Remains  2016 1080p BlurayHorror                  openload.co
## 6                               Suicide Squad  2016           HDAction                  openload.co
库(tidyverse)
数据帧(x=ex)%>%
提取(
x,,
c(“标题”、“年份”、“领域”、“流派”),
“(^[^(+)\\s+\\(\\d{4})\\)\\s*([^(+)\\s+\\([^)]+)”
)
##标题年域名类型
## *                                                                              
##1会计2016年犯罪视频112.blogspot.com.es
##2 Peregrine小姐的特殊儿童之家2016 FantasySci-Fi vodmovies112.blogspot.com.es
##3只神奇的野兽以及在哪里可以找到它们2016 TSAdventure openload.co
##4 Ben Hur 2016 HDActionAdventure vodmovies112.blogspot.com.es
##5剩余的2016年1080p Bluray openload.co
##6自杀队2016 HDAction openload.co
temp <- strsplit(ex, "\\(|\\)")
df <- setNames(as.data.frame(lapply(1:4,function(i) sapply(temp,"[",i)), stringsAsFactors = FALSE), c("title", "year", "genre", "domain"))
df <- df[ , c("title", "year", "domain", "genre")]
correct <- sapply(seq_along(df$genre), function(y) which(lengths(sapply(seq_along(genres), function(x) grep(genres[x], df$genre[y])))>0))
correct <- lapply(correct, function(x) paste0(genres[x], collapse = " "))
df$genre <- unlist(correct)

df
#                                         title year                       domain            genre
# 1                              The Accountant  2016 vodmovies112.blogspot.com.es            Crime
# 2 Miss Peregrine's Home for Peculiar Children  2016 vodmovies112.blogspot.com.es   Fantasy Sci-Fi
# 3     Fantastic Beasts And Where To Find Them  2016                  openload.co        Adventure
# 4                                     Ben-Hur  2016 vodmovies112.blogspot.com.es Action Adventure
# 5                                 The Remains  2016                  openload.co           Horror
# 6                               Suicide Squad  2016                  openload.co           Action
ex <- c("The Accountant (2016)Crime (vodmovies112.blogspot.com.es)", 
"Miss Peregrine's Home for Peculiar Children (2016)FantasySci-Fi (vodmovies112.blogspot.com.es)", 
"Fantastic Beasts And Where To Find Them (2016) TSAdventure (openload.co)", 
"Ben-Hur (2016) HDActionAdventure (vodmovies112.blogspot.com.es)", 
"The Remains (2016) 1080p BlurayHorror (openload.co)", "Suicide Squad (2016) HDAction (openload.co)"
)

genres <- c("Action", "Adventure", "Animation", "Biography", "Comedy", 
"Crime", "Documentary", "Drama", "Family", "Fantasy", "Film-Noir", 
"History", "Horror", "Music", "Musical", "Mystery", "Romance", 
"Sci-Fi", "Sport", "Thriller", "War", "Western")
library(tidyverse)

data_frame(x = ex) %>%
    extract(
        x,
        c("title", "year", "domain", "genre"), 
        "(^[^(]+)\\s+\\((\\d{4})\\)\\s*([^(]+)\\s+\\(([^)]+)"
    )

##                                         title  year             domain                        genre
## *                                       <chr> <chr>              <chr>                        <chr>
## 1                              The Accountant  2016              Crime vodmovies112.blogspot.com.es
## 2 Miss Peregrine's Home for Peculiar Children  2016      FantasySci-Fi vodmovies112.blogspot.com.es
## 3     Fantastic Beasts And Where To Find Them  2016        TSAdventure                  openload.co
## 4                                     Ben-Hur  2016  HDActionAdventure vodmovies112.blogspot.com.es
## 5                                 The Remains  2016 1080p BlurayHorror                  openload.co
## 6                               Suicide Squad  2016           HDAction                  openload.co