Regex 从两个文件中匹配单词并提取匹配的单词
我有以下数据框:Regex 从两个文件中匹配单词并提取匹配的单词,regex,r,pattern-matching,Regex,R,Pattern Matching,我有以下数据框: dataFrame <- data.frame(sent = c(1,1,2,2,3,3,3,4,5), word = c("good printer", "wireless easy", "just right size", "size perfect weight", "worth price", "website great tablet",
dataFrame <- data.frame(sent = c(1,1,2,2,3,3,3,4,5), word = c("good printer", "wireless easy", "just right size",
"size perfect weight", "worth price", "website great tablet",
"pan nice tablet", "great price", "product easy install"), val = c(1,2,3,4,5,6,7,8,9))
然后我会说:
nouns <- c("printer", "wireless", "weight", "price", "tablet")
这里有一个使用
stringi
包的简单解决方案(size
不在您的名词
列表中)
库(stringi)
转换(数据帧,
extract=stri_extract_all(单词,
正则表达式=粘贴(名词,collapse=“|”),
简化=真)
#发送字val摘录
#1台好的打印机1台打印机
#2 1无线轻松2无线
#3 2正合适的尺寸3
#4 2尺码完美重量4重量
#价值价
#6 3网站大平板电脑6平板电脑
#7 3潘尼斯片7片
#价格太高了
#9 5产品易于安装9
这是另一种解决方案。稍微复杂一点,但它也会删除名词和dataFrame$word之间不匹配的行
require(stringr)
dataFrame <- data.frame("sent" = c(1,1,2,2,3,3,3,4,5),
"word" = c("good printer", "wireless easy", "just right size",
"size perfect weight", "worth price", "website great tablet",
"pan nice tablet", "great price", "product easy install"),
val = c(1,2,3,4,5,6,7,8,9))
nouns <- c("printer", "wireless", "weight", "price", "tablet")
test <- character()
df.del <- list()
for (i in 1:nrow(dataFrame)) {
if(length(intersect(nouns, unlist(strsplit(as.character(dataFrame$word[i]), " ")))) == 0) {
df.del <- rbind(df.del, i)
} else {
test <- rbind(test,
intersect(nouns, unlist(strsplit(as.character(dataFrame$word[i]), " "))))
}
}
dataFrame <- dataFrame[-c(unlist(df.del)), ]
dataFrame <- cbind(dataFrame, test)
names(dataFrame)[4] <- "extract"
下面是另一个使用循环函数和if语句的解决方案
word<-dataFrame$word
dat<-NULL
extract<-c(rep(c("remove"), each=length(word)))
n<-length(word)
m<-length(nouns)
for (i in 1:n) {
g<-as.character(word[i])
for (j in 1:m) {
dat<-grepl(nouns[j], g)
if(dat == TRUE) {extract[i] <- nouns[j]}
}
}
dataFrame$extract <- extract
# sent word val extract
#1 1 good printer 1 printer
#2 1 wireless easy 2 wireless
#3 2 just right size 3 remove
#4 2 size perfect weight 4 weight
#5 3 worth price 5 price
#6 3 website great tablet 6 tablet
#7 3 pan nice tablet 7 tablet
#8 4 great price 8 price
#9 5 product easy install 9 remove
word你不是在一小时前问了一个非常类似的问题吗?你如何回应它的评论?我已经结束了另一个问题,请不要发布同一个问题两次。对此表示抱歉,我想更改任务,但不幸的是复制了该问题。有趣的问题!我想知道您需要确保单词列中的每个单元格只能出现一个名词。实际上,我发现@David Arenburg解决方案在名词列表中两个单词匹配的情况下自动提供第二列,例如,如果dataFrame[5,2]的值为“tablet price”而不是“worth price”。很好的解决方案!非常感谢,大卫工作得很好。这就是我要找的。
library(stringi)
transform(dataFrame,
extract = stri_extract_all(word,
regex = paste(nouns, collapse = "|"),
simplify = TRUE))
# sent word val extract
# 1 1 good printer 1 printer
# 2 1 wireless easy 2 wireless
# 3 2 just right size 3 <NA>
# 4 2 size perfect weight 4 weight
# 5 3 worth price 5 price
# 6 3 website great tablet 6 tablet
# 7 3 pan nice tablet 7 tablet
# 8 4 great price 8 price
# 9 5 product easy install 9 <NA>
require(stringr)
dataFrame <- data.frame("sent" = c(1,1,2,2,3,3,3,4,5),
"word" = c("good printer", "wireless easy", "just right size",
"size perfect weight", "worth price", "website great tablet",
"pan nice tablet", "great price", "product easy install"),
val = c(1,2,3,4,5,6,7,8,9))
nouns <- c("printer", "wireless", "weight", "price", "tablet")
test <- character()
df.del <- list()
for (i in 1:nrow(dataFrame)) {
if(length(intersect(nouns, unlist(strsplit(as.character(dataFrame$word[i]), " ")))) == 0) {
df.del <- rbind(df.del, i)
} else {
test <- rbind(test,
intersect(nouns, unlist(strsplit(as.character(dataFrame$word[i]), " "))))
}
}
dataFrame <- dataFrame[-c(unlist(df.del)), ]
dataFrame <- cbind(dataFrame, test)
names(dataFrame)[4] <- "extract"
sent word val extract
1 1 good printer 1 printer
2 1 wireless easy 2 wireless
4 2 size perfect weight 4 weight
5 3 worth price 5 price
6 3 website great tablet 6 tablet
7 3 pan nice tablet 7 tablet
8 4 great price 8 price
word<-dataFrame$word
dat<-NULL
extract<-c(rep(c("remove"), each=length(word)))
n<-length(word)
m<-length(nouns)
for (i in 1:n) {
g<-as.character(word[i])
for (j in 1:m) {
dat<-grepl(nouns[j], g)
if(dat == TRUE) {extract[i] <- nouns[j]}
}
}
dataFrame$extract <- extract
# sent word val extract
#1 1 good printer 1 printer
#2 1 wireless easy 2 wireless
#3 2 just right size 3 remove
#4 2 size perfect weight 4 weight
#5 3 worth price 5 price
#6 3 website great tablet 6 tablet
#7 3 pan nice tablet 7 tablet
#8 4 great price 8 price
#9 5 product easy install 9 remove