R 在数据框中将分隔字符串拆分为不同的列
我需要一种快速而简洁的方法将数据框架中的字符串文本拆分为一组列。假设我有这个数据框R 在数据框中将分隔字符串拆分为不同的列,r,substring,tokenize,tm,R,Substring,Tokenize,Tm,我需要一种快速而简洁的方法将数据框架中的字符串文本拆分为一组列。假设我有这个数据框 data <- data.frame(id=c(1,2,3), tok1=c("a, b, c", "a, a, d", "b, d, e"), tok2=c("alpha|bravo", "alpha|charlie", "tango|tango|delta") ) 我尝试使用以下语法: tok1.f = factor(data$tok1) dummies <- model.matrix(~tok
data <- data.frame(id=c(1,2,3), tok1=c("a, b, c", "a, a, d", "b, d, e"), tok2=c("alpha|bravo", "alpha|charlie", "tango|tango|delta") )
我尝试使用以下语法:
tok1.f = factor(data$tok1)
dummies <- model.matrix(~tok1.f)
tok1.f=系数(数据$tok1)
假人如果您不介意使用data.table
(暂时),这可能适合您:
library(data.table)
data <- data.frame(id=c(1,2,3),
tok1=c("a, b, c", "a, a, d", "b, d, e"),
tok2=c("alpha|bravo", "alpha|charlie", "tango|tango|delta"))
splitCols <- function(col_name, data) {
# strsplit needs strings
data[, col_name] <- as.character(data[, col_name])
# make a list of single row data frames from the tabulation
# of each of items from the split column
tokens <- lapply(strsplit(data[, col_name], "[^[:alnum:]]+"), function(x) {
tab <- table(x)
setNames(rbind.data.frame(as.numeric(tab)), names(tab))
})
# use data.table's rbindlist, filling in missing values
rbl <- rbindlist(tokens, fill=TRUE)
# 0 out the NA's
rbl[is.na(rbl)] <- 0
# add the "id" column
cbind(id=data$id, rbl)
}
lapply(names(data)[-1], splitCols, data)
## [[1]]
## id a b c d e
## 1: 1 1 1 1 0 0
## 2: 2 2 0 0 1 0
## 3: 3 0 1 0 1 1
##
## [[2]]
## id alpha bravo charlie delta tango
## 1: 1 1 1 0 0 0
## 2: 2 1 0 1 0 0
## 3: 3 0 0 0 1 2
库(data.table)
数据您可以使用stringr
包,如下所示:
require(stringr)
test_data <- data.frame(id=c(1,2,3), tok1=c("a, b, c", "a, a, d", "b, d, e"), tok2=c("alpha|bravo", "alpha|charlie", "tango|tango|delta") )
#conversion to character class and uniform delimeter as ","
test_data$tok1<-as.character(test_data$tok1)
test_data$tok1<-gsub(" ","",test_data$tok1)
test_data$tok2=gsub("\\|",",",as.character(test_data$tok2))
#Unique list of elements for each column
tok1.uniq=sort(unique(unlist(strsplit(as.character(test_data$tok1),","))))
tok2.uniq=sort(unique(unlist(strsplit(as.character(test_data$tok2),","))))
#Token count for each column
#In each row of token, find the count of characters using str_count from stringr package
第二列:
tok2.occurances=do.call(cbind,lapply(tok2.uniq,function(x) {
DF=data.frame(do.call(rbind,lapply(test_data$tok2,function(y,z=x) str_count(y,z))))
colnames(DF) = x
return(DF)
}
))
tok2.occurances=data.frame(id=as.numeric(row.names(tok2.occurances)),tok2.occurances,stringsAsFactors=FALSE)
# > tok2.occurances
# id alpha bravo charlie delta tango
# 1 1 1 0 0 0
# 2 1 0 1 0 0
# 3 0 0 0 1 2
我能想到的最简单的方法是与dcast.data.table
结合使用,如下所示:
library(splitstackshape)
dcast.data.table(cSplit(data, "tok1", ", ", "long"),
id ~ tok1, value.var = "tok1",
fun.aggregate = length)
# id a b c d e
# 1: 1 1 1 1 0 0
# 2: 2 2 0 0 1 0
# 3: 3 0 1 0 1 1
dcast.data.table(cSplit(data, "tok2", "|", "long"),
id ~ tok2, value.var = "tok2",
fun.aggregate = length)
# id alpha bravo charlie delta tango
# 1: 1 1 1 0 0 0
# 2: 2 1 0 1 0 0
# 3: 3 0 0 0 1 2
编辑:更新为library(splitstackshape)
,因为cSplit
现在是该软件包的一部分。也是(我认为这是一个恰当的问题,可以作为结束问题的理由)。实际上,我已经投票重新打开这个问题。虽然它们非常相似,但并非完全相同。然而,我建议你用你已经尝试过的东西来说明你的问题——如果没有其他东西的话,它会赢得善意。目前,您没有编码错误/问题,您有一个任务需要其他人为您解决。我没有编码错误,因为我不知道为该任务编写哪种代码。然而,我实际上使用tm软件包做了一些测试。基本上,我使用这个软件包根据来自不同alpha、bravo、charlie、a、b的术语词典构建文档术语矩阵……添加了第一个(不成功的)尝试“data.table”优势在哪里?似乎大部分操作仍处于数据。帧
操作级别….rbindlist
完成填充部分。我尝试不再使用plyr
,而rbindlist
填补了这一空白。这非常有效,就像一个符咒,tnx allot!
tok2.occurances=do.call(cbind,lapply(tok2.uniq,function(x) {
DF=data.frame(do.call(rbind,lapply(test_data$tok2,function(y,z=x) str_count(y,z))))
colnames(DF) = x
return(DF)
}
))
tok2.occurances=data.frame(id=as.numeric(row.names(tok2.occurances)),tok2.occurances,stringsAsFactors=FALSE)
# > tok2.occurances
# id alpha bravo charlie delta tango
# 1 1 1 0 0 0
# 2 1 0 1 0 0
# 3 0 0 0 1 2
library(splitstackshape)
dcast.data.table(cSplit(data, "tok1", ", ", "long"),
id ~ tok1, value.var = "tok1",
fun.aggregate = length)
# id a b c d e
# 1: 1 1 1 1 0 0
# 2: 2 2 0 0 1 0
# 3: 3 0 1 0 1 1
dcast.data.table(cSplit(data, "tok2", "|", "long"),
id ~ tok2, value.var = "tok2",
fun.aggregate = length)
# id alpha bravo charlie delta tango
# 1: 1 1 1 0 0 0
# 2: 2 1 0 1 0 0
# 3: 3 0 0 0 1 2