R-将子串分成三元组并执行计算
我在上问了一个类似的问题,但我问题中的细节现在已经改变,因此我重新发布,因为我需要一个不同的解决方案 我附上了一张图片,说明了我的开始数据集和我试图实现的终点。我需要一个R解决方案,它使用base R作为我将使用的平台,但不能使用其他软件包 原始数据集有多个列。对于一些列,即L1、L2、L3,我想 1) 根据列中任何字符串的最大长度生成动态列数,例如L1 max length=6,因此有6个新列,每个列都标记为“L1_1”到“L1_6” 2) 将原始字符串分隔为子字符串,每个子字符串包含从左侧开始的3个字符。倒数第二列将包含2个字符,最后一列将包含1个字符。(与原质询不同) 3) 对这些子字符串执行计算,即:(a'*1的数量)+(b'*3的数量)+(c'*7的数量),并在新列中返回此计算的值 有人知道怎么做吗 提前谢谢R-将子串分成三元组并执行计算,r,R,我在上问了一个类似的问题,但我问题中的细节现在已经改变,因此我重新发布,因为我需要一个不同的解决方案 我附上了一张图片,说明了我的开始数据集和我试图实现的终点。我需要一个R解决方案,它使用base R作为我将使用的平台,但不能使用其他软件包 原始数据集有多个列。对于一些列,即L1、L2、L3,我想 1) 根据列中任何字符串的最大长度生成动态列数,例如L1 max length=6,因此有6个新列,每个列都标记为“L1_1”到“L1_6” 2) 将原始字符串分隔为子字符串,每个子字符串包含从左侧开
dput(original_data):
structure(list(ID = 1:5, L1 = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("", "AAAAAA", "AABBCC", "BBACB", "BCBDAB"), class = "factor"), L2 = structure(c(3L,
4L, 3L, 1L, 2L), .Label = c("", "ACAA", "BACA", "BACBA"), class = "factor"), L3 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "CABAC", "CACCC", "CBABA"), class = "factor")), .Names = c("ID", "L1",
"L2", "L3"), class = "data.frame", row.names = c(NA, -5L))
dput(interim_data):
structure(list(ID = 1:5, L1 = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("",
"AAAAAA", "AABBCC", "BBACB", "BCBDAB"), class = "factor"), L2 = structure(c(3L,
4L, 3L, 1L, 2L), .Label = c("", "ACAA", "BACA", "BACBA"), class = "factor"),
L3 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "CABAC",
"CACCC", "CBABA"), class = "factor"), L1_1 = structure(c(3L,
2L, 4L, 1L, 5L), .Label = c("", "AAA", "AAB", "BBA", "BCB"
), class = "factor"), L1_2 = structure(c(3L, 2L, 4L, 1L,
5L), .Label = c("", "AAA", "ABB", "BAC", "CBD"), class = "factor"),
L1_3 = structure(c(4L, 2L, 3L, 1L, 5L), .Label = c("", "AAA",
"ACB", "BBC", "BDA"), class = "factor"), L1_4 = structure(c(3L,
2L, 4L, 1L, 5L), .Label = c("", "AAA", "BCC", "CB", "DAB"
), class = "factor"), L1_5 = structure(c(5L, 2L, 4L, 1L,
3L), .Label = c("", "AA", "AB", "B", "CC"), class = "factor"),
L1_6 = structure(c(4L, 2L, 1L, 1L, 3L), .Label = c("", "A",
"B", "C"), class = "factor"), L2_1 = structure(c(3L, 3L,
3L, 1L, 2L), .Label = c("", "ACA", "BAC"), class = "factor"),
L2_2 = structure(c(2L, 3L, 2L, 1L, 4L), .Label = c("", "ACA",
"ACB", "CAA"), class = "factor"), L2_3 = structure(c(3L,
4L, 3L, 1L, 2L), .Label = c("", "AA", "AC", "CBA"), class = "factor"),
L2_4 = structure(c(2L, 3L, 2L, 1L, 2L), .Label = c("", "A",
"BA"), class = "factor"), L2_5 = structure(c(1L, 2L, 1L,
1L, 1L), .Label = c("", "A"), class = "factor"), L3_1 = structure(c(1L,
3L, 2L, 1L, 4L), .Label = c("", "CAB", "CAC", "CBA"), class = "factor"),
L3_2 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "ABA",
"ACC", "BAB"), class = "factor"), L3_3 = structure(c(1L,
4L, 3L, 1L, 2L), .Label = c("", "ABA", "BAC", "CCC"), class = "factor"),
L3_4 = structure(c(1L, 4L, 2L, 1L, 3L), .Label = c("", "AC",
"BA", "CC"), class = "factor"), L3_5 = structure(c(1L, 3L,
3L, 1L, 2L), .Label = c("", "A", "C"), class = "factor")), .Names = c("ID",
"L1", "L2", "L3", "L1_1", "L1_2", "L1_3", "L1_4", "L1_5", "L1_6",
"L2_1", "L2_2", "L2_3", "L2_4", "L2_5", "L3_1", "L3_2", "L3_3",
"L3_4", "L3_5"), class = "data.frame", row.names = c(NA, -5L))
编辑:
@onyanbu提供的代码
interim=sapply(df, as.character)
interim[,1]=as.numeric(interim[,1]
funfun = function(u){
if(is.numeric(u)) return(u)
s = unique(unlist(strsplit(u,"")))
w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}
ADD_char=function(x) mapply(funfun,x)
sapply(interim,ADD_char)
dat1 <- cbind(interim[,1:4],sapply(interim[,-(1:4)],ADD_char))
middial=sapply(df,as.character)
临时的,临时的,临时的
funfun=功能(u){
如果(是数字(u))返回(u)
s=唯一(未列出(strsplit(u,“”))
w=sapply(s,函数(x)长度(unlist(gregexpr(x,u)))
ifelse(长度>0,总和(w[“A”]*1,w[“B”]*3,w[“C”]*7,na.rm=T),na)
}
ADD_char=function(x)mapply(funfun,x)
sapply(临时,添加字符)
dat1首先,在尝试此代码之前,请确保检查列的类:sapply(middial,class)
如果上面的代码给您的是“factor”
而不是“character”
,那么您必须将数据帧更改为characters
而不是factors
。
只需通过middial=data.frame(t(middial)),stringsafactors=F)
或middial=sapply(middial,as.character)
就可以完成这项工作。然后将ID
列更改为numeric
。即middial[,1]=as.numeric(middial[,1])
。确保数据以字符形式显示后,可以运行以下代码:
funfun = function(u){
if(is.numeric(u)) return(u)
s = unique(unlist(strsplit(u,"")))
w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}
ADD_char=function(x) mapply(funfun,x)
sapply(Interim,ADD_char)
funfun = function(u){
if(is.numeric(u)) return(u)
s = unique(unlist(strsplit(u,"")))
w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}
ADD_char=function(x) mapply(funfun,x)
funfun1=function(u){
if(nchar(u)<2) return(u)
a = unlist(strsplit(u,""))
m = length(a)
if(m>0)
sapply(1:m, function(i)paste0(a[i:(ifelse(i+2<m,i+2,m))],collapse = ""))
}
funfun2=function(data){
char_split = function(x) mapply(funfun1,x)
s = lapply(apply(data,1,char_split),unlist)
nam = lapply(s,names)
slen = sapply(nam,length)
ans=`names<-`(do.call(rbind.data.frame,
lapply(s,function(i){length(i)=max(slen);i})),
nam[[which.max(slen)]])
ans=data.frame(t(t(ans)),stringsAsFactors=FALSE)
fn=sapply(ans,function(j) ifelse(is.na(j), "",j))
as.data.frame(fn,stringsAsFactors=FALSE)
}
k=funfun2(interim[,1:4])
mapply(class,k)
k[,1]=as.numeric(k[,1])
sapply(k,ADD_char)
ID L11 L12 L13 L14 L15 L16 L21 L22 L23 L24 L25 L31 L32 L33 L34 L35
[1,] 1 5 7 13 17 14 7 11 9 8 1 NA NA NA NA NA NA
[2,] 2 3 3 3 3 2 1 11 11 11 4 1 15 15 21 14 7
[3,] 3 7 11 11 10 3 11 9 8 1 11 5 11 8 7 NA NA
[4,] 4 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[5,] 5 13 10 4 4 4 3 9 9 2 1 11 7 5 4 1 NA
首先,在尝试此代码之前,请确保检查列的类:sapply(临时,类)
如果上面的代码为您提供了“factor”
而不是“character”
,那么您必须将数据帧更改为characters
而不是factors
。
只需通过middial=data.frame(t(middial)),stringsafactors=F)
或middial=sapply(middial,as.character)
就可以完成这项工作。然后将ID
列更改为numeric
。即middial[,1]=as.numeric(middial[,1])
。确保数据以字符形式显示后,可以运行以下代码:
funfun = function(u){
if(is.numeric(u)) return(u)
s = unique(unlist(strsplit(u,"")))
w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}
ADD_char=function(x) mapply(funfun,x)
sapply(Interim,ADD_char)
funfun = function(u){
if(is.numeric(u)) return(u)
s = unique(unlist(strsplit(u,"")))
w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}
ADD_char=function(x) mapply(funfun,x)
funfun1=function(u){
if(nchar(u)<2) return(u)
a = unlist(strsplit(u,""))
m = length(a)
if(m>0)
sapply(1:m, function(i)paste0(a[i:(ifelse(i+2<m,i+2,m))],collapse = ""))
}
funfun2=function(data){
char_split = function(x) mapply(funfun1,x)
s = lapply(apply(data,1,char_split),unlist)
nam = lapply(s,names)
slen = sapply(nam,length)
ans=`names<-`(do.call(rbind.data.frame,
lapply(s,function(i){length(i)=max(slen);i})),
nam[[which.max(slen)]])
ans=data.frame(t(t(ans)),stringsAsFactors=FALSE)
fn=sapply(ans,function(j) ifelse(is.na(j), "",j))
as.data.frame(fn,stringsAsFactors=FALSE)
}
k=funfun2(interim[,1:4])
mapply(class,k)
k[,1]=as.numeric(k[,1])
sapply(k,ADD_char)
ID L11 L12 L13 L14 L15 L16 L21 L22 L23 L24 L25 L31 L32 L33 L34 L35
[1,] 1 5 7 13 17 14 7 11 9 8 1 NA NA NA NA NA NA
[2,] 2 3 3 3 3 2 1 11 11 11 4 1 15 15 21 14 7
[3,] 3 7 11 11 10 3 11 9 8 1 11 5 11 8 7 NA NA
[4,] 4 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[5,] 5 13 10 4 4 4 3 9 9 2 1 11 7 5 4 1 NA
funfun=函数(u){
如果(是数字(u))返回(u)
s=唯一(未列出(strsplit(u,“”))
w=sapply(s,函数(x)长度(unlist(gregexpr(x,u)))
ifelse(长度>0,总和(w[“A”]*1,w[“B”]*3,w[“C”]*7,na.rm=T),na)
}
ADD_char=function(x)mapply(funfun,x)
FUN1=函数(u){
如果(nchar(u)0)
sapply(1:m,函数(i)paste0(a[i:(ifelse)(i+2)数据的posting图片真的没有什么帮助。请看如何共享a。抱歉,我已经包含了dput(数据)现在,您真的需要interrim数据还是只需要最后一列?谢谢,只是最后一列,临时数据表只是为了显示字符串应该如何分离..谢谢!我尝试了这个建议并得到了这个错误:“”解析中的错误(text=script):文本参数中的解析错误:funfun处的解析错误(第3行,字符1至6)“.”我已将代码放入quI的编辑中,并公布了我确实获得的结果。在使用上述函数时,我真的不知道为什么会出现错误。但似乎在某个地方存在问题。临时数据是您的全部数据吗?。让我检查一下,看看是否可以提供帮助。谢谢您谢谢,原始数据是我的全部数据,临时数据数据只是显示它们应该如何成为子字符串。我已经在图中显示了这一点。您可以尝试运行ADD_char(过渡[,2])
你得到了什么?哦,好吧,那么你的数据还没有被子集?根据你上面介绍的中间的
数据,没有必要对它进行子集,因为它已经是子集了。我想你已经完成了子集设置。让我再试试,并在函数中包含子设置。