R-将子串分成三元组并执行计算

R-将子串分成三元组并执行计算,r,R,我在上问了一个类似的问题,但我问题中的细节现在已经改变,因此我重新发布,因为我需要一个不同的解决方案 我附上了一张图片,说明了我的开始数据集和我试图实现的终点。我需要一个R解决方案,它使用base R作为我将使用的平台,但不能使用其他软件包 原始数据集有多个列。对于一些列,即L1、L2、L3,我想 1) 根据列中任何字符串的最大长度生成动态列数,例如L1 max length=6,因此有6个新列,每个列都标记为“L1_1”到“L1_6” 2) 将原始字符串分隔为子字符串,每个子字符串包含从左侧开

我在上问了一个类似的问题,但我问题中的细节现在已经改变,因此我重新发布,因为我需要一个不同的解决方案

我附上了一张图片,说明了我的开始数据集和我试图实现的终点。我需要一个R解决方案,它使用base R作为我将使用的平台,但不能使用其他软件包

原始数据集有多个列。对于一些列,即L1、L2、L3,我想

1) 根据列中任何字符串的最大长度生成动态列数,例如L1 max length=6,因此有6个新列,每个列都标记为“L1_1”到“L1_6”

2) 将原始字符串分隔为子字符串,每个子字符串包含从左侧开始的3个字符。倒数第二列将包含2个字符,最后一列将包含1个字符。(与原质询不同)

3) 对这些子字符串执行计算,即:(a'*1的数量)+(b'*3的数量)+(c'*7的数量),并在新列中返回此计算的值

有人知道怎么做吗

提前谢谢

dput(original_data):
    structure(list(ID = 1:5, L1 = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("",                                                                          "AAAAAA", "AABBCC", "BBACB", "BCBDAB"), class = "factor"), L2 = structure(c(3L,                                                                        
4L, 3L, 1L, 2L), .Label = c("", "ACAA", "BACA", "BACBA"), class = "factor"),                                                                           L3 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "CABAC",                                                                                     "CACCC", "CBABA"), class = "factor")), .Names = c("ID", "L1",                                                                                      
"L2", "L3"), class = "data.frame", row.names = c(NA, -5L))   

dput(interim_data):
structure(list(ID = 1:5, L1 = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("",                                                                          
"AAAAAA", "AABBCC", "BBACB", "BCBDAB"), class = "factor"), L2 = structure(c(3L,                                                                        
4L, 3L, 1L, 2L), .Label = c("", "ACAA", "BACA", "BACBA"), class = "factor"),                                                                           
    L3 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "CABAC",                                                                                      
    "CACCC", "CBABA"), class = "factor"), L1_1 = structure(c(3L,                                                                                       
    2L, 4L, 1L, 5L), .Label = c("", "AAA", "AAB", "BBA", "BCB"                                                                                         
    ), class = "factor"), L1_2 = structure(c(3L, 2L, 4L, 1L,                                                                                           
    5L), .Label = c("", "AAA", "ABB", "BAC", "CBD"), class = "factor"),                                                                                
    L1_3 = structure(c(4L, 2L, 3L, 1L, 5L), .Label = c("", "AAA",                                                                                      
    "ACB", "BBC", "BDA"), class = "factor"), L1_4 = structure(c(3L,                                                                                    
    2L, 4L, 1L, 5L), .Label = c("", "AAA", "BCC", "CB", "DAB"                                                                                          
    ), class = "factor"), L1_5 = structure(c(5L, 2L, 4L, 1L,                                                                                           
    3L), .Label = c("", "AA", "AB", "B", "CC"), class = "factor"),                                                                                     
    L1_6 = structure(c(4L, 2L, 1L, 1L, 3L), .Label = c("", "A",                                                                                        
    "B", "C"), class = "factor"), L2_1 = structure(c(3L, 3L,                                                                                           
    3L, 1L, 2L), .Label = c("", "ACA", "BAC"), class = "factor"),                                                                                      
    L2_2 = structure(c(2L, 3L, 2L, 1L, 4L), .Label = c("", "ACA",                                                                                      
    "ACB", "CAA"), class = "factor"), L2_3 = structure(c(3L,                                                                                           
    4L, 3L, 1L, 2L), .Label = c("", "AA", "AC", "CBA"), class = "factor"),                                                                             
    L2_4 = structure(c(2L, 3L, 2L, 1L, 2L), .Label = c("", "A",                                                                                        
    "BA"), class = "factor"), L2_5 = structure(c(1L, 2L, 1L,                                                                                           
    1L, 1L), .Label = c("", "A"), class = "factor"), L3_1 = structure(c(1L,                                                                            
    3L, 2L, 1L, 4L), .Label = c("", "CAB", "CAC", "CBA"), class = "factor"),                                                                           
    L3_2 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "ABA",                                                                                      
    "ACC", "BAB"), class = "factor"), L3_3 = structure(c(1L,                                                                                           
    4L, 3L, 1L, 2L), .Label = c("", "ABA", "BAC", "CCC"), class = "factor"),                                                                           
    L3_4 = structure(c(1L, 4L, 2L, 1L, 3L), .Label = c("", "AC",                                                                                       
    "BA", "CC"), class = "factor"), L3_5 = structure(c(1L, 3L,                                                                                         
    3L, 1L, 2L), .Label = c("", "A", "C"), class = "factor")), .Names = c("ID",                                                                        
"L1", "L2", "L3", "L1_1", "L1_2", "L1_3", "L1_4", "L1_5", "L1_6",                                                                                      
"L2_1", "L2_2", "L2_3", "L2_4", "L2_5", "L3_1", "L3_2", "L3_3",                                                                                        
"L3_4", "L3_5"), class = "data.frame", row.names = c(NA, -5L))  

编辑: @onyanbu提供的代码

interim=sapply(df, as.character)
interim[,1]=as.numeric(interim[,1]
funfun = function(u){
  if(is.numeric(u)) return(u)
  s = unique(unlist(strsplit(u,"")))
  w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
 ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}

ADD_char=function(x) mapply(funfun,x)
sapply(interim,ADD_char)
dat1 <- cbind(interim[,1:4],sapply(interim[,-(1:4)],ADD_char))
middial=sapply(df,as.character)
临时的,临时的,临时的
funfun=功能(u){
如果(是数字(u))返回(u)
s=唯一(未列出(strsplit(u,“”))
w=sapply(s,函数(x)长度(unlist(gregexpr(x,u)))
ifelse(长度>0,总和(w[“A”]*1,w[“B”]*3,w[“C”]*7,na.rm=T),na)
}
ADD_char=function(x)mapply(funfun,x)
sapply(临时,添加字符)

dat1首先,在尝试此代码之前,请确保检查列的类:
sapply(middial,class)
如果上面的代码给您的是
“factor”
而不是
“character”
,那么您必须将数据帧更改为
characters
而不是
factors
。 只需通过
middial=data.frame(t(middial)),stringsafactors=F)
middial=sapply(middial,as.character)
就可以完成这项工作。然后将
ID
列更改为
numeric
。即
middial[,1]=as.numeric(middial[,1])
。确保数据以字符形式显示后,可以运行以下代码:

funfun = function(u){
  if(is.numeric(u)) return(u)
  s = unique(unlist(strsplit(u,"")))
  w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
 ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}

 ADD_char=function(x) mapply(funfun,x)

 sapply(Interim,ADD_char)
funfun = function(u){
  if(is.numeric(u)) return(u)
  s = unique(unlist(strsplit(u,"")))
  w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
 ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}

 ADD_char=function(x) mapply(funfun,x)


 funfun1=function(u){
   if(nchar(u)<2) return(u)
   a = unlist(strsplit(u,""))
   m = length(a)
   if(m>0)
   sapply(1:m, function(i)paste0(a[i:(ifelse(i+2<m,i+2,m))],collapse = ""))
 }  

 funfun2=function(data){
   char_split = function(x) mapply(funfun1,x)
   s = lapply(apply(data,1,char_split),unlist)
   nam = lapply(s,names)
   slen = sapply(nam,length)
   ans=`names<-`(do.call(rbind.data.frame,
              lapply(s,function(i){length(i)=max(slen);i})),
                 nam[[which.max(slen)]])
   ans=data.frame(t(t(ans)),stringsAsFactors=FALSE)
   fn=sapply(ans,function(j) ifelse(is.na(j), "",j))
  as.data.frame(fn,stringsAsFactors=FALSE)
 }

k=funfun2(interim[,1:4])
mapply(class,k)
k[,1]=as.numeric(k[,1])
sapply(k,ADD_char)

     ID L11 L12 L13 L14 L15 L16 L21 L22 L23 L24 L25 L31 L32 L33 L34 L35
[1,]  1   5   7  13  17  14   7  11   9   8   1  NA  NA  NA  NA  NA  NA
[2,]  2   3   3   3   3   2   1  11  11  11   4   1  15  15  21  14   7
[3,]  3   7  11  11  10   3  11   9   8   1  11   5  11   8   7  NA  NA
[4,]  4  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
[5,]  5  13  10   4   4   4   3   9   9   2   1  11   7   5   4   1  NA

首先,在尝试此代码之前,请确保检查列的类:
sapply(临时,类)
如果上面的代码为您提供了
“factor”
而不是
“character”
,那么您必须将数据帧更改为
characters
而不是
factors
。 只需通过
middial=data.frame(t(middial)),stringsafactors=F)
middial=sapply(middial,as.character)
就可以完成这项工作。然后将
ID
列更改为
numeric
。即
middial[,1]=as.numeric(middial[,1])
。确保数据以字符形式显示后,可以运行以下代码:

funfun = function(u){
  if(is.numeric(u)) return(u)
  s = unique(unlist(strsplit(u,"")))
  w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
 ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}

 ADD_char=function(x) mapply(funfun,x)

 sapply(Interim,ADD_char)
funfun = function(u){
  if(is.numeric(u)) return(u)
  s = unique(unlist(strsplit(u,"")))
  w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
 ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}

 ADD_char=function(x) mapply(funfun,x)


 funfun1=function(u){
   if(nchar(u)<2) return(u)
   a = unlist(strsplit(u,""))
   m = length(a)
   if(m>0)
   sapply(1:m, function(i)paste0(a[i:(ifelse(i+2<m,i+2,m))],collapse = ""))
 }  

 funfun2=function(data){
   char_split = function(x) mapply(funfun1,x)
   s = lapply(apply(data,1,char_split),unlist)
   nam = lapply(s,names)
   slen = sapply(nam,length)
   ans=`names<-`(do.call(rbind.data.frame,
              lapply(s,function(i){length(i)=max(slen);i})),
                 nam[[which.max(slen)]])
   ans=data.frame(t(t(ans)),stringsAsFactors=FALSE)
   fn=sapply(ans,function(j) ifelse(is.na(j), "",j))
  as.data.frame(fn,stringsAsFactors=FALSE)
 }

k=funfun2(interim[,1:4])
mapply(class,k)
k[,1]=as.numeric(k[,1])
sapply(k,ADD_char)

     ID L11 L12 L13 L14 L15 L16 L21 L22 L23 L24 L25 L31 L32 L33 L34 L35
[1,]  1   5   7  13  17  14   7  11   9   8   1  NA  NA  NA  NA  NA  NA
[2,]  2   3   3   3   3   2   1  11  11  11   4   1  15  15  21  14   7
[3,]  3   7  11  11  10   3  11   9   8   1  11   5  11   8   7  NA  NA
[4,]  4  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
[5,]  5  13  10   4   4   4   3   9   9   2   1  11   7   5   4   1  NA
funfun=函数(u){
如果(是数字(u))返回(u)
s=唯一(未列出(strsplit(u,“”))
w=sapply(s,函数(x)长度(unlist(gregexpr(x,u)))
ifelse(长度>0,总和(w[“A”]*1,w[“B”]*3,w[“C”]*7,na.rm=T),na)
}
ADD_char=function(x)mapply(funfun,x)
FUN1=函数(u){
如果(nchar(u)0)

sapply(1:m,函数(i)paste0(a[i:(ifelse)(i+2)数据的posting图片真的没有什么帮助。请看如何共享a。抱歉,我已经包含了dput(数据)现在,您真的需要interrim数据还是只需要最后一列?谢谢,只是最后一列,临时数据表只是为了显示字符串应该如何分离..谢谢!我尝试了这个建议并得到了这个错误:“”解析中的错误(text=script):文本参数中的解析错误:funfun处的解析错误(第3行,字符1至6)“.”我已将代码放入quI的编辑中,并公布了我确实获得的结果。在使用上述函数时,我真的不知道为什么会出现错误。但似乎在某个地方存在问题。临时数据是您的全部数据吗?。让我检查一下,看看是否可以提供帮助。谢谢您谢谢,原始数据是我的全部数据,临时数据数据只是显示它们应该如何成为子字符串。我已经在图中显示了这一点。您可以尝试运行
ADD_char(过渡[,2])
你得到了什么?哦,好吧,那么你的数据还没有被子集?根据你上面介绍的
中间的
数据,没有必要对它进行子集,因为它已经是子集了。我想你已经完成了子集设置。让我再试试,并在函数中包含子设置。