R 如何根据给定的值将文本字符串转换为向量，并用数值替换每个字母_R

R 如何根据给定的值将文本字符串转换为向量，并用数值替换每个字母

R 如何根据给定的值将文本字符串转换为向量，并用数值替换每个字母,r,R,我有一个字符串列表（类：“factor”），在下面的代码中称为ptuples： aminoacid <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y") aminoacid1 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R"

我有一个字符串列表（类：“factor”），在下面的代码中称为ptuples：

aminoacid <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid1 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
df <- expand.grid(aminoacid, aminoacid1, aminoacid2)
df <- transform(df, newname = paste(df$Var1, df$Var2, df$Var3, sep=""))
ptuples <- df[,4]

如何将这些字符串转换为长度为15的向量？下一步是使用k-means聚类将集群分为100个组，因此，如果某个特定的格式可以使这更容易，我将不胜感激

谢谢

aminoacid这将输出一个矩阵，每个p元组有一行15列，其中key
是您的20x5数据帧或数字矩阵。您应该能够在dist
函数或类似函数中直接使用此函数进行进一步聚类
aminoacid <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid1 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
df <- expand.grid(aminoacid, aminoacid1, aminoacid2)
df <- transform(df, newname = paste(df$Var1, df$Var2, df$Var3, sep=""))
ptuples <- df[,4]

key <- read.table(text="          pah         pss         ms         cc          ec
A -0.59145974 -1.30209266 -0.7330651  1.5703918 -0.14550842
                  C -1.34267179  0.46542300 -0.8620345 -1.0200786 -0.25516894
                  D  1.05015062  0.30242411 -3.6559147 -0.2590236 -3.24176791
                  E  1.35733226 -1.45275578  1.4766610  0.1129444 -0.83715681
                  F -1.00610084 -0.59046634  1.8909687 -0.3966186  0.41194139
                  G -0.38387987  1.65201497  1.3301017  1.0449765  2.06385566
                  H  0.33616543 -0.41662780 -1.6733690 -1.4738898 -0.07772917
                  I -1.23936304 -0.54652238  2.1314349  0.3931618  0.81630366
                  K  1.83146558 -0.56109831  0.5332237 -0.2771101  1.64762794
                  L -1.01895162 -0.98693471 -1.5046185  1.2658296 -0.91181195
                  M -0.66312569 -1.52353917  2.2194787 -1.0047207  1.21181214
                  N  0.94535614  0.82846219  1.2991286 -0.1688162  0.93339498
                  P  0.18862522  2.08084151 -1.6283286  0.4207004 -1.39177378
                  Q  0.93056541 -0.17926549 -3.0048731 -0.5025910 -1.85303476
                  R  1.53754853 -0.05472897  1.5021086  0.4403185  2.89744417
                  S -0.22788299  1.39869991 -4.7596375  0.6701745 -2.64747356
                  T -0.03181782  0.32571153  2.2134612  0.9078985  1.31337035
                  V -1.33661279 -0.27854634 -0.5440132  1.2419935 -1.26225362
                  W -0.59533918  0.00907760  0.6719274 -2.1275244 -0.18358096
                  Y  0.25999617  0.82992312  3.0973596 -0.8380164  1.51150958")

key$letter <- row.names(key)

for(l in key$letter){
  ptuples <- gsub(l, paste(key[key$letter==l,1:5],collapse = " ", " "),ptuples)

}

 ptuples[1]

output <- t(sapply(as.character(ptuples),
                   function(x) sapply(1:3, function(i) key[substr(x,i,i),])))

head(output)
    [,1]       [,2]       [,3]       [,4]       [,5]       [,6]       [,7]      [,8]       [,9]     [,10]      [,11]      [,12]     [,13]      [,14]    [,15]     
AAA -0.5914597 -1.302093  -0.7330651 1.570392   -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
CAA -1.342672  0.465423   -0.8620345 -1.020079  -0.2551689 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
DAA 1.050151   0.3024241  -3.655915  -0.2590236 -3.241768  -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
EAA 1.357332   -1.452756  1.476661   0.1129444  -0.8371568 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
FAA -1.006101  -0.5904663 1.890969   -0.3966186 0.4119414  -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
GAA -0.3838799 1.652015   1.330102   1.044976   2.063856   -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084

输出您所说的对象键
在哪里？你是否从我的答案中复制了一些代码，却忘了包含它这只是我从问题中复制的表格（使用overflow:：soread
），尽管我认为使用与您的答案相同的名称是有意义的。我以为OP已经有了它，所以没有再复制它@AndrewGustar如果我有长度为5的字符串，我将如何更改它？只需将其更改为sapply（1:5…？@AndrewGustar这样做会在输出中给我几个NA值，即使输入字符串中的所有字符都在键中。@JackArnestad有趣。我刚刚尝试了一部分ptuples（5）我也遇到了这个问题，虽然我不太清楚为什么。但是，我确实解决了这个问题，在我使用的子集中删除了未使用的因子级别，因此您可以尝试输出
library(splitstackshape)
df <- cSplit(as.data.frame(ptuples), 'ptuples', sep=" ", type.convert=FALSE)
head(df)

    ptuples_01  ptuples_02 ptuples_03 ptuples_04  ptuples_05  ptuples_06  ptuples_07 ptuples_08 ptuples_09  ptuples_10  ptuples_11  ptuples_12 ptuples_13 ptuples_14
1: -0.59145974 -1.30209266 -0.7330651  1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651  1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651  1.5703918
2: -1.34267179    0.465423 -0.8620345 -1.0200786 -0.25516894 -0.59145974 -1.30209266 -0.7330651  1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651  1.5703918
3:  1.05015062  0.30242411 -3.6559147 -0.2590236 -3.24176791 -0.59145974 -1.30209266 -0.7330651  1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651  1.5703918
4:  1.35733226 -1.45275578   1.476661  0.1129444 -0.83715681 -0.59145974 -1.30209266 -0.7330651  1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651  1.5703918
5: -1.00610084 -0.59046634  1.8909687 -0.3966186  0.41194139 -0.59145974 -1.30209266 -0.7330651  1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651  1.5703918
6: -0.38387987  1.65201497  1.3301017  1.0449765  2.06385566 -0.59145974 -1.30209266 -0.7330651  1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651  1.5703918
    ptuples_15
1: -0.14550842
2: -0.14550842
3: -0.14550842
4: -0.14550842
5: -0.14550842
6: -0.14550842

output <- t(sapply(as.character(ptuples),
                   function(x) sapply(1:3, function(i) key[substr(x,i,i),])))

head(output)
    [,1]       [,2]       [,3]       [,4]       [,5]       [,6]       [,7]      [,8]       [,9]     [,10]      [,11]      [,12]     [,13]      [,14]    [,15]     
AAA -0.5914597 -1.302093  -0.7330651 1.570392   -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
CAA -1.342672  0.465423   -0.8620345 -1.020079  -0.2551689 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
DAA 1.050151   0.3024241  -3.655915  -0.2590236 -3.241768  -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
EAA 1.357332   -1.452756  1.476661   0.1129444  -0.8371568 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
FAA -1.006101  -0.5904663 1.890969   -0.3966186 0.4119414  -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
GAA -0.3838799 1.652015   1.330102   1.044976   2.063856   -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084