R 如何根据给定的值将文本字符串转换为向量,并用数值替换每个字母
我有一个字符串列表(类:“factor”),在下面的代码中称为ptuples:R 如何根据给定的值将文本字符串转换为向量,并用数值替换每个字母,r,R,我有一个字符串列表(类:“factor”),在下面的代码中称为ptuples: aminoacid <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y") aminoacid1 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R"
aminoacid <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid1 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
df <- expand.grid(aminoacid, aminoacid1, aminoacid2)
df <- transform(df, newname = paste(df$Var1, df$Var2, df$Var3, sep=""))
ptuples <- df[,4]
如何将这些字符串转换为长度为15的向量?下一步是使用k-means聚类将集群分为100个组,因此,如果某个特定的格式可以使这更容易,我将不胜感激
谢谢
aminoacid这将输出一个矩阵,每个p元组有一行15列,其中key
是您的20x5数据帧或数字矩阵。您应该能够在dist
函数或类似函数中直接使用此函数进行进一步聚类
aminoacid <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid1 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
aminoacid2 <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
df <- expand.grid(aminoacid, aminoacid1, aminoacid2)
df <- transform(df, newname = paste(df$Var1, df$Var2, df$Var3, sep=""))
ptuples <- df[,4]
key <- read.table(text=" pah pss ms cc ec
A -0.59145974 -1.30209266 -0.7330651 1.5703918 -0.14550842
C -1.34267179 0.46542300 -0.8620345 -1.0200786 -0.25516894
D 1.05015062 0.30242411 -3.6559147 -0.2590236 -3.24176791
E 1.35733226 -1.45275578 1.4766610 0.1129444 -0.83715681
F -1.00610084 -0.59046634 1.8909687 -0.3966186 0.41194139
G -0.38387987 1.65201497 1.3301017 1.0449765 2.06385566
H 0.33616543 -0.41662780 -1.6733690 -1.4738898 -0.07772917
I -1.23936304 -0.54652238 2.1314349 0.3931618 0.81630366
K 1.83146558 -0.56109831 0.5332237 -0.2771101 1.64762794
L -1.01895162 -0.98693471 -1.5046185 1.2658296 -0.91181195
M -0.66312569 -1.52353917 2.2194787 -1.0047207 1.21181214
N 0.94535614 0.82846219 1.2991286 -0.1688162 0.93339498
P 0.18862522 2.08084151 -1.6283286 0.4207004 -1.39177378
Q 0.93056541 -0.17926549 -3.0048731 -0.5025910 -1.85303476
R 1.53754853 -0.05472897 1.5021086 0.4403185 2.89744417
S -0.22788299 1.39869991 -4.7596375 0.6701745 -2.64747356
T -0.03181782 0.32571153 2.2134612 0.9078985 1.31337035
V -1.33661279 -0.27854634 -0.5440132 1.2419935 -1.26225362
W -0.59533918 0.00907760 0.6719274 -2.1275244 -0.18358096
Y 0.25999617 0.82992312 3.0973596 -0.8380164 1.51150958")
key$letter <- row.names(key)
for(l in key$letter){
ptuples <- gsub(l, paste(key[key$letter==l,1:5],collapse = " ", " "),ptuples)
}
ptuples[1]
output <- t(sapply(as.character(ptuples),
function(x) sapply(1:3, function(i) key[substr(x,i,i),])))
head(output)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15]
AAA -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
CAA -1.342672 0.465423 -0.8620345 -1.020079 -0.2551689 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
DAA 1.050151 0.3024241 -3.655915 -0.2590236 -3.241768 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
EAA 1.357332 -1.452756 1.476661 0.1129444 -0.8371568 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
FAA -1.006101 -0.5904663 1.890969 -0.3966186 0.4119414 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
GAA -0.3838799 1.652015 1.330102 1.044976 2.063856 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
输出您所说的对象键
在哪里?你是否从我的答案中复制了一些代码,却忘了包含它这只是我从问题中复制的表格(使用overflow::soread
),尽管我认为使用与您的答案相同的名称是有意义的。我以为OP已经有了它,所以没有再复制它@AndrewGustar如果我有长度为5的字符串,我将如何更改它?只需将其更改为sapply(1:5…?@AndrewGustar这样做会在输出中给我几个NA值,即使输入字符串中的所有字符都在键中。@JackArnestad有趣。我刚刚尝试了一部分ptuples(5)我也遇到了这个问题,虽然我不太清楚为什么。但是,我确实解决了这个问题,在我使用的子集中删除了未使用的因子级别,因此您可以尝试输出
library(splitstackshape)
df <- cSplit(as.data.frame(ptuples), 'ptuples', sep=" ", type.convert=FALSE)
head(df)
ptuples_01 ptuples_02 ptuples_03 ptuples_04 ptuples_05 ptuples_06 ptuples_07 ptuples_08 ptuples_09 ptuples_10 ptuples_11 ptuples_12 ptuples_13 ptuples_14
1: -0.59145974 -1.30209266 -0.7330651 1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651 1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651 1.5703918
2: -1.34267179 0.465423 -0.8620345 -1.0200786 -0.25516894 -0.59145974 -1.30209266 -0.7330651 1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651 1.5703918
3: 1.05015062 0.30242411 -3.6559147 -0.2590236 -3.24176791 -0.59145974 -1.30209266 -0.7330651 1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651 1.5703918
4: 1.35733226 -1.45275578 1.476661 0.1129444 -0.83715681 -0.59145974 -1.30209266 -0.7330651 1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651 1.5703918
5: -1.00610084 -0.59046634 1.8909687 -0.3966186 0.41194139 -0.59145974 -1.30209266 -0.7330651 1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651 1.5703918
6: -0.38387987 1.65201497 1.3301017 1.0449765 2.06385566 -0.59145974 -1.30209266 -0.7330651 1.5703918 -0.14550842 -0.59145974 -1.30209266 -0.7330651 1.5703918
ptuples_15
1: -0.14550842
2: -0.14550842
3: -0.14550842
4: -0.14550842
5: -0.14550842
6: -0.14550842
output <- t(sapply(as.character(ptuples),
function(x) sapply(1:3, function(i) key[substr(x,i,i),])))
head(output)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15]
AAA -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
CAA -1.342672 0.465423 -0.8620345 -1.020079 -0.2551689 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
DAA 1.050151 0.3024241 -3.655915 -0.2590236 -3.241768 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
EAA 1.357332 -1.452756 1.476661 0.1129444 -0.8371568 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
FAA -1.006101 -0.5904663 1.890969 -0.3966186 0.4119414 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084
GAA -0.3838799 1.652015 1.330102 1.044976 2.063856 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084 -0.5914597 -1.302093 -0.7330651 1.570392 -0.1455084