在r中重新编码时将列拆分为两列
我有以下资料:在r中重新编码时将列拆分为两列,r,split,R,Split,我有以下资料: set.seed(123) M1 <- c(sample(c("AA", "AB", "BB"), 5, replace = T)) M2k <- c(sample (c("AG", "GG", "AA"), 5, replace = T)) M3l <- c(sample (c("AT", "TT", "AA"), 5, replace = T)) M4 <- c(sample (c("CT", "TT", "CC"), 5, replace = T)
set.seed(123)
M1 <- c(sample(c("AA", "AB", "BB"), 5, replace = T))
M2k <- c(sample (c("AG", "GG", "AA"), 5, replace = T))
M3l <- c(sample (c("AT", "TT", "AA"), 5, replace = T))
M4 <- c(sample (c("CT", "TT", "CC"), 5, replace = T))
#in real data M1 .......M1000
myd <- data.frame (M1, M2k, M3l, M4)
set.seed(123)
M1编辑重塑::colsplit
将按''
使用重塑::colsplit
library(reshape)
split_col <- function(.col, data){
.x <- colsplit( data[[.col]], names = paste0(.col, letters[1:2]))
}
# split each column and combine
new_data <- do.call(cbind,lapply(names(myd), split_col, data = myd))
# convert each new column to a factor with levels 1:5 as requested.
new_data_2 <- do.call(data.frame,
lapply(new_data, factor, levels = c('A','B','C','G','T'), labels= 1:5))
M1a M1b M2ka M2kb M3la M3lb M4a M4b
1 1 1 1 4 1 1 3 3
2 2 2 4 4 5 5 3 5
3 1 2 1 1 1 1 3 5
4 2 2 4 4 5 5 3 5
5 2 2 4 4 1 5 3 3
库(重塑)
split_col编辑重塑::colsplit
将按'
使用重塑::colsplit
library(reshape)
split_col <- function(.col, data){
.x <- colsplit( data[[.col]], names = paste0(.col, letters[1:2]))
}
# split each column and combine
new_data <- do.call(cbind,lapply(names(myd), split_col, data = myd))
# convert each new column to a factor with levels 1:5 as requested.
new_data_2 <- do.call(data.frame,
lapply(new_data, factor, levels = c('A','B','C','G','T'), labels= 1:5))
M1a M1b M2ka M2kb M3la M3lb M4a M4b
1 1 1 1 4 1 1 3 3
2 2 2 4 4 5 5 3 5
3 1 2 1 1 1 1 3 5
4 2 2 4 4 5 5 3 5
5 2 2 4 4 1 5 3 3
库(重塑)
split_col这里是另一个可能的解决方案,除了我发现它很容易遵循之外,没有什么特别的优势:
myd$M5 = c("AB", "GT", "GA", "QW", "CK") # Add another test column.
mat = as.matrix(myd) # Convert to matrix for speed and indexing benefits.
# Construct new column names.
new_names = character(length=ncol(mat) * 2)
new_names[seq(1, ncol(mat) * 2, 2)] = paste(colnames(mat), "a", sep="")
new_names[seq(2, ncol(mat) * 2, 2)] = paste(colnames(mat), "b", sep="")
# Create empty matrix with correct column names.
newmat = matrix(ncol=ncol(mat) * 2, nrow=nrow(mat))
colnames(newmat) = new_names
# Split columns.
for (i in seq(1, ncol(mat))) {
newmat[, (i * 2) - 1] = substr(mat[, i], 1, 1)
newmat[, i * 2 ] = substr(mat[, i], 2, 2)
}
# Use named vector to recode data.
recode = c(A=1, B=2, C=3, G=4, T=5)
newmat[] = recode[newmat]
newmat
# M1a M1b M2ka M2kb M3la M3lb M4a M4b M5a M5b
# [1,] "1" "1" "1" "4" "1" "1" "3" "3" "1" "2"
# [2,] "2" "2" "4" "4" "5" "5" "3" "5" "4" "5"
# [3,] "1" "2" "1" "1" "1" "1" "3" "5" "4" "1"
# [4,] "2" "2" "4" "4" "5" "5" "3" "5" NA NA
# [5,] "2" "2" "4" "4" "1" "5" "3" "3" "3" NA
下面是另一个可能的解决方案,除了我发现它很容易遵循之外,没有什么特别的优势:
myd$M5 = c("AB", "GT", "GA", "QW", "CK") # Add another test column.
mat = as.matrix(myd) # Convert to matrix for speed and indexing benefits.
# Construct new column names.
new_names = character(length=ncol(mat) * 2)
new_names[seq(1, ncol(mat) * 2, 2)] = paste(colnames(mat), "a", sep="")
new_names[seq(2, ncol(mat) * 2, 2)] = paste(colnames(mat), "b", sep="")
# Create empty matrix with correct column names.
newmat = matrix(ncol=ncol(mat) * 2, nrow=nrow(mat))
colnames(newmat) = new_names
# Split columns.
for (i in seq(1, ncol(mat))) {
newmat[, (i * 2) - 1] = substr(mat[, i], 1, 1)
newmat[, i * 2 ] = substr(mat[, i], 2, 2)
}
# Use named vector to recode data.
recode = c(A=1, B=2, C=3, G=4, T=5)
newmat[] = recode[newmat]
newmat
# M1a M1b M2ka M2kb M3la M3lb M4a M4b M5a M5b
# [1,] "1" "1" "1" "4" "1" "1" "3" "3" "1" "2"
# [2,] "2" "2" "4" "4" "5" "5" "3" "5" "4" "5"
# [3,] "1" "2" "1" "1" "1" "1" "3" "5" "4" "1"
# [4,] "2" "2" "4" "4" "5" "5" "3" "5" NA NA
# [5,] "2" "2" "4" "4" "1" "5" "3" "3" "3" NA
mnel已经给出了一个非常直截了当的回答。这是我在玩GitHub上的正在处理的包(qdap),尽管它还没有在CRAN上:
安装qdap
# install.packages("devtools")
library(devtools)
install_github("qdap", "trinker")
解决问题:
lapply(seq_along(myd), function(i){
myd <<- colsplit2df(myd, (i+i-1), paste0(names(myd)[i+i-1],
letters[1:2]), sep="")
})
data.frame(apply(myd, 2, function(x) as.numeric(text2color(x,
c("A", "B", "C", "G", "T"), c(1:5, NA)))))
lapply(seq_沿(myd),函数(i){
mydmnel已经给出了一个非常直截了当的答案。这是我在玩GitHub上的正在处理的包(qdap),虽然还没有在CRAN上:
安装qdap
# install.packages("devtools")
library(devtools)
install_github("qdap", "trinker")
解决问题:
lapply(seq_along(myd), function(i){
myd <<- colsplit2df(myd, (i+i-1), paste0(names(myd)[i+i-1],
letters[1:2]), sep="")
})
data.frame(apply(myd, 2, function(x) as.numeric(text2color(x,
c("A", "B", "C", "G", "T"), c(1:5, NA)))))
lapply(seq_沿(myd),函数(i){
myd使用qdap和更稳定的溶液:
x <- colsplit2df(myd, 1:ncol(myd), sep="")
colnames(x) <- paste(rep(colnames(myd), each = 2), letters[1:2], sep=".")
## M1a M1b M2ka M2kb M3la M3lb M4a M4b
## 1 1 1 1 4 1 1 3 3
## 2 2 2 4 4 5 5 3 5
## 3 1 2 1 1 1 1 3 5
## 4 2 2 4 4 5 5 3 5
## 5 2 2 4 4 1 5 3 3
x使用具有更稳定溶液的qdap:
x <- colsplit2df(myd, 1:ncol(myd), sep="")
colnames(x) <- paste(rep(colnames(myd), each = 2), letters[1:2], sep=".")
## M1a M1b M2ka M2kb M3la M3lb M4a M4b
## 1 1 1 1 4 1 1 3 3
## 2 2 2 4 4 5 5 3 5
## 3 1 2 1 1 1 1 3 5
## 4 2 2 4 4 5 5 3 5
## 5 2 2 4 4 1 5 3 3
x感谢您的回答,它适用于这个小示例,当我在更大的示例中尝试时,它不显示选项显示它将变量拆分并命名到某个点,例如变量,然后停止!错误是“'names'属性[2]必须与向量[1]长度相同”我发现问题的原因是在数据集中。我需要对此进行优化,以便NA或感谢您的回答,它适用于这个小示例,当我在更大的示例中尝试时,它不显示选项显示它将变量拆分并命名到某个点,例如变量,然后停止!错误是“'names'属性[2]必须与向量[1]的长度相同。“我发现问题的原因在数据集中。我需要对此进行优化,以便NA或