在r中重新编码时将列拆分为两列

在r中重新编码时将列拆分为两列,r,split,R,Split,我有以下资料: set.seed(123) M1 <- c(sample(c("AA", "AB", "BB"), 5, replace = T)) M2k <- c(sample (c("AG", "GG", "AA"), 5, replace = T)) M3l <- c(sample (c("AT", "TT", "AA"), 5, replace = T)) M4 <- c(sample (c("CT", "TT", "CC"), 5, replace = T)

我有以下资料:

set.seed(123)
M1 <- c(sample(c("AA", "AB", "BB"), 5, replace = T))
M2k <- c(sample (c("AG", "GG", "AA"), 5, replace = T))
M3l <- c(sample (c("AT", "TT", "AA"), 5, replace = T))
M4  <- c(sample (c("CT", "TT", "CC"), 5, replace = T))

  #in real data M1 .......M1000 

myd <- data.frame (M1, M2k, M3l, M4)
set.seed(123)

M1编辑
重塑::colsplit
将按
''

使用
重塑::colsplit

library(reshape)

split_col <- function(.col, data){
 .x <- colsplit( data[[.col]], names =  paste0(.col, letters[1:2])) 

}

# split each column and combine
new_data <- do.call(cbind,lapply(names(myd), split_col, data = myd))
# convert each new column to a factor  with levels 1:5 as requested.
new_data_2 <- do.call(data.frame, 
  lapply(new_data, factor, levels = c('A','B','C','G','T'), labels= 1:5))

  M1a M1b M2ka M2kb M3la M3lb M4a M4b
1   1   1    1    4    1    1   3   3
2   2   2    4    4    5    5   3   5
3   1   2    1    1    1    1   3   5
4   2   2    4    4    5    5   3   5
5   2   2    4    4    1    5   3   3
库(重塑)

split_col编辑
重塑::colsplit
将按
'

使用
重塑::colsplit

library(reshape)

split_col <- function(.col, data){
 .x <- colsplit( data[[.col]], names =  paste0(.col, letters[1:2])) 

}

# split each column and combine
new_data <- do.call(cbind,lapply(names(myd), split_col, data = myd))
# convert each new column to a factor  with levels 1:5 as requested.
new_data_2 <- do.call(data.frame, 
  lapply(new_data, factor, levels = c('A','B','C','G','T'), labels= 1:5))

  M1a M1b M2ka M2kb M3la M3lb M4a M4b
1   1   1    1    4    1    1   3   3
2   2   2    4    4    5    5   3   5
3   1   2    1    1    1    1   3   5
4   2   2    4    4    5    5   3   5
5   2   2    4    4    1    5   3   3
库(重塑)

split_col这里是另一个可能的解决方案,除了我发现它很容易遵循之外,没有什么特别的优势:

myd$M5 = c("AB", "GT", "GA", "QW", "CK") # Add another test column.

mat = as.matrix(myd) # Convert to matrix for speed and indexing benefits.

# Construct new column names.
new_names = character(length=ncol(mat) * 2)
new_names[seq(1, ncol(mat) * 2, 2)] = paste(colnames(mat), "a", sep="")
new_names[seq(2, ncol(mat) * 2, 2)] = paste(colnames(mat), "b", sep="")

# Create empty matrix with correct column names.
newmat = matrix(ncol=ncol(mat) * 2, nrow=nrow(mat))
colnames(newmat) = new_names

# Split columns.
for (i in seq(1, ncol(mat))) {
    newmat[, (i * 2) - 1] = substr(mat[, i], 1, 1)
    newmat[,  i * 2     ] = substr(mat[, i], 2, 2)
}

# Use named vector to recode data.
recode = c(A=1, B=2, C=3, G=4, T=5)
newmat[] = recode[newmat]

newmat
#      M1a M1b M2ka M2kb M3la M3lb M4a M4b M5a M5b
# [1,] "1" "1" "1"  "4"  "1"  "1"  "3" "3" "1" "2"
# [2,] "2" "2" "4"  "4"  "5"  "5"  "3" "5" "4" "5"
# [3,] "1" "2" "1"  "1"  "1"  "1"  "3" "5" "4" "1"
# [4,] "2" "2" "4"  "4"  "5"  "5"  "3" "5" NA  NA 
# [5,] "2" "2" "4"  "4"  "1"  "5"  "3" "3" "3" NA 

下面是另一个可能的解决方案,除了我发现它很容易遵循之外,没有什么特别的优势:

myd$M5 = c("AB", "GT", "GA", "QW", "CK") # Add another test column.

mat = as.matrix(myd) # Convert to matrix for speed and indexing benefits.

# Construct new column names.
new_names = character(length=ncol(mat) * 2)
new_names[seq(1, ncol(mat) * 2, 2)] = paste(colnames(mat), "a", sep="")
new_names[seq(2, ncol(mat) * 2, 2)] = paste(colnames(mat), "b", sep="")

# Create empty matrix with correct column names.
newmat = matrix(ncol=ncol(mat) * 2, nrow=nrow(mat))
colnames(newmat) = new_names

# Split columns.
for (i in seq(1, ncol(mat))) {
    newmat[, (i * 2) - 1] = substr(mat[, i], 1, 1)
    newmat[,  i * 2     ] = substr(mat[, i], 2, 2)
}

# Use named vector to recode data.
recode = c(A=1, B=2, C=3, G=4, T=5)
newmat[] = recode[newmat]

newmat
#      M1a M1b M2ka M2kb M3la M3lb M4a M4b M5a M5b
# [1,] "1" "1" "1"  "4"  "1"  "1"  "3" "3" "1" "2"
# [2,] "2" "2" "4"  "4"  "5"  "5"  "3" "5" "4" "5"
# [3,] "1" "2" "1"  "1"  "1"  "1"  "3" "5" "4" "1"
# [4,] "2" "2" "4"  "4"  "5"  "5"  "3" "5" NA  NA 
# [5,] "2" "2" "4"  "4"  "1"  "5"  "3" "3" "3" NA 

mnel已经给出了一个非常直截了当的回答。这是我在玩GitHub上的正在处理的包(qdap),尽管它还没有在CRAN上:

安装qdap

# install.packages("devtools")
library(devtools)
install_github("qdap", "trinker")
解决问题:

lapply(seq_along(myd),  function(i){
    myd <<- colsplit2df(myd, (i+i-1), paste0(names(myd)[i+i-1], 
        letters[1:2]), sep="")
})

data.frame(apply(myd, 2, function(x) as.numeric(text2color(x, 
    c("A", "B", "C", "G", "T"), c(1:5, NA)))))
lapply(seq_沿(myd),函数(i){

mydmnel已经给出了一个非常直截了当的答案。这是我在玩GitHub上的正在处理的包(qdap),虽然还没有在CRAN上:

安装qdap

# install.packages("devtools")
library(devtools)
install_github("qdap", "trinker")
解决问题:

lapply(seq_along(myd),  function(i){
    myd <<- colsplit2df(myd, (i+i-1), paste0(names(myd)[i+i-1], 
        letters[1:2]), sep="")
})

data.frame(apply(myd, 2, function(x) as.numeric(text2color(x, 
    c("A", "B", "C", "G", "T"), c(1:5, NA)))))
lapply(seq_沿(myd),函数(i){

myd使用qdap和更稳定的溶液:

x <- colsplit2df(myd, 1:ncol(myd), sep="")
colnames(x) <- paste(rep(colnames(myd), each = 2), letters[1:2], sep=".")

##   M1a M1b M2ka M2kb M3la M3lb M4a M4b
## 1   1   1    1    4    1    1   3   3
## 2   2   2    4    4    5    5   3   5
## 3   1   2    1    1    1    1   3   5
## 4   2   2    4    4    5    5   3   5
## 5   2   2    4    4    1    5   3   3

x使用具有更稳定溶液的qdap:

x <- colsplit2df(myd, 1:ncol(myd), sep="")
colnames(x) <- paste(rep(colnames(myd), each = 2), letters[1:2], sep=".")

##   M1a M1b M2ka M2kb M3la M3lb M4a M4b
## 1   1   1    1    4    1    1   3   3
## 2   2   2    4    4    5    5   3   5
## 3   1   2    1    1    1    1   3   5
## 4   2   2    4    4    5    5   3   5
## 5   2   2    4    4    1    5   3   3

x感谢您的回答,它适用于这个小示例,当我在更大的示例中尝试时,它不显示选项显示它将变量拆分并命名到某个点,例如变量,然后停止!错误是“'names'属性[2]必须与向量[1]长度相同”我发现问题的原因是在数据集中。我需要对此进行优化,以便NA或感谢您的回答,它适用于这个小示例,当我在更大的示例中尝试时,它不显示选项显示它将变量拆分并命名到某个点,例如变量,然后停止!错误是“'names'属性[2]必须与向量[1]的长度相同。“我发现问题的原因在数据集中。我需要对此进行优化,以便NA或