R 如何将观察值的存在或不存在转换为具有此格式的二进制事件计数的矩阵?
我正在尝试将遗传输入程序的输入转换为不同的格式,以便在下游分析中使用它。 输入外观的一个玩具示例如下:R 如何将观察值的存在或不存在转换为具有此格式的二进制事件计数的矩阵?,r,matrix,binary,R,Matrix,Binary,我正在尝试将遗传输入程序的输入转换为不同的格式,以便在下游分析中使用它。 输入外观的一个玩具示例如下: input <- data.frame(A1 = c("a", "a", "b"), A2 = c("b", "a", "b"), row.names = c("ind1", "ind2", "ind3"), stringsAsFactors = FALSE) A1 A2 ind1 a b ind2 a a ind3 b b 在玩具示例中,单个ID(行名称)位于
input <- data.frame(A1 = c("a", "a", "b"), A2 = c("b", "a", "b"),
row.names = c("ind1", "ind2", "ind3"), stringsAsFactors = FALSE)
A1 A2
ind1 a b
ind2 a a
ind3 b b
在玩具示例中,单个ID(行名称)位于ID列中,A1为Alleger1列,A2为Alleger2列。预期产出如下:
FAM_308 FAM_308 FAM_2235 FAM_2235 882_cas326 882_cas326 851_cas295 851_cas295
01:01 0 0 0 0 1 0 0 0
02:01 0 0 0 0 1 0 1 0
03:01 0 0 0 1 0 0 1 0
26:01 1 0 0 0 0 0 0 0
29:02 1 0 0 0 0 0 0 0
678_cas122 678_cas122
01:01 0 0
02:01 0 1
03:01 0 0
26:01 0 0
29:02 0 0
非常感谢您的贡献 这是一个使用虚拟数据的解决方案。应该很容易适应真实的东西
library(dplyr)
A1 <- c("a", "a", "b")
A2 <- c("b", "a", "b")
In <- c("ind1", "ind2", "ind3")
alleles <- data.frame(In, A1, A2)
result <-
bind_rows(alleles, alleles, .id="Index") %>%
arrange(In) %>%
mutate(a=case_when(
Index == 1 & A1 == "a" & A2 == "b" ~ 1,
Index == 2 & A1 == "a" & A2 == "a" ~ 1,
TRUE ~ 0
)) %>%
mutate(b=case_when(
Index == 1 & A1 == "a" & A2 == "b" ~ 1,
Index == 2 & A1 == "b" & A2 == "b" ~ 1,
TRUE ~ 0
))
reshaped <- result %>%
mutate(new_name=paste(In, Index, sep="_")) %>%
select(new_name, a, b) %>%
t
final <- as.matrix(reshaped[2:3,])
colnames(final) <- reshaped[1,]
rownames(final) <- c("a", "b")
final
ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2
a "1" "0" "0" "1" "0" "0"
b "1" "0" "0" "0" "0" "1"
使用base R,我们可以得到所有
唯一的观测值。对于每行中的每个观察,我们根据条件返回输出。将所有结果绑定在一起,并指定列名和行名。首先在输入上执行该操作
数据共享
unique_vals <- unique(unlist(input))
cols <- c(t(outer(rownames(input), c("_1", "_2"), paste0)))
output <- do.call(rbind.data.frame, lapply(unique_vals, function(x)
c(apply(input, 1, function(y)
if (all(y == x)) c(0, 1) else if (any(y == x)) c(1, 0) else c(0, 0)))))
names(output) <- cols
rownames(output) <- unique_vals
output
# ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2
#a 1 0 0 1 0 0
#b 1 0 0 0 0 1
使用相同名称的列是不好的做法,因此在列名中添加“\u 1”
和“\u 2”
其中df
为
df <- structure(list(ID = c("FAM_308", "FAM_2235", "882_cas326", "851_cas295",
"678_cas122"), locus = c("HLAA", "HLAA", "HLAA", "HLAA", "HLAA"
), allele1 = c("26:01", "03:01", "01:01", "02:01", "02:01"),
allele2 = c("29:02", "03:01", "02:01", "03:01", "02:01"),
prob = c(0.9805655, 0.9917792, 0.8891524, 0.9468442, 0.9643058
), matching = c(0.0006153191, 0.0043972647, 0.0001758429,
0.0002267387, 0.0004104801)), class = "data.frame", row.names = c("397",
"677", "274", "246", "95"))
df难道你不能创建一个新的虚拟列,检查input$A1==input$A2
?这里你也有两种类型的观察(A1,A2),实际上?我无法理解你的玩具示例input
和原始输入之间的关系?实际数据中的A1
A2
列是什么?您还可以显示共享数据的预期输出吗?@heck1否。我需要为另一个程序创建一个输出文件,该程序需要我已经解释过的特定格式。@tom A1和A2只是列的名称,它们表示观察值1和2。在玩具示例中,两种类型的观察值是“a”和“b”。它工作起来很有魅力,只是使用了基数R,所以非常感谢您的帮助!
library(dplyr)
A1 <- c("a", "a", "b")
A2 <- c("b", "a", "b")
In <- c("ind1", "ind2", "ind3")
alleles <- data.frame(In, A1, A2)
result <-
bind_rows(alleles, alleles, .id="Index") %>%
arrange(In) %>%
mutate(a=case_when(
Index == 1 & A1 == "a" & A2 == "b" ~ 1,
Index == 2 & A1 == "a" & A2 == "a" ~ 1,
TRUE ~ 0
)) %>%
mutate(b=case_when(
Index == 1 & A1 == "a" & A2 == "b" ~ 1,
Index == 2 & A1 == "b" & A2 == "b" ~ 1,
TRUE ~ 0
))
reshaped <- result %>%
mutate(new_name=paste(In, Index, sep="_")) %>%
select(new_name, a, b) %>%
t
final <- as.matrix(reshaped[2:3,])
colnames(final) <- reshaped[1,]
rownames(final) <- c("a", "b")
final
ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2
a "1" "0" "0" "1" "0" "0"
b "1" "0" "0" "0" "0" "1"
library(dplyr)
library(tidyr)
ID <- c("FAM_308", "FAM_2235", "882_cas326", "851_cas295", "678_cas122")
allele1 <- c("26:01", "03:01", "01:01", "02:01", "02:01")
allele2 <- c("29:02", "03:01", "02:01", "03:01", "02:01")
DD <- data.frame(ID, allele1, allele2, stringsAsFactors = FALSE) %>% arrange(ID, allele1, allele2)
DD_long <- gather(DD, Allele, Value, -ID)
all_rows <- unique(DD_long$Value)
all_cols <- unique(DD_long$ID)
mm <- matrix(
0,
nrow = length(all_rows),
ncol = length(all_cols) * 2 ,
dimnames = list(all_rows, c(
paste(all_cols, 1, sep = "_"), paste(all_cols, 2, sep = "_")
))
)
# function to fill rows,
# but don't keep track of whether alleles match
fill_row <- function(row, mat) {
x <- filter(DD_long, Value == row) %>%
mutate(z=paste(ID, gsub("allele", "", Allele), sep="_")) %>%
select(z) %>% unlist %>% unname
cat("found allele ", row, "in individual ", x, "\n\n")
mat[row, x] <- 1
mat
}
for (i in seq_along(all_rows)) {
mm <- fill_row(all_rows[i], mm)
}
# reorganize the 1s and 0s dependent on whether alleles match
reorganize_row <- function(row, col, mat) {
if (sum(mat[row,grep(col, colnames(mm))]) == 1) {
mat[row, grep(col, x = colnames(mat))[1]] <- 1
mat[row, grep(col, x = colnames(mat))[2]] <- 0
}
if (sum(mat[row,grep(col, colnames(mm))]) == 2) {
mat[row, grep(col, x = colnames(mat))[1]] <- 0
mat[row, grep(col, x = colnames(mat))[2]] <- 1
}
mat
}
# nested loop, sorry
for (i in seq_along(all_rows)) {
for (j in seq_along(all_cols)) {
mm <- reorganize_row(all_rows[i], col = all_cols[j], mat = mm)
}
}
# sort the matrix to be as in example
nn <- mm[c("01:01", "02:01", "03:01", "26:01", "29:02"),
c(
"FAM_308_1",
"FAM_308_2",
"FAM_2235_1",
"FAM_2235_2",
"882_cas326_1",
"882_cas326_2",
"851_cas295_1",
"851_cas295_2",
"678_cas122_1",
"678_cas122_2"
)]
colnames(nn) <- gsub("_1|_2", "", x = colnames(nn))
nn
FAM_308 FAM_308 FAM235 FAM235 882_cas326 882_cas326 851_cas295 851_cas295 678_cas122 678_cas122
01:01 0 0 0 0 1 0 0 0 0 0
02:01 0 0 0 0 1 0 1 0 0 1
03:01 0 0 0 1 0 0 1 0 0 0
26:01 1 0 0 0 0 0 0 0 0 0
29:02 1 0 0 0 0 0 0 0 0 0
unique_vals <- unique(unlist(input))
cols <- c(t(outer(rownames(input), c("_1", "_2"), paste0)))
output <- do.call(rbind.data.frame, lapply(unique_vals, function(x)
c(apply(input, 1, function(y)
if (all(y == x)) c(0, 1) else if (any(y == x)) c(1, 0) else c(0, 0)))))
names(output) <- cols
rownames(output) <- unique_vals
output
# ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2
#a 1 0 0 1 0 0
#b 1 0 0 0 0 1
vals <- c("allele1", "allele2")
unique_vals <- sort(unique(unlist(df[vals])))
cols <- c(t(outer(df$ID, c("_1", "_2"), paste0)))
output <- do.call(rbind.data.frame, lapply(unique_vals, function(x)
c(apply(df[vals], 1, function(y)
if (all(y == x)) c(0, 1) else if (any(y == x)) c(1, 0) else c(0, 0)))))
names(output) <- cols
output
# FAM_308_1 FAM_308_2 FAM_2235_1 FAM_2235_2 882_cas326_1 882_cas326_2
#01:01 0 0 0 0 1 0
#02:01 0 0 0 0 1 0
#03:01 0 0 0 1 0 0
#26:01 1 0 0 0 0 0
#29:02 1 0 0 0 0 0
# 851_cas295_1 851_cas295_2 678_cas122_1 678_cas122_2
#01:01 0 0 0 0
#02:01 1 0 0 1
#03:01 1 0 0 0
#26:01 0 0 0 0
#29:02 0 0 0 0
df <- structure(list(ID = c("FAM_308", "FAM_2235", "882_cas326", "851_cas295",
"678_cas122"), locus = c("HLAA", "HLAA", "HLAA", "HLAA", "HLAA"
), allele1 = c("26:01", "03:01", "01:01", "02:01", "02:01"),
allele2 = c("29:02", "03:01", "02:01", "03:01", "02:01"),
prob = c(0.9805655, 0.9917792, 0.8891524, 0.9468442, 0.9643058
), matching = c(0.0006153191, 0.0043972647, 0.0001758429,
0.0002267387, 0.0004104801)), class = "data.frame", row.names = c("397",
"677", "274", "246", "95"))