如何匹配2个dataframe列并提取列值和列名?
我有一个叫做mymat的矩阵。我有一个向量叫做如何匹配2个dataframe列并提取列值和列名?,r,data.table,R,Data.table,我有一个叫做mymat的矩阵。我有一个向量叫做geno,我对dplyr函数不太熟悉。您可以尝试基本R合并功能: mm <- merge(key.table,mymat,by="key",all.x=T) mm mm不完全确定您想要什么,但可能接近于此: library(reshape2) mymat <- structure( c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", "chr5:12118",
geno,我对dplyr函数不太熟悉。您可以尝试基本R合并功能:
mm <- merge(key.table,mymat,by="key",all.x=T)
mm
mm不完全确定您想要什么,但可能接近于此:
library(reshape2)
mymat <- structure(
c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00",
"11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L),
.Dimnames = list(
c("34", "35", "36", "37", "38"),
c("key", "AMLM12001KP", "AMAS-11.3-Diagnostic", "AMLM12014N-R")))
key.table<- structure(
c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125",
"chr5:12127", "chr5:12129", "9920068", "9920069", "9920070",
"9920071", "9920072", "9920073", "9920074", "9920075", "9920076",
"9920077", "9920078"), .Dim = c(11L, 2L),
.Dimnames = list(
c("34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44"),
c("key", "variantId")))
# work with dataframes
mmdf <- data.frame(mymat)
ktdf <- data.frame(key.table)
tdf <- merge(mmdf,ktdf,by="key")
mltdf <- melt(tdf,id.vars=c("key","variantId"))
mltdf1 <- mltdf[mltdf$value != "0N" & mltdf$value != "00" ,]
mltdf1
使用,我会这样处理:
library(data.table)
# convert the 'key.table' matrix to a data.table
kt <- as.data.table(key.table, keep.rownames=TRUE)
# convert the 'mymat' matrix to a data.table and melt into long format
# filter on the needed geno-types
# paste the needed values together into the requested format
mm <- melt(as.data.table(mymat, keep.rownames=TRUE),
id=c("rn","key"))[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")
][, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)
][val=="", val:=NA]
# join the 'mm' and 'kt' data.tables
kt[mm, matched := val, on=c("rn","key")]
说明:
kt所以我们只对“1N”、“11”和“10”的值感兴趣?您还不清楚。@MikeWise是的,我们只想在单元格具有这些值中的任何一个的情况下提取。那么它是什么样子的呢?@MikeWise谢谢,但是您可以看到这一行36“chr5:12113”“9920070”amlm1201kp(1N),AMLM12014N-R(1N)
有多个colname。我们只对提取所有列名感兴趣,前提是匹配键的行在列中有一个geno
值。呃,不确定该注释的含义。很好地使用了merge
+1谢谢,这正是我需要的。请帮我理解您是如何在这里使用melt函数的。看起来很有趣,我真的很想了解它。请您解释一下如何在上下文中放置val
、值
、和paste0(变量“(”,值“)”)
。@MAPK添加了一个解释并改进了答案。汉克斯,非常感谢。
get.geno <- function(x,y) ifelse(!x %in% c("00","0N") & !is.na(x), paste0(y," (",x,")"), NA)
a <- t(apply(mm[,3:5], 1, get.geno, colnames(mm)[3:5]))
mm$result <- apply(a, 1, function(x) paste(x[!is.na(x)] ,collapse=","))
mm[, -3:-5]
key variantId result
1 chr5:12111 9920068
2 chr5:12111 9920068
3 chr5:12111 9920069
4 chr5:12111 9920069
5 chr5:12113 9920070 AMLM12001KP (1N),AMLM12014N-R (1N)
6 chr5:12114 9920071 AMAS-11.3-Diagnostic (11)
7 chr5:12118 9920072 AMAS-11.3-Diagnostic (10)
8 chr5:12122 9920073
9 chr5:12123 9920074
10 chr5:12123 9920075
11 chr5:12125 9920076
12 chr5:12127 9920077
13 chr5:12129 9920078
library(reshape2)
mymat <- structure(
c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00",
"11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L),
.Dimnames = list(
c("34", "35", "36", "37", "38"),
c("key", "AMLM12001KP", "AMAS-11.3-Diagnostic", "AMLM12014N-R")))
key.table<- structure(
c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
"chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125",
"chr5:12127", "chr5:12129", "9920068", "9920069", "9920070",
"9920071", "9920072", "9920073", "9920074", "9920075", "9920076",
"9920077", "9920078"), .Dim = c(11L, 2L),
.Dimnames = list(
c("34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44"),
c("key", "variantId")))
# work with dataframes
mmdf <- data.frame(mymat)
ktdf <- data.frame(key.table)
tdf <- merge(mmdf,ktdf,by="key")
mltdf <- melt(tdf,id.vars=c("key","variantId"))
mltdf1 <- mltdf[mltdf$value != "0N" & mltdf$value != "00" ,]
mltdf1
key variantId variable value
5 chr5:12113 9920070 AMLM12001KP 1N
13 chr5:12114 9920071 AMAS.11.3.Diagnostic 11
14 chr5:12118 9920072 AMAS.11.3.Diagnostic 10
19 chr5:12113 9920070 AMLM12014N.R 1N
library(data.table)
# convert the 'key.table' matrix to a data.table
kt <- as.data.table(key.table, keep.rownames=TRUE)
# convert the 'mymat' matrix to a data.table and melt into long format
# filter on the needed geno-types
# paste the needed values together into the requested format
mm <- melt(as.data.table(mymat, keep.rownames=TRUE),
id=c("rn","key"))[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")
][, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)
][val=="", val:=NA]
# join the 'mm' and 'kt' data.tables
kt[mm, matched := val, on=c("rn","key")]
> kt
rn key variantId matched
1: 34 chr5:12111 9920068 NA
2: 35 chr5:12111 9920069 NA
3: 36 chr5:12113 9920070 AMLM12001KP (1N),AMLM12014N-R (1N)
4: 37 chr5:12114 9920071 AMAS-11.3-Diagnostic (11)
5: 38 chr5:12118 9920072 AMAS-11.3-Diagnostic (10)
6: 39 chr5:12122 9920073 NA
7: 40 chr5:12123 9920074 NA
8: 41 chr5:12123 9920075 NA
9: 42 chr5:12125 9920076 NA
10: 43 chr5:12127 9920077 NA
11: 44 chr5:12129 9920078 NA