如何匹配2个dataframe列并提取列值和列名?

如何匹配2个dataframe列并提取列值和列名?,r,data.table,R,Data.table,我有一个叫做mymat的矩阵。我有一个向量叫做geno,我对dplyr函数不太熟悉。您可以尝试基本R合并功能: mm <- merge(key.table,mymat,by="key",all.x=T) mm mm不完全确定您想要什么,但可能接近于此: library(reshape2) mymat <- structure( c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", "chr5:12118",

我有一个叫做mymat的矩阵。我有一个向量叫做
geno,我对dplyr函数不太熟悉。您可以尝试基本R合并功能:

mm <- merge(key.table,mymat,by="key",all.x=T)
mm

mm不完全确定您想要什么,但可能接近于此:

library(reshape2)
mymat <- structure(
  c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
    "chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00", 
    "11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L), 
  .Dimnames = list(
    c("34", "35", "36", "37", "38"), 
    c("key", "AMLM12001KP", "AMAS-11.3-Diagnostic", "AMLM12014N-R")))

key.table<- structure(
  c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", 
    "chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125", 
    "chr5:12127", "chr5:12129", "9920068", "9920069", "9920070", 
    "9920071", "9920072", "9920073", "9920074", "9920075", "9920076", 
    "9920077", "9920078"), .Dim = c(11L, 2L), 
  .Dimnames = list(
    c("34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44"), 
                   c("key", "variantId")))

# work with dataframes
mmdf <- data.frame(mymat)
ktdf <- data.frame(key.table)

tdf <- merge(mmdf,ktdf,by="key")
mltdf <- melt(tdf,id.vars=c("key","variantId"))
mltdf1 <- mltdf[mltdf$value != "0N" & mltdf$value != "00" ,]

mltdf1
使用,我会这样处理:

library(data.table)
# convert the 'key.table' matrix to a data.table
kt <- as.data.table(key.table, keep.rownames=TRUE)
# convert the 'mymat' matrix to a data.table and melt into long format
# filter on the needed geno-types
# paste the needed values together into the requested format
mm <- melt(as.data.table(mymat, keep.rownames=TRUE),
           id=c("rn","key"))[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")
                             ][, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)
                               ][val=="", val:=NA]
# join the 'mm' and 'kt' data.tables
kt[mm, matched := val, on=c("rn","key")]

说明:


  • kt所以我们只对“1N”、“11”和“10”的值感兴趣?您还不清楚。@MikeWise是的,我们只想在单元格具有这些值中的任何一个的情况下提取。那么它是什么样子的呢?@MikeWise谢谢,但是您可以看到这一行
    36“chr5:12113”“9920070”amlm1201kp(1N),AMLM12014N-R(1N)
    有多个colname。我们只对提取所有列名感兴趣,前提是匹配键的行在列中有一个
    geno
    值。呃,不确定该注释的含义。很好地使用了
    merge
    +1谢谢,这正是我需要的。请帮我理解您是如何在这里使用melt函数的。看起来很有趣,我真的很想了解它。请您解释一下如何在上下文中放置
    val
    、和
    paste0(变量“(”,值“)”)
    。@MAPK添加了一个解释并改进了答案。汉克斯,非常感谢。
    get.geno <- function(x,y) ifelse(!x %in% c("00","0N") & !is.na(x), paste0(y," (",x,")"), NA)
    a <- t(apply(mm[,3:5], 1, get.geno, colnames(mm)[3:5]))
    
    mm$result <- apply(a, 1, function(x) paste(x[!is.na(x)] ,collapse=","))
    mm[, -3:-5]
              key   variantId                           result
    1  chr5:12111   9920068                                   
    2  chr5:12111   9920068                                   
    3  chr5:12111   9920069                                   
    4  chr5:12111   9920069                                   
    5  chr5:12113   9920070 AMLM12001KP (1N),AMLM12014N-R (1N)
    6  chr5:12114   9920071          AMAS-11.3-Diagnostic (11)
    7  chr5:12118   9920072          AMAS-11.3-Diagnostic (10)
    8  chr5:12122   9920073                                   
    9  chr5:12123   9920074                                   
    10 chr5:12123   9920075                                   
    11 chr5:12125   9920076                                   
    12 chr5:12127   9920077                                   
    13 chr5:12129   9920078    
    
    library(reshape2)
    mymat <- structure(
      c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114",
        "chr5:12118", "0N", "0N", "1N", "0N", "0N", "00", "00", "00", 
        "11", "10", "00", "00", "1N", "0N", "00"), .Dim = c(5L, 4L), 
      .Dimnames = list(
        c("34", "35", "36", "37", "38"), 
        c("key", "AMLM12001KP", "AMAS-11.3-Diagnostic", "AMLM12014N-R")))
    
    key.table<- structure(
      c("chr5:12111", "chr5:12111", "chr5:12113", "chr5:12114", 
        "chr5:12118", "chr5:12122", "chr5:12123", "chr5:12123", "chr5:12125", 
        "chr5:12127", "chr5:12129", "9920068", "9920069", "9920070", 
        "9920071", "9920072", "9920073", "9920074", "9920075", "9920076", 
        "9920077", "9920078"), .Dim = c(11L, 2L), 
      .Dimnames = list(
        c("34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44"), 
                       c("key", "variantId")))
    
    # work with dataframes
    mmdf <- data.frame(mymat)
    ktdf <- data.frame(key.table)
    
    tdf <- merge(mmdf,ktdf,by="key")
    mltdf <- melt(tdf,id.vars=c("key","variantId"))
    mltdf1 <- mltdf[mltdf$value != "0N" & mltdf$value != "00" ,]
    
    mltdf1
    
              key variantId             variable value
    5  chr5:12113   9920070          AMLM12001KP    1N
    13 chr5:12114   9920071 AMAS.11.3.Diagnostic    11
    14 chr5:12118   9920072 AMAS.11.3.Diagnostic    10
    19 chr5:12113   9920070         AMLM12014N.R    1N
    
    library(data.table)
    # convert the 'key.table' matrix to a data.table
    kt <- as.data.table(key.table, keep.rownames=TRUE)
    # convert the 'mymat' matrix to a data.table and melt into long format
    # filter on the needed geno-types
    # paste the needed values together into the requested format
    mm <- melt(as.data.table(mymat, keep.rownames=TRUE),
               id=c("rn","key"))[value %in% c("1N","11","10"), val := paste0(variable," (",value,")")
                                 ][, .(val = paste(val[!is.na(val)], collapse = ",")), by = .(rn,key)
                                   ][val=="", val:=NA]
    # join the 'mm' and 'kt' data.tables
    kt[mm, matched := val, on=c("rn","key")]
    
    > kt
        rn        key variantId                            matched
     1: 34 chr5:12111   9920068                                 NA
     2: 35 chr5:12111   9920069                                 NA
     3: 36 chr5:12113   9920070 AMLM12001KP (1N),AMLM12014N-R (1N)
     4: 37 chr5:12114   9920071          AMAS-11.3-Diagnostic (11)
     5: 38 chr5:12118   9920072          AMAS-11.3-Diagnostic (10)
     6: 39 chr5:12122   9920073                                 NA
     7: 40 chr5:12123   9920074                                 NA
     8: 41 chr5:12123   9920075                                 NA
     9: 42 chr5:12125   9920076                                 NA
    10: 43 chr5:12127   9920077                                 NA
    11: 44 chr5:12129   9920078                                 NA