如何创建满足不同需求的新列;如果;R中的条件?
我有一大组数据,如下所示:如何创建满足不同需求的新列;如果;R中的条件?,r,if-statement,R,If Statement,我有一大组数据,如下所示: Name SNP.x ILMN.Strand.x Customer.Strand.x SNP.y ILMN.Strand.y Customer.Strand.y exm-rs10128711 [T/C] BOT BOT [T/C] BOT BOT exm-rs10134944 [A/G] TOP BOT NA NA NA exm-rs10218696 NA NA NA [T/C] BOT TOP e
Name SNP.x ILMN.Strand.x Customer.Strand.x SNP.y ILMN.Strand.y Customer.Strand.y
exm-rs10128711 [T/C] BOT BOT [T/C] BOT BOT
exm-rs10134944 [A/G] TOP BOT NA NA NA
exm-rs10218696 NA NA NA [T/C] BOT TOP
exm-rs10223421 [A/C] TOP BOT NA NA NA
如何创建新列“SNP”、“ILMN.Strand”、“Customer.Strand”,其中:
非常感谢!:) 我假设,如果
SNP.x
和SNP.y
都是NA
,则该行将从数据帧中删除。如果SNP.x!=SNP.y
行也会被删除(如果发生这种情况)
下面的代码并不漂亮,也不是很有效,但它应该做到这一点
tmp <- apply(df, 1, function(x){
# if SNP.x == SNP.y and not NA pass X
if(!is.na(x["SNP.x"] == x["SNP.y"])) {
if(x["SNP.x"] == x["SNP.y"]) data.frame(Name = x["Name"], SNP = x["SNP.x"], ILMN.Strand = x["ILMN.Strand.x"], Customer.Strand = x["Customer.Strand.x"])
} else if(is.na(x["SNP.x"])) { # else if SNP.x is NA pass y
if(!is.na(x["SNP.y"])) data.frame(Name = x["Name"], SNP = x["SNP.y"], ILMN.Strand = x["ILMN.Strand.y"], Customer.Strand = x["Customer.Strand.y"])
} else if(is.na(x["SNP.y"])) { # else if SNP.y is NA pass x
if(!is.na(x["SNP.x"])) data.frame(Name = x["Name"], SNP = x["SNP.x"], ILMN.Strand = x["ILMN.Strand.x"], Customer.Strand = x["Customer.Strand.x"])
} else NULL # otherwise pass NULL (e.g. (SNP.x != SNP.y AND neither are NA))
})
# rbind the list-output of the previous apply() function
result <- do.call(rbind, tmp[!sapply(tmp, is.null)])
编辑: 这可能是更好的解决方案(使用dplyr 0.5.0的R3.2.4),因为
apply()
将数据帧强制转换为矩阵等。如果(SNP.X!=SNP.Y)
且两者都不是NA
,则下面的解决方案也会返回唯一的“…”。希望这能起到作用,尽管没有更多数据信息,很难预测您可能会遇到哪些问题。
在这个解决方案中,因子被强制转换为字符,所以请记住这一点
# This is a helper function for the logic
# a and b will be tested; retA, retB, NA or '..' (see below) will be returned
logicalTest <- function(a, b, retA, retB){
# coerce factors into character
if(is.factor(retA)) retA <- as.character(retA)
if(is.factor(retB)) retB <- as.character(retB)
tmp <- a == b # compare a and b (surrogates for SNP.x and SNP.y) and put in tmp variable
if(is.na(tmp)){ # if the comparison was NA one of a or b must have been NA ...
if(is.na(a) & is.na(b)) return(NA) # if both were NA just return NA,
else if(is.na(a)) return(retB) # if a was NA return b,
else return(retA) # otherwise return a
} else if(tmp){ # if tmp is TRUE (a == b)
return(retA) # return a
} else return("..") # else (a != b) return ".."
}
# load dplyr for the bit below
library(dplyr)
result <- df %>%
group_by(Name) %>%
transmute(SNP = logicalTest(SNP.x, SNP.y, SNP.x, SNP.y),
ILMN.Strand = logicalTest(SNP.x, SNP.y, ILMN.Strand.x, ILMN.Strand.y),
Customer.Strand = logicalTest(SNP.x, SNP.y, Customer.Strand.x, Customer.Strand.y))
# get cleaned results
result[!rowSums(is.na(result)),] # drop rows with NAs
result[!(rowSums(is.na(result)) | result$SNP == ".."),] # drop rows with NAs and ".."
#这是逻辑的辅助函数
#a和b将被测试;将返回retA、retB、NA或“…”(见下文)
logicalTest关于数据表
我不确定逻辑应该如何工作(例如SNP.x!=SNP.y
或两者都是NA的
,但您可以自己修改它
编辑:很少有方法进行基准测试。
准备数据:
6144行:
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# d1 <- f1() 11.977032 12.128610 13.869310 12.52532 12.585317 20.130271 5 b
# d2 <- f2() 17.200552 17.627260 21.616209 20.76224 22.830254 29.660738 5 c
# d3 <- f3() 2.945114 3.009456 3.317191 3.04064 3.071429 4.519314 5 a
#单位:毫秒
#expr最小lq平均uq最大neval cld
#d1这些列是factor
还是character
类这些列在factor类中。SNP.x
和SNP.y
都可以是NAs吗?或者这是数据检索过程无法做到的?SNP.x和SNP.y都不能是NAs。:)如果SNP.x!=SNP.y
?您好,impz,我尝试运行您的代码,但收到了错误消息。。应用(df,1,函数(x)中出错{:dim(x)必须有正长度我对此很在行。请你再给我一次建议好吗?谢谢!这可能是因为强制问题。我包括了另一个解决方案,应该可以更好地工作。我m-dz,不幸的是,我无法加载或安装microbenchmark库,因为找不到它。我在这里遗漏了什么吗?有错误吗?它在CRAN上。但它与t完全无关他问:实际上,你从中得到的一切都贴在上面(时间)。这是你要求的吗?
require(data.table)
require(microbenchmark)
dat1 <- data.table(Name = c("exm-rs10128711", "exm-rs10134944", "exm-rs10218696", "exm-rs10223421", "both_NAs", "no_NAs_just_diff"),
SNP.x = c("[T/C]", "[A/G]", NA, "[A/C]", NA, "new_x"),
ILMN.Strand.x = c("BOT", "TOP", NA, "TOP", "new_x", "new_x"),
Customer.Strand.x = c("BOT", "BOT", NA, "BOT", "new_x", "new_x"),
SNP.y = c("[T/C]", NA, "[T/C]", NA, NA, "new_y"),
ILMN.Strand.y = c("BOT", NA, "BOT", NA, "new_y", "new_y"),
Customer.Strand.y = c("BOT", NA, "TOP", NA, "new_y", "new_y"))
# Make it a bit bigger
for (i in seq_len(15)) dat1 <- rbind(dat1, dat1) # 15 MB, 196608 rows
# If needed cast to characters (to get rid of "level sets of factors are different" error...)
# dat <- dat[, lapply(.SD, as.character)]
# if else returning a list
f1 <- function() {
dat1[, c("SNP", "ILMN.Strand", "Customer.Strand") :=
if ( !is.na(SNP.x) ) { list(SNP.x, ILMN.Strand.x, Customer.Strand.x)
} else if ( !is.na(SNP.y) ) { list(SNP.y, ILMN.Strand.y, Customer.Strand.y)
} else { list(NA_character_, NA_character_, NA_character_) },
by = seq_len(nrow(dat1))
][]
}
# ifelse per column
f2 <- function() {
dat1[, ":="(SNP = ifelse(!is.na(SNP.x), SNP.x,
ifelse(!is.na(SNP.y), SNP.y, NA_character_)),
ILMN.Strand = ifelse(!is.na(SNP.x), ILMN.Strand.x,
ifelse(!is.na(SNP.y), ILMN.Strand.y, NA_character_)),
Customer.Strand = ifelse(!is.na(SNP.x), Customer.Strand.x,
ifelse(!is.na(SNP.y), Customer.Strand.y, NA_character_)))
][]
}
# ifelse returning a list
f3 <- function() {
dat1[, c("SNP", "ILMN.Strand", "Customer.Strand") :=
ifelse (!is.na(SNP.x), list(list(SNP.x, ILMN.Strand.x, Customer.Strand.x)),
ifelse (!is.na(SNP.y), list(list(SNP.y, ILMN.Strand.y, Customer.Strand.y)),
list(list(NA_character_, NA_character_, NA_character_))))[[1]] # HERE IS THE ONE!
][]
}
microbenchmark(
d1 <- f1(),
d2 <- f2(),
d3 <- f3(),
times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# d1 <- f1() 303.03681 316.91054 354.9147 330.91177 403.3858 420.3286 5 b
# d2 <- f2() 658.27527 660.19131 723.9005 664.31352 737.0994 899.6230 5 c
# d3 <- f3() 78.20754 84.91487 110.3533 86.73539 104.9149 196.9938 5 a
d1[1:6, ]
# Name SNP.x ILMN.Strand.x Customer.Strand.x SNP.y ILMN.Strand.y Customer.Strand.y SNP ILMN.Strand Customer.Strand
# 1: exm-rs10128711 [T/C] BOT BOT [T/C] BOT BOT [T/C] BOT BOT
# 2: exm-rs10134944 [A/G] TOP BOT NA NA NA [A/G] TOP BOT
# 3: exm-rs10218696 NA NA NA [T/C] BOT TOP [T/C] BOT TOP
# 4: exm-rs10223421 [A/C] TOP BOT NA NA NA [A/C] TOP BOT
# 5: both_NAs NA new_x new_x NA new_y new_y NA NA NA
# 6: no_NAs_just_diff new_x new_x new_x new_y new_y new_y new_x new_x new_x
sapply(list(d1, d2, d3), FUN = identical, d1)
# [1] TRUE TRUE TRUE
# Unit: microseconds
# expr min lq mean median uq max neval cld
# d1 <- f1() 1964.988 1968.936 2238.697 2273.276 2404.722 2581.564 5 b
# d2 <- f2() 976.574 998.284 1147.020 1033.021 1038.942 1688.280 5 a
# d3 <- f3() 684.471 845.916 1026.389 1141.573 1209.466 1250.519 5 a
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# d1 <- f1() 11.977032 12.128610 13.869310 12.52532 12.585317 20.130271 5 b
# d2 <- f2() 17.200552 17.627260 21.616209 20.76224 22.830254 29.660738 5 c
# d3 <- f3() 2.945114 3.009456 3.317191 3.04064 3.071429 4.519314 5 a