如何使用R从IUPAC符号中获取所有可能的序列
我有一个带有IUPAC符号()的DNA序列载体。例如,给定序列和符号:如何使用R从IUPAC符号中获取所有可能的序列,r,data.table,dna-sequence,R,Data.table,Dna Sequence,我有一个带有IUPAC符号()的DNA序列载体。例如,给定序列和符号: seq <- "AATCRVTAA" iuapc <- data.table(code = c("A", "C", "G", "T", "R", "Y", "S", "W", "K", "M",
seq <- "AATCRVTAA"
iuapc <- data.table(code = c("A", "C", "G", "T", "R", "Y", "S", "W", "K", "M", "B", "D", "H", "V", "N"),
base = c("A", "C", "G", "T", "AG", "CT", "GC", "AT", "GT", "AC", "CGT", "AGT", "ACT", "ACG", "ACGT"))
序列向量非常大,因此性能非常重要。任何帮助都将不胜感激
这里已经有人问过Python这个问题:根据您今天早些时候的问题(),这里有一个拙劣的tidyverse/base方法:
library(tidyverse)
tibble(seq) %>%
separate_rows(seq, sep = '(?<=.)(?=.)') %>%
left_join(iuapc, by = c("seq" = "code")) %>%
pull(base) %>%
str_split("") %>%
expand.grid(stringsAsFactors = FALSE)
# Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1 A A T C A A T A A
#2 A A T C G A T A A
#3 A A T C A C T A A
#4 A A T C G C T A A
#5 A A T C A G T A A
#6 A A T C G G T A A
库(tidyverse)
TIBLE(序列)%>%
单独的行(seq,sep=”(?这里有一些非常原始的东西:
library(data.table)
library(magrittr)
# Convert iuapc$base to list of vectors
iuapc[, base := list(strsplit(base, ''))]
setkey(iuapc, code)
tstrsplit(seq, '') %>%
lapply(function(x) iuapc[x, base[[1]]]) %>%
do.call(CJ, .) %>%
.[, paste(.SD, collapse = ''), by = 1:nrow(.)] %>%
.[, V1]
# [1] "AATCAATAA" "AATCACTAA" "AATCAGTAA" "AATCGATAA" "AATCGCTAA" "AATCGGTAA"
库(stringr)
我只是好奇,“相当大”对你来说意味着什么?
library(data.table)
library(magrittr)
# Convert iuapc$base to list of vectors
iuapc[, base := list(strsplit(base, ''))]
setkey(iuapc, code)
tstrsplit(seq, '') %>%
lapply(function(x) iuapc[x, base[[1]]]) %>%
do.call(CJ, .) %>%
.[, paste(.SD, collapse = ''), by = 1:nrow(.)] %>%
.[, V1]
# [1] "AATCAATAA" "AATCACTAA" "AATCAGTAA" "AATCGATAA" "AATCGCTAA" "AATCGGTAA"
library(stringr)
all.seq.iuapc <- function(seq, dictio_replace){
seq <- toupper(seq)
vec <- strsplit(seq, "")[[1]]
vec2 <- str_replace_all(string = vec, pattern= dictio_replace)
tmp <- expand.grid(strsplit(vec2, ""), stringsAsFactors = FALSE)
strings <- apply(tmp, 1, paste0, collapse = "")
return(strings)
}
dictio_replace= c("A" = "A",
"C" = "C",
"G" = "G",
"T" = "T",
"R" = "AG",
"Y" = "CT",
"S" = "GC",
"W" = "AT",
"K" = "GT",
"M" = "AC",
"B" = "CGT",
"D" = "AGT",
"H" = "ACT",
"V" = "ACG",
"N" = "ACGT")