如何使用R从IUPAC符号中获取所有可能的序列

如何使用R从IUPAC符号中获取所有可能的序列,r,data.table,dna-sequence,R,Data.table,Dna Sequence,我有一个带有IUPAC符号()的DNA序列载体。例如,给定序列和符号: seq <- "AATCRVTAA" iuapc <- data.table(code = c("A", "C", "G", "T", "R", "Y", "S", "W", "K", "M",

我有一个带有IUPAC符号()的DNA序列载体。例如,给定序列和符号:

seq <- "AATCRVTAA"
iuapc <- data.table(code = c("A", "C", "G", "T", "R", "Y", "S", "W", "K", "M", "B", "D", "H", "V", "N"),
                base = c("A", "C", "G", "T", "AG", "CT", "GC", "AT", "GT", "AC", "CGT", "AGT", "ACT", "ACG", "ACGT"))
序列向量非常大,因此性能非常重要。任何帮助都将不胜感激


这里已经有人问过Python这个问题:

根据您今天早些时候的问题(),这里有一个拙劣的tidyverse/base方法:

library(tidyverse)

tibble(seq) %>%
  separate_rows(seq, sep = '(?<=.)(?=.)') %>%
  left_join(iuapc, by = c("seq" = "code")) %>%
  pull(base) %>%
  str_split("") %>%
  expand.grid(stringsAsFactors = FALSE)

#  Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1    A    A    T    C    A    A    T    A    A
#2    A    A    T    C    G    A    T    A    A
#3    A    A    T    C    A    C    T    A    A
#4    A    A    T    C    G    C    T    A    A
#5    A    A    T    C    A    G    T    A    A
#6    A    A    T    C    G    G    T    A    A
库(tidyverse)
TIBLE(序列)%>%

单独的行(seq,sep=”(?这里有一些非常原始的东西:

library(data.table)
library(magrittr)

# Convert iuapc$base to list of vectors
iuapc[, base := list(strsplit(base, ''))]
setkey(iuapc, code)


tstrsplit(seq, '') %>% 
  lapply(function(x) iuapc[x, base[[1]]]) %>% 
  do.call(CJ, .) %>% 
  .[, paste(.SD, collapse = ''), by = 1:nrow(.)] %>% 
  .[, V1]

# [1] "AATCAATAA" "AATCACTAA" "AATCAGTAA" "AATCGATAA" "AATCGCTAA" "AATCGGTAA"
库(stringr)
我只是好奇,“相当大”对你来说意味着什么?
library(data.table)
library(magrittr)

# Convert iuapc$base to list of vectors
iuapc[, base := list(strsplit(base, ''))]
setkey(iuapc, code)


tstrsplit(seq, '') %>% 
  lapply(function(x) iuapc[x, base[[1]]]) %>% 
  do.call(CJ, .) %>% 
  .[, paste(.SD, collapse = ''), by = 1:nrow(.)] %>% 
  .[, V1]

# [1] "AATCAATAA" "AATCACTAA" "AATCAGTAA" "AATCGATAA" "AATCGCTAA" "AATCGGTAA"
library(stringr)

all.seq.iuapc <- function(seq, dictio_replace){
  seq <- toupper(seq)
  vec <- strsplit(seq, "")[[1]]
  vec2 <- str_replace_all(string = vec, pattern= dictio_replace)
  tmp <- expand.grid(strsplit(vec2, ""), stringsAsFactors = FALSE)
  strings <- apply(tmp, 1, paste0, collapse = "")
  return(strings)
}

dictio_replace= c("A" = "A",
                  "C" = "C",
                  "G" = "G",
                  "T" = "T",
                  "R" = "AG",        
                  "Y" = "CT",
                  "S" = "GC",
                  "W" = "AT",
                  "K" = "GT",
                  "M" = "AC",
                  "B" = "CGT",
                  "D" = "AGT",
                  "H" = "ACT",
                  "V" = "ACG",
                  "N" = "ACGT")