Fasta到数据帧-R_R_Dataframe_Bioinformatics

Fasta到数据帧-R

r dataframe

Fasta到数据帧-R,r,dataframe,bioinformatics,R,Dataframe,Bioinformatics,在我在网上找到的这个脚本中，我生成了一个棉花列表 require(ape) # make vector of accession numbers, for ITS 1 and 2 region for Gossypium (cotton) species cotton_acc <- c("U56806", "U12712", "U56810", "U12732", "U12725", "U56786", "U12715",

在我在网上找到的这个脚本中，我生成了一个棉花列表

require(ape) 
# make vector of accession numbers, for ITS 1 and 2 region for Gossypium (cotton) species 
cotton_acc <- c("U56806", "U12712", "U56810", 
                "U12732", "U12725", "U56786", "U12715", 
                "AF057758", "U56790", "U12716", "U12729", 
                "U56798", "U12727", "U12713", "U12719", 
                "U56811", "U12728", "U12730", "U12731", 
                "U12722", "U56796", "U12714", "U56789", 
                "U56797", "U56801", "U56802", "U12718", 
                "U12710", "U56804", "U12734", "U56809", 
                "U56812", "AF057753", "U12711", "U12717", 
                "U12723", "U12726") 
# get data from Genbank 
cotton <- read.GenBank(cotton_acc, species.names = T) 
# name the sequences with species names instead of access numbers 
names_accs <- data.frame(species = attr(cotton, "species"), accs = names(cotton)) 
names(cotton) <- attr(cotton, "species")
write.dna(cotton, "C:/Users/Comp12/Desktop/cotton.fas", format = "fasta")

我如何在数据框中以cotton_acc、species.Name、sequence、Base composition总计的列顺序排列它？我将得到37行

谢谢

以下内容将为您提供至少包含物种名称和DNA序列的数据框。由于我不熟悉DNA，我不知道acc和碱基组成是什么。在我看来，你需要自己做一些计算才能得到基本成分。我希望你所在领域的专家能指导你完成更多的工作

library(dplyr)
library(tidyr)

# http://svitsrv25.epfl.ch/R-doc/library/ape/html/as.alignment.html
# class 'DNAbin' to `character` to get alphabets for DNA sequence

foo <- lapply(cotton, function(x) as.character(x[1:length(x)]))

# A tiny function to create a data.frame with vectors in lists, which I have.

listvec2df <- function(l){

    n.obs <- sapply(l, length)
    seq.max <- seq_len(max(n.obs))
    mydf <- data.frame(sapply(l, "[", i = seq.max), stringsAsFactors = FALSE)

}

# Create a data frame with names from the list (i.e., cotton) and listvec2df(foo),
# which is transposed.

foo2 <- data.frame(names(foo), t(listvec2df(foo)), stringsAsFactors = FALSE)
foo2 <- foo2 %>%
        separate(names.foo., c("cotton", "species"), sep = "_")

#      cotton       species X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
#1  Gossypium      anomalum  t  c  g  a  a  a  c  c  t   c   c   c   t   a   a
#2  Gossypium      arboreum  t  c  g  a  a  a  c  c  t   g   c   c   t   a   g
#3  Gossypium    areysianum  t  c  g  a  a  a  c  c  t   g   c   c   t   a   g
#4  Gossypium        aridum  t  c  g  a  a  a  c  c  t   g   c   c   t   a   g
#5  Gossypium   armourianum  t  c  g  a  a  a  c  c  t   g   c   c   t   a   g

谢谢install.packages中的警告：包“tidyr”不适用于R版本3.0.3。还有其他解决方案吗？@user2916044您认为可以将R更新到最新版本吗？@user2916044否则，您可以这样做。创建foo2后，运行foo2$names.foo。同时使用row.names和names.foo获取cotton_acc值。@user2916044我将把这部分留给您。你在田野里；你知道该怎么做

library(dplyr)
library(tidyr)

# http://svitsrv25.epfl.ch/R-doc/library/ape/html/as.alignment.html
# class 'DNAbin' to `character` to get alphabets for DNA sequence

foo <- lapply(cotton, function(x) as.character(x[1:length(x)]))

# A tiny function to create a data.frame with vectors in lists, which I have.

listvec2df <- function(l){

    n.obs <- sapply(l, length)
    seq.max <- seq_len(max(n.obs))
    mydf <- data.frame(sapply(l, "[", i = seq.max), stringsAsFactors = FALSE)

}

# Create a data frame with names from the list (i.e., cotton) and listvec2df(foo),
# which is transposed.

foo2 <- data.frame(names(foo), t(listvec2df(foo)), stringsAsFactors = FALSE)
foo2 <- foo2 %>%
        separate(names.foo., c("cotton", "species"), sep = "_")

#      cotton       species X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
#1  Gossypium      anomalum  t  c  g  a  a  a  c  c  t   c   c   c   t   a   a
#2  Gossypium      arboreum  t  c  g  a  a  a  c  c  t   g   c   c   t   a   g
#3  Gossypium    areysianum  t  c  g  a  a  a  c  c  t   g   c   c   t   a   g
#4  Gossypium        aridum  t  c  g  a  a  a  c  c  t   g   c   c   t   a   g
#5  Gossypium   armourianum  t  c  g  a  a  a  c  c  t   g   c   c   t   a   g

dim(foo2)
#[1]  37 689