R 将不同基因列表之间的基因重叠计算为%
我已经生成了一个表格来显示不同基因列表之间的重叠。因为我有八个不同的基因列表,所以我有64个结果。我目前拥有的代码如下:R 将不同基因列表之间的基因重叠计算为%,r,rstudio,R,Rstudio,我已经生成了一个表格来显示不同基因列表之间的重叠。因为我有八个不同的基因列表,所以我有64个结果。我目前拥有的代码如下: #------------------------------------------------------------------------------- # Set the working directory and load the data files #-------------------------------------------------------
#-------------------------------------------------------------------------------
# Set the working directory and load the data files
#-------------------------------------------------------------------------------
setwd("~/Desktop/R_Project/Gene_overlap")
getwd()
files <- list.files(pattern="*.txt", full.names = TRUE)
files
data.list <- lapply(files, function(fil) {
scan(file=fil, what=character())
})
names(data.list) <- basename(files) %>% stringr::str_remove("\\.txt$")
str(data.list)
# List of 8
# $ GSE108363_BCGdown_D: chr [1:350] "IL1B" "IL6" "IL1A" "CCL20" ...
# $ GSE108363_BCGdown_V: chr [1:267] "IL6" "CCL20" "IL1A" "CXCL5" ...
# $ GSE108363_BCGup_D : chr [1:250] "FABP4" "CMTM2" "FUCA1" "CD36" ...
# $ GSE108363_BCGup_V : chr [1:429] "FCN1" "FCGR3B" "MNDA" "CPVL" ...
# $ GSE108363_MTBdown_D: chr [1:86] "CCL20" "IL1B" "IL1A" "IL6" ...
# $ GSE108363_MTBdown_V: chr [1:244] "IL1B" "IL1A" "CCL20" "IL6" ...
# $ GSE108363_MTBup_D : chr [1:128] "FUCA1" "FGL2" "TGFBI" "CPVL" ...
# $ GSE108363_MTBup_V : chr [1:286] "FABP4" "RNASE1" "MNDA" "CPVL" ...
intersect(data.list$GSE108363_BCGdown_D, data.list$GSE108363_BCGdown_V) %>% length
sapply(data.list, length)
#-------------------------------------------------------------------------------
# Using the intersect function to see the overlaps
#-------------------------------------------------------------------------------
data.file1 <- "GSE108363_BCGdown_V.txt"
data.file2 <- "GSE108363_BCGdown_D.txt"
data.file3 <- "GSE108363_BCGup_V.txt"
data.file4 <- "GSE108363_BCGup_D.txt"
data.file5 <- "GSE108363_MTBdown_V.txt"
data.file6 <- "GSE108363_MTBdown_D.txt"
data.file7 <- "GSE108363_MTBup_V.txt"
data.file8 <- "GSE108363_MTBup_D.txt"
genevect1 <- scan(data.file1, what=character(), sep="\n")
genevect2 <- scan(data.file2, what=character(), sep="\n")
genevect3 <- scan(data.file3, what=character(), sep="\n")
genevect4 <- scan(data.file4, what=character(), sep="\n")
genevect5 <- scan(data.file5, what=character(), sep="\n")
genevect6 <- scan(data.file6, what=character(), sep="\n")
genevect7 <- scan(data.file7, what=character(), sep="\n")
genevect8 <- scan(data.file8, what=character(), sep="\n")
filelist <- list(data.file1, data.file2, data.file3, data.file4, data.file5, data.file6, data.file7, data.file8)
all(sapply(filelist, file.exists))
# read files:
gene.lists <- lapply(filelist, function(f) {
scan(file=f, what=character())
})
# set up empty matrix
x <- (length(gene.lists))^2
x
y <- rep(NA, x)
mx <- matrix(y, ncol=length(gene.lists))
mx
row.names(mx) <- sapply(filelist, basename) %>% stringr::str_remove('.txt$')
colnames(mx) <- sapply(filelist, basename) %>% stringr::str_remove('.txt$')
mx
mx.overlap.count <- mx
# seq_along(gene.lists) # 1 2 3 4 5 6 7 8
for (i in seq_along(gene.lists)) {
g1 <- gene.lists[[i]]
for (j in seq_along(gene.lists)) {
g2 <- gene.lists[[j]]
a <- intersect(g1, g2)
b <- length(a)
mx.overlap.count[j,i] <- b
}
}
mx.overlap.count
View(mx.overlap.count)
#-------------------------------------------------------------------------------
#设置工作目录并加载数据文件
#-------------------------------------------------------------------------------
setwd(“~/Desktop/R\u项目/Gene\u重叠”)
getwd()
文件使用字母样本,因为您没有提供基因列表:
set.seed(1)
data.list <- lapply(sample(10:20), function(n)LETTERS[sample(1:26, n)])
overlaps <- sapply(data.list, function(g1)
sapply(data.list, function(g2)
{round(length(intersect(g1, g2)) / length(g2) * 100)}))
overlaps
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
[1,] 100 50 67 75 42 92 58 92 67 33 92
[2,] 46 100 62 69 54 77 62 69 69 54 62
[3,] 53 53 100 60 33 73 60 73 80 33 60
[4,] 53 53 53 100 47 71 53 76 53 29 82
[5,] 45 64 45 73 100 91 64 82 36 45 73
[6,] 61 56 61 67 56 100 56 89 56 33 72
[7,] 50 57 64 64 50 71 100 86 71 50 64
[8,] 55 45 55 65 45 80 60 100 60 40 80
[9,] 50 56 75 56 25 62 62 75 100 38 69
[10,] 40 70 50 50 50 60 70 80 60 100 70
[11,] 58 42 47 74 42 68 47 84 58 37 100
set.seed(1)
data.list定义基因重叠。两个列表中常见的基因(一次比较两个列表)@user2974951:这一点很好,但恕我直言,这是不可能的,因为mapply并行迭代列表,而这里我们需要逐个迭代列表。@Martijnvanatekum非常感谢您的回复。这正是我想要的。我可以问一下为什么样本是(10:20)吗?Hi@HibaAl khaffaji10:20
随机确定生成的字母样本的长度(一个样本的长度为10,一个样本的长度为11,等等)。没有特别的理由选择这些数字,只是想在长度上有一些变化。