R将列中的值合并到行中
在每个基因组(OG1,OG2)中,我都有相同的有机体R将列中的值合并到行中,r,R,在每个基因组(OG1,OG2)中,我都有相同的有机体 library(purr) df1 %>% split(.$organism) %>% imap(~setNames(.x[c(1,3)],c(.y,"group"))) %>% reduce(inner_join) # P. berghei group P. fragile P. inui # 1 PB_1 OG1 PF_1 PI_1 # 2 P
library(purr)
df1 %>%
split(.$organism) %>%
imap(~setNames(.x[c(1,3)],c(.y,"group"))) %>%
reduce(inner_join)
# P. berghei group P. fragile P. inui
# 1 PB_1 OG1 PF_1 PI_1
# 2 PB_1 OG1 PF_1 PI_2
# 3 PB_1 OG1 PF_1 PI_3
# 4 PB_1 OG1 PF_2 PI_1
# 5 PB_1 OG1 PF_2 PI_2
# 6 PB_1 OG1 PF_2 PI_3
# 7 PB_1 OG1 PF_3 PI_1
# 8 PB_1 OG1 PF_3 PI_2
# 9 PB_1 OG1 PF_3 PI_3
# 10 PB_2 OG1 PF_1 PI_1
# 11 PB_2 OG1 PF_1 PI_2
# 12 PB_2 OG1 PF_1 PI_3
# 13 PB_2 OG1 PF_2 PI_1
# 14 PB_2 OG1 PF_2 PI_2
# 15 PB_2 OG1 PF_2 PI_3
# 16 PB_2 OG1 PF_3 PI_1
# 17 PB_2 OG1 PF_3 PI_2
# 18 PB_2 OG1 PF_3 PI_3
# 19 PBa_1 OG2 PFa_1 PIa_1
# 20 PBa_1 OG2 PFa_1 PIa_2
# 21 PBa_1 OG2 PFa_2 PIa_1
# 22 PBa_1 OG2 PFa_2 PIa_2
# 23 PBa_2 OG2 PFa_1 PIa_1
# 24 PBa_2 OG2 PFa_1 PIa_2
# 25 PBa_2 OG2 PFa_2 PIa_1
# 26 PBa_2 OG2 PFa_2 PIa_2
# 27 PBa_3 OG2 PFa_1 PIa_1
# 28 PBa_3 OG2 PFa_1 PIa_2
# 29 PBa_3 OG2 PFa_2 PIa_1
# 30 PBa_3 OG2 PFa_2 PIa_2
每个有机体在一个特定的群体中都有一个或多个基因。然而,每个有机体的基因数量因群体而异。在下面的例子中,P.fragile在OG1中有3个基因,但在OG2中只有2个基因
为了将所有基因与所有基因进行比较,我需要重新排列表格:在一个组中,一个生物体的每个基因都应该与其他生物体的所有基因组合列在一行中。我提供了输出的外观
输出中可以省略生物体名称,因为基因_ID包含生物体名称的一部分。
我使用dplyer包对数据进行分组,使用:
group_by(data,group)
但由于每个生物在每个基因组中都有不同数量的基因,我被卡住了
输入表:
df <- structure(list(gene_ID = c("PF_1", "PF_2", "PF_3", "PI_1", "PI_2",
"PI_3", "PB_1", "PB_2", "PFa_1", "PFa_2", "PIa_1", "PIa_2", "PBa_1",
"PBa_2", "PBa_3"), organism = c("P. fragile", "P. fragile", "P. fragile",
"P. inui", "P. inui", "P. inui", "P. berghei", "P. berghei",
"P. fragile", "P. fragile", "P. inui", "P. inui", "P. berghei",
"P. berghei", "P. berghei"), group = c("OG1", "OG1", "OG1", "OG1",
"OG1", "OG1", "OG1", "OG1", "OG2", "OG2", "OG2", "OG2", "OG2",
"OG2", "OG2")), .Names = c("gene_ID", "organism", "group"), class = "data.frame", row.names = c(NA,
-15L))
group
OG1 PF_1 PI_1 PB_1
OG1 PF_1 PI_1 PB_2
OG1 PF_1 PI_2 PB_1
OG1 PF_1 PI_2 PB_2
OG1 PF_1 PI_3 PB_1
OG1 PF_1 PI_3 PB_2
OG1 PF_2 PI_1 PB_1
OG1 PF_2 PI_1 PB_2
OG1 PF_2 PI_2 PB_1
OG1 PF_2 PI_2 PB_2
OG1 PF_2 PI_3 PB_1
OG1 PF_2 PI_3 PB_2
OG1 PF_3 PI_1 PB_1
OG1 PF_3 PI_1 PB_2
OG1 PF_3 PI_2 PB_1
OG1 PF_3 PI_2 PB_2
OG1 PF_3 PI_3 PB_1
OG1 PF_3 PI_3 PB_2
OG2 PFa_1 PIa_1 PBa_1
OG2 PFa_1 PIa_1 PBa_2
OG2 PFa_1 PIa_1 PBa_3
OG2 PFa_1 PIa_2 PBa_1
OG2 PFa_1 PIa_2 PBa_2
OG2 PFa_1 PIa_2 PBa_3
OG2 PFa_2 PIa_1 PBa_1
OG2 PFa_2 PIa_1 PBa_2
OG2 PFa_2 PIa_1 PBa_3
OG2 PFa_2 PIa_2 PBa_1
OG2 PFa_2 PIa_2 PBa_2
OG2 PFa_2 PIa_2 PBa_3
你可以这样做,我们加入了所有生物体的过滤表
library(purr)
df1 %>%
split(.$organism) %>%
imap(~setNames(.x[c(1,3)],c(.y,"group"))) %>%
reduce(inner_join)
# P. berghei group P. fragile P. inui
# 1 PB_1 OG1 PF_1 PI_1
# 2 PB_1 OG1 PF_1 PI_2
# 3 PB_1 OG1 PF_1 PI_3
# 4 PB_1 OG1 PF_2 PI_1
# 5 PB_1 OG1 PF_2 PI_2
# 6 PB_1 OG1 PF_2 PI_3
# 7 PB_1 OG1 PF_3 PI_1
# 8 PB_1 OG1 PF_3 PI_2
# 9 PB_1 OG1 PF_3 PI_3
# 10 PB_2 OG1 PF_1 PI_1
# 11 PB_2 OG1 PF_1 PI_2
# 12 PB_2 OG1 PF_1 PI_3
# 13 PB_2 OG1 PF_2 PI_1
# 14 PB_2 OG1 PF_2 PI_2
# 15 PB_2 OG1 PF_2 PI_3
# 16 PB_2 OG1 PF_3 PI_1
# 17 PB_2 OG1 PF_3 PI_2
# 18 PB_2 OG1 PF_3 PI_3
# 19 PBa_1 OG2 PFa_1 PIa_1
# 20 PBa_1 OG2 PFa_1 PIa_2
# 21 PBa_1 OG2 PFa_2 PIa_1
# 22 PBa_1 OG2 PFa_2 PIa_2
# 23 PBa_2 OG2 PFa_1 PIa_1
# 24 PBa_2 OG2 PFa_1 PIa_2
# 25 PBa_2 OG2 PFa_2 PIa_1
# 26 PBa_2 OG2 PFa_2 PIa_2
# 27 PBa_3 OG2 PFa_1 PIa_1
# 28 PBa_3 OG2 PFa_1 PIa_2
# 29 PBa_3 OG2 PFa_2 PIa_1
# 30 PBa_3 OG2 PFa_2 PIa_2
数据
df1 <- read.table(text="gene_ID organism group
PF_1 'P. fragile' OG1
PF_2 'P. fragile' OG1
PF_3 'P. fragile' OG1
PI_1 'P. inui ' OG1
PI_2 'P. inui ' OG1
PI_3 'P. inui ' OG1
PB_1 'P. berghei' OG1
PB_2 'P. berghei' OG1
PFa_1 'P. fragile' OG2
PFa_2 'P. fragile' OG2
PIa_1 'P. inui ' OG2
PIa_2 'P. inui ' OG2
PBa_1 'P. berghei' OG2
PBa_2 'P. berghei' OG2
PBa_3 'P. berghei' OG2",header=T,stringsAsFactors=F)
它是冗长的,但结果没有任何其他包
groups <- unique(df$group)
combined.genes <- data.frame()
for (i in 1:length(groups)) {
current.group <- df[df$group==groups[i],-3]
o <- unique(current.group$organism)
genes<-lapply(1:length(o), function(x) {
current.group[current.group$organism==o[x],1]
})
max.genes <- sort(table(current.group$organism),
decreasing=TRUE)[1]
#Set list of same-sized vectors with genes by organism, allowing "merge" to work
for(j in 1:length(o))
{
genes[[j]] <- c(genes[[j]],rep(NA,max.genes-length(genes[[j]])))
if(j==1)
tmp.df <- as.data.frame(genes[[j]])
else if(j==2)
tmp.df <- merge(tmp.df,genes[[j]])
else
tmp.df <- cbind(tmp.df[,-(j-1)],merge(tmp.df[,j-1],genes[[j]]))
}
tmp.df <- na.exclude(tmp.df)
tmp.df <- cbind(rep(groups[i],nrow(tmp.df)), tmp.df)
tmp.df <- tmp.df[order(tmp.df[,2],tmp.df[,3],tmp.df[,4]),]
combined.genes <- rbind(combined.genes, tmp.df)
}
rownames(combined.genes)<-NULL
combined.genes
groups当我使用你的代码时:df%>%split(.$Organic)%%>%imap(~setNames(.x[c(1,3)],c(.y,“group”)))%%>%reduce(inner_join)
。我理解您对split()所做的操作,并且这是有效的,因此我假设错误是由imap()部分引起的。我查看了purrr包,但无法找出问题所在。我编辑了源数据,请检查它是否仍然不适用于您?如果它适用于我的数据,请提供问题中您的数据的dput
,我们将从中进行排序,但我仍然会得到相同的错误。我按照您的建议添加了dput
。这一定是版本问题或是污染了您的工作区,您能尝试将所有内容更新到最新版本并在干净的会话中进行测试吗?与此同时,我在我的帖子底部添加了另一个选项。是的,你使用base R和magrittr
的版本成功了!伟大的谢谢我已经安装了R v.3.2.5,在该版本下构建了purrr
,并清理了工作区,重新启动了所有程序,但都不起作用。让我们在答案中留下两种解决方案,以便人们可以同时尝试这两种方法。