R 将一个数据帧的每一列与另一个数据帧列进行比较,并将每个结果重叠打印到单独的文件中
我想将一个数据帧的每一列与另一个数据帧列进行比较,并将每个结果重叠打印到单独的文件中 我从两个测试数据集开始:R 将一个数据帧的每一列与另一个数据帧列进行比较,并将每个结果重叠打印到单独的文件中,r,R,我想将一个数据帧的每一列与另一个数据帧列进行比较,并将每个结果重叠打印到单独的文件中 我从两个测试数据集开始: df1 <- data.frame("x" = c("a_b", "c_d", "e_f/c_f", "g_h"), "y" = c(9,2,1,4), "z" = c(7,5,8,5)) df2 <- data.frame("m" = c("c_f", "x_y"),
df1 <- data.frame("x" = c("a_b", "c_d", "e_f/c_f", "g_h"),
"y" = c(9,2,1,4),
"z" = c(7,5,8,5))
df2 <- data.frame("m" = c("c_f", "x_y"),
"n" = c("a_b", "x_y"))
df1问题似乎是空的“
单元格,它应该是NA
df2[df2 == ""] <- NA
(不可见的
避免了不必要的、无聊的控制台输出,您也可以不使用它。)查看您的正则表达式。此模式为“”
,与任何内容都匹配。这里有一个MCVE供您使用grep(“a |”,字母)
谢谢您,但是这里缺少使用“/”的替代匹配标准。df2中的第三列应该有一个命中率。@Bob你是对的,这在match
中不容易实现,grep
更好。我想我发现了问题,请参阅更新。我明白了。你说得对,问题在于那些空单元格。谢谢大家!@鲍勃:是的,大多数情况下,最好将这样的东西编码为NA
。
df1<- structure(list(BGC_Accession = structure(c(1L, 1L, 1L, 2L), .Label = c("BGC0000647",
"BGC0000984"), class = "factor"), Genbank_ID = structure(c(1L,
3L, 2L, 4L), .Label = c("GCA_000202835", "GCA_000219295", "GCA_000964345",
"GCA_003029685"), class = "factor"), BGC_Class = structure(c(2L,
2L, 2L, 1L), .Label = c("NRP/Polyketide", "Terpene"), class = "factor"),
BGC_Start = c(2093957L, 1L, 1L, 2656134L), BGC_End = c(2115021L,
4440L, 4186L, 2721658L), Product = structure(c(1L, 1L, 1L,
2L), .Label = c("Carotenoid", "Delftibactin"), class = "factor"),
Similarity = structure(c(1L, 1L, 1L, 1L), .Label = "100%", class = "factor"),
Species_name = structure(c(1L, 4L, 2L, 3L), .Label = c("Acidiphilium_multivorum",
"Acidiphilium_sp_PM", "Acidovorax_avenae/Acidovorax_avene",
"Acinetobacter_baumannii"), class = "factor"), Kingdom = structure(c(1L,
1L, 1L, 1L), .Label = "k__Bacteria", class = "factor"), Phylum = structure(c(1L,
1L, 1L, 1L), .Label = "p__Proteobacteria", class = "factor"),
Class = structure(c(1L, 1L, 1L, 2L), .Label = c("c__Alphaproteobacteria",
"c__Betaproteobacteria"), class = "factor"), Order = structure(c(2L,
2L, 2L, 1L), .Label = c("o__Burkholderiales", "o__Rhodospirillales"
), class = "factor"), Family = structure(c(1L, 1L, 1L, 2L
), .Label = c("f__Acetobacteraceae", "f__Comamonadaceae"), class = "factor"),
Genus = structure(c(1L, 1L, 1L, 2L), .Label = c("g__Acidiphilium",
"g__Acidovorax"), class = "factor"), Species = structure(c(1L,
1L, 2L, 3L), .Label = c("s__Acidiphilium_multivorum", "s__Acidiphilium_sp_PM",
"s__Acidovorax_avenae"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
df2<- structure(list(Gut_SRS011111 = structure(c(2L, 1L, 1L), .Label = c("",
"Actinobaculum_unclassified"), class = "factor"), Gut_SRS011269 = structure(c(3L,
1L, 2L), .Label = c("Acidiphilium_multivorum", "Acinetobacter_baumannii",
"Clostridium_citroniae"), class = "factor"), Gut_SRS011355 = structure(c(2L,
3L, 1L), .Label = c("", "Acidovorax_avene", "Streptococcus_gordonii"
), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
for (i in colnames(df2)){
overlap_data<-df1[grep(paste(df2[,i], collapse = "|"), df1$Species_name), ]
write.csv(overlap_data, file = paste(i, ".csv", sep=""))
}
df2[df2 == ""] <- NA
invisible(lapply(names(df2), function(x) {
rr <- df1[grep(paste0(df2[,x], collapse= "|"), df1$Species_name), ]
write.csv(rr, file = paste(x, ".csv", sep=""))
}))