R 将一个数据帧的每一列与另一个数据帧列进行比较,并将每个结果重叠打印到单独的文件中

R 将一个数据帧的每一列与另一个数据帧列进行比较,并将每个结果重叠打印到单独的文件中,r,R,我想将一个数据帧的每一列与另一个数据帧列进行比较,并将每个结果重叠打印到单独的文件中 我从两个测试数据集开始: df1 <- data.frame("x" = c("a_b", "c_d", "e_f/c_f", "g_h"), "y" = c(9,2,1,4), "z" = c(7,5,8,5)) df2 <- data.frame("m" = c("c_f", "x_y"),

我想将一个数据帧的每一列与另一个数据帧列进行比较,并将每个结果重叠打印到单独的文件中

我从两个测试数据集开始:

df1 <- data.frame("x" = c("a_b", "c_d", "e_f/c_f", "g_h"),
                  "y" = c(9,2,1,4),
                  "z" = c(7,5,8,5))
df2 <- data.frame("m" = c("c_f", "x_y"),
                  "n" = c("a_b", "x_y"))

df1问题似乎是空的
单元格,它应该是
NA

df2[df2 == ""] <- NA

(不可见的
避免了不必要的、无聊的控制台输出,您也可以不使用它。)

查看您的正则表达式。此模式为“
,与任何内容都匹配。这里有一个MCVE供您使用
grep(“a |”,字母)
谢谢您,但是这里缺少使用“/”的替代匹配标准。df2中的第三列应该有一个命中率。@Bob你是对的,这在
match
中不容易实现,
grep
更好。我想我发现了问题,请参阅更新。我明白了。你说得对,问题在于那些空单元格。谢谢大家!@鲍勃:是的,大多数情况下,最好将这样的东西编码为
NA
df1<- structure(list(BGC_Accession = structure(c(1L, 1L, 1L, 2L), .Label = c("BGC0000647", 
"BGC0000984"), class = "factor"), Genbank_ID = structure(c(1L, 
3L, 2L, 4L), .Label = c("GCA_000202835", "GCA_000219295", "GCA_000964345", 
"GCA_003029685"), class = "factor"), BGC_Class = structure(c(2L, 
2L, 2L, 1L), .Label = c("NRP/Polyketide", "Terpene"), class = "factor"), 
    BGC_Start = c(2093957L, 1L, 1L, 2656134L), BGC_End = c(2115021L, 
    4440L, 4186L, 2721658L), Product = structure(c(1L, 1L, 1L, 
    2L), .Label = c("Carotenoid", "Delftibactin"), class = "factor"), 
    Similarity = structure(c(1L, 1L, 1L, 1L), .Label = "100%", class = "factor"), 
    Species_name = structure(c(1L, 4L, 2L, 3L), .Label = c("Acidiphilium_multivorum", 
    "Acidiphilium_sp_PM", "Acidovorax_avenae/Acidovorax_avene", 
    "Acinetobacter_baumannii"), class = "factor"), Kingdom = structure(c(1L, 
    1L, 1L, 1L), .Label = "k__Bacteria", class = "factor"), Phylum = structure(c(1L, 
    1L, 1L, 1L), .Label = "p__Proteobacteria", class = "factor"), 
    Class = structure(c(1L, 1L, 1L, 2L), .Label = c("c__Alphaproteobacteria", 
    "c__Betaproteobacteria"), class = "factor"), Order = structure(c(2L, 
    2L, 2L, 1L), .Label = c("o__Burkholderiales", "o__Rhodospirillales"
    ), class = "factor"), Family = structure(c(1L, 1L, 1L, 2L
    ), .Label = c("f__Acetobacteraceae", "f__Comamonadaceae"), class = "factor"), 
    Genus = structure(c(1L, 1L, 1L, 2L), .Label = c("g__Acidiphilium", 
    "g__Acidovorax"), class = "factor"), Species = structure(c(1L, 
    1L, 2L, 3L), .Label = c("s__Acidiphilium_multivorum", "s__Acidiphilium_sp_PM", 
    "s__Acidovorax_avenae"), class = "factor")), class = "data.frame", row.names = c(NA, 
-4L))
df2<- structure(list(Gut_SRS011111 = structure(c(2L, 1L, 1L), .Label = c("", 
"Actinobaculum_unclassified"), class = "factor"), Gut_SRS011269 = structure(c(3L, 
1L, 2L), .Label = c("Acidiphilium_multivorum", "Acinetobacter_baumannii", 
"Clostridium_citroniae"), class = "factor"), Gut_SRS011355 = structure(c(2L, 
3L, 1L), .Label = c("", "Acidovorax_avene", "Streptococcus_gordonii"
), class = "factor")), class = "data.frame", row.names = c(NA, 
-3L))
for (i in colnames(df2)){ 
  overlap_data<-df1[grep(paste(df2[,i], collapse = "|"), df1$Species_name), ]
  write.csv(overlap_data, file = paste(i, ".csv", sep=""))
}
df2[df2 == ""] <- NA
invisible(lapply(names(df2), function(x) {
  rr <- df1[grep(paste0(df2[,x], collapse= "|"), df1$Species_name), ]
  write.csv(rr, file = paste(x, ".csv", sep=""))
}))