R 在尽可能多的列上串行连接两个数据帧,直到df1中的每一行都匹配为止

R 在尽可能多的列上串行连接两个数据帧,直到df1中的每一行都匹配为止,r,dataframe,join,R,Dataframe,Join,我有两个数据帧需要连接。目标是连接尽可能多的列,并从最多到最少的精确列匹配顺序进行。我需要解决K列的问题。对于匹配j列(j% 不同的(B,C,.keep_all=T)%>% 不同的(A,C,.keep_all=T)%>% 行() 变异(id2=样本(字母,1,替换=T)) #在3列上匹配 列匹配3=df1%>% 内部联接(df2,by=c(“A”、“B”、“c”))%>% 不同(id1、A、B、C、.keep_all=T) #两列上的序列匹配 列匹配=df1%>% 反联接(列匹配3,由=c(“A

我有两个数据帧需要连接。目标是连接尽可能多的列,并从最多到最少的精确列匹配顺序进行。我需要解决K列的问题。对于匹配j列(j
# 3-column toy example 
library(dplyr)
df1_rows = 50
df2_rows = 100
set.seed(1)

df1 <- data.frame(id1 = sample(LETTERS,df1_rows,replace = T),
                  A = sample(1:5,df1_rows,replace = T),
                  B = sample(1:5,df1_rows,replace = T),
                  C = sample(1:5,df1_rows,replace = T) )
df1
df2 <- data.frame(A = sample(1:5,df2_rows,replace = T),
                  B = sample(1:5,df2_rows,replace = T),
                  C = sample(1:5,df2_rows,replace = T) ) %>%
  distinct(A,B,C,.keep_all = T) %>%
  distinct(A,B,.keep_all = T) %>%
  distinct(B,C,.keep_all = T) %>%
  distinct(A,C,.keep_all = T) %>%
  rowwise() %>%
  mutate(id2 = sample(LETTERS,1,replace = T))

# match on 3 columns
col_match_3 = df1 %>%
  inner_join(df2 , by = c("A","B","C")) %>%
  distinct(id1, A,B,C,.keep_all = T)

#serial matches on two columns
col_match_AB = df1 %>%
  anti_join(col_match_3, by = c("A","B","C")) %>%
  inner_join(df2 %>% select(-C), by = c("A","B")) %>%
  distinct(id1, A,B,C,.keep_all = T)

col_match_BC = df1 %>%
  anti_join(col_match_3, by = c("A","B","C")) %>%
  anti_join(col_match_AB, by = c("A","B")) %>%
  inner_join(df2 %>% select(-A), by = c("B","C")) %>%
  distinct(id1, A,B,C,.keep_all = T)

col_match_AC = df1 %>%
  anti_join(col_match_3, by = c("A","B","C")) %>%
  anti_join(col_match_AB, by = c("A","B")) %>%
  anti_join(col_match_BC, by = c("B","C")) %>%
  inner_join(df2 %>% select(-B), by = c("A","C")) %>%
  distinct(id1, A,B,C,.keep_all = T)

#join on one col
col_match_A = df1 %>%
  anti_join(col_match_3, by = c("A","B","C")) %>%
  anti_join(col_match_AB, by = c("A","B")) %>%
  anti_join(col_match_BC, by = c("B","C")) %>%
  anti_join(col_match_AC, by = c("A","C")) %>%
  inner_join(df2 %>% select(id2,A), by = "A") %>%
  distinct(id1, A,B,C,.keep_all = T)

col_match_B = df1 %>%
  anti_join(col_match_3, by = c("A","B","C")) %>%
  anti_join(col_match_AB, by = c("A","B")) %>%
  anti_join(col_match_BC, by = c("B","C")) %>%
  anti_join(col_match_AC, by = c("A","C")) %>%
  anti_join(col_match_A, by = "A") %>%
  inner_join(df2 %>% select(id2,B), by = "B") %>%
  distinct(.keep_all = T)
  

res = 
  col_match_3 %>%
  bind_rows(col_match_AB) %>%
  bind_rows(col_match_BC) %>%
  bind_rows(col_match_AC) %>%
  bind_rows(col_match_A)

res
#3列玩具示例
图书馆(dplyr)
df1_行=50
df2_行=100
种子(1)
df1%
不同的(A,B,.keep_all=T)%>%
不同的(B,C,.keep_all=T)%>%
不同的(A,C,.keep_all=T)%>%
行()
变异(id2=样本(字母,1,替换=T))
#在3列上匹配
列匹配3=df1%>%
内部联接(df2,by=c(“A”、“B”、“c”))%>%
不同(id1、A、B、C、.keep_all=T)
#两列上的序列匹配
列匹配=df1%>%
反联接(列匹配3,由=c(“A”、“B”、“c”))%>%
内部联接(df2%>%select(-C),by=C(“A”,“B”))%>%
不同(id1、A、B、C、.keep_all=T)
col_match_BC=df1%>%
反联接(列匹配3,由=c(“A”、“B”、“c”))%>%
反连接(col\u match\u AB,by=c(“A”,“B”))%>%
内部联接(df2%>%select(-A),by=c(“B”,“c”))%>%
不同(id1、A、B、C、.keep_all=T)
列匹配=df1%>%
反联接(列匹配3,由=c(“A”、“B”、“c”))%>%
反连接(col\u match\u AB,by=c(“A”,“B”))%>%
反连接(col\u match\u BC,by=c(“B”,“c”))%>%
内部联接(df2%>%select(-B),by=c(“A”,“c”))%>%
不同(id1、A、B、C、.keep_all=T)
#连成一列
列匹配=df1%>%
反联接(列匹配3,由=c(“A”、“B”、“c”))%>%
反连接(col\u match\u AB,by=c(“A”,“B”))%>%
反连接(col\u match\u BC,by=c(“B”,“c”))%>%
反连接(col\u match\u AC,by=c(“A”,“c”))%>%
内部联接(df2%%>%select(id2,A),by=“A”)%%>%
不同(id1、A、B、C、.keep_all=T)
列匹配=df1%>%
反联接(列匹配3,由=c(“A”、“B”、“c”))%>%
反连接(col\u match\u AB,by=c(“A”,“B”))%>%
反连接(col\u match\u BC,by=c(“B”,“c”))%>%
反连接(col\u match\u AC,by=c(“A”,“c”))%>%
反联接(列匹配A,by=“A”)%>%
内部联接(df2%%>%select(id2,B),by=“B”)%%>%
不同(.keep_all=T)
res=
列匹配\u 3%>%
绑定行(列匹配对象)%>%
绑定行(列匹配)%>%
绑定行(列匹配)%>%
绑定行(列匹配)
物件