R 做一个;模糊的;和非模糊、多对1合并数据表

R 做一个;模糊的;和非模糊、多对1合并数据表,r,merge,data.table,fuzzyjoin,R,Merge,Data.table,Fuzzyjoin,假设我有两个数据库dfA和dfB。其中一个数据库有单独的观察结果,另一个数据库有国家级数据(适用于同一年和同一国家的多个观察结果),我创建了一个名为matchcode的键。此匹配代码是国家代码和年份的组合 dfA <- read.table( text = "A B C D E F G iso year matchcode 1 0 1 1 1 0 1 0 NLD 2010 NLD2010 2

假设我有两个数据库
dfA
dfB
。其中一个数据库有单独的观察结果,另一个数据库有国家级数据(适用于同一年和同一国家的多个观察结果),我创建了一个名为matchcode的键。此匹配代码是国家代码和年份的组合

   dfA <- read.table(
  text = "A   B   C   D   E   F   G   iso   year   matchcode
  1   0   1   1   1   0   1   0   NLD   2010   NLD2010
  2   1   0   0   0   1   0   1   NLD   2014   NLD2014
  3   0   0   0   1   1   0   0   AUS   2010   AUS2010
  4   1   0   1   0   0   1   0   AUS   2006   AUS2006
  5   0   1   0   1   0   1   1   USA   2008   USA2008
  6   0   0   1   0   0   0   1   USA   2010   USA2010
  7   0   1   0   1   0   0   0   USA   2012   USA2012
  8   1   0   1   0   0   1   0   BLG   2008   BLG2008
  9   0   1   0   1   1   0   1   BEL   2008   BEL2008
  10  1   0   1   0   0   1   0   BEL   2010   BEL2010
  11  0   1   1   1   0   1   0   NLD   2010   NLD2010
  12  1   0   0   0   1   0   1   NLD   2014   NLD2014
  13  0   0   0   1   1   0   0   AUS   2010   AUS2010
  14  1   0   1   0   0   1   0   AUS   2006   AUS2006
  15  0   1   0   1   0   1   1   USA   2008   USA2008
  16  0   0   1   0   0   0   1   USA   2010   USA2010
  17  0   1   0   1   0   0   0   USA   2012   USA2012
  18  1   0   1   0   0   1   0   BLG   2008   BLG2008
  19  0   1   0   1   1   0   1   BEL   2008   BEL2008
  20  1   0   1   0   0   1   0   BEL   2010   BEL2010",
  header = TRUE
)

   dfB <- read.table(
  text = "A   B   C   D   H   I   J   iso   year   matchcode
  1   0   1   1   1   0   1   0   NLD   2009   NLD2009
  2   1   0   0   0   1   0   1   NLD   2014   NLD2014
  3   0   0   0   1   1   0   0   AUS   2011   AUS2011
  4   1   0   1   0   0   1   0   AUS   2007   AUS2007
  5   0   1   0   1   0   1   1   USA   2007   USA2007
  6   0   0   1   0   0   0   1   USA   2011   USA2010
  7   0   1   0   1   0   0   0   USA   2013   USA2013
  8   1   0   1   0   0   1   0   BLG   2007   BLG2007
  9   0   1   0   1   1   0   1   BEL   2009   BEL2009
  10   1   0   1   0   0   1   0  BEL   2012   BEL2012",
  header = TRUE
)

library(data.table)
setDT(dfA)
setDT(dfB)
其他来源:

Hers是我的(默认)连接方法,使用
数据。table

代码

library( data.table )

#change the name of the matchcode-column
setnames(dfA, c("matchcode", "iso", "year"), c("matchcodeA", "isoA", "yearA"))
setnames(dfB, c("matchcode", "iso", "year"), c("matchcodeB", "isoB", "yearB"))

#store column-order for in the end
namesA <- as.character( names( dfA ) )
namesB <- as.character( setdiff( names(dfB), names(dfA) ) )
colorder <- c(namesA, namesB)

#create columns to join on
dfA[, `:=`(iso.join = isoA, year.join = yearA)]
dfB[, `:=`(iso.join = isoB, year.join = yearB)]

#perform left join
result <- dfB[dfA, on = c("iso.join", "year.join"),roll = "nearest" ]

#drop columns that are not needed
result[, grep("^i\\.", names(result)) := NULL ]
result[, grep("join$", names(result)) := NULL ]

#set column order
setcolorder(result, colorder)
样本数据

dfA <- fread(
  "A   B   C   D   E   F   G   iso   year   matchcode
  0   1   1   1   0   1   0   NLD   2010   NLD2010
     1   0   0   0   1   0   1   NLD   2014   NLD2014
     0   0   0   1   1   0   0   AUS   2010   AUS2010
     1   0   1   0   0   1   0   AUS   2006   AUS2006
     0   1   0   1   0   1   1   USA   2008   USA2008
     0   0   1   0   0   0   1   USA   2010   USA2010
     0   1   0   1   0   0   0   USA   2012   USA2012
     1   0   1   0   0   1   0   BLG   2008   BLG2008
     0   1   0   1   1   0   1   BEL   2008   BEL2008
    1   0   1   0   0   1   0   BEL   2010   BEL2010
    0   1   1   1   0   1   0   NLD   2010   NLD2010
    1   0   0   0   1   0   1   NLD   2014   NLD2014
    0   0   0   1   1   0   0   AUS   2010   AUS2010
    1   0   1   0   0   1   0   AUS   2006   AUS2006
    0   1   0   1   0   1   1   USA   2008   USA2008
    0   0   1   0   0   0   1   USA   2010   USA2010
    0   1   0   1   0   0   0   USA   2012   USA2012
    1   0   1   0   0   1   0   BLG   2008   BLG2008
    0   1   0   1   1   0   1   BEL   2008   BEL2008
    1   0   1   0   0   1   0   BEL   2010   BEL2010",
  header = TRUE
)


dfB <- fread(
  "A   B   C   D   H   I   J   iso   year   matchcode
     0   1   1   1   0   1   0   NLD   2009   NLD2009
     1   0   0   0   1   0   1   NLD   2014   NLD2014
     0   0   0   1   1   0   0   AUS   2011   AUS2011
     1   0   1   0   0   1   0   AUS   2007   AUS2007
     0   1   0   1   0   1   1   USA   2007   USA2007
     0   0   1   0   0   0   1   USA   2011   USA2010
     0   1   0   1   0   0   0   USA   2013   USA2013
     1   0   1   0   0   1   0   BLG   2007   BLG2007
     0   1   0   1   1   0   1   BEL   2009   BEL2009
     1   0   1   0   0   1   0  BEL   2012   BEL2012",
  header = TRUE
)

dfA您似乎想要匹配dfB中dfA中的每一行。这是否为您提供了所需的输出:
dfB[dfA,on=(iso,year),roll=“nearest”,nomatch=0]
?感谢您查看mt1022!举例来说确实如此,但遗憾的是,我在实际数据集中仍然丢失了大约14000个观察值。顺便说一句,这已经比以前少了很多!我猜dfA中的一些代码在B中不存在。您可以在join中设置
nomatch=NA
,并检查获得NA值的行的情况。我删除了前一列,因为我认为有问题。经过更彻底的检查,它确实工作得很好。非常感谢你的帮助。一个很小的问题。我注意到除了
matchcodeB
,还有
year
iso
a是空的(这是我的困惑)。是否可以调整代码,使
iso
year
dfA
保持不变?@Tom当然可以给他们唯一的名字。请参阅更新的答案,希望您仍然在这里:我一直在使用您的答案合并许多数据库。例如,A的数字为1-8,然后B的数字为1-8。第一次合并进行得很顺利,但在
配色机中出现重复问题之后。解决这件事我简直是疯了。你知道它为什么会发生以及如何预防吗?我是否也应该从其他数据库中删除
I.
.join
?不过,在进行第二次合并之前,我通过删除每个
dfB
中不需要的所有变量来修复它。这似乎奏效了。
# A tibble: 11 x 18
       A     B     C     D     E     F     G iso    year matchcode     K     L     M     N     O     P     Q i.matchcode
   <int> <int> <int> <int> <int> <int> <int> <fct> <int> <fct>     <int> <int> <int> <int> <int> <int> <int> <fct>      
 1     0     1     1     1     0     1     0 NLD    2009 NLD2010       0     1     1     1     0     1     0 NLD2009    
 2     1     0     0     0     1     0     1 NLD    2014 NLD2014       1     0     0     0     1     0     1 NLD2014    
 3     1     0     0     0     1     0     1 NLD    2014 NLD2014       1     0     0     0     1     0     1 NLD2014    
 4     0     0     0     1     1     0     0 AUS    2011 AUS2010       0     0     0     1     1     0     0 AUS2011    
 5     1     0     1     0     0     1     0 AUS    2007 AUS2006       1     0     1     0     0     1     0 AUS2007    
 6     0     1     0     1     0     1     1 USA    2007 USA2008       0     1     0     1     0     1     1 USA2007    
 7     0     0     1     0     0     0     1 USA    2011 USA2010       0     0     1     0     0     0     1 USA2010    
 8     0     1     0     1     0     0     0 USA    2013 USA2012       0     1     0     1     0     0     0 USA2013    
 9     1     0     1     0     0     1     0 BLG    2007 BLG2008       1     0     1     0     0     1     0 BLG2007    
10     0     1     0     1     1     0     1 BEL    2009 BEL2008       0     1     0     1     1     0     1 BEL2009    
11     1     0     1     0     0     1     0 BEL    2012 BEL2010       1     0     1     0     0     1     0 BEL2012   
#    A B C D E F G iso year matchcodeA H I J matchcodeB
# 1: 1 0 0 0 1 0 1 NLD  2014  NLD2014  1 0 1    NLD2014
# 2: 0 0 0 1 1 0 0 AUS  2011  AUS2010  1 0 0    AUS2011
# 3: 1 0 1 0 0 1 0 AUS  2007  AUS2006  0 1 0    AUS2007
# 4: 0 0 1 0 0 0 1 USA  2011  USA2010  0 0 1    USA2010
# 5: 0 1 0 1 0 0 0 USA  2013  USA2012  0 0 0    USA2013
# 6: 0 1 0 1 1 0 1 BEL  2009  BEL2008  1 0 1    BEL2009
# 7: 0 1 1 1 0 1 0 NLD  2009  NLD2010  0 1 0    NLD2009
# 8: 0 1 0 1 0 1 1 USA  2007  USA2008  0 1 1    USA2007
# 9: 0 1 0 1 0 0 0 USA  2011  USA2012  0 0 1    USA2010
#10: 1 0 1 0 0 1 0 BEL  2009  BEL2010  1 0 1    BEL2009
#11: 1 0 0 0 1 0 1 NLD  2014  NLD2014  1 0 1    NLD2014
#12: 0 0 0 1 1 0 0 AUS  2011  AUS2010  1 0 0    AUS2011
#13: 1 0 1 0 0 1 0 AUS  2007  AUS2006  0 1 0    AUS2007
#14: 0 0 1 0 0 0 1 USA  2011  USA2010  0 0 1    USA2010
#15: 0 1 0 1 0 0 0 USA  2013  USA2012  0 0 0    USA2013
#16: 0 1 0 1 1 0 1 BEL  2009  BEL2008  1 0 1    BEL2009
#17: 0 1 1 1 0 1 0 NLD  2009  NLD2010  0 1 0    NLD2009
#18: 0 1 0 1 0 1 1 USA  2007  USA2008  0 1 1    USA2007
#19: 0 1 0 1 0 0 0 USA  2011  USA2012  0 0 1    USA2010
#20: 1 0 1 0 0 1 0 BEL  2009  BEL2010  1 0 1    BEL2009
library( data.table )

#change the name of the matchcode-column
setnames(dfA, c("matchcode", "iso", "year"), c("matchcodeA", "isoA", "yearA"))
setnames(dfB, c("matchcode", "iso", "year"), c("matchcodeB", "isoB", "yearB"))

#store column-order for in the end
namesA <- as.character( names( dfA ) )
namesB <- as.character( setdiff( names(dfB), names(dfA) ) )
colorder <- c(namesA, namesB)

#create columns to join on
dfA[, `:=`(iso.join = isoA, year.join = yearA)]
dfB[, `:=`(iso.join = isoB, year.join = yearB)]

#perform left join
result <- dfB[dfA, on = c("iso.join", "year.join"),roll = "nearest" ]

#drop columns that are not needed
result[, grep("^i\\.", names(result)) := NULL ]
result[, grep("join$", names(result)) := NULL ]

#set column order
setcolorder(result, colorder)
#     A B C D E F G isoA yearA matchcodeA H I J isoB yearB matchcodeB
#  1: 0 1 1 1 0 1 0  NLD  2010    NLD2010 0 1 0  NLD  2009    NLD2009
#  2: 1 0 0 0 1 0 1  NLD  2014    NLD2014 1 0 1  NLD  2014    NLD2014
#  3: 0 0 0 1 1 0 0  AUS  2010    AUS2010 1 0 0  AUS  2011    AUS2011
#  4: 1 0 1 0 0 1 0  AUS  2006    AUS2006 0 1 0  AUS  2007    AUS2007
#  5: 0 1 0 1 0 1 1  USA  2008    USA2008 0 1 1  USA  2007    USA2007
#  6: 0 0 1 0 0 0 1  USA  2010    USA2010 0 0 1  USA  2011    USA2010
#  7: 0 0 1 0 0 0 0  USA  2012    USA2012 0 0 1  USA  2011    USA2010
#  8: 1 0 1 0 0 1 0  BLG  2008    BLG2008 0 1 0  BLG  2007    BLG2007
#  9: 0 1 0 1 1 0 1  BEL  2008    BEL2008 1 0 1  BEL  2009    BEL2009
# 10: 0 1 0 1 0 1 0  BEL  2010    BEL2010 1 0 1  BEL  2009    BEL2009
# 11: 0 1 1 1 0 1 0  NLD  2010    NLD2010 0 1 0  NLD  2009    NLD2009
# 12: 1 0 0 0 1 0 1  NLD  2014    NLD2014 1 0 1  NLD  2014    NLD2014
# 13: 0 0 0 1 1 0 0  AUS  2010    AUS2010 1 0 0  AUS  2011    AUS2011
# 14: 1 0 1 0 0 1 0  AUS  2006    AUS2006 0 1 0  AUS  2007    AUS2007
# 15: 0 1 0 1 0 1 1  USA  2008    USA2008 0 1 1  USA  2007    USA2007
# 16: 0 0 1 0 0 0 1  USA  2010    USA2010 0 0 1  USA  2011    USA2010
# 17: 0 0 1 0 0 0 0  USA  2012    USA2012 0 0 1  USA  2011    USA2010
# 18: 1 0 1 0 0 1 0  BLG  2008    BLG2008 0 1 0  BLG  2007    BLG2007
# 19: 0 1 0 1 1 0 1  BEL  2008    BEL2008 1 0 1  BEL  2009    BEL2009
# 20: 0 1 0 1 0 1 0  BEL  2010    BEL2010 1 0 1  BEL  2009    BEL2009
dfA <- fread(
  "A   B   C   D   E   F   G   iso   year   matchcode
  0   1   1   1   0   1   0   NLD   2010   NLD2010
     1   0   0   0   1   0   1   NLD   2014   NLD2014
     0   0   0   1   1   0   0   AUS   2010   AUS2010
     1   0   1   0   0   1   0   AUS   2006   AUS2006
     0   1   0   1   0   1   1   USA   2008   USA2008
     0   0   1   0   0   0   1   USA   2010   USA2010
     0   1   0   1   0   0   0   USA   2012   USA2012
     1   0   1   0   0   1   0   BLG   2008   BLG2008
     0   1   0   1   1   0   1   BEL   2008   BEL2008
    1   0   1   0   0   1   0   BEL   2010   BEL2010
    0   1   1   1   0   1   0   NLD   2010   NLD2010
    1   0   0   0   1   0   1   NLD   2014   NLD2014
    0   0   0   1   1   0   0   AUS   2010   AUS2010
    1   0   1   0   0   1   0   AUS   2006   AUS2006
    0   1   0   1   0   1   1   USA   2008   USA2008
    0   0   1   0   0   0   1   USA   2010   USA2010
    0   1   0   1   0   0   0   USA   2012   USA2012
    1   0   1   0   0   1   0   BLG   2008   BLG2008
    0   1   0   1   1   0   1   BEL   2008   BEL2008
    1   0   1   0   0   1   0   BEL   2010   BEL2010",
  header = TRUE
)


dfB <- fread(
  "A   B   C   D   H   I   J   iso   year   matchcode
     0   1   1   1   0   1   0   NLD   2009   NLD2009
     1   0   0   0   1   0   1   NLD   2014   NLD2014
     0   0   0   1   1   0   0   AUS   2011   AUS2011
     1   0   1   0   0   1   0   AUS   2007   AUS2007
     0   1   0   1   0   1   1   USA   2007   USA2007
     0   0   1   0   0   0   1   USA   2011   USA2010
     0   1   0   1   0   0   0   USA   2013   USA2013
     1   0   1   0   0   1   0   BLG   2007   BLG2007
     0   1   0   1   1   0   1   BEL   2009   BEL2009
     1   0   1   0   0   1   0  BEL   2012   BEL2012",
  header = TRUE
)