R函数在左连接中回切？_R - Fatal编程技术网

R函数在左连接中回切？

R函数在左连接中回切？,r,R,我有20多个不同的数据表，它们的数据由同一个编码系统（北美工业分类系统，NAICS）索引，我想将它们合并到一个表中问题是，每个表中都有不同的详细级别，当我加入时，我希望通过在找到匹配之前破坏编码系统的层次结构来找到最佳匹配常规的left\u连接将不起作用，因为不会始终存在精确匹配。我已经看过了fuzzyjoin包，但它有点让我不知所措我想从一个代码表开始： t_master # A tibble: 360 x 1 NAICS17 <chr> 1 311111

我有20多个不同的数据表，它们的数据由同一个编码系统（北美工业分类系统，NAICS）索引，我想将它们合并到一个表中

问题是，每个表中都有不同的详细级别，当我加入时，我希望通过在找到匹配之前破坏编码系统的层次结构来找到最佳匹配

常规的

left\u连接将不起作用，因为不会始终存在精确匹配。我已经看过了fuzzyjoin
包，但它有点让我不知所措
我想从一个代码表开始：
t_master
# A tibble: 360 x 1
   NAICS17
   <chr>  
 1 311111 
 2 311119 
 3 311211 
 4 311212 
 5 311213 
 6 311221 
 7 311224 
 8 311225 
 9 311230 
10 311313 
# ... with 350 more rows

t_master <- structure(list(NAICS17 = c(311111L, 311119L, 311211L, 311212L, 
311213L, 311221L, 311224L, 311225L, 311230L, 311313L)), row.names = c(NA, 
-10L), class = "data.frame")

t\u主机
#一个tibble:360 x 1
NAICS17
1 311111 
2 311119 
3 311211 
4 311212 
5 311213 
6 311221 
7 311224 
8 311225 
9 311230 
10 311313 
# ... 还有350行
t_master我将执行一系列更新联接：
library(data.table)

ncs  = seq_len(max(nchar(t_master$NAICS17)))
nms  = copy(names(t_asm))
xnms = sprintf("x.%s", nms)
tnms = replace(nms, nms == "NAICS17", "m")

t_asm2 <- data.table(t_asm)
out = data.table(t_master)
out[, NAICS17 := as.character(NAICS17)]
out[, m := NA_character_]
for (nc in rev(ncs)){
  out[is.na(m), target := substr(NAICS17, 1, nc)]
  out[is.na(m), 
    (tnms) := t_asm2[.SD, on=.(NAICS17 = target), mget(xnms)][]
  ]
  if (!anyNA(out$m)) break
}
out[, target := NULL][]

    NAICS17      m  CEXBLD  CEXMCH
 1:  311111 311111   49756  225494
 2:  311119 311119   42673  291702
 3:  311211  31121   75310  267693
 4:  311212  31121   75310  267693
 5:  311213  31121   75310  267693
 6:  311221  31122   94339  546407
 7:  311224  31122   94339  546407
 8:  311225  31122   94339  546407
 9:  311230   3112  192911 1016770
10:  311313    311 2099542 9063451

库（data.table）
ncs=序列长度（最大值（nchar（主数据$NAICS17）））
nms=副本（名称（t_asm））
xnms=sprintf（“x.%s”，nms）
tnms=替换（nms，nms==“NAICS17”，“m”）
抱歉，修好了。你完全正确。这太棒了！非常感谢你的帮助！正如我所理解的，循环基本上是在开始时抛出一个na，然后逐渐地将代码构建得越来越详细，看看它是否可以在此基础上进行改进？很抱歉，我花了这么长时间逐行处理所发生的事情。@Paul哦，我忘了将其转换为data.table。我已经编辑使用了t_asm2，非常感谢！你救了我一大笔钱time@Paul嗯，作为一个单独的问题听起来更好，因为我不太理解这个扩展，其他人可能对如何解决这个问题有不同的想法。
t_brdis_2015
# A tibble: 90 x 3
   NAICS17 rdcost_total rdcost_wage
   <chr>   <chr>        <chr>      
 1 0       355821       204170     
 2 31      236132       129375     
 3 32      236132       129375     
 4 33      236132       129375     
 5 311     4838         2945       
 6 312     1002         532        
 7 313     748          481        
 8 314     748          481        
 9 315     748          481        
10 316     748          481        
# ... with 80 more rows

t_brdis_2015 <- structure(list(NAICS17 = c("0", "31", "32", "33", "311", "312", 
"313", "314", "315", "316"), rdcost_total = c("355821", "236132", 
"236132", "236132", "4838", "1002", "748", "748", "748", "748"
), rdcost_wage = c("204170", "129375", "129375", "129375", "2945", 
"532", "481", "481", "481", "481")), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -10L))

library(data.table)

ncs  = seq_len(max(nchar(t_master$NAICS17)))
nms  = copy(names(t_asm))
xnms = sprintf("x.%s", nms)
tnms = replace(nms, nms == "NAICS17", "m")

t_asm2 <- data.table(t_asm)
out = data.table(t_master)
out[, NAICS17 := as.character(NAICS17)]
out[, m := NA_character_]
for (nc in rev(ncs)){
  out[is.na(m), target := substr(NAICS17, 1, nc)]
  out[is.na(m), 
    (tnms) := t_asm2[.SD, on=.(NAICS17 = target), mget(xnms)][]
  ]
  if (!anyNA(out$m)) break
}
out[, target := NULL][]

    NAICS17      m  CEXBLD  CEXMCH
 1:  311111 311111   49756  225494
 2:  311119 311119   42673  291702
 3:  311211  31121   75310  267693
 4:  311212  31121   75310  267693
 5:  311213  31121   75310  267693
 6:  311221  31122   94339  546407
 7:  311224  31122   94339  546407
 8:  311225  31122   94339  546407
 9:  311230   3112  192911 1016770
10:  311313    311 2099542 9063451