R 按组列出的重复观察的百分比

R 按组列出的重复观察的百分比,r,dplyr,data.table,R,Dplyr,Data.table,我有一个公司员工随时间变化的数据集,如下所示 data.table(firm = c(rep("A", 8), rep("B", 8)), employee = c(1, 2, 3, 4, 1, 2, 3, NA, 5, 6, NA, NA, 5, 6, 7, 8), year = c(rep(1, 4), rep(2, 4))) firm employee_id year A 1 1 A 2

我有一个公司员工随时间变化的数据集,如下所示

data.table(firm = c(rep("A", 8), rep("B", 8)), 
           employee = c(1, 2, 3, 4, 1, 2, 3, NA, 5, 6, NA, NA, 5, 6, 7, 8),
           year = c(rep(1, 4), rep(2, 4)))

  firm employee_id year
    A        1    1
    A        2    1
    A        3    1
    A        4    1
    A        1    2
    A        2    2
    A        3    2
    A       NA    2
    B        5    1
    B        6    1
    B       NA    1
    B       NA    1
    B        5    2
    B        6    2
    B        7    2
    B        8    2
firm year continued_employees
 A    2     0.75
 B    2     1
我想计算每个公司从第==1年到第==2年仍在工作的员工的百分比

输出应该是这样的

data.table(firm = c(rep("A", 8), rep("B", 8)), 
           employee = c(1, 2, 3, 4, 1, 2, 3, NA, 5, 6, NA, NA, 5, 6, 7, 8),
           year = c(rep(1, 4), rep(2, 4)))

  firm employee_id year
    A        1    1
    A        2    1
    A        3    1
    A        4    1
    A        1    2
    A        2    2
    A        3    2
    A       NA    2
    B        5    1
    B        6    1
    B       NA    1
    B       NA    1
    B        5    2
    B        6    2
    B        7    2
    B        8    2
firm year continued_employees
 A    2     0.75
 B    2     1
我可以每年循环使用

sum(员工id[年度==1]%在%employee\u id[年度==2])/length(员工id[年度==1])


但我有大约4万家公司和10年的观察。关于如何在
dplyr
数据中实现这一点,您有什么想法吗?table
语法?

这里有一个不太漂亮的
数据。table
方法可以用于任何公司和年份:

years <- head(sort(unique(dt$year)), -1)
setNames(lapply(years, function(y) {
  dt[dt[(year == y), .(firm, employee)], on = .(firm, employee)][
    !is.na(employee), all(c(y, y+1) %in% year), by = .(employee, firm)][, 
      .(continued = mean(V1), year = y+1), by = firm]
}), paste("Year", years, sep="-"))

#$`Year-1`
#   firm continued year
#1:    A      0.75    2
#2:    B      1.00    2

years这里有一种稍微不同的方法:

dt<-dat[,list(all=.(unique(employee))), by=list(year,firm)]
dt<-dt[,list(year1=sapply(list(all),`[`,1), 
             year2=sapply(list(all),`[`,2)), by=firm]
dt[,uniqueN(mapply(intersect, year1, year2))/uniqueN(na.omit(unlist(year1))),by=firm]
dtJoin与移位年份
这是一种使用一种带移位年份的自联接的方法:

library(data.table)
options(datatable.print.class = TRUE)
# self join with shifted year
DT[.(firm = firm, employee = employee, year = year - 1), 
   on = .(firm, employee, year), cont := TRUE][]
# aggregate
DT[!is.na(employee), sum(cont, na.rm = TRUE) / .N, by = .(firm, year = year + 1)][
  # beautify result
  year <= max(DT$year)]
使用
shift()
或者,可以使用
shift()
函数计算
cont
列。聚合部分与上面的连接方法相同
shift()
要求确保数据按年份排序

DT[order(year), cont := shift(year, type = "lead") == year + 1, by = .(firm, employee)][
  !is.na(employee), sum(cont, na.rm = TRUE) / .N, by = .(firm, year = year + 1)][
    year <= max(DT$year)]
样本数据集由4 M行组成,在从长格式到宽格式进行重塑后,可以实现最佳可视化:

dcast(DT0[!is.na(employee)], firm + employee ~ year)
基准代码:

microbenchmark::microbenchmark(
  dd = {
    dt <- copy(DT0)
    years <- head(sort(unique(dt$year)), -1)
    rbindlist(
      setNames(lapply(years, function(y) {
        dt[dt[(year == y), .(firm, employee)], on = .(firm, employee)][
          !is.na(employee), all(c(y, y+1) %in% year), by = .(employee, firm)][
            , .(continued = mean(V1), year = y+1), by = firm]
      }), paste("Year", years, sep="-"))
    )
  },
  join = {
    DT <- copy(DT0)
    DT[.(firm = firm, employee = employee, year = year - 1), 
       on = .(firm, employee, year), cont := TRUE][
         !is.na(employee), .(continued = sum(cont, na.rm = TRUE) / .N), 
         by = .(firm, year = year + 1)][
           year <= max(DT$year)]
  },
  shift = {
    DT <- copy(DT0)
    DT[order(year), cont := shift(year, type = "lead") == year + 1, 
       by = .(firm, employee)][
         !is.na(employee), .(continued = sum(cont, na.rm = TRUE) / .N), 
         by = .(firm, year = year + 1)][
           year <= max(DT$year)]
  },
  check = my_check,
  times = 3L
)

完美的在尝试使用data.table语法进行第一次连接时,我丢失了第一次连接。不太好看,虽然简洁高效,但您可以对结果使用rbindlist来获得单个表输出
Using 'year' as value column. Use 'value.var' to override
          firm employee  2001  2002  2003  2004  2005  2006  2007  2008  2009  2010
        <char>    <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
     1: 000001        1  2001  2002  2003  2004  2005  2006  2007  2008  2009  2010
     2: 000001        2  2001  2002  2003    NA  2005  2006  2007    NA  2009    NA
     3: 000001        3  2001  2002    NA    NA  2005  2006  2007  2008  2009  2010
     4: 000001        4  2001    NA    NA    NA  2005  2006  2007  2008    NA  2010
    ---                                                                            
399996: 040000        6  2001  2002    NA  2004  2005    NA    NA    NA  2009  2010
399997: 040000        7    NA  2002    NA    NA  2005  2006  2007  2008  2009  2010
399998: 040000        8  2001  2002  2003    NA    NA    NA  2007    NA    NA  2010
399999: 040000        9  2001  2002  2003    NA  2005  2006  2007  2008  2009    NA
400000: 040000       10  2001  2002  2003    NA    NA  2006  2007  2008  2009  2010
my_check <- function(values) {
  values <- lapply(values, function(x) x[, dcast(.SD, firm ~ year, value.var = "continued")])
  all(sapply(values[-1], function(x) identical(values[[1]], x)))
}
microbenchmark::microbenchmark(
  dd = {
    dt <- copy(DT0)
    years <- head(sort(unique(dt$year)), -1)
    rbindlist(
      setNames(lapply(years, function(y) {
        dt[dt[(year == y), .(firm, employee)], on = .(firm, employee)][
          !is.na(employee), all(c(y, y+1) %in% year), by = .(employee, firm)][
            , .(continued = mean(V1), year = y+1), by = firm]
      }), paste("Year", years, sep="-"))
    )
  },
  join = {
    DT <- copy(DT0)
    DT[.(firm = firm, employee = employee, year = year - 1), 
       on = .(firm, employee, year), cont := TRUE][
         !is.na(employee), .(continued = sum(cont, na.rm = TRUE) / .N), 
         by = .(firm, year = year + 1)][
           year <= max(DT$year)]
  },
  shift = {
    DT <- copy(DT0)
    DT[order(year), cont := shift(year, type = "lead") == year + 1, 
       by = .(firm, employee)][
         !is.na(employee), .(continued = sum(cont, na.rm = TRUE) / .N), 
         by = .(firm, year = year + 1)][
           year <= max(DT$year)]
  },
  check = my_check,
  times = 3L
)
Unit: seconds
  expr       min        lq      mean    median        uq       max neval cld
    dd 11.756114 11.919959 12.083042 12.083805 12.246506 12.409207     3   c
  join  1.054293  1.239829  1.303971  1.425366  1.428810  1.432254     3 a  
 shift  6.105725  6.105906  6.148136  6.106087  6.169342  6.232596     3  b