R 多列中的第一个滞后

R 多列中的第一个滞后,r,data.table,R,Data.table,我有两列中出现的factor变量,现在我想要第一个滞后,不管factor最后出现在哪个列中 考虑以下数据表 require(data.table) set.seed(21) dt <- data.table(item1 = c(rep(sample(letters[1:5]), 2), sample(letters[6:10])), item2 = c(rep(sample(letters[6:10]), 2), sample(letters[1:5])),

我有两列中出现的factor变量,现在我想要第一个滞后,不管factor最后出现在哪个列中

考虑以下数据表

require(data.table)
set.seed(21)
dt <- data.table(item1 = c(rep(sample(letters[1:5]), 2), sample(letters[6:10])),
           item2 = c(rep(sample(letters[6:10]), 2), sample(letters[1:5])),
           value1 = rnorm(15, 5, 2),
           value2 = rnorm(15, 5, 2),
           iteration = rep(1:3, each = 5))
> dt
    item1 item2    value1   value2 iteration
 1:     d     i 0.4464375 6.491179         1
 2:     b     j 6.5148245 5.665638         1
 3:     c     f 3.9031889 2.751919         1
 4:     a     g 5.3450990 3.587738         1
 5:     e     h 6.1257061 3.544912         1
 6:     d     i 8.0236359 1.331371         2
 7:     b     j 6.3180503 4.184624         2
 8:     c     f 7.2440561 5.053722         2
 9:     a     g 3.4307173 6.823257         2
10:     e     h 4.1486154 8.268693         2
11:     j     a 5.7859952 5.121371         3
12:     f     c 5.0735143 8.695145         3
13:     i     e 2.9358327 5.160250         3
14:     g     d 2.4702771 7.837112         3
15:     h     b 4.5460694 7.917232         3
我可能可以通过为item和value创建一列来解决这个问题,但是有更好的解决方案吗

为了清楚起见,我对第11行lag1的期望值是4.184624

我还需要第2项的延迟,并完成了这个dplyr

dt %>% 
  mutate(nr = 1:nrow(dt)) %>% 
  gather(key, value, -nr, -iteration) %>% 
  mutate(key = ifelse(key == "item1" | key == "item2", "item", "value"),
         variabel = rep(c(1, 2), 2, each = nrow(dt))) %>% 
  spread(key, value) %>% 
  group_by(item) %>% 
  arrange(nr) %>%
  mutate(lag = lag(value)) %>% 
  gather(key, value, -iteration, -nr, -variabel) %>%
  unite(key, c("key", "variabel"), sep = "") %>% 
  spread(key, value)

   iteration    nr item1 item2             lag1              lag2            value1           value2
 *     <int> <int> <chr> <chr>            <chr>             <chr>             <chr>            <chr>
 1         1     1     e     f             <NA>              <NA>  4.48327811883486 5.98823833422944
 2         1     2     b     i             <NA>              <NA>  6.21252978898878  3.6803830789734
 3         1     3     d     g             <NA>              <NA>  5.62689643314086 7.00228385274896
 4         1     4     c     h             <NA>              <NA>  5.10720616395708 7.14416894881173
 5         1     5     a     j             <NA>              <NA>  7.25650757535391 6.51153141154262
 6         2     6     e     f 4.48327811883486  5.98823833422944  3.88373308164829 2.08907058913021
 7         2     7     b     i 6.21252978898878   3.6803830789734  8.07191789162847 6.88574195362948
 8         2     8     d     g 5.62689643314086  7.00228385274896  4.87510729533042 1.25944984673148
 9         2     9     c     h 5.10720616395708  7.14416894881173   5.0431504307243  4.4934555124612
10         2    10     a     j 7.25650757535391  6.51153141154262 0.820345123625779 4.41487625686153
11         3    11     g     d 1.25944984673148  4.87510729533042  3.37822264689098 5.43753611910662
12         3    12     j     a 4.41487625686153 0.820345123625779 -0.88757977661203 2.28986114731552
13         3    13     i     e 6.88574195362948  3.88373308164829  4.96240860503556 4.75454561215201
14         3    14     h     b  4.4934555124612  8.07191789162847  4.29063975464589 4.09626986248512
15         3    15     f     c 2.08907058913021   5.0431504307243  5.07114037497055 5.19449624162733
dt%>%
突变(nr=1:nrow(dt))%>%
聚集(键,值,-nr,-迭代)%>%
变异(key=ifelse(key==“item1”| key==“item2”、“item”、“value”),
变量=rep(c(1,2),2,各=nrow(dt))%>%
排列(键,值)%>%
分组依据(项目)%>%
排列(nr)%>%
突变(滞后=滞后(值))%>%
聚集(键,值,-迭代,-nr,-variabel)%>%
联合(键,c(“键”,“变量”),sep=“”)%>%
排列(键、值)
迭代编号项目1项目2 lag1 lag2值1值2
*                                                          
1 1 e f 4.48327811883486 5.98823833422944
2 1 2 b i 6.21252978898878 3.6803830789734
3 1 3 d g 5.62689643314086 7.00228385274896
4 1 4 c h 5.10720616395708 7.14416894881173
5 1 5 a j 7.256507535391 6.51153141154262
6 2 6 e f 4.48327811883486 5.98823833422944 3.88373308164829 2.08907058913021
7 2 7 b i 6.21252978898878 3.68038307897348.07191789162847 6.88574195362948
8 2 8 d g 5.62689643314086 7.00228385274896 4.87510729533042 1.25944984673148
9 2 9 c h 5.10720616395708 7.14416894881173 5.0431504307243 4.4934555124612
10 2 10 a j 7.25650755391 6.51153141154262 0.820345123625779 4.41487625686153
11 3 11 g d 1.25944984673148 4.87510729533042 3.37822264689098 5.43753611910662
12 3 12 j a 4.41487625686153 0.820345123625779-0.88757977661203 2.28986114731552
13 3 13即6.88574195362948 3.88373308164829 4.96240860503556 4.75454561215201
14 3 14 h b 4.4934555124612 8.07191789162847 4.29063975464589 4.09626986248512
15 3 15 f c 2.08907058913021 5.0431504307243 5.07114037497055.19449624162733

在为
行号添加列后,可以使用
melt
找到解决方案

library(data.table)

#Add a column to represent row number in current table. This will be used
#later to sort data.table to find correct `lag`
dt[,rn:=.I]

#Use melt to transfer values in long format. value1 contains "items" and 
#value2 contains "values" 
dt<-melt(dt, id=c("iteration","rn"), measure=list(1:2,c("value1","value2")),
value.name = c("item","value"))

#The order in original table is row number, item1 and item2. The same can be 
#achieved by sorting on "rn" and "variable"
dt[order(rn,variable), lag := shift(value), by = item]

dt[variable==1,][dt[variable==2,],.(item1 = item, item2 = i.item, value1 = value, 
        value2=i.value, iteration, lag1 = lag, lag2 = i.lag), on=("rn")]

#   item1 item2    value1   value2 iteration      lag1     lag2
# 1:     d     i 0.4464375 6.491179         1        NA       NA
# 2:     b     j 6.5148245 5.665638         1        NA       NA
# 3:     c     f 3.9031889 2.751919         1        NA       NA
# 4:     a     g 5.3450990 3.587738         1        NA       NA
# 5:     e     h 6.1257061 3.544912         1        NA       NA
# 6:     d     i 8.0236359 1.331371         2 0.4464375 6.491179
# 7:     b     j 6.3180503 4.184624         2 6.5148245 5.665638
# 8:     c     f 7.2440561 5.053722         2 3.9031889 2.751919
# 9:     a     g 3.4307173 6.823257         2 5.3450990 3.587738
# 10:    e     h 4.1486154 8.268693         2 6.1257061 3.544912
# 11:    j     a 5.7859952 5.121371         3 4.1846241 3.430717
# 12:    f     c 5.0735143 8.695145         3 5.0537224 7.244056
# 13:    i     e 2.9358327 5.160250         3 1.3313712 4.148615
# 14:    g     d 2.4702771 7.837112         3 6.8232573 8.023636
# 15:    h     b 4.5460694 7.917232         3 8.2686930 6.318050
库(data.table)
#添加一列以表示当前表中的行号。这将被使用
#稍后对data.table进行排序,以找到正确的'lag'`
dt[,rn:=.I]
#使用melt以长格式传输值。值1包含“项目”和
#value2包含“值”

dt发布了另一个类似的方法。类似于在长data.table中使用item1+item2的加长版本。区别在于使用联接

有两种可能的情况:

1) 延迟总是在紧接前一个迭代中出现,那么使用普通联接的以下代码应该可以工作:

library(data.table)
set.seed(21)
dt <- data.table(item1 = c(rep(sample(letters[1:5]), 2), sample(letters[6:10])),
    item2 = c(rep(sample(letters[6:10]), 2), sample(letters[1:5])),
    value1 = rnorm(15, 5, 2),
    value2 = rnorm(15, 5, 2),
    iteration = rep(1:3, each = 5))

#if that first lag can always be found in previous iteration
dt[.(iitem=c(item1, item2), ivalue=c(value1, value2), iiteration=c(iteration + 1, iteration + 1)),
    lag1 := ivalue,
    on=c(item1="iitem", iteration="iiteration")]
dt[.(iitem=c(item1, item2), ivalue=c(value1, value2), iiteration=c(iteration + 1, iteration + 1)),
    lag2 := ivalue,
    on=c(item2="iitem", iteration="iiteration")]
dt

#     item1 item2             value1            value2 iteration               lag1              lag2
#  1:     d     i 0.4464375195067456 6.491178609416053         1                 NA                NA
#  2:     b     j 6.5148244502509627 5.665638360665036         1                 NA                NA
#  3:     c     f 3.9031888919439428 2.751919085284464         1                 NA                NA
#  4:     a     g 5.3450989557007524 3.587738435542055         1                 NA                NA
#  5:     e     h 6.1257061355108435 3.544912270783058         1                 NA                NA
#  6:     d     i 8.0236359188753603 1.331371229451156         2 0.4464375195067456 6.491178609416053
#  7:     b     j 6.3180503383288116 4.184624119479032         2 6.5148244502509627 5.665638360665036
#  8:     c     f 7.2440561493491140 5.053722389597528         2 3.9031888919439428 2.751919085284464
#  9:     a     g 3.4307172617070858 6.823257275121762         2 5.3450989557007524 3.587738435542055
# 10:     e     h 4.1486154223793710 8.268692951017332         2 6.1257061355108435 3.544912270783058
# 11:     j     a 5.7859951827443368 5.121371228719468         3 4.1846241194790323 3.430717261707086
# 12:     f     c 5.0735142596491132 8.695145055731583         3 5.0537223895975281 7.244056149349114
# 13:     i     e 2.9358326775434151 5.160249909302514         3 1.3313712294511557 4.148615422379371
# 14:     g     d 2.4702770572371642 7.837111765957783         3 6.8232572751217617 8.023635918875360
# 15:     h     b 4.5460694295527579 7.917231870893728         3 8.2686929510173321 6.318050338328812
库(data.table)
种子(21)

太好了,谢谢。我不清楚在开始,但我也将需要第2项滞后。请参阅更新的问题。@我已经更新了答案,提供了
lag1
lag2
。看一看。
library(data.table)
set.seed(21)
dt <- data.table(item1 = c(rep(sample(letters[1:5]), 2), sample(letters[6:10])),
    item2 = c(rep(sample(letters[6:10]), 2), sample(letters[1:5])),
    value1 = rnorm(15, 5, 2),
    value2 = rnorm(15, 5, 2),
    iteration = rep(1:3, each = 5))

#if that first lag can always be found in previous iteration
dt[.(iitem=c(item1, item2), ivalue=c(value1, value2), iiteration=c(iteration + 1, iteration + 1)),
    lag1 := ivalue,
    on=c(item1="iitem", iteration="iiteration")]
dt[.(iitem=c(item1, item2), ivalue=c(value1, value2), iiteration=c(iteration + 1, iteration + 1)),
    lag2 := ivalue,
    on=c(item2="iitem", iteration="iiteration")]
dt

#     item1 item2             value1            value2 iteration               lag1              lag2
#  1:     d     i 0.4464375195067456 6.491178609416053         1                 NA                NA
#  2:     b     j 6.5148244502509627 5.665638360665036         1                 NA                NA
#  3:     c     f 3.9031888919439428 2.751919085284464         1                 NA                NA
#  4:     a     g 5.3450989557007524 3.587738435542055         1                 NA                NA
#  5:     e     h 6.1257061355108435 3.544912270783058         1                 NA                NA
#  6:     d     i 8.0236359188753603 1.331371229451156         2 0.4464375195067456 6.491178609416053
#  7:     b     j 6.3180503383288116 4.184624119479032         2 6.5148244502509627 5.665638360665036
#  8:     c     f 7.2440561493491140 5.053722389597528         2 3.9031888919439428 2.751919085284464
#  9:     a     g 3.4307172617070858 6.823257275121762         2 5.3450989557007524 3.587738435542055
# 10:     e     h 4.1486154223793710 8.268692951017332         2 6.1257061355108435 3.544912270783058
# 11:     j     a 5.7859951827443368 5.121371228719468         3 4.1846241194790323 3.430717261707086
# 12:     f     c 5.0735142596491132 8.695145055731583         3 5.0537223895975281 7.244056149349114
# 13:     i     e 2.9358326775434151 5.160249909302514         3 1.3313712294511557 4.148615422379371
# 14:     g     d 2.4702770572371642 7.837111765957783         3 6.8232572751217617 8.023635918875360
# 15:     h     b 4.5460694295527579 7.917231870893728         3 8.2686929510173321 6.318050338328812
library(data.table)
set.seed(21)
dt <- data.table(item1 = c(rep(sample(letters[1:5]), 2), sample(letters[6:10])),
    item2 = c(rep(sample(letters[6:10]), 2), sample(letters[1:5])),
    value1 = rnorm(15, 5, 2),
    value2 = rnorm(15, 5, 2),
    iteration = rep(1:3, each = 5))
#remove iteration=2, item1=c, item2=f to show finding lag from earlier iterations
dt <- dt[-8,]

#if that first lag can be found in even earlier iteration, using non-equi joins as follows:
elongated <- dt[,.(item=c(item1, item2), ivalue=c(value1, value2), iteration=c(iteration, iteration), cpyalliter=c(iteration, iteration))]

dt[, lag1 := elongated[.SD, on=.(item=item1, iteration < iteration)][,
    last(ivalue), by=.(item1=item, item2, value1, value2, iteration)]$V1 ]

dt[, lag2 := elongated[.SD, on=.(item=item2, iteration < iteration)][,
    last(ivalue), by=.(item1, item2=item, value1, value2, iteration)]$V1 ]

dt

#     item1 item2             value1            value2 iteration               lag1              lag2
#  1:     d     i 0.4464375195067456 6.491178609416053         1                 NA                NA
#  2:     b     j 6.5148244502509627 5.665638360665036         1                 NA                NA
#  3:     c     f 3.9031888919439428 2.751919085284464         1                 NA                NA
#  4:     a     g 5.3450989557007524 3.587738435542055         1                 NA                NA
#  5:     e     h 6.1257061355108435 3.544912270783058         1                 NA                NA
#  6:     d     i 8.0236359188753603 1.331371229451156         2 0.4464375195067456 6.491178609416053
#  7:     b     j 6.3180503383288116 4.184624119479032         2 6.5148244502509627 5.665638360665036
#  8:     a     g 3.4307172617070858 6.823257275121762         2 5.3450989557007524 3.587738435542055
#  9:     e     h 4.1486154223793710 8.268692951017332         2 6.1257061355108435 3.544912270783058
# 10:     j     a 5.7859951827443368 5.121371228719468         3 4.1846241194790323 3.430717261707086
# 11:     f     c 5.0735142596491132 8.695145055731583         3 2.7519190852844644 3.903188891943943
# 12:     i     e 2.9358326775434151 5.160249909302514         3 1.3313712294511557 4.148615422379371
# 13:     g     d 2.4702770572371642 7.837111765957783         3 6.8232572751217617 8.023635918875360
# 14:     h     b 4.5460694295527579 7.917231870893728         3 8.2686929510173321 6.318050338328812