R-创建对方频率矩阵_R_Frequency

R-创建对方频率矩阵

R-创建对方频率矩阵,r,frequency,R,Frequency,我有易货经济的数据。我试图创建一个矩阵，计算项目与其他项目作为交易对手的频率例如： myDat <- data.frame( TradeID = as.factor(c(1,1,1,2,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8)), Origin = as.factor(c(1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0)), ItemID = as.facto

我有易货经济的数据。我试图创建一个矩阵，计算项目与其他项目作为交易对手的频率

例如：

  myDat <- data.frame(
             TradeID = as.factor(c(1,1,1,2,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8)),
             Origin = as.factor(c(1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0)),
             ItemID = as.factor(c(1,2,3,4,5,1,1,6,7,1,1,8,7,5,1,1,2,3,4))
            )


     TradeID Origin ItemID
  1        1      1      1
  2        1      0      2
  3        1      0      3
  4        2      1      4
  5        2      1      5
  6        2      0      1
  7        3      1      1
  8        3      0      6
  9        4      1      7
  10       4      0      1
  11       5      1      1
  12       5      0      8
  13       6      1      7
  14       6      0      5
  15       7      1      1
  16       7      0      1
  17       8      1      2
  18       8      0      3
  19       8      0      4
  20       9      1      1
  21       9      0      8

myDat这将为您提供每个TradeID和ItemID的观察次数
myDat <- data.frame(
  TradeID = as.factor(c(1,1,1,2,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8)),
  Origin = as.factor(c(1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0)),
  ItemID = as.factor(c(1,2,3,4,5,1,1,6,7,1,1,8,7,5,1,1,2,3,4))
)
result = tapply(myDat$Origin, list(myDat$ItemID,myDat$TradeID), length)
result[is.na(result)] = 0
result["1","7"]

result = tapply(myDat$Origin, list(myDat$ItemID,myDat$TradeID), function(x) { sum(as.numeric(as.character(x)))/length(x) })

这将为您提供每个TradeID和ItemID的1
Origin

myDat <- data.frame(
  TradeID = as.factor(c(1,1,1,2,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8)),
  Origin = as.factor(c(1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0)),
  ItemID = as.factor(c(1,2,3,4,5,1,1,6,7,1,1,8,7,5,1,1,2,3,4))
)
result = tapply(myDat$Origin, list(myDat$ItemID,myDat$TradeID), length)
result[is.na(result)] = 0
result["1","7"]

result = tapply(myDat$Origin, list(myDat$ItemID,myDat$TradeID), function(x) { sum(as.numeric(as.character(x)))/length(x) })

您可以使用result[is.NA（result）]=0将最后一个矩阵中的NA
值设置为0
，但这会将没有观察值与只有0
原产地交易的值混淆。
这将为您提供每个连续ItemID
s的观察值：
idxList <-  with(myDat, tapply(ItemID, TradeID, FUN = function(items) 
  lapply(seq(length(items) - 1), 
         function(i) sort(c(items[i], items[i + 1])))))

# indices of observations  
idx <- do.call(rbind, unlist(idxList, recursive = FALSE))

# create a matrix
ids <- unique(myDat$ItemID)
mat <- matrix(0, length(ids), length(ids))

# place values in matrix
for (i in seq(nrow(idx))) {
  mat[idx[i, , drop = FALSE]] <- mat[idx[i, , drop = FALSE]] + 1      
}

# create symmatric marix 
mat[lower.tri(mat)] <- t(mat)[lower.tri(mat)]


     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,]    1    1    0    0    1    1    1    1
[2,]    1    0    2    0    0    0    0    0
[3,]    0    2    0    1    0    0    0    0
[4,]    0    0    1    0    1    0    0    0
[5,]    1    0    0    1    0    0    1    0
[6,]    1    0    0    0    0    0    0    0
[7,]    1    0    0    0    1    0    0    0
[8,]    1    0    0    0    0    0    0    0

idxList好吧，我想我已经弄明白了。简单的回答是：
Reduce("+",by(myDat, myDat$TradeID, function(x) pmin(table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]) + table(x$ItemID[x$Origin==1]) %o% table(x$ItemID[x$Origin==0]),1)))

这将给出以下矩阵，与所需结果相匹配：
  1 2 3 4 5 6 7 8
1 1 1 1 1 1 1 1 2
2 1 0 1 1 0 0 0 0
3 1 1 0 0 0 0 0 0
4 1 1 0 0 0 0 0 0
5 1 0 0 0 0 0 1 0
6 1 0 0 0 0 0 0 0
7 1 0 0 0 1 0 0 0
8 2 0 0 0 0 0 0 0

答案很长。使用by
和outer
（%o%
）和table
函数，您可以获得每个TradeID
的矩阵列表。但是这个双重计数交易7，其中项目1被交易为项目1，所以我使用pmax
函数来解决这个问题。然后我使用Reduce
函数对列表进行求和
这是到达那里的步骤。注意添加了TradeID#9，这被排除在问题代码之外
# Data
myDat <- data.frame(
  TradeID = as.factor(c(1,1,1,2,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8,9,9)),
  Origin = as.factor(c(1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0)),
  ItemID = as.factor(c(1,2,3,4,5,1,1,6,7,1,1,8,7,5,1,1,2,3,4,1,8))
)

# Sum in 1 direction
by(myDat, myDat$TradeID, function(x) table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]))

# Sum in both directions
by(myDat, myDat$TradeID, function(x) table(x$ItemID[x$Origin==1]) %o% table(x$ItemID[x$Origin==0]) + table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]))

# Remove double-count in trade 7
by(myDat, myDat$TradeID, function(x) pmin(table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]) + table(x$ItemID[x$Origin==1]) %o% table(x$ItemID[x$Origin==0]),1))

# Sum across lists
Reduce("+",by(myDat, myDat$TradeID, function(x) pmin(table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]) + table(x$ItemID[x$Origin==1]) %o% table(x$ItemID[x$Origin==0]),1)))

这似乎是速度的两倍
> microbenchmark(Reduce("+",by(myDat, myDat$TradeID, function(x) pmin(table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]) + table(x$ItemID[x$Origin==1]) %o% table(x$ItemID[x$Origin==0]),1))))
Unit: milliseconds
      min       lq   median       uq     max neval
 7.489092 7.733382 7.955861 8.536359 9.83216   100

> microbenchmark(Reduce("+",by(myDat, myDat$TradeID, function(x) table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]))))
Unit: milliseconds

      min      lq   median       uq      max neval
 4.023964 4.18819 4.277767 4.452824 5.801171   100

你能解释一下矩阵到底应该显示什么吗？例如，2在[1,8]处表示什么？[5,7]处的1表示什么？等等，谢谢你，尼克。我在示例矩阵下添加了一些额外的解释。很抱歉我不够清晰。谢谢你的有效解决方案。我很好奇，@Nick，我是否可以大胆地征求你的建议，让代码运行得更快和/或内存要求更低？我试着在我的样本上运行它（平均每天50000-150000笔交易，约3500项），但我访问的服务器往往会耗尽内存。利用对称性，只需在一个方向上求和，就可以将其几乎减半。所以类似于Reduce（“+”），通过（myDat，myDat$TradeID，函数（x）表（x$ItemID[x$Origin==0]）%o%table（x$ItemID[x$Origin==1]）
然后清理结果。我将把这个添加到我的答案中。你也可以尝试分批运行代码，然后在最后对结果求和，因为它是跨TradeID
求和。