提高R中社交网络分析的处理性能_R_Performance

提高R中社交网络分析的处理性能

r performance

提高R中社交网络分析的处理性能,r,performance,R,Performance,我正在使用R中的igraph软件包进行社交网络分析，我正在处理近200万个顶点和边。还计算了近800万个顶点和边的分离度。通常，执行需要2到3个小时，这太高了。我需要一些意见和建议来提高这个性能。下面是我正在使用的示例代码： g <- graph.data.frame( ids, directed = F) # ids contains approximately 2 million records distances(graph = g, v = t_ids$ID_from[x], to

我正在使用R中的igraph软件包进行社交网络分析，我正在处理近200万个顶点和边。还计算了近800万个顶点和边的分离度。通常，执行需要2到3个小时，这太高了。我需要一些意见和建议来提高这个性能。下面是我正在使用的示例代码：

g <- graph.data.frame( ids, directed = F) # ids contains approximately 2 million records
distances(graph = g, v = t_ids$ID_from[x], to = t_ids$ID_to[x], weights = NA)
# t_ids contains approximately 8 million records for which degrees of separation is to be calculated using Shortest Path Algorithms

g我不这么认为，但我很高兴被证明是错的
您应该研究优化正在运行的代码的其他方法
如果您的数据是固定的，您可以计算一次距离，保存（可能相当大的）距离矩阵，并询问分离度
如果您的分析不需要所有x
顶点之间的距离，您应该通过从[x]

中缩短

t\u ID$ID\u来优化代码。只获得你需要的距离。不过，我怀疑你已经这么做了
distance（）
实际上计算速度相当快。在10000个节点上（相当于4,99*10^6个无向距离），我的蹩脚机器在几秒钟内得到一个完整的700MB大距离矩阵
我首先想到了在distance（）
中可以选择的不同算法，但现在我怀疑它们是否会对您有所帮助。我对不同的算法进行了速度测试，看看是否可以向您推荐其中任何一种算法，但它们似乎都以大致相同的速度运行（结果与使用上述代码中使用的自动算法计算的时间有关）：
我不认为可以从中得出任何结论，但它是在Erdős-Rényi模型上运行的。您的网络结构可能偏向于一种算法而非另一种算法，但它们仍然无法给您带来您所希望的性能提升
代码如下：
# igrpah
library(igraph)

# setup:
samplesizes <- c(10, 100, 1000, 5000, 10000)
reps <- c(100, 100, 15, 3, 1)
algorithms = c("automatic", "unweighted", "dijkstra", "bellman-ford", "johnson")
df <- as.data.frame(matrix(ncol=length(algorithms), nrow=0), stringsAsFactors = FALSE)
names(df) <- algorithms

# any random graph
g <- erdos.renyi.game(10000, 10000, "gnm")

# These are the different algorithms used by distances:
m.auto <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="automatic")
m.unwg <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="unweighted")
m.dijk <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="dijkstra")
m.belm <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="bellman-ford")
m.john <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="johnson")

# They produce the same result:
sum(m.auto == m.unwg & m.auto == m.dijk & m.auto == m.belm & m.auto == m.john) == length(m.auto)


# Use this function will be used to test the speed of distances() run using different algorithms
test_distances <- function(alg){
       m.auto <- distances(g, v=V(g), to=V(g), weights=NA, algorithm=alg)
       (TRUE)
}

# Build testresults
for(i.sample in 1:length(samplesizes)){
       # Create a random network to test
       g <- erdos.renyi.game(samplesizes[i.sample], (samplesizes[i.sample]*1.5), type = "gnm", directed = FALSE, loops = FALSE)

       i.rep <- reps[i.sample]

       for(i.alg in 1:length(algorithms)){
              df[i.sample,i.alg] <- system.time( replicate(i.rep, test_distances(algorithms[i.alg]) ) )[['elapsed']]
       }
}

# Normalize benchmark results
dfn <- df

dfn[,1:length(df[,])] <- df[,1:length(df[,])] / df[,1]
dfn$sample <- samplesizes
dfn <- dfn[,c(6,1:5)]
dfn

#igrpah
图书馆（igraph）
#设置：
抽样
# igrpah
library(igraph)

# setup:
samplesizes <- c(10, 100, 1000, 5000, 10000)
reps <- c(100, 100, 15, 3, 1)
algorithms = c("automatic", "unweighted", "dijkstra", "bellman-ford", "johnson")
df <- as.data.frame(matrix(ncol=length(algorithms), nrow=0), stringsAsFactors = FALSE)
names(df) <- algorithms

# any random graph
g <- erdos.renyi.game(10000, 10000, "gnm")

# These are the different algorithms used by distances:
m.auto <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="automatic")
m.unwg <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="unweighted")
m.dijk <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="dijkstra")
m.belm <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="bellman-ford")
m.john <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="johnson")

# They produce the same result:
sum(m.auto == m.unwg & m.auto == m.dijk & m.auto == m.belm & m.auto == m.john) == length(m.auto)


# Use this function will be used to test the speed of distances() run using different algorithms
test_distances <- function(alg){
       m.auto <- distances(g, v=V(g), to=V(g), weights=NA, algorithm=alg)
       (TRUE)
}

# Build testresults
for(i.sample in 1:length(samplesizes)){
       # Create a random network to test
       g <- erdos.renyi.game(samplesizes[i.sample], (samplesizes[i.sample]*1.5), type = "gnm", directed = FALSE, loops = FALSE)

       i.rep <- reps[i.sample]

       for(i.alg in 1:length(algorithms)){
              df[i.sample,i.alg] <- system.time( replicate(i.rep, test_distances(algorithms[i.alg]) ) )[['elapsed']]
       }
}

# Normalize benchmark results
dfn <- df

dfn[,1:length(df[,])] <- df[,1:length(df[,])] / df[,1]
dfn$sample <- samplesizes
dfn <- dfn[,c(6,1:5)]
dfn