Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/75.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
提高R中社交网络分析的处理性能_R_Performance - Fatal编程技术网

提高R中社交网络分析的处理性能

提高R中社交网络分析的处理性能,r,performance,R,Performance,我正在使用R中的igraph软件包进行社交网络分析,我正在处理近200万个顶点和边。还计算了近800万个顶点和边的分离度。通常,执行需要2到3个小时,这太高了。我需要一些意见和建议来提高这个性能。下面是我正在使用的示例代码: g <- graph.data.frame( ids, directed = F) # ids contains approximately 2 million records distances(graph = g, v = t_ids$ID_from[x], to

我正在使用R中的igraph软件包进行社交网络分析,我正在处理近200万个顶点和边。还计算了近800万个顶点和边的分离度。通常,执行需要2到3个小时,这太高了。我需要一些意见和建议来提高这个性能。下面是我正在使用的示例代码:

g <- graph.data.frame( ids, directed = F) # ids contains approximately 2 million records
distances(graph = g, v = t_ids$ID_from[x], to = t_ids$ID_to[x], weights = NA)
# t_ids contains approximately 8 million records for which degrees of separation is to be calculated using Shortest Path Algorithms

g我不这么认为,但我很高兴被证明是错的

您应该研究优化正在运行的代码的其他方法

如果您的数据是固定的,您可以计算一次距离,保存(可能相当大的)距离矩阵,并询问分离度

如果您的分析不需要所有
x
顶点之间的距离,您应该通过从[x]
中缩短
t\u ID$ID\u来优化代码。只获得你需要的距离。不过,我怀疑你已经这么做了

distance()
实际上计算速度相当快。在10000个节点上(相当于4,99*10^6个无向距离),我的蹩脚机器在几秒钟内得到一个完整的700MB大距离矩阵

我首先想到了在
distance()
中可以选择的不同算法,但现在我怀疑它们是否会对您有所帮助。我对不同的算法进行了速度测试,看看是否可以向您推荐其中任何一种算法,但它们似乎都以大致相同的速度运行(结果与使用上述代码中使用的自动算法计算的时间有关):

我不认为可以从中得出任何结论,但它是在Erdős-Rényi模型上运行的。您的网络结构可能偏向于一种算法而非另一种算法,但它们仍然无法给您带来您所希望的性能提升

代码如下:

# igrpah
library(igraph)

# setup:
samplesizes <- c(10, 100, 1000, 5000, 10000)
reps <- c(100, 100, 15, 3, 1)
algorithms = c("automatic", "unweighted", "dijkstra", "bellman-ford", "johnson")
df <- as.data.frame(matrix(ncol=length(algorithms), nrow=0), stringsAsFactors = FALSE)
names(df) <- algorithms

# any random graph
g <- erdos.renyi.game(10000, 10000, "gnm")

# These are the different algorithms used by distances:
m.auto <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="automatic")
m.unwg <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="unweighted")
m.dijk <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="dijkstra")
m.belm <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="bellman-ford")
m.john <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="johnson")

# They produce the same result:
sum(m.auto == m.unwg & m.auto == m.dijk & m.auto == m.belm & m.auto == m.john) == length(m.auto)


# Use this function will be used to test the speed of distances() run using different algorithms
test_distances <- function(alg){
       m.auto <- distances(g, v=V(g), to=V(g), weights=NA, algorithm=alg)
       (TRUE)
}

# Build testresults
for(i.sample in 1:length(samplesizes)){
       # Create a random network to test
       g <- erdos.renyi.game(samplesizes[i.sample], (samplesizes[i.sample]*1.5), type = "gnm", directed = FALSE, loops = FALSE)

       i.rep <- reps[i.sample]

       for(i.alg in 1:length(algorithms)){
              df[i.sample,i.alg] <- system.time( replicate(i.rep, test_distances(algorithms[i.alg]) ) )[['elapsed']]
       }
}

# Normalize benchmark results
dfn <- df

dfn[,1:length(df[,])] <- df[,1:length(df[,])] / df[,1]
dfn$sample <- samplesizes
dfn <- dfn[,c(6,1:5)]
dfn
#igrpah
图书馆(igraph)
#设置:
抽样
# igrpah
library(igraph)

# setup:
samplesizes <- c(10, 100, 1000, 5000, 10000)
reps <- c(100, 100, 15, 3, 1)
algorithms = c("automatic", "unweighted", "dijkstra", "bellman-ford", "johnson")
df <- as.data.frame(matrix(ncol=length(algorithms), nrow=0), stringsAsFactors = FALSE)
names(df) <- algorithms

# any random graph
g <- erdos.renyi.game(10000, 10000, "gnm")

# These are the different algorithms used by distances:
m.auto <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="automatic")
m.unwg <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="unweighted")
m.dijk <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="dijkstra")
m.belm <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="bellman-ford")
m.john <- distances(g, v=V(g), to=V(g), weights=NA, algorithm="johnson")

# They produce the same result:
sum(m.auto == m.unwg & m.auto == m.dijk & m.auto == m.belm & m.auto == m.john) == length(m.auto)


# Use this function will be used to test the speed of distances() run using different algorithms
test_distances <- function(alg){
       m.auto <- distances(g, v=V(g), to=V(g), weights=NA, algorithm=alg)
       (TRUE)
}

# Build testresults
for(i.sample in 1:length(samplesizes)){
       # Create a random network to test
       g <- erdos.renyi.game(samplesizes[i.sample], (samplesizes[i.sample]*1.5), type = "gnm", directed = FALSE, loops = FALSE)

       i.rep <- reps[i.sample]

       for(i.alg in 1:length(algorithms)){
              df[i.sample,i.alg] <- system.time( replicate(i.rep, test_distances(algorithms[i.alg]) ) )[['elapsed']]
       }
}

# Normalize benchmark results
dfn <- df

dfn[,1:length(df[,])] <- df[,1:length(df[,])] / df[,1]
dfn$sample <- samplesizes
dfn <- dfn[,c(6,1:5)]
dfn