R 加速用于创建方便的映射和图表组的函数 背景
以下是SO同事就以下方面进行的有益讨论和提供的帮助:R 加速用于创建方便的映射和图表组的函数 背景,r,performance,function,vector,hmisc,R,Performance,Function,Vector,Hmisc,以下是SO同事就以下方面进行的有益讨论和提供的帮助: , , 及 我结合了一个方便的功能。它接受一个数值向量,并生成与组相关的分解向量 作用 函数体如下所示 nice.cuts <- function(variable, cuts = 10, thousands.separator = FALSE) { # Load required packages (useful when used independently of context) Vectorize(require
- ,
- ,
- 及
nice.cuts <- function(variable, cuts = 10, thousands.separator = FALSE) {
# Load required packages (useful when used independently of context)
Vectorize(require)(package = c("gsubfn", "Hmisc", "scales"),
character.only = TRUE)
# Destring this variable
destring <- function(x) {
## convert factor to strings
if (is.character(x)) {
as.numeric(x)
} else if (is.factor(x)) {
as.numeric(levels(x))[x]
} else if (is.numeric(x)) {
x
} else {
stop("could not convert to numeric")
}
}
# Apply function
variable <- destring(variable)
# Check whether to disable scientific notation
if (mean(variable) > 100000) {
options(scipen = 999)
} else {
options(scipen = 0)
}
# Create pretty breaks
cut_breaks <- pretty_breaks(n = cuts)(variable)
# Round it two decimal places
variable <- round(variable, digits = 2)
# Develop cuts according to the provided object
cuts_variable <- cut2(x = variable, cuts = cut_breaks)
# Check if variable is total or with decimals
if (all(cut_breaks %% 1 == 0)) {
# Variable is integer
clean_cuts <- gsubfn('\\[\\s*(\\d+),\\s*(\\d+)[^0-9]+',
~paste0(x, '-',as.numeric(y)-1),
as.character(cuts_variable))
} else {
# Variable is not integer
# Create clean cuts
clean_cuts <- gsubfn('\\[\\s*([0-9]+\\.*[0-9]*),\\s*(\\d+\\.\\d+).*',
~paste0(x, '-', as.numeric(y)- 0.01),
as.character(cuts_variable))
}
# Clean Inf
clean_cuts <- gsub("Inf", max(variable), clean_cuts)
# Clean punctuation
clean_cuts <- sub("\\[(.*), (.*)\\]", "\\1 - \\2", clean_cuts)
# Replace strings with spaces
clean_cuts <- gsub("-"," - ",clean_cuts, fixed = TRUE)
# Trim white spaces
clean_cuts <- trimws(clean_cuts)
# Order factor before returning
clean_cuts <- factor(clean_cuts, levels = unique(clean_cuts[order(variable)]))
if (thousands.separator == TRUE) {
levels(clean_cuts) <- sapply(strsplit(levels(clean_cuts), " - "),
function(x) paste(prettyNum(x,
big.mark = ",",
preserve.width = "none"),
collapse = " - "))
}
# Return
return(clean_cuts)
}
nice.cuts完成完整输入向量上标签的所有清理:首先在cut2
之后生成一个字符向量,然后在此向量上执行大量正则表达式。但是,您只是在修改标签
因此,在生成cut\u breaks
之后,我将首先以正确的格式生成标签:cut\u labels
。我在下面新版本的cut.labels
中这样做了。与原始版本相比,基准测试显示了巨大的改进:
> require(microbenchmark)
> dta <- data.frame(values=floor(runif(1000, 10000,90000)))
> microbenchmark(nice.cuts(dta$values, thousands.separator = TRUE),
+ nice.cuts2(dta$values, thousands.separator = TRUE))
Unit: milliseconds
expr min lq mean median uq max neval cld
nice.cuts(dta$values, thousands.separator = TRUE) 720.1378 815.51782 902.9218 923.97881 968.39036 1208.00434 100 b
nice.cuts2(dta$values, thousands.separator = TRUE) 11.4147 15.18232 16.6196 16.46937 17.05305 29.91089 100 a
>
这似乎更适合姐妹网站。如果你想把它放在这里,你应该把它归结为代码问题,即更简单的问题。@Roland谢谢你的建议,我很乐意删除并在CR上重新发布。也许我会等几分钟,看看SO社区的成员是否感兴趣。我被标签的可用性误导了,因为这就是我的问题所在。@非常感谢您的建议。事实上,我对构建有序因子向量感兴趣。实际上,我经常对形状文件中的data.frames
afterggfortify
中已有的值使用该函数,因此我希望为所有具有括号值而非核心值的观察值设置一个因子列。基本理念是在地图/图表图例上使用一组整洁的颜色。非常感谢您表现出的兴趣和宝贵的贡献。您是否愿意发布基准测试结果以显示速度的提高?@Konrad添加了新版本的nice.cuts
,并将其与原始版本进行了基准测试。非常感谢您的评论,这对性能来说是一个非常显著的提高!
> dta$cuts <- nice.cuts(dta$values, thousands.separator = TRUE)
> t(t(table(dta$cuts))) #' t() for presentation
[,1]
10,000 - 19,999 9
20,000 - 29,999 11
30,000 - 39,999 12
40,000 - 49,999 20
50,000 - 59,999 6
60,000 - 69,999 15
70,000 - 79,999 17
80,000 - 89,999 10
> require(microbenchmark)
> dta <- data.frame(values=floor(runif(100, 10000,90000)))
> microbenchmark(nice.cuts(dta$values, thousands.separator = TRUE))
Unit: milliseconds
expr min lq mean median uq max neval
nice.cuts(dta$values, thousands.separator = TRUE) 32.67988 58.25709 99.26317 95.25195 136.7998 222.2178 100
> dta <- data.frame(values=floor(runif(1000, 10000,90000)))
> microbenchmark(nice.cuts(dta$values, thousands.separator = TRUE),
+ times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
nice.cuts(dta$values, thousands.separator = TRUE) 428.6821 901.2123 1154.097 1068.845 1679.052 1708.836 10
> require(microbenchmark)
> dta <- data.frame(values=floor(runif(1000, 10000,90000)))
> microbenchmark(nice.cuts(dta$values, thousands.separator = TRUE),
+ nice.cuts2(dta$values, thousands.separator = TRUE))
Unit: milliseconds
expr min lq mean median uq max neval cld
nice.cuts(dta$values, thousands.separator = TRUE) 720.1378 815.51782 902.9218 923.97881 968.39036 1208.00434 100 b
nice.cuts2(dta$values, thousands.separator = TRUE) 11.4147 15.18232 16.6196 16.46937 17.05305 29.91089 100 a
>
nice.cuts2 <- function(variable, cuts = 10, thousands.separator = FALSE) {
# Load required packages (useful when used independently of context)
Vectorize(require)(package = c("gsubfn", "Hmisc", "scales"),
character.only = TRUE)
# Destring this variable
destring <- function(x) {
## convert factor to strings
if (is.character(x)) {
as.numeric(x)
} else if (is.factor(x)) {
as.numeric(levels(x))[x]
} else if (is.numeric(x)) {
x
} else {
stop("could not convert to numeric")
}
}
# Apply function
variable <- destring(variable)
# Check whether to disable scientific notation
if (mean(variable) > 100000) {
options(scipen = 999)
} else {
options(scipen = 0)
}
# Create pretty breaks
cut_breaks <- pretty_breaks(n = cuts)(variable)
# Round it two decimal places
variable <- round(variable, digits = 2)
# Develop cuts according to the provided object
cuts_variable <- cut2(x = variable, cuts = cut_breaks)
cuts_labels <- levels(cuts_variable)
# Check if variable is total or with decimals
if (all(cut_breaks %% 1 == 0)) {
# Variable is integer
cuts_labels <- gsubfn('\\[\\s*(\\d+),\\s*(\\d+)[^0-9]+',
~paste0(x, '-',as.numeric(y)-1),
as.character(cuts_labels))
} else {
# Variable is not integer
# Create clean cuts
cuts_labels <- gsubfn('\\[\\s*([0-9]+\\.*[0-9]*),\\s*(\\d+\\.\\d+).*',
~paste0(x, '-', as.numeric(y)- 0.01),
as.character(cuts_labels))
}
# Clean Inf
cuts_labels <- gsub("Inf", max(variable), cuts_labels)
# Clean punctuation
cuts_labels <- sub("\\[(.*), (.*)\\]", "\\1 - \\2", cuts_labels)
# Replace strings with spaces
cuts_labels <- gsub("-"," - ",cuts_labels, fixed = TRUE)
# Trim white spaces
cuts_labels <- trimws(cuts_labels)
if (thousands.separator == TRUE) {
cuts_labels <- sapply(strsplit(cuts_labels, " - "),
function(x) paste(prettyNum(x,
big.mark = ",",
preserve.width = "none"),
collapse = " - "))
}
levels(cuts_variable) <- cuts_labels
cuts_variable
}