R 如何查找数据框中两列的异常值
我需要得到每种类型的variable2和variable3的variable1的异常值总数,然后将其显示在表中。它还需要只显示variable4大于1.5的情况。我让它工作,但我认为我的代码有问题,因为每个的输出都是0,这是不正确的 当我执行boxplot.stats(df$variable1)$out时,我会得到一个巨大的异常值列表。但是,当我使用下面的代码时,它只表示每个代码为0R 如何查找数据框中两列的异常值,r,R,我需要得到每种类型的variable2和variable3的variable1的异常值总数,然后将其显示在表中。它还需要只显示variable4大于1.5的情况。我让它工作,但我认为我的代码有问题,因为每个的输出都是0,这是不正确的 当我执行boxplot.stats(df$variable1)$out时,我会得到一个巨大的异常值列表。但是,当我使用下面的代码时,它只表示每个代码为0 high <- mean(df$variable1) + sd(df$variable1) * 3 low
high <- mean(df$variable1) + sd(df$variable1) * 3
low <- mean(df$variable1) - sd(df$variable1) * 3
df%>%
filter(varaible4>1.5)%>%
group_by(variable2, variable3) %>%
tally(variable1 < low ||variable1 > high)
高1.5)%
分组依据(变量2,变量3)%>%
计数(可变1<低| |可变1>高)
一个表格显示了每种类型的变量2和变量3…但计数只是表示每种类型为0。也许您可以使用
比例
,而不是定义高
和低
阈值并使用计数
以下是基于一些随机数据的实现:
library(dplyr)
df = data.frame(variable1 = runif(100,1,10),
variable2 = round(runif(100,1,3)),
variable3 = round(runif(100,1,3)),
variable4 = runif(100,1,5))
df$variable1[c(5,13,95)] = 1000
df1 <- df %>%
filter(variable4>1.5)%>%
group_by(variable2, variable3) %>%
mutate(individual_outliers = abs(scale(variable1) > 3),
total_outliers = sum(individual_outliers))
> df1
# A tibble: 91 x 6
# Groups: variable2, variable3 [9]
variable1 variable2 variable3 variable4 individual_outliers total_outliers
<dbl> <dbl> <dbl> <dbl> <int> <int>
1 6.86 2 3 2.82 0 0
2 4.89 1 2 3.27 0 0
3 4.19 2 3 3.03 0 0
4 2.05 2 3 2.31 0 0
5 1000 3 2 2.08 1 1
6 9.36 2 2 3.85 0 0
7 8.40 3 3 3.81 0 0
8 8.33 3 2 2.32 0 1
9 7.92 2 1 4.58 0 0
10 8.13 3 1 2.48 0 0
# ... with 81 more rows
库(dplyr)
df=数据帧(variable1=runif(100,1,10),
变量2=四舍五入(runif(100,1,3)),
变量3=四舍五入(runif(100,1,3)),
variable4=runif(100,1,5))
df$variable1[c(5,13,95)]=1000
df1%
过滤器(变量4>1.5)%>%
分组依据(变量2,变量3)%>%
突变(单个异常值=绝对值(标度(变量1)>3),
总异常值=总和(单个异常值))
>df1
#A tibble:91 x 6
#组:变量2,变量3[9]
变量1变量2变量3变量4单个异常值总计异常值
1 6.86 2 3 2.82 0 0
2 4.89 1 2 3.27 0 0
3 4.19 2 3 3.03 0 0
4 2.05 2 3 2.31 0 0
5 1000 3 2 2.08 1 1
6 9.36 2 2 3.85 0 0
7 8.40 3 3 3.81 0 0
8 8.33 3 2 2.32 0 1
9 7.92 2 1 4.58 0 0
10 8.13 3 1 2.48 0 0
# ... 还有81行
数据:
df请查看并提供一些原始数据。
df <- data.frame(variable1 = runif(1000,1,10),
variable2 = round(runif(1000,1,3)),
variable3 = round(runif(1000,1,3)),
variable4 = runif(1000,1,5),
variable5 = rep(LETTERS[1:4], 250),
variable6 = rep(LETTERS[5:9], 200), stringsAsFactors = F)
df$variable1[c(5,13,95)] = 1000
# Create a grouping vector:
grouping_vars <- c("variable5", "variable6")
# Split apply combine function:
tmp_df <- do.call(rbind, lapply(split(df[,sapply(df, is.numeric)], df[,grouping_vars]), function(x){
# Calculate mahalanobis distance:
md <- mahalanobis(x, colMeans(x), cov(x), inverted = FALSE)
# Calculate the iqr of the md:
iqr <- quantile(md, .75) - quantile(md, .25)
# Classify the lower threshold outliers:
lwr <- ifelse(md > (quantile(md, .75) + (1.5 * iqr)) | (md < (quantile(md, .25) - (1.5 * iqr))),
"outlier",
"not outlier")
# Classify the upper threshold outliers:
upr <- ifelse(md > (quantile(md, .75) + (3 * iqr)) | (md < (quantile(md, .25) - (3 * iqr))),
"outlier",
"not outlier")
# Bind all of the vecs together:
cbind(x, md, lwr, upr)
}
)
)
# Extract the group from the row names:
tmp_df <- data.frame(cbind(df[,!(sapply(df, is.numeric))],
grouping_vars = row.names(tmp_df), tmp_df), row.names = NULL)
df <- tmp_df[,c(names(df), setdiff(names(tmp_df), names(df)))]
# Use boxplot stats mean(x) +- 1.5 * IQR:
outliers_classified <- do.call("rbind", lapply(split(df, df[,grouping_vars]), function(x){
if(is.numeric(x)){
ifelse(x %in% boxplot.stats(x)$out, NA, x)
}else{
x
}
}
)
)