R 如何在每次试验中有效地去除异常值
我是R新手,所以我只知道如何为循环编写代码,但我肯定认为有一种更有效的方法来完成我正在尝试的工作 以下是我现在掌握的代码:R 如何在每次试验中有效地去除异常值,r,performance,loops,dataframe,R,Performance,Loops,Dataframe,我是R新手,所以我只知道如何为循环编写代码,但我肯定认为有一种更有效的方法来完成我正在尝试的工作 以下是我现在掌握的代码: for (i in 1:length(unique(poo$TRIAL_INDEX))) { zz <- subset(poo, TRIAL_INDEX==i) sds <- sd(zz$RIGHT_PUPIL_SIZE, na.rm = TRUE) avgpupil <- mean(zz$RIGHT_PUPIL_SIZE, na.rm = TRUE) #
for (i in 1:length(unique(poo$TRIAL_INDEX))) {
zz <- subset(poo, TRIAL_INDEX==i)
sds <- sd(zz$RIGHT_PUPIL_SIZE, na.rm = TRUE)
avgpupil <- mean(zz$RIGHT_PUPIL_SIZE, na.rm = TRUE)
#what im trying to do in the lines above is subset the data for every trial
#so that I can calculate the standard deviation and average for each trial
for (j in 1:length(zz$RIGHT_PUPIL_SIZE)) {
if (zz$RIGHT_PUPIL_SIZE[j] > 3*sds+avgpupil | zz$RIGHT_PUPIL_SIZE[j] < avgpupil-3*sds | is.na(zz$RIGHT_PUPIL_SIZE[j])) {
zz$RIGHT_PUPIL_SIZE[j] <- NA_character_
goo <- rbind(zz[j],goo)
} else {
goo <- rbind(zz[j],goo)
}
}
}
#then I want it to replace the value in RIGHT_PUPIL_SIZE with NA if it is
# 3 SD above or under the mean, and if it's NA. Then I bind it to a new dataframe
for(1中的i:长度(唯一(poo$TRIAL_索引))){
zz这可能满足您的大部分要求。我不理解您问题的rbind
部分:
poo <- read.table(text = '
TRIAL_INDEX RIGHT_PUPIL_SIZE
1 10
1 8
1 6
1 4
1 NA
2 1
2 2
2 NA
2 4
2 5
', header = TRUE, stringsAsFactors = FALSE, na.strings = "NA")
my.summary <- as.data.frame(do.call("rbind", tapply(poo$RIGHT_PUPIL_SIZE, poo$TRIAL_INDEX,
function(x) c(index.sd = sd(x, na.rm = TRUE), index.mean = mean(x, na.rm = TRUE)))))
my.summary$TRIAL_INDEX <- rownames(my.summary)
poo <- merge(poo, my.summary, by = 'TRIAL_INDEX')
poo$RIGHT_PUPIL_SIZE <- ifelse( (poo$RIGHT_PUPIL_SIZE > (poo$index.mean + 3 * poo$index.sd)) |
(poo$RIGHT_PUPIL_SIZE < (poo$index.mean - 3 * poo$index.sd)) |
is.na(poo$RIGHT_PUPIL_SIZE), NA, poo$RIGHT_PUPIL_SIZE)
poo
# TRIAL_INDEX RIGHT_PUPIL_SIZE index.sd index.mean
#1 1 10 2.581989 7
#2 1 8 2.581989 7
#3 1 6 2.581989 7
#4 1 4 2.581989 7
#5 1 NA 2.581989 7
#6 2 1 1.825742 3
#7 2 2 1.825742 3
#8 2 NA 1.825742 3
#9 2 4 1.825742 3
#10 2 5 1.825742 3
下面是一个使用ave
的解决方案:
index.mean <- ave(poo$RIGHT_PUPIL_SIZE, poo$TRIAL_INDEX, FUN = function(x) mean(x, na.rm = TRUE))
index.sd <- ave(poo$RIGHT_PUPIL_SIZE, poo$TRIAL_INDEX, FUN = function(x) sd(x, na.rm = TRUE))
poo <- data.frame(poo, index.mean, index.sd)
poo$RIGHT_PUPIL_SIZE <- ifelse((poo$RIGHT_PUPIL_SIZE > (poo$index.mean + 3 * poo$index.sd)) |
(poo$RIGHT_PUPIL_SIZE < (poo$index.mean - 3 * poo$index.sd)) |
is.na(poo$RIGHT_PUPIL_SIZE), NA, poo$RIGHT_PUPIL_SIZE)
这里有一个使用数据.table
的解决方案。可能有更好的解决方案使用数据.table
。我想在发布这个答案之前,我只使用了数据.table
一次
library(dplyr)
my.summary <- poo %>%
group_by(TRIAL_INDEX) %>%
summarise(index.mean = mean(RIGHT_PUPIL_SIZE, na.rm = TRUE),
index.sd = sd(RIGHT_PUPIL_SIZE, na.rm = TRUE))
my.summary
poo <- merge(poo, as.data.frame(my.summary), by = 'TRIAL_INDEX')
poo$RIGHT_PUPIL_SIZE <- ifelse((poo$RIGHT_PUPIL_SIZE > (poo$index.mean + 3 * poo$index.sd)) |
(poo$RIGHT_PUPIL_SIZE < (poo$index.mean - 3 * poo$index.sd)) |
is.na(poo$RIGHT_PUPIL_SIZE), NA, poo$RIGHT_PUPIL_SIZE)
poo
poo <- read.table(text = '
TRIAL_INDEX RIGHT_PUPIL_SIZE
1 10
1 8
1 6
1 4
1 NA
2 1
2 2
2 NA
2 4
2 5
', header = TRUE, stringsAsFactors = FALSE, na.strings = "NA")
library(data.table)
my.summary <- data.frame(setDT(poo)[, .(index.mean = mean(RIGHT_PUPIL_SIZE, na.rm = TRUE),
index.sd = sd(RIGHT_PUPIL_SIZE, na.rm = TRUE)),
.(TRIAL_INDEX)])
poo <- merge(poo, my.summary, by = 'TRIAL_INDEX')
poo$RIGHT_PUPIL_SIZE <- ifelse((poo$RIGHT_PUPIL_SIZE > (poo$index.mean + 3 * poo$index.sd)) |
(poo$RIGHT_PUPIL_SIZE < (poo$index.mean - 3 * poo$index.sd)) |
is.na(poo$RIGHT_PUPIL_SIZE), NA, poo$RIGHT_PUPIL_SIZE)
poo
poo这可能会满足您的大部分需求。我不理解您问题的rbind
部分:
poo <- read.table(text = '
TRIAL_INDEX RIGHT_PUPIL_SIZE
1 10
1 8
1 6
1 4
1 NA
2 1
2 2
2 NA
2 4
2 5
', header = TRUE, stringsAsFactors = FALSE, na.strings = "NA")
my.summary <- as.data.frame(do.call("rbind", tapply(poo$RIGHT_PUPIL_SIZE, poo$TRIAL_INDEX,
function(x) c(index.sd = sd(x, na.rm = TRUE), index.mean = mean(x, na.rm = TRUE)))))
my.summary$TRIAL_INDEX <- rownames(my.summary)
poo <- merge(poo, my.summary, by = 'TRIAL_INDEX')
poo$RIGHT_PUPIL_SIZE <- ifelse( (poo$RIGHT_PUPIL_SIZE > (poo$index.mean + 3 * poo$index.sd)) |
(poo$RIGHT_PUPIL_SIZE < (poo$index.mean - 3 * poo$index.sd)) |
is.na(poo$RIGHT_PUPIL_SIZE), NA, poo$RIGHT_PUPIL_SIZE)
poo
# TRIAL_INDEX RIGHT_PUPIL_SIZE index.sd index.mean
#1 1 10 2.581989 7
#2 1 8 2.581989 7
#3 1 6 2.581989 7
#4 1 4 2.581989 7
#5 1 NA 2.581989 7
#6 2 1 1.825742 3
#7 2 2 1.825742 3
#8 2 NA 1.825742 3
#9 2 4 1.825742 3
#10 2 5 1.825742 3
下面是一个使用ave
的解决方案:
index.mean <- ave(poo$RIGHT_PUPIL_SIZE, poo$TRIAL_INDEX, FUN = function(x) mean(x, na.rm = TRUE))
index.sd <- ave(poo$RIGHT_PUPIL_SIZE, poo$TRIAL_INDEX, FUN = function(x) sd(x, na.rm = TRUE))
poo <- data.frame(poo, index.mean, index.sd)
poo$RIGHT_PUPIL_SIZE <- ifelse((poo$RIGHT_PUPIL_SIZE > (poo$index.mean + 3 * poo$index.sd)) |
(poo$RIGHT_PUPIL_SIZE < (poo$index.mean - 3 * poo$index.sd)) |
is.na(poo$RIGHT_PUPIL_SIZE), NA, poo$RIGHT_PUPIL_SIZE)
这里有一个使用数据.table
的解决方案。可能有更好的解决方案使用数据.table
。我想在发布这个答案之前,我只使用了数据.table
一次
library(dplyr)
my.summary <- poo %>%
group_by(TRIAL_INDEX) %>%
summarise(index.mean = mean(RIGHT_PUPIL_SIZE, na.rm = TRUE),
index.sd = sd(RIGHT_PUPIL_SIZE, na.rm = TRUE))
my.summary
poo <- merge(poo, as.data.frame(my.summary), by = 'TRIAL_INDEX')
poo$RIGHT_PUPIL_SIZE <- ifelse((poo$RIGHT_PUPIL_SIZE > (poo$index.mean + 3 * poo$index.sd)) |
(poo$RIGHT_PUPIL_SIZE < (poo$index.mean - 3 * poo$index.sd)) |
is.na(poo$RIGHT_PUPIL_SIZE), NA, poo$RIGHT_PUPIL_SIZE)
poo
poo <- read.table(text = '
TRIAL_INDEX RIGHT_PUPIL_SIZE
1 10
1 8
1 6
1 4
1 NA
2 1
2 2
2 NA
2 4
2 5
', header = TRUE, stringsAsFactors = FALSE, na.strings = "NA")
library(data.table)
my.summary <- data.frame(setDT(poo)[, .(index.mean = mean(RIGHT_PUPIL_SIZE, na.rm = TRUE),
index.sd = sd(RIGHT_PUPIL_SIZE, na.rm = TRUE)),
.(TRIAL_INDEX)])
poo <- merge(poo, my.summary, by = 'TRIAL_INDEX')
poo$RIGHT_PUPIL_SIZE <- ifelse((poo$RIGHT_PUPIL_SIZE > (poo$index.mean + 3 * poo$index.sd)) |
(poo$RIGHT_PUPIL_SIZE < (poo$index.mean - 3 * poo$index.sd)) |
is.na(poo$RIGHT_PUPIL_SIZE), NA, poo$RIGHT_PUPIL_SIZE)
poo
poo以下是一些示例数据:
#dput(poo)
poo<-structure(list(TRIAL_INDEX = structure(c(1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A",
"B"), class = "factor"), RIGHT_PUPIL_SIZE = c(10.2043651385866,
20.9885863196198, NA, 199, 8.83696635172232, 18.7815785751864,
10.3610991868418, 19.6540748580446, 8.5323332390802, 20.2930866405183,
8.74706048647041, 17.6785303413612, 10.0699206520888, 21.359973619746,
10.1517982308973, 18.7513452694493, 8.44732655940166, 20.5369556689887,
8.63612148828901, 22.2712027851507)), .Names = c("TRIAL_INDEX",
"RIGHT_PUPIL_SIZE"), row.names = c(NA, -20L), class = "data.frame")
#dput(poo)
poo以下是一些样本数据:
#dput(poo)
poo<-structure(list(TRIAL_INDEX = structure(c(1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A",
"B"), class = "factor"), RIGHT_PUPIL_SIZE = c(10.2043651385866,
20.9885863196198, NA, 199, 8.83696635172232, 18.7815785751864,
10.3610991868418, 19.6540748580446, 8.5323332390802, 20.2930866405183,
8.74706048647041, 17.6785303413612, 10.0699206520888, 21.359973619746,
10.1517982308973, 18.7513452694493, 8.44732655940166, 20.5369556689887,
8.63612148828901, 22.2712027851507)), .Names = c("TRIAL_INDEX",
"RIGHT_PUPIL_SIZE"), row.names = c(NA, -20L), class = "data.frame")
#dput(poo)
你能给我们一份你的poo
的样品吗