R 数据帧中每组的平均值
我有一个R 数据帧中每组的平均值,r,aggregate,R,Aggregate,我有一个数据框,我需要计算每组的平均值(即每个月,如下) 我期望的输出如下所示,其中Rate1和Rate2的值是组平均值。请忽略此值,我已为示例补足了此值 Name Rate1 Rate2 Aira 23.21 12.2 Ben 45.23 43.9 Cat 33.22 32.2 这种类型的操作正是aggregate设计的目的: d <- read.table(text= 'Name
数据框
,我需要计算每组的平均值(即每个月
,如下)
我期望的输出如下所示,其中Rate1
和Rate2
的值是组平均值。请忽略此值,我已为示例补足了此值
Name Rate1 Rate2
Aira 23.21 12.2
Ben 45.23 43.9
Cat 33.22 32.2
这种类型的操作正是
aggregate
设计的目的:
d <- read.table(text=
'Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32', header=TRUE)
aggregate(d[, 3:4], list(d$Name), mean)
Group.1 Rate1 Rate2
1 Aira 16.33333 47.00000
2 Ben 31.33333 50.33333
3 Cat 44.66667 54.00000
您还可以使用package
plyr
,它在某种程度上更加通用:
library(plyr)
ddply(d, .(Name), summarize, Rate1=mean(Rate1), Rate2=mean(Rate2))
Name Rate1 Rate2
1 Aira 16.33333 47.00000
2 Ben 31.33333 50.33333
3 Cat 44.66667 54.00000
第三个很好的替代方法是使用包
data.table
,它也有类data.frame,但是像您正在寻找的操作的计算速度要快得多
library(data.table)
mydt <- structure(list(Name = c("Aira", "Aira", "Aira", "Ben", "Ben", "Ben", "Cat", "Cat", "Cat"), Month = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), Rate1 = c(15.6396600443877, 2.15649279424609, 6.24692918928743, 2.37658797276116, 34.7500663272292, 3.28750138697048, 29.3265553981065, 17.9821839334431, 10.8639802575958), Rate2 = c(17.1680489538369, 5.84231656330206, 8.54330866437461, 5.88415184986176, 3.02064294862551, 17.2053351400752, 16.9552950199166, 2.56058000170089, 15.7496228048122)), .Names = c("Name", "Month", "Rate1", "Rate2"), row.names = c(NA, -9L), class = c("data.table", "data.frame"))
库(data.table)
mydt或使用包中的分组和总结
:
库(dplyr)
d%>%
分组单位(名称)%>%
总结(变量(-月),funs(平均值(,na.rm=TRUE)))
#一个tibble:3x3
名称费率1费率2
1 Aira 16.3 47.0
2本31.3 50.3
3类44.7 54.0
有关指定要作用的变量的多种方法,请参见
?在
处总结。在这里,vars(-Month)
表示除Month
之外的所有变量。您也可以使用通用函数cbind()
和lm()
,而不使用截取:
cbind(lm(d$Rate1~-1+d$Name)$coef,lm(d$Rate2~-1+d$Name)$coef)
> [,1] [,2]
>d$NameAira 16.33333 47.00000
>d$NameBen 31.33333 50.33333
>d$NameCat 44.66667 54.00000
您还可以使用
sqldf
包来完成此操作,如下所示:
library(sqldf)
x <- read.table(text='Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32', header=TRUE)
sqldf("
select
Name
,avg(Rate1) as Rate1_float
,avg(Rate2) as Rate2_float
,avg(Rate1) as Rate1
,avg(Rate2) as Rate2
from x
group by
Name
")
# Name Rate1_float Rate2_float Rate1 Rate2
#1 Aira 16.33333 47.00000 16 47
#2 Ben 31.33333 50.33333 31 50
#3 Cat 44.66667 54.00000 44 54
库(sqldf)
x我描述了两种方法,一种基于数据表,另一种基于重塑2包。data.table方法已经有了答案,但我已经尝试让它更清晰、更详细
数据如下:
d <- structure(list(Name = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L), .Label = c("Aira", "Ben", "Cat"), class = "factor"),
Month = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), Rate1 = c(12L,
18L, 19L, 53L, 22L, 19L, 22L, 67L, 45L), Rate2 = c(23L, 73L,
45L, 19L, 87L, 45L, 87L, 43L, 32L)), .Names = c("Name", "Month",
"Rate1", "Rate2"), class = "data.frame", row.names = c(NA, -9L
))
head(d)
Name Month Rate1 Rate2
1 Aira 1 12 23
2 Aira 2 18 73
3 Aira 3 19 45
4 Ben 1 53 19
5 Ben 2 22 87
6 Ben 3 19 45
library("reshape2")
mym <- melt(d, id = c("Name"))
res <- dcast(mym, Name ~ variable, mean)
res
#Name Month Rate1 Rate2
#1 Aira 2 16.33333 47.00000
#2 Ben 2 31.33333 50.33333
#3 Cat 2 44.66667 54.00000
另一种方法是避免使用.SD在data.table中为j编写许多参数
d[, lapply(.SD, mean), by = .(Name)]
# Name Month Rate1 Rate2
#1: Aira 2 16.33333 47.00000
#2: Ben 2 31.33333 50.33333
#3: Cat 2 44.66667 54.00000
如果我们只想要Rate1和Rate2,那么我们可以使用.SDcols,如下所示:
d[, lapply(.SD, mean), by = .(Name), .SDcols = 3:4]
# Name Rate1 Rate2
#1: Aira 16.33333 47.00000
#2: Ben 31.33333 50.33333
#3: Cat 44.66667 54.00000
下面是在baseR
中执行此操作的多种方法,包括另一种aggregate
方法。下面的例子是每月退货,我想这正是您所要求的。尽管如此,同样的方法可用于返回人均收入:
使用ave
:
my.data <- read.table(text = '
Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
Rate1.mean <- with(my.data, ave(Rate1, Month, FUN = function(x) mean(x, na.rm = TRUE)))
Rate2.mean <- with(my.data, ave(Rate2, Month, FUN = function(x) mean(x, na.rm = TRUE)))
my.data <- data.frame(my.data, Rate1.mean, Rate2.mean)
my.data
使用lappy
和split
:
my.data <- read.table(text = '
Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
ly.mean <- lapply(split(my.data, my.data$Month), function(x) c(Mean = colMeans(x[,3:4])))
ly.mean <- as.data.frame(do.call("rbind", ly.mean))
ly.mean <- cbind(Month = rownames(ly.mean), ly.mean)
my.data <- merge(my.data, ly.mean, by = 'Month')
my.data
my.data <- read.table(text = '
Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
my.data
sy.mean <- t(sapply(split(my.data, my.data$Month), function(x) colMeans(x[,3:4])))
colnames(sy.mean) <- c('Rate1.mean', 'Rate2.mean')
sy.mean <- data.frame(Month = rownames(sy.mean), sy.mean, stringsAsFactors = FALSE)
my.data <- merge(my.data, sy.mean, by = 'Month')
my.data
使用聚合
:
my.data <- read.table(text = '
Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
my.summary <- with(my.data, aggregate(list(Rate1, Rate2), by = list(Month),
FUN = function(x) { mon.mean = mean(x, na.rm = TRUE) } ))
my.summary <- do.call(data.frame, my.summary)
colnames(my.summary) <- c('Month', 'Rate1.mean', 'Rate2.mean')
my.summary
my.data <- merge(my.data, my.summary, by = 'Month')
my.data
如果有更多的列,如FirstName、LastName和Address,而不仅仅是要分组的名称,这可能会很有用。是的,通过将其更改为聚合(d[,3:4],list(Name=d$Name),mean)
当使用聚合(as.numeric(matrix$value),list(matrix$hour),mean)
时,出于某种原因,我得到了一些NaN
-值。检查my data.table中的is.nan()
和is.na()
不会显示任何结果。有什么想法吗?@jdeTypere-想不出为什么,但仔细看看split(矩阵$value,矩阵$hour)
的元素,这些元素对应于从聚合中返回的NaN
(即split(矩阵$value,矩阵$hour)[is.NaN(聚合为.numeric(矩阵$value),列表(矩阵$hour),平均值)答案很好。aggregate(d[,3:4],list(d[,1]),mean)
或者aggregate(d[,c('Rate1','Rate2')],list(d[,c('Name')),mean)
更“一致”,imho.@FabianHabersack您将?将结果与原始数据合并,但您需要为聚合对象使用不同的名称。使用dplyr更容易完成所需操作:d%%>%groupby(Name)%%>%mutate(mean1=mean(Rate1),mean2=mean(Rate2))
刚刚意识到已经有了一个答案:您能对舍入进行评论吗?Hi@partickt(很抱歉延迟),这里描述的是SQL的细微差别:。正如您所看到的,由于SQL是以int
开头的,所以它将其保持为int
平均值(cast(Ratei As float))
应该得到一个可以根据需要用四舍五入括起来的十进制值。avg(Ratei*1.0)
还应该强制使用一个float
…还没有运行它summarise.\u在新版本中,每个都不推荐使用。我应该使用什么intead?我尝试了在或如果或全部汇总,但没有成功。尝试d%>%group\u by(Name)%>%summary\u在(.vars=name(.)[3:4],.funs=c(mean=“mean”))
为什么要汇总而不是仅仅汇总?
允许您指定要汇总的特定变量范围即使在数据表中也必须使用lapply?Hermantouthrot我会这么说,至少对我来说是这样。在这种情况下,尤其是通过使用.SDcols
,他们精确地指定了要应用该变量的列函数to。如果您可以想象这样一种情况,即data.table的列数较大,则可以首先获取所有数值列的名称,然后使用.SDcols=numericVars
,而不必尝试将它们全部命名在j列中。另一个实例是使用:=
就地赋值e> .SD:=lappy(.SD,f)
d[, lapply(.SD, mean), by = .(Name)]
# Name Month Rate1 Rate2
#1: Aira 2 16.33333 47.00000
#2: Ben 2 31.33333 50.33333
#3: Cat 2 44.66667 54.00000
d[, lapply(.SD, mean), by = .(Name), .SDcols = 3:4]
# Name Rate1 Rate2
#1: Aira 16.33333 47.00000
#2: Ben 31.33333 50.33333
#3: Cat 44.66667 54.00000
my.data <- read.table(text = '
Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
Rate1.mean <- with(my.data, ave(Rate1, Month, FUN = function(x) mean(x, na.rm = TRUE)))
Rate2.mean <- with(my.data, ave(Rate2, Month, FUN = function(x) mean(x, na.rm = TRUE)))
my.data <- data.frame(my.data, Rate1.mean, Rate2.mean)
my.data
my.data <- read.table(text = '
Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
by.month <- as.data.frame(do.call("rbind", by(my.data, my.data$Month, FUN = function(x) colMeans(x[,3:4]))))
colnames(by.month) <- c('Rate1.mean', 'Rate2.mean')
by.month <- cbind(Month = rownames(by.month), by.month)
my.data <- merge(my.data, by.month, by = 'Month')
my.data
my.data <- read.table(text = '
Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
ly.mean <- lapply(split(my.data, my.data$Month), function(x) c(Mean = colMeans(x[,3:4])))
ly.mean <- as.data.frame(do.call("rbind", ly.mean))
ly.mean <- cbind(Month = rownames(ly.mean), ly.mean)
my.data <- merge(my.data, ly.mean, by = 'Month')
my.data
my.data <- read.table(text = '
Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
my.data
sy.mean <- t(sapply(split(my.data, my.data$Month), function(x) colMeans(x[,3:4])))
colnames(sy.mean) <- c('Rate1.mean', 'Rate2.mean')
sy.mean <- data.frame(Month = rownames(sy.mean), sy.mean, stringsAsFactors = FALSE)
my.data <- merge(my.data, sy.mean, by = 'Month')
my.data
my.data <- read.table(text = '
Name Month Rate1 Rate2
Aira 1 12 23
Aira 2 18 73
Aira 3 19 45
Ben 1 53 19
Ben 2 22 87
Ben 3 19 45
Cat 1 22 87
Cat 2 67 43
Cat 3 45 32
', header = TRUE, stringsAsFactors = FALSE, na.strings = 'NA')
my.summary <- with(my.data, aggregate(list(Rate1, Rate2), by = list(Month),
FUN = function(x) { mon.mean = mean(x, na.rm = TRUE) } ))
my.summary <- do.call(data.frame, my.summary)
colnames(my.summary) <- c('Month', 'Rate1.mean', 'Rate2.mean')
my.summary
my.data <- merge(my.data, my.summary, by = 'Month')
my.data
my.group <- c(1,2,1,2,2,3,1,2,3,3)
my.data <- matrix(c( 1, 2, 3, 4, 5,
10, 20, 30, 40, 50,
2, 4, 6, 8, 10,
20, 30, 40, 50, 60,
20, 18, 16, 14, 12,
1000, 1100, 1200, 1300, 1400,
2, 3, 4, 3, 2,
50, 40, 30, 20, 10,
1001, 2001, 3001, 4001, 5001,
1000, 2000, 3000, 4000, 5000), nrow = 10, ncol = 5, byrow = TRUE)
my.data
my.summary <- aggregate(list(my.data), by = list(my.group), FUN = function(x) { my.mean = mean(x, na.rm = TRUE) } )
my.summary
# Group.1 X1 X2 X3 X4 X5
#1 1 1.666667 3.000 4.333333 5.000 5.666667
#2 2 25.000000 27.000 29.000000 31.000 33.000000
#3 3 1000.333333 1700.333 2400.333333 3100.333 3800.333333