聚合R中的数据帧子集_R_Dataframe_Subset_Aggregation

聚合R中的数据帧子集

r dataframe

聚合R中的数据帧子集,r,dataframe,subset,aggregation,R,Dataframe,Subset,Aggregation,我有数据帧ds CountyID ZipCode Value1 Value2 Value3 ... Value25 1 1 0 etc etc etc 2 1 3 3 1 0 4 1 1 5 2 2

我有数据帧

ds

CountyID  ZipCode   Value1    Value2    Value3 ...   Value25
   1        1         0        etc        etc          etc
   2        1         3       
   3        1         0       
   4        1         1       
   5        2         2       
   6        3         3       
   7        4         7
   8        4         2       
   9        5         1       
   10       6         0

并且希望基于

ds$ZipCode

进行聚合，并基于最高的

ds$Value1

将

ds$CountyID

设置为等于主要县。对于上面的示例，它将如下所示：

CountyID  ZipCode   Value1    Value2    Value3 ...   Value25
   2        1         4        etc        etc          etc
   5        2         2       
   6        3         3       
   7        4         9       
   9        5         1       
   10       6         0

所有

ValueX

列都是按

ZipCode

分组的列的总和

在过去的几天里，我尝试了很多不同的策略，但都不管用。我想到的最好的办法是

#initialize the dataframe
ds_temp = data.frame()

#loop through each subset based on unique zipcodes
for (zip in unique(ds$ZipCode) {

    sub <- subset(ds, ds$ZipCode == zip)                                           
    len <- length(sub)                                                             
    maxIndex <- which.max(sub$Value1)                          

    #do the aggregation  
    row <- aggregate(sub[3:27], FUN=sum, by=list(                                  
        CountyID = rep(sub$CountyID[maxIndex], len),                           
        ZipCode = sub$ZipCode))                

    rbind(ds_temp, row)                                                            
}                                                                                  

ds <- ds_temp

#初始化数据帧
ds_temp=data.frame（）
#基于唯一zipcodes遍历每个子集
用于（压缩编码唯一（ds$ZipCode）{
sub您可以尝试以下方法：
data.frame(aggregate(df[,3:27], by=list(df$ZipCode), sum),
  CountyID = unlist(lapply(split(df, df$ZipCode), 
    function(x) x$CountyID[which.max(x$Value1)])))

完全可复制的样本数据：
df<-read.table(text="
CountyID  ZipCode   Value1    
   1        1         0   
   2        1         3       
   3        1         0       
   4        1         1       
   5        2         2       
   6        3         3       
   7        4         7
   8        4         2       
   9        5         1       
   10       6         0", header=TRUE)

data.frame(aggregate(df[,3], by=list(df$ZipCode), sum),
  CountyID = unlist(lapply(split(df, df$ZipCode), 
    function(x) x$CountyID[which.max(x$Value1)])))

#  Group.1 x CountyID
#1       1 4        2
#2       2 2        5
#3       3 3        6
#4       4 9        7
#5       5 1        9
#6       6 0       10

df您可以尝试以下方法：
data.frame(aggregate(df[,3:27], by=list(df$ZipCode), sum),
  CountyID = unlist(lapply(split(df, df$ZipCode), 
    function(x) x$CountyID[which.max(x$Value1)])))

完全可复制的样本数据：
df<-read.table(text="
CountyID  ZipCode   Value1    
   1        1         0   
   2        1         3       
   3        1         0       
   4        1         1       
   5        2         2       
   6        3         3       
   7        4         7
   8        4         2       
   9        5         1       
   10       6         0", header=TRUE)

data.frame(aggregate(df[,3], by=list(df$ZipCode), sum),
  CountyID = unlist(lapply(split(df, df$ZipCode), 
    function(x) x$CountyID[which.max(x$Value1)])))

#  Group.1 x CountyID
#1       1 4        2
#2       2 2        5
#3       3 3        6
#4       4 9        7
#5       5 1        9
#6       6 0       10

df您可以尝试以下方法：
data.frame(aggregate(df[,3:27], by=list(df$ZipCode), sum),
  CountyID = unlist(lapply(split(df, df$ZipCode), 
    function(x) x$CountyID[which.max(x$Value1)])))

完全可复制的样本数据：
df<-read.table(text="
CountyID  ZipCode   Value1    
   1        1         0   
   2        1         3       
   3        1         0       
   4        1         1       
   5        2         2       
   6        3         3       
   7        4         7
   8        4         2       
   9        5         1       
   10       6         0", header=TRUE)

data.frame(aggregate(df[,3], by=list(df$ZipCode), sum),
  CountyID = unlist(lapply(split(df, df$ZipCode), 
    function(x) x$CountyID[which.max(x$Value1)])))

#  Group.1 x CountyID
#1       1 4        2
#2       2 2        5
#3       3 3        6
#4       4 9        7
#5       5 1        9
#6       6 0       10

df您可以尝试以下方法：
data.frame(aggregate(df[,3:27], by=list(df$ZipCode), sum),
  CountyID = unlist(lapply(split(df, df$ZipCode), 
    function(x) x$CountyID[which.max(x$Value1)])))

完全可复制的样本数据：
df<-read.table(text="
CountyID  ZipCode   Value1    
   1        1         0   
   2        1         3       
   3        1         0       
   4        1         1       
   5        2         2       
   6        3         3       
   7        4         7
   8        4         2       
   9        5         1       
   10       6         0", header=TRUE)

data.frame(aggregate(df[,3], by=list(df$ZipCode), sum),
  CountyID = unlist(lapply(split(df, df$ZipCode), 
    function(x) x$CountyID[which.max(x$Value1)])))

#  Group.1 x CountyID
#1       1 4        2
#2       2 2        5
#3       3 3        6
#4       4 9        7
#5       5 1        9
#6       6 0       10

df针对您对Frank答案的评论，您可以使用aggregate
中的公式方法保留列名。使用Franks的数据df
，这将是
> cbind(aggregate(Value1 ~ ZipCode, df, sum), 
        CountyID = sapply(split(df, df$ZipCode), function(x) {
            with(x, CountyID[Value1 == max(Value1)]) }))
#   ZipCode Value1 CountyID
# 1       1      4        2
# 2       2      2        5
# 3       3      3        6
# 4       4      9        7
# 5       5      1        9
# 6       6      0       10

为了回应您对Frank答案的评论，您可以使用aggregate
中的公式方法保留列名。使用Franks的数据df
，这将是
> cbind(aggregate(Value1 ~ ZipCode, df, sum), 
        CountyID = sapply(split(df, df$ZipCode), function(x) {
            with(x, CountyID[Value1 == max(Value1)]) }))
#   ZipCode Value1 CountyID
# 1       1      4        2
# 2       2      2        5
# 3       3      3        6
# 4       4      9        7
# 5       5      1        9
# 6       6      0       10

为了回应您对Frank答案的评论，您可以使用aggregate
中的公式方法保留列名。使用Franks的数据df
，这将是
> cbind(aggregate(Value1 ~ ZipCode, df, sum), 
        CountyID = sapply(split(df, df$ZipCode), function(x) {
            with(x, CountyID[Value1 == max(Value1)]) }))
#   ZipCode Value1 CountyID
# 1       1      4        2
# 2       2      2        5
# 3       3      3        6
# 4       4      9        7
# 5       5      1        9
# 6       6      0       10

为了回应您对Frank答案的评论，您可以使用aggregate
中的公式方法保留列名。使用Franks的数据df
，这将是
> cbind(aggregate(Value1 ~ ZipCode, df, sum), 
        CountyID = sapply(split(df, df$ZipCode), function(x) {
            with(x, CountyID[Value1 == max(Value1)]) }))
#   ZipCode Value1 CountyID
# 1       1      4        2
# 2       2      2        5
# 3       3      3        6
# 4       4      9        7
# 5       5      1        9
# 6       6      0       10

似乎对样本数据很有效。有没有办法保存列名？@aquamole你可以做一些类似names（newdf）[1:26]的事情对样本数据似乎很有效。有没有办法保存列名？@aquamole你可以做一些类似names（newdf）[1:26]似乎对样本数据很有效。有没有办法保存列名？@aquamole你可以做一些类似names（newdf）[1:26]的事情对样本数据似乎很有效。有没有办法保存列名？@aquamole你可以做一些类似names（newdf）[1:26]为了回应您的评论，我添加了一个显示列名称的答案作为回应您的评论，我添加了一个显示列名称的答案作为回应您的评论，我添加了一个显示列名称的答案作为回应您的评论，我添加了一个显示列名称的答案nIcely这是可行的，但不幸的是，我必须拼出所有25个值的列名。不，你不能。只需将名称（df）
的子集与集合名一起使用即可，但不幸的是，我必须拼出所有25个值的列名。不，你不能。只需使用名称（df）的子集即可
使用setNames
这很有效，但不幸的是，我必须拼出所有25个值的列名。不，你不能。只需使用名称（df）
的子集，使用setNames
这很有效，但不幸的是，我必须拼出所有25个值的列名。不，你不能。只需使用名称（df）的子集即可
带有设置名称