在RStudio中使用条件分组

在RStudio中使用条件分组,r,conditional-statements,grouping,R,Conditional Statements,Grouping,大家早上好,我有一个csv文件df2.csv,其中包含几个变量,如下所示,例如: CLASSE Variables Terms Number 1 DAT_1 20160701q 5 1 DAT_1 20160802q 2 1 DAT_1 20160901q 1 1 DAT_2 20161001q 1 1 DAT_2 20161201q 2

大家早上好,我有一个csv文件df2.csv,其中包含几个变量,如下所示,例如:

CLASSE  Variables   Terms   Number    
1       DAT_1    20160701q   5    
1       DAT_1    20160802q   2    
1       DAT_1    20160901q   1    
1       DAT_2    20161001q   1    
1       DAT_2    20161201q   2    
1       DAT_2    20170301q   3    
2       DAT_1    20161001q   1    
2       DAT_1    20161201q   2    
2       DAT_1    20170301q   1 
在这种情况下,对于每个类1或2,对于每个不同的日期变量,如果个体数小于3,则将个体与下一个日期分组。如果我的周期超过3个人,在这种情况下,我希望有一个类似于“20160701q-20160901q”的日期,而不是分别为20160701q和20160901q。在这种情况下,我们将两个或两个以上的日期分组,以获得一个包含3个以上个体的时段,如果该类的下一个日期包含的个体少于3个,则我们也将该日期与之前的时段分组。 我是从这个代码开始的

for (n in df2$CLASSE){
  for (k in df2$Variables){
    for (i in 1:nrow(df2)){
      if (df2$Number[i]<3){
        rempl_date=paste(df2$Terms[i],df2$Terms[i+1], sep="-")
        df2$Terms[i]<-rempl_date
        next  
      }
    }
  }
}
我不知道我必须改变什么,如果你能帮助我,我希望我是清楚的。提前感谢

我们可以在这里使用MESS::cumsumbining函数创建组,直到达到阈值

library(dplyr)
thresh <- 3

temp <- df %>%
         group_by(CLASSE, Variables, 
                  group = MESS::cumsumbinning(Number, thresh)) %>%
         summarise(Terms = if(n() > 1) 
                           paste(first(Terms), last(Terms), sep = "-") else Terms,
                   Number = sum(Number)) %>%
         select(-group)
这将返回:

temp
# A tibble: 6 x 4
# Groups:   CLASSE, Variables [3]
#  CLASSE Variables Terms               Number
#   <int> <chr>     <chr>                <int>
#1      1 DAT_1     20160701q                5
#2      1 DAT_1     20160802q-20160901q      3
#3      1 DAT_2     20161001q-20161201q      3
#4      1 DAT_2     20170301q                3
#5      2 DAT_1     20161001q-20161201q      3
#6      2 DAT_1     20170301q                1
要合并最后一行,我们可以执行以下操作:

n <- nrow(temp) 
if(temp$Number[n] < 3) {
   temp$Terms[n-1] <- sub("-.*", paste0('-', temp$Terms[n]), temp$Terms[n -1])
   temp$Number[n-1] <- sum(temp$Number[n-1], temp$Number[n])
   temp <- temp[-n,]
}


#  CLASSE Variables Terms               Number
#   <int> <chr>     <chr>                <int>
#1      1 DAT_1     20160701q                5
#2      1 DAT_1     20160802q-20160901q      3
#3      1 DAT_2     20161001q-20161201q      3
#4      1 DAT_2     20170301q                3
#5      2 DAT_1     20161001q-20170301q      4

这是我创建的一个相当麻烦的解决方案,可以满足您的要求。我确信它可以被优化,或者可以使用其他包中的函数

代码中插入了解释

# new dataframe
df_new <- data.frame(
  CLASSE = numeric(nrow(df2)),
  Variables = character(nrow(df2)),
  Terms = character(nrow(df2)),
  Number = numeric(nrow(df2)),
  stringsAsFactors = FALSE
)

# temporary dataframe
temp_df <- data.frame(
  CLASSE = numeric(0),
  Variables = character(0),
  Terms = character(0),
  Number = numeric(0),
  stringsAsFactors = FALSE
)

temp_sum <- 0
present_row_temp_df <- 1
for (i in 1:nrow(df2)){
  # if the row doesn't have to be grouped, just paste it in the new dataframe
  if (df2$Number[i] >= 3){
    df_new[i,] <- df2[i,]
    next
  }
  # if the row has to be grouped, add it to a temporary dataframe
  if (df2$Number[i] < 3){
    temp_df[present_row_temp_df,] <- df2[i,]
    temp_sum <- temp_sum + df2$Number[i]
    present_row_temp_df <- present_row_temp_df + 1
    # if the rows in the temporary dataframe need to be grouped now
    if(temp_sum >= 3){
      Terms_new <- paste(temp_df$Terms[1], temp_df$Terms[nrow(temp_df)], sep = "-")
      Number_new <- sum(temp_df$Number)
      df_new[i, c(1:3)] <- c(df2$CLASSE[i], df2$Variables[i], Terms_new)
      df_new[i, 4] <- Number_new
      # re-initialize temporary variables
      temp_df <- data.frame(
        CLASSE = numeric(0),
        Variables = character(0),
        Terms = character(0),
        Number = numeric(0),
        stringsAsFactors = FALSE
      )
      temp_sum <- 0
      present_row_temp_df <- 1
    }
    # for the case in which the last row is not united with the previous rows
    if (i == nrow(df2) & df2$Number[i] < 3){
      Terms_new <- paste(stringr::str_extract(df_new$Terms[i-1], "^[^-]*"), df2$Terms[i], sep = "-")
      Number_new <- df_new$Number[i-1] + df2$Number[i]
      df_new[i, c(1:3)] <- c(df_new$CLASSE[i-1], df_new$Variables[i-1], Terms_new)
      df_new[i, 4] <- Number_new
      df_new[i-1,] <- c("0", "0", "0", 0)
    }
  }
}

# filter only relevant rows
df_new <- df_new[df_new$Number != 0,]

下面是一个基本的R解决方案:

定义用于分组的自定义函数 使用order将dfout格式化为所需样式 资料

以致

> dfout
  Classe Variables Number               Terms
3      1     DAT_1      5           20160701q
4      1     DAT_1      3 20160802q-20160901q
1      1     DAT_2      3 20161001q-20161201q
5      1     DAT_2      3           20170301q
2      2     DAT_1      4 20161001q-20170301q

感谢您的回复,我在下面的另一个案例中尝试了它,但它给了我两个人的小组,可能是代码类别变量术语编号1 DAT_1 20160701q 1 DAT_1 20160802q 1 DAT_1 20160901q2 1 DAT_1 20160902q2 1 DAT_1 20161001q3 1 DAT_1 201610021q1 DAT_1 2016100201q1 2 DAT_2 20161001q1 2 DAT_2 2016100201q2 DAT2 DAT_2 20170301q1@Abdel_El注释中的格式不清楚。如果你用这个例子更新你的帖子,那就很容易理解了。thresh的定义如果我想应用代码将前一行与最后一行合并,那么我该怎么做?如果个体数少于3,但每个类中的每个术语不只是数据帧的最后一行?@ThomaslsCoding:谢谢,我尝试了你的答案,我如何连接Terms变量的内容,使第一个和最后一个元素有一个周期,如:20160802q-20160901q???@ThomaslsCoding:我发现了它,多亏了你,我现在将代码改编为data@Abdel_El请看我的更新。如果您认为我的答案有帮助,请随时投票/接受,谢谢@汤马斯:谢谢,这正是我要寻找的-@Abdel_El这是阈值的两倍,即在您的帖子中为3。我在回答中更新了函数f,因此你可以根据需要更改阈值我尝试了你的解决方案,但在其他情况下,它合并了类,它从类1中提取一些个体,从类2中提取其他个体,而不是按类按项分组?@Abdel_El是的,你是对的,它适用于样本数据,但不能很好地推广。很抱歉没有正确回答您的问题
df_new

#CLASSE Variables               Terms Number
#     1     DAT_1           20160701q      5
#     1     DAT_1 20160802q-20160901q      3
#     1     DAT_2 20161001q-20161201q      3
#     1     DAT_2           20170301q      3
#     2     DAT_1 20161001q-20170301q      4
f <- function(v, th = 3) {
  k <- 1
  r <- c()
  repeat {
    if (length(v)==0) break
    ind<-seq(head(which(cumsum(v)>=th),1))
    if (sum(v)<2*th) {
      r <- c(r,rep(k,length(v)))
      v <- c()
    } else {
      r <- c(r,rep(k,length(ind)))
      v <- v[-ind]
    }
    k <- k+1
  }
  r
}

dfout <- subset(aggregate(Terms~.,
                          within(within(df,grp <- ave(Number,Classe, Variables, FUN = f)),
                                 Number <- ave(Number,Classe,Variables,grp,FUN = sum)),
                          c),
                select = -grp)

dfout <- dfout[order(dfout$Classe,dfout$Variables),]
> dfout
  Classe Variables Number                           Terms
3      1     DAT_1      5                       20160701q
4      1     DAT_1      3            20160802q, 20160901q
1      1     DAT_2      3            20161001q, 20161201q
5      1     DAT_2      3                       20170301q
2      2     DAT_1      4 20161001q, 20161201q, 20170301q
df <- structure(list(Classe = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), 
    Variables = c("DAT_1", "DAT_1", "DAT_1", "DAT_2", "DAT_2", 
    "DAT_2", "DAT_1", "DAT_1", "DAT_1"), Terms = c("20160701q", 
    "20160802q", "20160901q", "20161001q", "20161201q", "20170301q", 
    "20161001q", "20161201q", "20170301q"), Number = c(5L, 2L, 
    1L, 1L, 2L, 3L, 1L, 2L, 1L)), class = "data.frame", row.names = c(NA, 
-9L))
dfout <- subset(aggregate(Terms~.,
                          within(within(df,grp <- ave(Number,Classe, Variables, FUN = f)),
                                 Number <- ave(Number,Classe,Variables,grp,FUN = sum)),
                          FUN = function(v) ifelse(length(v)==1,v,paste0(c(v[1],v[length(v)]),collapse = "-"))),
                select = -grp)

dfout <- dfout[order(dfout$Classe,dfout$Variables),]
> dfout
  Classe Variables Number               Terms
3      1     DAT_1      5           20160701q
4      1     DAT_1      3 20160802q-20160901q
1      1     DAT_2      3 20161001q-20161201q
5      1     DAT_2      3           20170301q
2      2     DAT_1      4 20161001q-20170301q