R 添加新列并基于另一列中的值在列中插入值

R 添加新列并基于另一列中的值在列中插入值,r,dataframe,R,Dataframe,我有一个R数据框data1,如下所示: prodID storeID Term Exit 1 1001 5 0 1 1002 4 1 1 1003 3 1 1 1004 5 0 2 1001 4 1 2 1002 3 1 2 1003 5

我有一个R数据框
data1
,如下所示:

prodID   storeID   Term    Exit
1        1001      5       0
1        1002      4       1
1        1003      3       1
1        1004      5       0
2        1001      4       1
2        1002      3       1
2        1003      5       0
3        1001      4       1
3        1002      3       1
3        1003      5       0
4        1001      4       1
4        1002      3       1
5        1001      5       0
5        1002      4       1
5        1003      3       1
这当然是我的真实数据的高度简化格式,大约有300万行。我必须做到以下几点:

  • 根据
    Term
    列中的最大值,在
    data1
    中插入具有
    NA
    值的多个列。列名应为
    Week1
    Week2
    Week3
    ,等等
  • 对于每一行,使用以下规则在新列中填入0、1或
    NA
    :1)如果
    术语
    为5,则在
    Week1
    Week2
    、直到
    Week4
    Week5
    中插入0 2) 如果
    术语
    为4,则在
    第1周
    第2周
    第3周
    中插入0,在
    第4周
    中插入1,并在
    第5周
    中保留
    NA
    。等等
  • 最终输出应如下所示:

    prodID   storeID   Term    Exit  Week1   Week2   Week3   Week4   Week5
    1        1001      5       0     0       0       0       0       1
    1        1002      4       1     0       0       0       1       NA
    1        1003      3       1     0       0       1       NA      NA
    1        1004      5       0     0       0       0       0       1
    2        1001      4       1     0       0       0       1       NA
    2        1002      3       1     0       0       1       NA      NA
    2        1003      5       0     0       0       0       0       1
    3        1001      4       1     0       0       0       1       NA
    3        1002      3       1     0       0       1       NA      NA
    3        1003      5       0     0       0       0       0       1
    4        1001      4       1     0       0       0       1       NA
    4        1002      3       1     0       0       1       NA      NA
    5        1001      5       0     0       0       0       0       1
    5        1002      4       1     0       0       0       1       NA
    5        1003      3       1     0       0       1       NA      NA
    
    这就是我所尝试的:

    variant <- c("Week1","Week2","Week3","Week4","Week5")
    
    data1[variant] <- NA
    
    for (i in 1:length(data1$prodID)){
      data1$Week1 <- ifelse(data1$Term==1,1,0)
      data1$Week2 <- ifelse(data1$Term==2,1,0)
      data1$Week3 <- ifelse(data1$Term==3,1,0)
      data1$Week4 <- ifelse(data1$Term==4,1,0)
      data1$Week5 <- ifelse(data1$Term==5,1,0)
    }
    

    variant这里有一个想法。我们可以创建您需要的内容,然后拆分列

    library(dplyr)
    library(data.table)
    library(splitstackshape)
    
    dat2 <- dat %>%
      mutate(Week = case_when(
        Term == 5       ~"0,0,0,0,1",
        Term == 4       ~"0,0,0,1,NA",
        Term == 3       ~"0,0,1,NA,NA",
        Term == 2       ~"0,1,NA,NA,NA",
        Term == 1       ~"1,NA,NA,NA,NA"
      )) %>%
      cSplit(splitCols = "Week")
    dat2
    #     prodID storeID Term Exit Week_1 Week_2 Week_3 Week_4 Week_5
    #  1:      1    1001    5    0      0      0      0      0      1
    #  2:      1    1002    4    1      0      0      0      1     NA
    #  3:      1    1003    3    1      0      0      1     NA     NA
    #  4:      1    1004    5    0      0      0      0      0      1
    #  5:      2    1001    4    1      0      0      0      1     NA
    #  6:      2    1002    3    1      0      0      1     NA     NA
    #  7:      2    1003    5    0      0      0      0      0      1
    #  8:      3    1001    4    1      0      0      0      1     NA
    #  9:      3    1002    3    1      0      0      1     NA     NA
    # 10:      3    1003    5    0      0      0      0      0      1
    # 11:      4    1001    4    1      0      0      0      1     NA
    # 12:      4    1002    3    1      0      0      1     NA     NA
    # 13:      5    1001    5    0      0      0      0      0      1
    # 14:      5    1002    4    1      0      0      0      1     NA
    # 15:      5    1003    3    1      0      0      1     NA     NA
    
    更新

    我们可以在排列周列之前使用
    stru pad
    stringr
    包到pad 0对列名进行排序

    library(tidyverse)
    
    dat2 <- dat %>%
      mutate(Week = map2(1, Term, `:`)) %>%
      unnest() %>%
      group_by(prodID, Term) %>%
      mutate(Week_Value = as.integer(Week == max(Week)),
             Week = paste0("Week", str_pad(Week, width = 3, pad = "0"))) %>%
      spread(Week, Week_Value) %>%
      ungroup()
    dat2
    # # A tibble: 15 x 9
    #   prodID storeID  Term  Exit Week001 Week002 Week003 Week004 Week005
    #     <int>   <int> <int> <int>   <int>   <int>   <int>   <int>   <int>
    #  1      1    1001     5     0       0       0       0       0       1
    #  2      1    1002     4     1       0       0       0       1      NA
    #  3      1    1003     3     1       0       0       1      NA      NA
    #  4      1    1004     5     0       0       0       0       0       1
    #  5      2    1001     4     1       0       0       0       1      NA
    #  6      2    1002     3     1       0       0       1      NA      NA
    #  7      2    1003     5     0       0       0       0       0       1
    #  8      3    1001     4     1       0       0       0       1      NA
    #  9      3    1002     3     1       0       0       1      NA      NA
    # 10      3    1003     5     0       0       0       0       0       1
    # 11      4    1001     4     1       0       0       0       1      NA
    # 12      4    1002     3     1       0       0       1      NA      NA
    # 13      5    1001     5     0       0       0       0       0       1
    # 14      5    1002     4     1       0       0       0       1      NA
    # 15      5    1003     3     1       0       0       1      NA      NA
    
    库(tidyverse)
    dat2%
    突变(周=map2(1,术语,`:`))%>%
    unest()%>%
    分组依据(项目ID,期限)%>%
    mutate(Week_Value=as.integer(Week==max(Week)),
    周=粘贴0(“周”,str_pad(周,宽度=3,pad=“0”))%>%
    价差(周,周价值)%>%
    解组()
    dat2
    ##A tibble:15 x 9
    #prodID storeID期限退出周001周002周003周004周005
    #                         
    #  1      1    1001     5     0       0       0       0       0       1
    #211002 41001 NA
    #3110031001NA
    #  4      1    1004     5     0       0       0       0       0       1
    #5210014001 NA
    #6 2 1002 3 1 0 1 NA
    #  7      2    1003     5     0       0       0       0       0       1
    #8310014001 NA
    #9 3 1002 3 1 0 1 NA
    # 10      3    1003     5     0       0       0       0       0       1
    #11100141001 NA
    #12 4 1002 3 1 0 1 NA
    # 13      5    1001     5     0       0       0       0       0       1
    #14 5 1002 4 1 0 0 1 NA
    #15 5 1003 3 1 0 1 NA
    
    数据

    dat <- read.table(text = "prodID   storeID   Term    Exit
    1        1001      5       0
                      1        1002      4       1
                      1        1003      3       1
                      1        1004      5       0
                      2        1001      4       1
                      2        1002      3       1
                      2        1003      5       0
                      3        1001      4       1
                      3        1002      3       1
                      3        1003      5       0
                      4        1001      4       1
                      4        1002      3       1
                      5        1001      5       0
                      5        1002      4       1
                      5        1003      3       1",
                      header = TRUE)
    

    dat这里有一个想法。我们可以创建您需要的内容,然后拆分列

    library(dplyr)
    library(data.table)
    library(splitstackshape)
    
    dat2 <- dat %>%
      mutate(Week = case_when(
        Term == 5       ~"0,0,0,0,1",
        Term == 4       ~"0,0,0,1,NA",
        Term == 3       ~"0,0,1,NA,NA",
        Term == 2       ~"0,1,NA,NA,NA",
        Term == 1       ~"1,NA,NA,NA,NA"
      )) %>%
      cSplit(splitCols = "Week")
    dat2
    #     prodID storeID Term Exit Week_1 Week_2 Week_3 Week_4 Week_5
    #  1:      1    1001    5    0      0      0      0      0      1
    #  2:      1    1002    4    1      0      0      0      1     NA
    #  3:      1    1003    3    1      0      0      1     NA     NA
    #  4:      1    1004    5    0      0      0      0      0      1
    #  5:      2    1001    4    1      0      0      0      1     NA
    #  6:      2    1002    3    1      0      0      1     NA     NA
    #  7:      2    1003    5    0      0      0      0      0      1
    #  8:      3    1001    4    1      0      0      0      1     NA
    #  9:      3    1002    3    1      0      0      1     NA     NA
    # 10:      3    1003    5    0      0      0      0      0      1
    # 11:      4    1001    4    1      0      0      0      1     NA
    # 12:      4    1002    3    1      0      0      1     NA     NA
    # 13:      5    1001    5    0      0      0      0      0      1
    # 14:      5    1002    4    1      0      0      0      1     NA
    # 15:      5    1003    3    1      0      0      1     NA     NA
    
    更新

    我们可以在排列周列之前使用
    stru pad
    stringr
    包到pad 0对列名进行排序

    library(tidyverse)
    
    dat2 <- dat %>%
      mutate(Week = map2(1, Term, `:`)) %>%
      unnest() %>%
      group_by(prodID, Term) %>%
      mutate(Week_Value = as.integer(Week == max(Week)),
             Week = paste0("Week", str_pad(Week, width = 3, pad = "0"))) %>%
      spread(Week, Week_Value) %>%
      ungroup()
    dat2
    # # A tibble: 15 x 9
    #   prodID storeID  Term  Exit Week001 Week002 Week003 Week004 Week005
    #     <int>   <int> <int> <int>   <int>   <int>   <int>   <int>   <int>
    #  1      1    1001     5     0       0       0       0       0       1
    #  2      1    1002     4     1       0       0       0       1      NA
    #  3      1    1003     3     1       0       0       1      NA      NA
    #  4      1    1004     5     0       0       0       0       0       1
    #  5      2    1001     4     1       0       0       0       1      NA
    #  6      2    1002     3     1       0       0       1      NA      NA
    #  7      2    1003     5     0       0       0       0       0       1
    #  8      3    1001     4     1       0       0       0       1      NA
    #  9      3    1002     3     1       0       0       1      NA      NA
    # 10      3    1003     5     0       0       0       0       0       1
    # 11      4    1001     4     1       0       0       0       1      NA
    # 12      4    1002     3     1       0       0       1      NA      NA
    # 13      5    1001     5     0       0       0       0       0       1
    # 14      5    1002     4     1       0       0       0       1      NA
    # 15      5    1003     3     1       0       0       1      NA      NA
    
    库(tidyverse)
    dat2%
    突变(周=map2(1,术语,`:`))%>%
    unest()%>%
    分组依据(项目ID,期限)%>%
    mutate(Week_Value=as.integer(Week==max(Week)),
    周=粘贴0(“周”,str_pad(周,宽度=3,pad=“0”))%>%
    价差(周,周价值)%>%
    解组()
    dat2
    ##A tibble:15 x 9
    #prodID storeID期限退出周001周002周003周004周005
    #                         
    #  1      1    1001     5     0       0       0       0       0       1
    #211002 41001 NA
    #3110031001NA
    #  4      1    1004     5     0       0       0       0       0       1
    #5210014001 NA
    #6 2 1002 3 1 0 1 NA
    #  7      2    1003     5     0       0       0       0       0       1
    #8310014001 NA
    #9 3 1002 3 1 0 1 NA
    # 10      3    1003     5     0       0       0       0       0       1
    #11100141001 NA
    #12 4 1002 3 1 0 1 NA
    # 13      5    1001     5     0       0       0       0       0       1
    #14 5 1002 4 1 0 0 1 NA
    #15 5 1003 3 1 0 1 NA
    
    数据

    dat <- read.table(text = "prodID   storeID   Term    Exit
    1        1001      5       0
                      1        1002      4       1
                      1        1003      3       1
                      1        1004      5       0
                      2        1001      4       1
                      2        1002      3       1
                      2        1003      5       0
                      3        1001      4       1
                      3        1002      3       1
                      3        1003      5       0
                      4        1001      4       1
                      4        1002      3       1
                      5        1001      5       0
                      5        1002      4       1
                      5        1003      3       1",
                      header = TRUE)
    

    dat这里有一个带
    base R
    的选项,我们在“术语”中循环,
    制表
    得到每个元素的0和1,在末尾附加
    NA
    length这里有一个带
    base R
    的选项,我们在“术语”中循环,
    制表
    得到每个元素的0和1,使用
    dplyr::mutate_at
    case_在
    末尾添加
    NA
    ,然后检查列号是否大于/等于/小于
    项的值

    # First add additional columns based on maximum value of Term
    df[,paste("Week", 1:max(df$Term), sep="")] <- NA
    
    library(dplyr)
    
    df %>% mutate_at(vars(starts_with("Week")), funs(case_when(
      as.integer(sub(".*(\\d+)","\\1",quo_name(quo(.)))) < Term ~ 0L,
      as.integer(sub(".*(\\d+)","\\1",quo_name(quo(.)))) == Term ~ 1L,
      TRUE                                                      ~ NA_integer_
    )))
    
    #    prodID storeID Term Exit Week1 Week2 Week3 Week4 Week5
    # 1       1    1001    5    0     0     0     0     0     1
    # 2       1    1002    4    1     0     0     0     1    NA
    # 3       1    1003    3    1     0     0     1    NA    NA
    # 4       1    1004    5    0     0     0     0     0     1
    # 5       2    1001    4    1     0     0     0     1    NA
    # 6       2    1002    3    1     0     0     1    NA    NA
    # 7       2    1003    5    0     0     0     0     0     1
    # 8       3    1001    4    1     0     0     0     1    NA
    # 9       3    1002    3    1     0     0     1    NA    NA
    # 10      3    1003    5    0     0     0     0     0     1
    # 11      4    1001    4    1     0     0     0     1    NA
    # 12      4    1002    3    1     0     0     1    NA    NA
    # 13      5    1001    5    0     0     0     0     0     1
    # 14      5    1002    4    1     0     0     0     1    NA
    # 15      5    1003    3    1     0     0     1    NA    NA
    
    #首先根据术语的最大值添加其他列
    df[,粘贴(“周”,1:max(df$Term),sep=“”)]%在(vars(以(“周”)开始)、funs(case_当(
    作为.integer(“.*(\\d+)”、“\\1”、quo_名称(“))df <- read.table(text="
    prodID   storeID   Term    Exit
    1        1001      5       0
    1        1002      4       1
    1        1003      3       1
    1        1004      5       0
    2        1001      4       1
    2        1002      3       1
    2        1003      5       0
    3        1001      4       1
    3        1002      3       1
    3        1003      5       0
    4        1001      4       1
    4        1002      3       1
    5        1001      5       0
    5        1002      4       1
    5        1003      3       1",
    header = TRUE, stringsAsFactors = FALSE)