R 添加新列并基于另一列中的值在列中插入值
我有一个R数据框R 添加新列并基于另一列中的值在列中插入值,r,dataframe,R,Dataframe,我有一个R数据框data1,如下所示: prodID storeID Term Exit 1 1001 5 0 1 1002 4 1 1 1003 3 1 1 1004 5 0 2 1001 4 1 2 1002 3 1 2 1003 5
data1
,如下所示:
prodID storeID Term Exit
1 1001 5 0
1 1002 4 1
1 1003 3 1
1 1004 5 0
2 1001 4 1
2 1002 3 1
2 1003 5 0
3 1001 4 1
3 1002 3 1
3 1003 5 0
4 1001 4 1
4 1002 3 1
5 1001 5 0
5 1002 4 1
5 1003 3 1
这当然是我的真实数据的高度简化格式,大约有300万行。我必须做到以下几点:
Term
列中的最大值,在data1
中插入具有NA
值的多个列。列名应为Week1
、Week2
、Week3
,等等NA
:1)如果术语
为5,则在Week1
、Week2
、直到Week4
和Week5
中插入0
2) 如果术语
为4,则在第1周
、第2周
和第3周
中插入0,在第4周
中插入1,并在第5周
中保留NA
。等等prodID storeID Term Exit Week1 Week2 Week3 Week4 Week5
1 1001 5 0 0 0 0 0 1
1 1002 4 1 0 0 0 1 NA
1 1003 3 1 0 0 1 NA NA
1 1004 5 0 0 0 0 0 1
2 1001 4 1 0 0 0 1 NA
2 1002 3 1 0 0 1 NA NA
2 1003 5 0 0 0 0 0 1
3 1001 4 1 0 0 0 1 NA
3 1002 3 1 0 0 1 NA NA
3 1003 5 0 0 0 0 0 1
4 1001 4 1 0 0 0 1 NA
4 1002 3 1 0 0 1 NA NA
5 1001 5 0 0 0 0 0 1
5 1002 4 1 0 0 0 1 NA
5 1003 3 1 0 0 1 NA NA
这就是我所尝试的:
variant <- c("Week1","Week2","Week3","Week4","Week5")
data1[variant] <- NA
for (i in 1:length(data1$prodID)){
data1$Week1 <- ifelse(data1$Term==1,1,0)
data1$Week2 <- ifelse(data1$Term==2,1,0)
data1$Week3 <- ifelse(data1$Term==3,1,0)
data1$Week4 <- ifelse(data1$Term==4,1,0)
data1$Week5 <- ifelse(data1$Term==5,1,0)
}
variant这里有一个想法。我们可以创建您需要的内容,然后拆分列
library(dplyr)
library(data.table)
library(splitstackshape)
dat2 <- dat %>%
mutate(Week = case_when(
Term == 5 ~"0,0,0,0,1",
Term == 4 ~"0,0,0,1,NA",
Term == 3 ~"0,0,1,NA,NA",
Term == 2 ~"0,1,NA,NA,NA",
Term == 1 ~"1,NA,NA,NA,NA"
)) %>%
cSplit(splitCols = "Week")
dat2
# prodID storeID Term Exit Week_1 Week_2 Week_3 Week_4 Week_5
# 1: 1 1001 5 0 0 0 0 0 1
# 2: 1 1002 4 1 0 0 0 1 NA
# 3: 1 1003 3 1 0 0 1 NA NA
# 4: 1 1004 5 0 0 0 0 0 1
# 5: 2 1001 4 1 0 0 0 1 NA
# 6: 2 1002 3 1 0 0 1 NA NA
# 7: 2 1003 5 0 0 0 0 0 1
# 8: 3 1001 4 1 0 0 0 1 NA
# 9: 3 1002 3 1 0 0 1 NA NA
# 10: 3 1003 5 0 0 0 0 0 1
# 11: 4 1001 4 1 0 0 0 1 NA
# 12: 4 1002 3 1 0 0 1 NA NA
# 13: 5 1001 5 0 0 0 0 0 1
# 14: 5 1002 4 1 0 0 0 1 NA
# 15: 5 1003 3 1 0 0 1 NA NA
更新
我们可以在排列周列之前使用stru pad
从stringr
包到pad 0对列名进行排序
library(tidyverse)
dat2 <- dat %>%
mutate(Week = map2(1, Term, `:`)) %>%
unnest() %>%
group_by(prodID, Term) %>%
mutate(Week_Value = as.integer(Week == max(Week)),
Week = paste0("Week", str_pad(Week, width = 3, pad = "0"))) %>%
spread(Week, Week_Value) %>%
ungroup()
dat2
# # A tibble: 15 x 9
# prodID storeID Term Exit Week001 Week002 Week003 Week004 Week005
# <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 1 1001 5 0 0 0 0 0 1
# 2 1 1002 4 1 0 0 0 1 NA
# 3 1 1003 3 1 0 0 1 NA NA
# 4 1 1004 5 0 0 0 0 0 1
# 5 2 1001 4 1 0 0 0 1 NA
# 6 2 1002 3 1 0 0 1 NA NA
# 7 2 1003 5 0 0 0 0 0 1
# 8 3 1001 4 1 0 0 0 1 NA
# 9 3 1002 3 1 0 0 1 NA NA
# 10 3 1003 5 0 0 0 0 0 1
# 11 4 1001 4 1 0 0 0 1 NA
# 12 4 1002 3 1 0 0 1 NA NA
# 13 5 1001 5 0 0 0 0 0 1
# 14 5 1002 4 1 0 0 0 1 NA
# 15 5 1003 3 1 0 0 1 NA NA
库(tidyverse)
dat2%
突变(周=map2(1,术语,`:`))%>%
unest()%>%
分组依据(项目ID,期限)%>%
mutate(Week_Value=as.integer(Week==max(Week)),
周=粘贴0(“周”,str_pad(周,宽度=3,pad=“0”))%>%
价差(周,周价值)%>%
解组()
dat2
##A tibble:15 x 9
#prodID storeID期限退出周001周002周003周004周005
#
# 1 1 1001 5 0 0 0 0 0 1
#211002 41001 NA
#3110031001NA
# 4 1 1004 5 0 0 0 0 0 1
#5210014001 NA
#6 2 1002 3 1 0 1 NA
# 7 2 1003 5 0 0 0 0 0 1
#8310014001 NA
#9 3 1002 3 1 0 1 NA
# 10 3 1003 5 0 0 0 0 0 1
#11100141001 NA
#12 4 1002 3 1 0 1 NA
# 13 5 1001 5 0 0 0 0 0 1
#14 5 1002 4 1 0 0 1 NA
#15 5 1003 3 1 0 1 NA
数据
dat <- read.table(text = "prodID storeID Term Exit
1 1001 5 0
1 1002 4 1
1 1003 3 1
1 1004 5 0
2 1001 4 1
2 1002 3 1
2 1003 5 0
3 1001 4 1
3 1002 3 1
3 1003 5 0
4 1001 4 1
4 1002 3 1
5 1001 5 0
5 1002 4 1
5 1003 3 1",
header = TRUE)
dat这里有一个想法。我们可以创建您需要的内容,然后拆分列
library(dplyr)
library(data.table)
library(splitstackshape)
dat2 <- dat %>%
mutate(Week = case_when(
Term == 5 ~"0,0,0,0,1",
Term == 4 ~"0,0,0,1,NA",
Term == 3 ~"0,0,1,NA,NA",
Term == 2 ~"0,1,NA,NA,NA",
Term == 1 ~"1,NA,NA,NA,NA"
)) %>%
cSplit(splitCols = "Week")
dat2
# prodID storeID Term Exit Week_1 Week_2 Week_3 Week_4 Week_5
# 1: 1 1001 5 0 0 0 0 0 1
# 2: 1 1002 4 1 0 0 0 1 NA
# 3: 1 1003 3 1 0 0 1 NA NA
# 4: 1 1004 5 0 0 0 0 0 1
# 5: 2 1001 4 1 0 0 0 1 NA
# 6: 2 1002 3 1 0 0 1 NA NA
# 7: 2 1003 5 0 0 0 0 0 1
# 8: 3 1001 4 1 0 0 0 1 NA
# 9: 3 1002 3 1 0 0 1 NA NA
# 10: 3 1003 5 0 0 0 0 0 1
# 11: 4 1001 4 1 0 0 0 1 NA
# 12: 4 1002 3 1 0 0 1 NA NA
# 13: 5 1001 5 0 0 0 0 0 1
# 14: 5 1002 4 1 0 0 0 1 NA
# 15: 5 1003 3 1 0 0 1 NA NA
更新
我们可以在排列周列之前使用stru pad
从stringr
包到pad 0对列名进行排序
library(tidyverse)
dat2 <- dat %>%
mutate(Week = map2(1, Term, `:`)) %>%
unnest() %>%
group_by(prodID, Term) %>%
mutate(Week_Value = as.integer(Week == max(Week)),
Week = paste0("Week", str_pad(Week, width = 3, pad = "0"))) %>%
spread(Week, Week_Value) %>%
ungroup()
dat2
# # A tibble: 15 x 9
# prodID storeID Term Exit Week001 Week002 Week003 Week004 Week005
# <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 1 1001 5 0 0 0 0 0 1
# 2 1 1002 4 1 0 0 0 1 NA
# 3 1 1003 3 1 0 0 1 NA NA
# 4 1 1004 5 0 0 0 0 0 1
# 5 2 1001 4 1 0 0 0 1 NA
# 6 2 1002 3 1 0 0 1 NA NA
# 7 2 1003 5 0 0 0 0 0 1
# 8 3 1001 4 1 0 0 0 1 NA
# 9 3 1002 3 1 0 0 1 NA NA
# 10 3 1003 5 0 0 0 0 0 1
# 11 4 1001 4 1 0 0 0 1 NA
# 12 4 1002 3 1 0 0 1 NA NA
# 13 5 1001 5 0 0 0 0 0 1
# 14 5 1002 4 1 0 0 0 1 NA
# 15 5 1003 3 1 0 0 1 NA NA
库(tidyverse)
dat2%
突变(周=map2(1,术语,`:`))%>%
unest()%>%
分组依据(项目ID,期限)%>%
mutate(Week_Value=as.integer(Week==max(Week)),
周=粘贴0(“周”,str_pad(周,宽度=3,pad=“0”))%>%
价差(周,周价值)%>%
解组()
dat2
##A tibble:15 x 9
#prodID storeID期限退出周001周002周003周004周005
#
# 1 1 1001 5 0 0 0 0 0 1
#211002 41001 NA
#3110031001NA
# 4 1 1004 5 0 0 0 0 0 1
#5210014001 NA
#6 2 1002 3 1 0 1 NA
# 7 2 1003 5 0 0 0 0 0 1
#8310014001 NA
#9 3 1002 3 1 0 1 NA
# 10 3 1003 5 0 0 0 0 0 1
#11100141001 NA
#12 4 1002 3 1 0 1 NA
# 13 5 1001 5 0 0 0 0 0 1
#14 5 1002 4 1 0 0 1 NA
#15 5 1003 3 1 0 1 NA
数据
dat <- read.table(text = "prodID storeID Term Exit
1 1001 5 0
1 1002 4 1
1 1003 3 1
1 1004 5 0
2 1001 4 1
2 1002 3 1
2 1003 5 0
3 1001 4 1
3 1002 3 1
3 1003 5 0
4 1001 4 1
4 1002 3 1
5 1001 5 0
5 1002 4 1
5 1003 3 1",
header = TRUE)
dat这里有一个带base R
的选项,我们在“术语”中循环,制表
得到每个元素的0和1,在末尾附加NA
和length这里有一个带base R
的选项,我们在“术语”中循环,制表
得到每个元素的0和1,使用dplyr::mutate_at
和case_在末尾添加NA
,然后检查列号是否大于/等于/小于项的值
# First add additional columns based on maximum value of Term
df[,paste("Week", 1:max(df$Term), sep="")] <- NA
library(dplyr)
df %>% mutate_at(vars(starts_with("Week")), funs(case_when(
as.integer(sub(".*(\\d+)","\\1",quo_name(quo(.)))) < Term ~ 0L,
as.integer(sub(".*(\\d+)","\\1",quo_name(quo(.)))) == Term ~ 1L,
TRUE ~ NA_integer_
)))
# prodID storeID Term Exit Week1 Week2 Week3 Week4 Week5
# 1 1 1001 5 0 0 0 0 0 1
# 2 1 1002 4 1 0 0 0 1 NA
# 3 1 1003 3 1 0 0 1 NA NA
# 4 1 1004 5 0 0 0 0 0 1
# 5 2 1001 4 1 0 0 0 1 NA
# 6 2 1002 3 1 0 0 1 NA NA
# 7 2 1003 5 0 0 0 0 0 1
# 8 3 1001 4 1 0 0 0 1 NA
# 9 3 1002 3 1 0 0 1 NA NA
# 10 3 1003 5 0 0 0 0 0 1
# 11 4 1001 4 1 0 0 0 1 NA
# 12 4 1002 3 1 0 0 1 NA NA
# 13 5 1001 5 0 0 0 0 0 1
# 14 5 1002 4 1 0 0 0 1 NA
# 15 5 1003 3 1 0 0 1 NA NA
#首先根据术语的最大值添加其他列
df[,粘贴(“周”,1:max(df$Term),sep=“”)]%在(vars(以(“周”)开始)、funs(case_当(
作为.integer(“.*(\\d+)”、“\\1”、quo_名称(“))df <- read.table(text="
prodID storeID Term Exit
1 1001 5 0
1 1002 4 1
1 1003 3 1
1 1004 5 0
2 1001 4 1
2 1002 3 1
2 1003 5 0
3 1001 4 1
3 1002 3 1
3 1003 5 0
4 1001 4 1
4 1002 3 1
5 1001 5 0
5 1002 4 1
5 1003 3 1",
header = TRUE, stringsAsFactors = FALSE)