R 对日期上字符串匹配的列求和
我有一个数据框df,其中包含ID变量和每日日期(格式XYYYMMDD)作为列标题:R 对日期上字符串匹配的列求和,r,dataframe,R,Dataframe,我有一个数据框df,其中包含ID变量和每日日期(格式XYYYMMDD)作为列标题: ID <- c(101,102,203,207,209) X20170101 <- c(1,NA,NA,2,1) X20170102 <- c(NA,1,1,1,NA) X20170103<-c(NA,NA,NA,2,1) X20170201<-c(NA,2,NA,NA,1) X20170202<-c(NA,1,1,NA,NA) X20170301<-c(NA,1,NA,
ID <- c(101,102,203,207,209)
X20170101 <- c(1,NA,NA,2,1)
X20170102 <- c(NA,1,1,1,NA)
X20170103<-c(NA,NA,NA,2,1)
X20170201<-c(NA,2,NA,NA,1)
X20170202<-c(NA,1,1,NA,NA)
X20170301<-c(NA,1,NA,NA,NA)
df <- data.table(ID,X20170101,X20170102,X20170103,X20170201,X20170202,X20170301)
ID X20170101 X20170102 X20170103 X20170201 X20170202 X20170301
101 1 NA NA NA NA NA
102 NA 1 NA 2 1 1
203 NA 1 NA NA 1 NA
207 2 1 2 NA NA NA
209 1 NA 1 1 NA NA
我的想法是避免重塑数据集的格式,并使用函数lappy和grepl部分匹配字符串,但我遗漏了一些东西
test = lapply(df, function(x) colSums(df[,grepl(x, names(df))]))
非常感谢。您不想传播数据有什么原因吗
library(tidyverse)
want <- df %>%
gather(key, value, -ID) %>%
mutate(key = substr(key, 1, 7)) %>%
group_by(ID, key) %>%
summarise(value = sum(value, na.rm=TRUE)) %>%
spread(key, value)
# A tibble: 5 x 4
# Groups: ID [5]
ID X201701 X201702 X201703
* <dbl> <dbl> <dbl> <dbl>
1 101 1 0 0
2 102 1 3 1
3 203 1 1 0
4 207 5 0 0
5 209 2 1 0
库(tidyverse)
想要%
聚集(键,值,-ID)%%>%
变异(key=substr(key,1,7))%>%
分组依据(ID,键)%>%
总结(值=总和(值,na.rm=真))%>%
排列(键、值)
#一个tibble:5x4
#组别:ID[5]
ID X201701 X201702 X201703
*
1 101 1 0 0
2 102 1 3 1
3 203 1 1 0
4 207 5 0 0
5 209 2 1 0
这里有一个使用lubridate
包来解析日期和拆分。默认值为将data.frame基于同一个月划分为多个组
library(lubridate)
factors = sapply(ymd(gsub("X", "", names(df)[-1])), function(x)
paste0(year(x), sprintf("%02d", as.integer(month(x)))))
data.frame(df[,1],
lapply(split.default(df[,-1], factors), function(x)
rowSums(x, na.rm = TRUE) * (NA^(rowSums(is.na(x)) == NCOL(x)))))
# ID X201701 X201702 X201703
#1 101 1 NA NA
#2 102 1 3 1
#3 203 1 1 NA
#4 207 5 NA NA
#5 209 2 1 NA
library(tidyverse)
want <- df %>%
gather(key, value, -ID) %>%
mutate(key = substr(key, 1, 7)) %>%
group_by(ID, key) %>%
summarise(value = sum(value, na.rm=TRUE)) %>%
spread(key, value)
# A tibble: 5 x 4
# Groups: ID [5]
ID X201701 X201702 X201703
* <dbl> <dbl> <dbl> <dbl>
1 101 1 0 0
2 102 1 3 1
3 203 1 1 0
4 207 5 0 0
5 209 2 1 0
library(lubridate)
factors = sapply(ymd(gsub("X", "", names(df)[-1])), function(x)
paste0(year(x), sprintf("%02d", as.integer(month(x)))))
data.frame(df[,1],
lapply(split.default(df[,-1], factors), function(x)
rowSums(x, na.rm = TRUE) * (NA^(rowSums(is.na(x)) == NCOL(x)))))
# ID X201701 X201702 X201703
#1 101 1 NA NA
#2 102 1 3 1
#3 203 1 1 NA
#4 207 5 NA NA
#5 209 2 1 NA