R 重新排列数据以与数据对齐
我在R中有以下数据帧R 重新排列数据以与数据对齐,r,dataframe,reshape,R,Dataframe,Reshape,我在R中有以下数据帧 df <- data.frame( "DateValue" = c("2016-07-01", "2016-07-02", "2016-07-03", "2016-07-04", "2016-07-05", "2016-07-06","2017-07-01", "2017-07-02", "2017-07-03", "2017-07-04", "2017-07-05", "2017-07-06", "2018-07-01", "2018-07-02", "201
df <- data.frame(
"DateValue" = c("2016-07-01", "2016-07-02", "2016-07-03", "2016-07-04", "2016-07-05", "2016-07-06","2017-07-01", "2017-07-02", "2017-07-03", "2017-07-04", "2017-07-05", "2017-07-06", "2018-07-01", "2018-07-02", "2018-07-03", "2018-07-04", "2018-07-05", "2018-07-06"),
"Age1" = seq(1:18),
"Age2" = c(seq(14,36,2), rep(NA, 6)),
"Age3" = c(seq(45,50),rep(NA, 12))
)
DateValue Age1 Age2 Age3
# 1 2016-07-01 1 14 45
# 2 2016-07-02 2 16 46
# 3 2016-07-03 3 18 47
# 4 2016-07-04 4 20 48
# 5 2016-07-05 5 22 49
# 6 2016-07-06 6 24 50
# 7 2017-07-01 7 26 NA
# 8 2017-07-02 8 28 NA
# 9 2017-07-03 9 30 NA
# 10 2017-07-04 10 32 NA
# 11 2017-07-05 11 34 NA
# 12 2017-07-06 12 36 NA
# 13 2018-07-01 13 NA NA
# 14 2018-07-02 14 NA NA
# 15 2018-07-03 15 NA NA
# 16 2018-07-04 16 NA NA
# 17 2018-07-05 17 NA NA
# 18 2018-07-06 18 NA NA
我正试图想出一个代码,将Age2和Age3列中的数据对齐,以便日期对齐。以下是我正在寻找的输出:
df <- data.frame(
"DateValue" = c("07-01", "07-02", "07-03", "07-04", "07-05", "07-06"),
"Age1" = seq(13:18),
"Age2" = seq(26,36,2),
"Age3" = seq(45,50)
)
# DateValue Age1 Age2 Age3
# 1 07-01 13 26 45
# 2 07-02 14 28 46
# 3 07-03 15 30 47
# 4 07-04 16 32 48
# 5 07-05 17 34 49
# 6 07-06 18 36 50
我基本上保留了2018年当前年份的所有日期和值,并将其与前几年的日期相匹配。请注意,我上一年可能有更多的日期。但是我需要删除所有没有本年度数据的行。我回顾了下面的线程,然后重新排列数据帧,但是上下文与我的情况大不相同。
我试着看了看R重塑包,但没有任何运气。如有任何建议/建议,将不胜感激 这里有一种方法。这肯定是可以重构的,但它是有效的
library(dplyr)
# DateValue is a factor; convert to date format
df$DateValue <- as.Date(as.character(df$DateValue), format="%Y-%m-%d")
# grab the month and day from DateValue, sort by Age1
df <- df %>%
mutate(MonthAndDay = format(DateValue, "%m-%d")) %>%
arrange(desc(Age1))
# get vector of dates
dates <- df$MonthAndDay[which(!duplicated(df$MonthAndDay))]
# define age columns
agecols <- c("Age1","Age2","Age3")
# initialize empty df to be populated in loop
temp <- data.frame(MonthAndDay = dates)
# for each column, get values that a) are in the target dates, b) aren't NA, and c) only get the first ones (not duplicates--that's why we arranged by Age1 before). Select the values and add them as a new column to the new dataframe.
for (col in agecols) {
temp_col <- filter(df, MonthAndDay %in% dates & !is.na(df[,col]))
temp_col <- filter(temp_col[-which(duplicated(df$MonthAndDay)), ]) %>%
select(col)
temp[,col] <- temp_col
}
temp %>% arrange(MonthAndDay)
# MonthAndDay Age1 Age2 Age3
# 1 07-01 13 26 45
# 2 07-02 14 28 46
# 3 07-03 15 30 47
# 4 07-04 16 32 48
# 5 07-05 17 34 49
# 6 07-06 18 36 50
使用base R,这里有一种方法
#Get age columns
age_cols <- grep("^Age", names(df))
#Convert date to actual object
df$DateValue <- as.Date(df$DateValue)
#Get year from date
df$year <- as.integer(format(df$DateValue, "%Y"))
#Get month-date from Date
df$month_date <- format(df$DateValue, "%m-%d")
#Select dates which are present in max year
subset_date <- with(df, month_date[year == max(year)])
#For each age_cols select the non NA values which match subset_date
cbind.data.frame(DateValue = subset_date,
sapply(df[age_cols], function(x) {
x <- x[order(df$year, decreasing = TRUE)]
x <- x[!is.na(x)]
x[match(subset_date, df$month_date)]
}))
# DateValue Age1 Age2 Age3
#1 07-01 13 26 45
#2 07-02 14 28 46
#3 07-03 15 30 47
#4 07-04 16 32 48
#5 07-05 17 34 49
#6 07-06 18 36 50
以下是一个不可靠的解决方案:
df$DateValue = format(as.Date(df$DateValue), '%m-%d')
Age3_non_NA <- sum(!is.na(df[['Age3']]))
df <- as.data.frame(lapply(df, function(l) tail(na.omit(l), Age3_non_NA)))
df
DateValue Age1 Age2 Age3
1 07-01 13 26 45
2 07-02 14 28 46
3 07-03 15 30 47
4 07-04 16 32 48
5 07-05 17 34 49
6 07-06 18 36 50
您的输出看起来像原始df中的前6行。究竟应该做些什么?请解释一下,我不明白你在问什么。您似乎只是在删除缺少年龄值的行。这就是你的意图吗?对不起。让我更正所需的输出。
library(tidyr)
library(dplyr)
library(lubridate)
df%>%
mutate(DateValue = as.Date(DateValue),
Year = year(DateValue),
Mon_Day = format(DateValue, '%m-%d'))%>%
select(-DateValue)%>%
gather(Age, val, -Year, -Mon_Day, na.rm = T)%>%
group_by(Age, Mon_Day)%>%
filter(Year == max(Year))%>%
ungroup()%>%
select(-Year)%>%
spread(Age, val)
# A tibble: 6 x 4
Mon_Day Age1 Age2 Age3
<chr> <dbl> <dbl> <dbl>
1 07-01 13 26 45
2 07-02 14 28 46
3 07-03 15 30 47
4 07-04 16 32 48
5 07-05 17 34 49
6 07-06 18 36 50