检查日期是否在R中的间隔内
我定义了这三个时间间隔:检查日期是否在R中的间隔内,r,lubridate,R,Lubridate,我定义了这三个时间间隔: YEAR_1 <- interval(ymd('2002-09-01'), ymd('2003-08-31')) YEAR_2 <- interval(ymd('2003-09-01'), ymd('2004-08-31')) YEAR_3 <- interval(ymd('2004-09-01'), ymd('2005-08-31')) 但我不知道该怎么做。我想我需要用一个apply 这是我的数据框: structure(c(1055289
YEAR_1 <- interval(ymd('2002-09-01'), ymd('2003-08-31'))
YEAR_2 <- interval(ymd('2003-09-01'), ymd('2004-08-31'))
YEAR_3 <- interval(ymd('2004-09-01'), ymd('2005-08-31'))
但我不知道该怎么做。我想我需要用一个apply
这是我的数据框:
structure(c(1055289600, 1092182400, 1086220800, 1074556800, 1109289600,
1041897600, 1069200000, 1047427200, 1072656000, 1048636800, 1092873600,
1090195200, 1051574400, 1052179200, 1130371200, 1242777600, 1140652800,
1137974400, 1045526400, 1111104000, 1073952000, 1052870400, 1087948800,
1053993600, 1039564800, 1141603200, 1074038400, 1105315200, 1060560000,
1072051200, 1046217600, 1107129600, 1088553600, 1071619200, 1115596800,
1050364800, 1147046400, 1083628800, 1056412800, 1159747200, 1087257600,
1201478400, 1120521600, 1066176000, 1034553600, 1057622400, 1078876800,
1010880000, 1133913600, 1098230400, 1170806400, 1037318400, 1070409600,
1091577600, 1057708800, 1182556800, 1091059200, 1058227200, 1061337600,
1034121600, 1067644800, 1039478400, 1022198400, 1063065600, 1096329600,
1049760000, 1081728000, 1016150400, 1029801600, 1059350400, 1087257600,
1181692800, 1310947200, 1125446400, 1057104000, NA, 1085529600,
1037664000, 1091577600, 1080518400, 1110758400, 1092787200, 1094601600,
1169424000, 1232582400, 1058918400, 1021420800, 1133136000, 1030320000,
1060732800, 1035244800, 1090800000, 1129161600, 1055808000, 1060646400,
1028678400, 1075852800, 1144627200, 1111363200, 1070236800), class = c("POSIXct",
"POSIXt"), tzone = "UTC")
您可以使用package
Purr
中的walk
进行以下操作:
purrr::walk(1:3, ~(df$Year[as.POSIXlt(df$DATE) %within% get(paste0("YEAR_", .))] <<- .))
df$year = lapply(df$dates,FUN = function(x){
x = as.Date(x)
if(is.na(x)){
return(NA)
}
for(i in 1:nrow(intervals){
if(df.intervals[i,"Start"]<=x & x<= df.intervals[i,"End"]){
return(paste0(YEAR_,i))}
}})
purrr::walk(1:3,~(df$Year[as.POSIXlt(df$DATE)%in%get(paste0(“Year”,)))您可以尝试以下方法:
df = as.data.frame(structure(c(1055289600, 1092182400, 1086220800, 1074556800, 1109289600,
1041897600, 1069200000, 1047427200, 1072656000, 1048636800, 1092873600,
1090195200, 1051574400, 1052179200, 1130371200, 1242777600, 1140652800,
1137974400, 1045526400, 1111104000, 1073952000, 1052870400, 1087948800,
1053993600, 1039564800, 1141603200, 1074038400, 1105315200, 1060560000,
1072051200, 1046217600, 1107129600, 1088553600, 1071619200, 1115596800,
1050364800, 1147046400, 1083628800, 1056412800, 1159747200, 1087257600,
1201478400, 1120521600, 1066176000, 1034553600, 1057622400, 1078876800,
1010880000, 1133913600, 1098230400, 1170806400, 1037318400, 1070409600,
1091577600, 1057708800, 1182556800, 1091059200, 1058227200, 1061337600,
1034121600, 1067644800, 1039478400, 1022198400, 1063065600, 1096329600,
1049760000, 1081728000, 1016150400, 1029801600, 1059350400, 1087257600,
1181692800, 1310947200, 1125446400, 1057104000, NA, 1085529600,
1037664000, 1091577600, 1080518400, 1110758400, 1092787200, 1094601600,
1169424000, 1232582400, 1058918400, 1021420800, 1133136000, 1030320000,
1060732800, 1035244800, 1090800000, 1129161600, 1055808000, 1060646400,
1028678400, 1075852800, 1144627200, 1111363200, 1070236800), class = c("POSIXct",
"POSIXt"), tzone = "UTC"))
colnames(df)[1] = "dates"
YEAR_1_Start = as.Date('2002-09-01')
YEAR_1_End = as.Date('2003-08-31')
YEAR_2_Start = as.Date('2003-09-01')
YEAR_2_End = as.Date('2004-08-31')
YEAR_3_Start = as.Date('2004-09-01')
YEAR_3_End = as.Date('2005-08-31')
df$year = lapply(df$dates,FUN = function(x){
x = as.Date(x)
if(is.na(x)){
return(NA)
}else if(YEAR_1_Start <= x & x <= YEAR_1_End){
return("YEAR_1")
}else if(YEAR_2_Start <= x & x <= YEAR_2_End){
return("YEAR_2")
}else if(YEAR_3_Start <= x & x <= YEAR_3_End){
return("YEAR_3")
}else{
return("Other")
}
})
df
dates year
1 2003-06-11 YEAR_1
2 2004-08-11 YEAR_2
3 2004-06-03 YEAR_2
4 2004-01-20 YEAR_2
5 2005-02-25 YEAR_3
6 2003-01-07 YEAR_1
7 2003-11-19 YEAR_2
8 2003-03-12 YEAR_1
9 2003-12-29 YEAR_2
10 2003-03-26 YEAR_1
11 2004-08-19 YEAR_2
12 2004-07-19 YEAR_2
13 2003-04-29 YEAR_1
14 2003-05-06 YEAR_1
15 2005-10-27 Other
16 2009-05-20 Other
17 2006-02-23 Other
18 2006-01-23 Other
19 2003-02-18 YEAR_1
20 2005-03-18 YEAR_3
21 2004-01-13 YEAR_2
22 2003-05-14 YEAR_1
23 2004-06-23 YEAR_2
24 2003-05-27 YEAR_1
25 2002-12-11 YEAR_1
26 2006-03-06 Other
27 2004-01-14 YEAR_2
28 2005-01-10 YEAR_3
29 2003-08-11 YEAR_1
30 2003-12-22 YEAR_2
31 2003-02-26 YEAR_1
32 2005-01-31 YEAR_3
33 2004-06-30 YEAR_2
34 2003-12-17 YEAR_2
35 2005-05-09 YEAR_3
36 2003-04-15 YEAR_1
37 2006-05-08 Other
38 2004-05-04 YEAR_2
39 2003-06-24 YEAR_1
40 2006-10-02 Other
41 2004-06-15 YEAR_2
42 2008-01-28 Other
43 2005-07-05 YEAR_3
44 2003-10-15 YEAR_2
45 2002-10-14 YEAR_1
46 2003-07-08 YEAR_1
47 2004-03-10 YEAR_2
48 2002-01-13 Other
49 2005-12-07 Other
50 2004-10-20 YEAR_3
51 2007-02-07 Other
52 2002-11-15 YEAR_1
53 2003-12-03 YEAR_2
54 2004-08-04 YEAR_2
55 2003-07-09 YEAR_1
56 2007-06-23 Other
57 2004-07-29 YEAR_2
58 2003-07-15 YEAR_1
59 2003-08-20 YEAR_1
60 2002-10-09 YEAR_1
61 2003-11-01 YEAR_2
62 2002-12-10 YEAR_1
63 2002-05-24 Other
64 2003-09-09 YEAR_2
65 2004-09-28 YEAR_3
66 2003-04-08 YEAR_1
67 2004-04-12 YEAR_2
68 2002-03-15 Other
69 2002-08-20 Other
70 2003-07-28 YEAR_1
71 2004-06-15 YEAR_2
72 2007-06-13 Other
73 2011-07-18 Other
74 2005-08-31 YEAR_3
75 2003-07-02 YEAR_1
76 <NA> NA
77 2004-05-26 YEAR_2
78 2002-11-19 YEAR_1
79 2004-08-04 YEAR_2
80 2004-03-29 YEAR_2
81 2005-03-14 YEAR_3
82 2004-08-18 YEAR_2
83 2004-09-08 YEAR_3
84 2007-01-22 Other
85 2009-01-22 Other
86 2003-07-23 YEAR_1
87 2002-05-15 Other
88 2005-11-28 Other
89 2002-08-26 Other
90 2003-08-13 YEAR_1
91 2002-10-22 YEAR_1
92 2004-07-26 YEAR_2
93 2005-10-13 Other
94 2003-06-17 YEAR_1
95 2003-08-12 YEAR_1
96 2002-08-07 Other
97 2004-02-04 YEAR_2
98 2006-04-10 Other
99 2005-03-21 YEAR_3
100 2003-12-01 YEAR_2
df=as.data.frame(结构,
1041897600, 1069200000, 1047427200, 1072656000, 1048636800, 1092873600,
1090195200, 1051574400, 1052179200, 1130371200, 1242777600, 1140652800,
1137974400, 1045526400, 1111104000, 1073952000, 1052870400, 1087948800,
1053993600, 1039564800, 1141603200, 1074038400, 1105315200, 1060560000,
1072051200, 1046217600, 1107129600, 1088553600, 1071619200, 1115596800,
1050364800, 1147046400, 1083628800, 1056412800, 1159747200, 1087257600,
1201478400, 1120521600, 1066176000, 1034553600, 1057622400, 1078876800,
1010880000, 1133913600, 1098230400, 1170806400, 1037318400, 1070409600,
1091577600, 1057708800, 1182556800, 1091059200, 1058227200, 1061337600,
1034121600, 1067644800, 1039478400, 1022198400, 1063065600, 1096329600,
1049760000, 1081728000, 1016150400, 1029801600, 1059350400, 1087257600,
118169280013109472011254464001057104000,北美1085529600,
1037664000, 1091577600, 1080518400, 1110758400, 1092787200, 1094601600,
1169424000, 1232582400, 1058918400, 1021420800, 1133136000, 1030320000,
1060732800, 1035244800, 1090800000, 1129161600, 1055808000, 1060646400,
1028678400、1075852800、1144627200、1111363200、1070236800),class=c(“POSIXct”,
“POSIXt”),tzone=“UTC”))
colnames(df)[1]=“日期”
第一年开始=截止日期('2002-09-01')
年份1年末=截止日期('2003-08-31')
年份开始日期=截止日期('2003-09-01')
第二年结束=截止日期('2004-08-31')
第三年开始=截止日期('2004-09-01')
第三年年底=截止日期('2005-08-31')
df$year=lappy(df$dates,FUN=function(x){
x=截止日期(x)
if(is.na(x)){
返回(NA)
}否则,如果(第1年开始)每个人都有他们最喜欢的工具,我的工具恰好是因为它称为itsdt[i,j,by]
逻辑
library(data.table)
dt <- data.table(date = as.IDate(pt))
dt[, YR := 0.0 ] # I am using a numeric for year here...
dt[ date >= as.IDate("2002-09-01") & date <= as.IDate("2003-08-31"), YR := 1 ]
dt[ date >= as.IDate("2003-09-01") & date <= as.IDate("2004-08-31"), YR := 2 ]
dt[ date >= as.IDate("2004-09-01") & date <= as.IDate("2005-08-31"), YR := 3 ]
我们也可以通过计算日期差异和截短来实现这一点,但有时稍微明确一点也不错
编辑:更通用的表单只在日期上使用算术:
R> dt[, YR2 := trunc(as.numeric(difftime(as.Date(date),
+ as.Date("2001-09-01"),
+ unit="days"))/365.25)]
R> table(dt[, YR2])
0 1 2 3 4 5 6 7 9
7 31 31 12 9 5 1 2 1
R>
这在一行中完成工作。使用lubridate
和mapply
:
library(lubridate)
dates <- # your data here
# no idea how you generated these, so let's just copy them
YEAR_1 <- interval(ymd('2002-09-01'), ymd('2003-08-31'))
YEAR_2 <- interval(ymd('2003-09-01'), ymd('2004-08-31'))
YEAR_3 <- interval(ymd('2004-09-01'), ymd('2005-08-31'))
# this should scale nicely
sapply(c(YEAR_1, YEAR_2, YEAR_3), function(x) { mapply(`%within%`, dates, x) })
也许有一种更好的方法可以用purr
来编写代码,但我太新手了,看不懂purr
。我们可以:
1st:创建一个数据。表包含所有年份
> interval.dt <- data.table(Interval = c(YEAR_1, YEAR_2, YEAR_3))
> interval.dt
# Interval
#1: 2002-09-01 UTC--2003-08-31 UTC
#2: 2003-09-01 UTC--2004-08-31 UTC
#3: 2004-09-01 UTC--2005-08-31 UTC
3rd:将FINDEARINDEX
函数应用于年日期数据中的每个元素。表格
> dt <- data.table(year = df)
> dt$YearIndex <- paste("YEAR", sapply(dt$year, findYearIndex), sep = "_")
> dt
# year YearIndex
#1: 2003-06-11 YEAR_1
#2: 2004-08-11 YEAR_2
#3: 2004-06-03 YEAR_2
#4: 2004-01-20 YEAR_2
#5: 2005-02-25 YEAR_3
#6: 2003-01-07 YEAR_1
#7: 2003-11-19 YEAR_2
#8: 2003-03-12 YEAR_1
#9: 2003-12-29 YEAR_2
#10: 2003-03-26 YEAR_1
#11: 2004-08-19 YEAR_2
#12: 2004-07-19 YEAR_2
#13: 2003-04-29 YEAR_1
#14: 2003-05-06 YEAR_1
#15: 2005-10-27 YEAR_integer(0)
#ignore the rest of dt
>dt$YearIndex dt
#年份索引
#1:2003-06-11第1年
#2:2004-08-11第2年
#3:2004-06-03第二年
#4:2004-01-20第2年
#5:2005-02-25第3年
#6:2003-01-07第1年
#7:2003-11-19岁
#8:2003-03-12第1年
#9:2003-12-29岁
#10:2003-03-26岁
#11:2004-08-19第二年
#12:2004-07-19第2年
#13:2003-04-29第1年
#14:2003-05-06第1年
#15:2005-10-27岁整数(0)
#忽略dt的其余部分
这是我的看法。我喜欢保持东西整洁;)
##加载库
>图书馆(tidyverse)
>图书馆(lubridate)
>
>##定义时间
>倍倍%
+变异(时间=截止日期(时间),
+重复的=重复的(时间)##有重复的时间!
>
>
>##定义年份
>年间隔年
>##检查数据
>时代
#一个tibble:100x2
时间重复
1 2003-06-11假
2 2004-08-11假
3 2004-06-03假
4 2004-01-20假
5 2005-02-25假
6 2003-01-07假
7 2003-11-19假
8 2003-03-12假
9 2003-12-29假
10 2003-03-26假
#…还有90行
>年头
#一个tibble:3x2
年间隔
1年世界协调时2002-09-01--世界协调时2003-08-31
2年2 2003-09-01 UTC--2004-08-31 UTC
三年世界协调时2004-09-01--世界协调时2005-08-31
>
>##创建新的指示器变量
> ##
>##连接数据集(长度=3 x 100)
>##年度指标
>##放下NAs
>##保持“时间”和“活跃”
>##加入《时代》杂志,回到完整的数据集
>##作为复制品,只保留其中一个
>穿越(次数,年)%>%
+变异(活动=如果其他(时间%在%间隔内,年份,不包含字符))%>%
+下拉菜单(活动)%>%
+选择(时间,活动)%>%
+右键联接(次数,by=“time”)%>%
+不同的()%>%
+选择(-duplicated)
#一个tibble:100x2
时间活动
1 2003-06-11岁
2 2004-08-11岁
3 2004-06-03第二年
4 2004-01-20岁
5 2005-02-25岁
6 2003-01-07第1年
7 2003-11-19岁2
8 2003-03-12学年1
9 2003-12-29岁
10 2003-03-26岁1
#…还有90行
David在这里的data.table-中使用了非等联接,给出了一个很好的版本,只要在带有开始/停止/年份列的data.table中指定了间隔。什么是pt
?谢谢如果您要使用data.table
,也可以使用%到%
;-)@MichaelChirico说得很对。我知道还有另一个操作符(我很少使用),但我看错了地方。@MonicaHeddneck在我的回答中pt
是你保存的结构中的向量,尽管你有
library(lubridate)
dates <- # your data here
# no idea how you generated these, so let's just copy them
YEAR_1 <- interval(ymd('2002-09-01'), ymd('2003-08-31'))
YEAR_2 <- interval(ymd('2003-09-01'), ymd('2004-08-31'))
YEAR_3 <- interval(ymd('2004-09-01'), ymd('2005-08-31'))
# this should scale nicely
sapply(c(YEAR_1, YEAR_2, YEAR_3), function(x) { mapply(`%within%`, dates, x) })
[,1] [,2] [,3]
[1,] TRUE FALSE FALSE
[2,] FALSE TRUE FALSE
[3,] FALSE TRUE FALSE
[4,] FALSE TRUE FALSE
... etc. (100 rows in your example data)
> interval.dt <- data.table(Interval = c(YEAR_1, YEAR_2, YEAR_3))
> interval.dt
# Interval
#1: 2002-09-01 UTC--2003-08-31 UTC
#2: 2003-09-01 UTC--2004-08-31 UTC
#3: 2004-09-01 UTC--2005-08-31 UTC
> findYearIndex <- function(year) {
interval.dt[,which(int_start(interval.dt$Interval) < year & year < int_end(interval.dt$Interval))]
}
> dt <- data.table(year = df)
> dt$YearIndex <- paste("YEAR", sapply(dt$year, findYearIndex), sep = "_")
> dt
# year YearIndex
#1: 2003-06-11 YEAR_1
#2: 2004-08-11 YEAR_2
#3: 2004-06-03 YEAR_2
#4: 2004-01-20 YEAR_2
#5: 2005-02-25 YEAR_3
#6: 2003-01-07 YEAR_1
#7: 2003-11-19 YEAR_2
#8: 2003-03-12 YEAR_1
#9: 2003-12-29 YEAR_2
#10: 2003-03-26 YEAR_1
#11: 2004-08-19 YEAR_2
#12: 2004-07-19 YEAR_2
#13: 2003-04-29 YEAR_1
#14: 2003-05-06 YEAR_1
#15: 2005-10-27 YEAR_integer(0)
#ignore the rest of dt
> ## load libraries
> library(tidyverse)
> library(lubridate)
>
> ## define times
> times <- c(1055289600, 1092182400, 1086220800, 1074556800, 1109289600,
+ 1041897600, 1069200000, 1047427200, 1072656000, 1048636800, 1092873600,
+ 1090195200, 1051574400, 1052179200, 1130371200, 1242777600, 1140652800,
+ 1137974400, 1045526400, 1111104000, 1073952000, 1052870400, 1087948800,
+ 1053993600, 1039564800, 1141603200, 1074038400, 1105315200, 1060560000,
+ 1072051200, 1046217600, 1107129600, 1088553600, 1071619200, 1115596800,
+ 1050364800, 1147046400, 1083628800, 1056412800, 1159747200, 1087257600,
+ 1201478400, 1120521600, 1066176000, 1034553600, 1057622400, 1078876800,
+ 1010880000, 1133913600, 1098230400, 1170806400, 1037318400, 1070409600,
+ 1091577600, 1057708800, 1182556800, 1091059200, 1058227200, 1061337600,
+ 1034121600, 1067644800, 1039478400, 1022198400, 1063065600, 1096329600,
+ 1049760000, 1081728000, 1016150400, 1029801600, 1059350400, 1087257600,
+ 1181692800, 1310947200, 1125446400, 1057104000, NA, 1085529600,
+ 1037664000, 1091577600, 1080518400, 1110758400, 1092787200, 1094601600,
+ 1169424000, 1232582400, 1058918400, 1021420800, 1133136000, 1030320000,
+ 1060732800, 1035244800, 1090800000, 1129161600, 1055808000, 1060646400,
+ 1028678400, 1075852800, 1144627200, 1111363200, 1070236800)
> times <- tibble(time = as.POSIXct(times, origin = "1970-01-01", tz = "UTC")) %>%
+ mutate(time = as_date(time),
+ duplicated = duplicated(time)) ## there are duplicated times!
>
>
> ## define years
> year <- c("YEAR_1", "YEAR_2", "YEAR_3")
> interval <- c(interval(ymd("2002-09-01", tz = "UTC"), ymd("2003-08-31", tz = "UTC")),
+ interval(ymd("2003-09-01", tz = "UTC"), ymd("2004-08-31", tz = "UTC")),
+ interval(ymd("2004-09-01", tz = "UTC"), ymd("2005-08-31", tz = "UTC")))
> years <- tibble(year, interval)
>
> ## check data
> times
# A tibble: 100 x 2
time duplicated
<date> <lgl>
1 2003-06-11 FALSE
2 2004-08-11 FALSE
3 2004-06-03 FALSE
4 2004-01-20 FALSE
5 2005-02-25 FALSE
6 2003-01-07 FALSE
7 2003-11-19 FALSE
8 2003-03-12 FALSE
9 2003-12-29 FALSE
10 2003-03-26 FALSE
# ... with 90 more rows
> years
# A tibble: 3 x 2
year interval
<chr> <S4: Interval>
1 YEAR_1 2002-09-01 UTC--2003-08-31 UTC
2 YEAR_2 2003-09-01 UTC--2004-08-31 UTC
3 YEAR_3 2004-09-01 UTC--2005-08-31 UTC
>
> ## create new indicator variavble
> ##
> ## join datasets (length = 3 x 100)
> ## indicator for year
> ## drop NAs
> ## keep "time" and "active"
> ## join with times to get back at full dataset
> ## as duplications, keep only one of them
> crossing(times, years) %>%
+ mutate(active = if_else(time %within% interval, year, NA_character_)) %>%
+ drop_na(active) %>%
+ select(time, active) %>%
+ right_join(times, by = "time") %>%
+ distinct() %>%
+ select(-duplicated)
# A tibble: 100 x 2
time active
<date> <chr>
1 2003-06-11 YEAR_1
2 2004-08-11 YEAR_2
3 2004-06-03 YEAR_2
4 2004-01-20 YEAR_2
5 2005-02-25 YEAR_3
6 2003-01-07 YEAR_1
7 2003-11-19 YEAR_2
8 2003-03-12 YEAR_1
9 2003-12-29 YEAR_2
10 2003-03-26 YEAR_1
# ... with 90 more rows