在R中解析时间并提取满足特定条件的元素
我有一个包含两列的大型数据帧的子集,看起来像这样在R中解析时间并提取满足特定条件的元素,r,datetime,R,Datetime,我有一个包含两列的大型数据帧的子集,看起来像这样 c1 c2 gym1 Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM gym2 Wed:7:00 AM-4:00 PM gym3 Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM gym4
c1 c2
gym1 Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM
gym2 Wed:7:00 AM-4:00 PM
gym3 Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM
gym4 Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -9:00 PM
gym5 Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -12:00 PM
c1包含健身房名称,c2包含一周中以字符格式打开的日期和时间
我将如何解析c2和1。查找哪些健身房开放时间超过x小时2小时。查找上午9点后开放的健身房?我猜我会在数据帧的末尾添加两列,其中包含真值或假值,但我不知道如何达到这一点
任何帮助或指导都将不胜感激。谢谢。下面的例子很冗长,当然可以简化。然而,我相信它能满足你的要求 它假设您的日期都符合以下(格式不一致)方式 它返回比预期更大的数据帧,因为它提供: 每天的开放时间
每天的关闭时间
健身房每天开放的持续时间
如果健身房在某一天的开放时间超过x小时
最后,它确定健身房是否在任何一天的上午9点之后(或当天)开放
最后,它确定健身房在任何一天的开放时间是否超过x小时
df<-data.frame(c1=c("gym1","gym2","gym3","gym4","gym5"),c2=c("Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM",
"Wed:7:00 AM-4:00 PM",
"Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM",
"Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -9:00 PM",
"Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -12:00 PM"))
# Remove white space to standardise
df$c2 <- gsub(" +","",df$c2)
# standardise time into hh:mm
df$c2 <- gsub(":([1-9]):",":0\\1:",df$c2)
df$c2 <- gsub("-([1-9]):","-0\\1:",df$c2)
# Lowercase the text
df$c2 <- tolower(df$c2 )
library(stringr)
# Open for greater than x hours
x <- 7
for(i in c("mon","tue","wed","thu","fri","sat","sun")) {
tmp <- data.frame(stringr::str_locate(df$c2,i))["end"]
df <- cbind(df,tmp)
df[,paste0(i,"_open")] <- NA
df[,paste0(i,"_closed")] <- NA
df[!is.na(df$end),paste0(i,"_open")] <- str_sub(df$c2[!is.na(df$end)],df$end[!is.na(df$end)]+2,df$end[!is.na(df$end)]+8)
df[!is.na(df$end),paste0(i,"_closed")] <- str_sub(df$c2[!is.na(df$end)],df$end[!is.na(df$end)]+10,df$end[!is.na(df$end)]+16)
df[,paste0(i,"_duration")] <- NA
df[,paste0(i,"_duration")] <-
as.numeric(difftime(
strptime(df[,paste0(i,"_closed")], "%I:%M%p" ),
strptime(df[,paste0(i,"_open")], "%I:%M%p" ),
units='hours')
)
# open for greater than x?
df[,paste0(i,"_open_greater_than_x_hours")] <- FALSE
df[which(df[,paste0(i,"_duration")] >= x),paste0(i,"_open_greater_than_x_hours")] <- TRUE
# open after 9 am?
df[,paste0(i,"_open_after_9am")] <- FALSE
df[,paste0(i,"_open_after_9am")] <- strptime(df[,paste0(i,"_open")], "%I:%M%p" ) >= strptime("09:00am", "%I:%M%p" )
df$end <- NULL
}
# Determine if a gym opens after (or at) 9am on at least one day
df$any_day_open_after_9am <- rowSums(df[,names(df)[grepl("after_9",names(df))]],na.rm = T) > 1
# Determine if a gym is open for greater than x hours on at least one day
df$open_greater_than_x_hours <- rowSums(df[,names(df)[grepl("open_greater_tha",names(df))]],na.rm = T) > 1
df第一步是将数据转换成更整洁的格式
strsplit
将天数放入字符向量的列表列中。然后unest
为每天排一行
那就需要更多的清洁了
library("lubridate")
library("tidyverse")
df <- read_table("c1 c2
gym1 Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM
gym2 Wed:7:00 AM-4:00 PM
gym3 Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM
gym4 Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -9:00 PM
gym5 Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -12:00 PM
")
df_tidied <-
df %>%
mutate(c2 = strsplit(c2, ";")) %>%
unnest %>%
separate(c2, c("open_day", "times"), sep = ":", extra = "merge") %>%
mutate(times = gsub(" ", "", times)) %>%
separate(times, c("open_time", "close_time"), sep = "-") %>%
mutate(
open_time = parse_date_time(open_time, "%I:%M%p"),
close_time = coalesce(
parse_date_time(close_time, "%I:%M%p"),
open_time + hours(1)),
opening_hours = close_time - open_time)
df_tidied
#> # A tibble: 10 x 5
#> c1 open_day open_time close_time opening_hours
#> <chr> <chr> <dttm> <dttm> <time>
#> 1 gym1 Thu 0000-01-01 08:00:00 0000-01-01 22:30:00 14.5
#> 2 gym1 Fri 0000-01-01 08:00:00 0000-01-01 21:00:00 13
#> 3 gym2 Wed 0000-01-01 07:00:00 0000-01-01 16:00:00 9
#> 4 gym3 Mon 0000-01-01 12:00:00 0000-01-01 18:00:00 6
#> 5 gym3 Tue 0000-01-01 12:00:00 0000-01-01 19:00:00 7
#> 6 gym3 Wed 0000-01-01 10:00:00 0000-01-01 11:00:00 1
#> 7 gym4 Sat 0000-01-01 08:00:00 0000-01-01 22:30:00 14.5
#> 8 gym4 Sun 0000-01-01 08:00:00 0000-01-01 21:00:00 13
#> 9 gym5 Sat 0000-01-01 08:00:00 0000-01-01 22:30:00 14.5
#> 10 gym5 Sun 0000-01-01 08:00:00 0000-01-01 12:00:00 4
df=dat%>%tidytext::unnest_标记(单词、c2、标记=strsplit、split=“;”)%>%
(Word,C(“天”,“OpenX时间”,“关闭时间”),“(请考虑规范化你的数据,并把每个开放的范围,为每个健身房,到你的数据帧的一个单独的行)。有人可能给你一个AMSWER你的数据,但你应该认真考虑改变你的设计。我认为一个解决方案很可能会使用<代码> StReS拆开(…,”;
,strsplit(…“-”)
(加上重复工作日),as.POSIXct(…)
;以及某种形式的数据帧宽到高转换(例如,tidyr::gather
)但是我想做所有这些的唯一原因是,如果你绝对不能控制这样的数据格式。很遗憾,我无法控制传入数据的格式……
df_tidied %>%
group_by(c1) %>%
tally(opening_hours)
#> # A tibble: 5 x 2
#> c1 n
#> <chr> <time>
#> 1 gym1 27.5
#> 2 gym2 9
#> 3 gym3 14
#> 4 gym4 27.5
#> 5 gym5 18.5
df=dat%>%tidytext::unnest_tokens(word, c2, token = strsplit, split = ";")%>%
separate(word,c("day","open_time","close_time"),"(?<=[a-z]):|-")%>%
mutate(duration=strptime(close_time,"%I:%M %p")-strptime(open_time,"%I:%M %p"))
df
c1 day open_time close_time duration
1 gym1 thu 8:00 am 10:30 pm 14.5 hours
2 gym1 fri 8:00 am 9:00 pm 13.0 hours
3 gym2 wed 7:00 am 4:00 pm 9.0 hours
4 gym3 mon 12:00 pm 6:00 pm 6.0 hours
5 gym3 tue 12:00 pm 7:00 pm 7.0 hours
6 gym3 wed 10:00 am 10:00 pm 12.0 hours
7 gym4 sat 8:00 am 10:30 pm 14.5 hours
8 gym4 sun 8:00 am 9:00 pm 13.0 hours
df%>%group_by(c1)%>%
mutate(openafter9=as.numeric(format(strptime(open_time,"%I:%M %p"),"%I"))>9,
Tot_Hrs_opn=sum(duration))##You can decide to use summarize but remember opening hour may depend on the day so you need to be careful
# A tibble: 8 x 7
# Groups: c1 [4]
c1 day open_time close_time duration openafter9 Tot_Hrs_opn
<chr> <chr> <chr> <chr> <time> <lgl> <time>
1 gym1 thu 8:00 am 10:30 pm 14.5 hours FALSE 27.5 hours
2 gym1 fri 8:00 am 9:00 pm 13.0 hours FALSE 27.5 hours
3 gym2 wed 7:00 am 4:00 pm 9.0 hours FALSE 9.0 hours
4 gym3 mon 12:00 pm 6:00 pm 6.0 hours TRUE 25.0 hours
5 gym3 tue 12:00 pm 7:00 pm 7.0 hours TRUE 25.0 hours
6 gym3 wed 10:00 am 10:00 pm 12.0 hours TRUE 25.0 hours
7 gym4 sat 8:00 am 10:30 pm 14.5 hours FALSE 27.5 hours
8 gym4 sun 8:00 am 9:00 pm 13.0 hours FALSE 27.5 hours