在R中解析时间并提取满足特定条件的元素_R_Datetime

在R中解析时间并提取满足特定条件的元素

r datetime

在R中解析时间并提取满足特定条件的元素,r,datetime,R,Datetime,我有一个包含两列的大型数据帧的子集，看起来像这样 c1 c2 gym1 Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM gym2 Wed:7:00 AM-4:00 PM gym3 Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM gym4

我有一个包含两列的大型数据帧的子集，看起来像这样

c1                      c2
gym1               Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM
gym2               Wed:7:00 AM-4:00 PM
gym3               Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM
gym4               Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -9:00 PM
gym5               Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -12:00 PM

c1包含健身房名称，c2包含一周中以字符格式打开的日期和时间

我将如何解析c2和1。查找哪些健身房开放时间超过x小时2小时。查找上午9点后开放的健身房？我猜我会在数据帧的末尾添加两列，其中包含真值或假值，但我不知道如何达到这一点

任何帮助或指导都将不胜感激。谢谢。

下面的例子很冗长，当然可以简化。然而，我相信它能满足你的要求

它假设您的日期都符合以下（格式不一致）方式

它返回比预期更大的数据帧，因为它提供：

每天的开放时间
每天的关闭时间
健身房每天开放的持续时间
如果健身房在某一天的开放时间超过x小时
最后，它确定健身房是否在任何一天的上午9点之后（或当天）开放
最后，它确定健身房在任何一天的开放时间是否超过x小时

df<-data.frame(c1=c("gym1","gym2","gym3","gym4","gym5"),c2=c("Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM",
    "Wed:7:00 AM-4:00 PM",
    "Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM",
    "Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -9:00 PM",
    "Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -12:00 PM"))

# Remove white space to standardise
df$c2 <- gsub(" +","",df$c2)

# standardise time into hh:mm
df$c2 <- gsub(":([1-9]):",":0\\1:",df$c2)
df$c2 <- gsub("-([1-9]):","-0\\1:",df$c2)
# Lowercase the text
df$c2 <- tolower(df$c2 )

library(stringr)

# Open for greater than x hours
x <- 7

for(i in c("mon","tue","wed","thu","fri","sat","sun")) {
    tmp <- data.frame(stringr::str_locate(df$c2,i))["end"]
    df <- cbind(df,tmp)
    df[,paste0(i,"_open")] <- NA
    df[,paste0(i,"_closed")] <- NA
    df[!is.na(df$end),paste0(i,"_open")] <- str_sub(df$c2[!is.na(df$end)],df$end[!is.na(df$end)]+2,df$end[!is.na(df$end)]+8)
    df[!is.na(df$end),paste0(i,"_closed")] <- str_sub(df$c2[!is.na(df$end)],df$end[!is.na(df$end)]+10,df$end[!is.na(df$end)]+16)

    df[,paste0(i,"_duration")] <- NA
    df[,paste0(i,"_duration")] <-
    as.numeric(difftime(
        strptime(df[,paste0(i,"_closed")], "%I:%M%p" ),
        strptime(df[,paste0(i,"_open")], "%I:%M%p" ),
        units='hours')
    )
    # open for greater than x?
    df[,paste0(i,"_open_greater_than_x_hours")] <- FALSE
    df[which(df[,paste0(i,"_duration")] >= x),paste0(i,"_open_greater_than_x_hours")] <- TRUE

    # open after 9 am?
    df[,paste0(i,"_open_after_9am")] <- FALSE
    df[,paste0(i,"_open_after_9am")] <- strptime(df[,paste0(i,"_open")], "%I:%M%p" )  >= strptime("09:00am", "%I:%M%p" ) 

    df$end <- NULL
}

# Determine if a gym opens after (or at) 9am on at least one day
df$any_day_open_after_9am <- rowSums(df[,names(df)[grepl("after_9",names(df))]],na.rm = T) > 1

# Determine if a gym is open for greater than x hours on at least one day
df$open_greater_than_x_hours <- rowSums(df[,names(df)[grepl("open_greater_tha",names(df))]],na.rm = T) > 1

df第一步是将数据转换成更整洁的格式
strsplit
将天数放入字符向量的列表列中。然后unest
为每天排一行
那就需要更多的清洁了
library("lubridate")
library("tidyverse")

df <- read_table("c1                 c2
gym1               Thu:8:00 AM -10:30 PM;Fri: 8:00 AM -9:00 PM
gym2               Wed:7:00 AM-4:00 PM
gym3               Mon:12:00 PM - 6:00 PM;Tue:12:00 PM - 7:00 PM;Wed:10:00 AM -10:00 PM
gym4               Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -9:00 PM
gym5               Sat:8:00 AM -10:30 PM;Sun: 8:00 AM -12:00 PM
")

df_tidied <-
  df %>%
  mutate(c2 = strsplit(c2, ";")) %>%
  unnest %>%
  separate(c2, c("open_day", "times"), sep = ":", extra = "merge") %>%
  mutate(times = gsub(" ", "", times)) %>%
  separate(times, c("open_time", "close_time"), sep = "-") %>%
  mutate(
    open_time = parse_date_time(open_time, "%I:%M%p"),
    close_time = coalesce(
      parse_date_time(close_time, "%I:%M%p"),
      open_time + hours(1)),
    opening_hours = close_time - open_time)
df_tidied
#> # A tibble: 10 x 5
#>    c1    open_day open_time           close_time          opening_hours
#>    <chr> <chr>    <dttm>              <dttm>              <time>       
#>  1 gym1  Thu      0000-01-01 08:00:00 0000-01-01 22:30:00 14.5         
#>  2 gym1  Fri      0000-01-01 08:00:00 0000-01-01 21:00:00 13           
#>  3 gym2  Wed      0000-01-01 07:00:00 0000-01-01 16:00:00 9            
#>  4 gym3  Mon      0000-01-01 12:00:00 0000-01-01 18:00:00 6            
#>  5 gym3  Tue      0000-01-01 12:00:00 0000-01-01 19:00:00 7            
#>  6 gym3  Wed      0000-01-01 10:00:00 0000-01-01 11:00:00 1            
#>  7 gym4  Sat      0000-01-01 08:00:00 0000-01-01 22:30:00 14.5         
#>  8 gym4  Sun      0000-01-01 08:00:00 0000-01-01 21:00:00 13           
#>  9 gym5  Sat      0000-01-01 08:00:00 0000-01-01 22:30:00 14.5         
#> 10 gym5  Sun      0000-01-01 08:00:00 0000-01-01 12:00:00 4    

df=dat%>%tidytext:：unnest_标记（单词、c2、标记=strsplit、split=“；”）%>%
（Word，C（“天”，“OpenX时间”，“关闭时间”），“（请考虑规范化你的数据，并把每个开放的范围，为每个健身房，到你的数据帧的一个单独的行）。有人可能给你一个AMSWER你的数据，但你应该认真考虑改变你的设计。我认为一个解决方案很可能会使用<代码> StReS拆开（…，”；
，strsplit（…“-”）
（加上重复工作日），as.POSIXct（…）
；以及某种形式的数据帧宽到高转换（例如，tidyr:：gather）但是我想做所有这些的唯一原因是，如果你绝对不能控制这样的数据格式。很遗憾，我无法控制传入数据的格式……
df_tidied %>%
  group_by(c1) %>%
  tally(opening_hours)
#> # A tibble: 5 x 2
#>   c1    n     
#>   <chr> <time>
#> 1 gym1  27.5  
#> 2 gym2  9     
#> 3 gym3  14    
#> 4 gym4  27.5  
#> 5 gym5  18.5 

df=dat%>%tidytext::unnest_tokens(word, c2, token = strsplit, split = ";")%>%
    separate(word,c("day","open_time","close_time"),"(?<=[a-z]):|-")%>%
    mutate(duration=strptime(close_time,"%I:%M %p")-strptime(open_time,"%I:%M %p"))
  df
   c1 day open_time close_time   duration
1 gym1 thu  8:00 am    10:30 pm 14.5 hours
2 gym1 fri  8:00 am     9:00 pm 13.0 hours
3 gym2 wed   7:00 am    4:00 pm  9.0 hours
4 gym3 mon 12:00 pm     6:00 pm  6.0 hours
5 gym3 tue 12:00 pm     7:00 pm  7.0 hours
6 gym3 wed 10:00 am    10:00 pm 12.0 hours
7 gym4 sat  8:00 am    10:30 pm 14.5 hours
8 gym4 sun  8:00 am     9:00 pm 13.0 hours

df%>%group_by(c1)%>%
     mutate(openafter9=as.numeric(format(strptime(open_time,"%I:%M %p"),"%I"))>9,
             Tot_Hrs_opn=sum(duration))##You can decide to use summarize but remember opening hour may depend on the day so you need to be careful
# A tibble: 8 x 7
# Groups:   c1 [4]
     c1   day open_time close_time   duration openafter9 Tot_Hrs_opn
  <chr> <chr>     <chr>      <chr>     <time>      <lgl>      <time>
1  gym1   thu  8:00 am    10:30 pm 14.5 hours      FALSE  27.5 hours
2  gym1   fri  8:00 am     9:00 pm 13.0 hours      FALSE  27.5 hours
3  gym2   wed   7:00 am    4:00 pm  9.0 hours      FALSE   9.0 hours
4  gym3   mon 12:00 pm     6:00 pm  6.0 hours       TRUE  25.0 hours
5  gym3   tue 12:00 pm     7:00 pm  7.0 hours       TRUE  25.0 hours
6  gym3   wed 10:00 am    10:00 pm 12.0 hours       TRUE  25.0 hours
7  gym4   sat  8:00 am    10:30 pm 14.5 hours      FALSE  27.5 hours
8  gym4   sun  8:00 am     9:00 pm 13.0 hours      FALSE  27.5 hours