Apache spark 将变量从';dttm&x27;进入';POSIXCT&x27;
使用时,我将这些变量以一种称为Apache spark 将变量从';dttm&x27;进入';POSIXCT&x27;,apache-spark,r,dplyr,Apache Spark,R,Dplyr,使用时,我将这些变量以一种称为dttm的奇怪格式显示如下: tpep_pickup_datetime tpep_dropoff_datetime <dttm> <dttm> 2015-01-15 18:05:39 2015-01-15 18:23:42 2015-01-10 19:33:38 2015-01-10 1
dttm
的奇怪格式显示如下:
tpep_pickup_datetime tpep_dropoff_datetime
<dttm> <dttm>
2015-01-15 18:05:39 2015-01-15 18:23:42
2015-01-10 19:33:38 2015-01-10 19:53:28
2015-01-10 19:33:38 2015-01-10 19:43:41
2015-01-10 19:33:39 2015-01-10 19:35:31
我得到这个错误:
与此相同,我们可以使用dplyr
、lubridate::seconds\u to_period
和base::difftime
来获得输出
库(dplyr)
图书馆(lubridate)
df1%>%
变异(dur=秒到周期(如数字(difftime)(tpep\u dropoff\u datetime,
tpep_皮卡_日期时间),
单位(秒)
#>#tibble:4 x 3
#>tpep_皮卡_日期时间tpep_下车_日期时间dur
#>
#>1 2015-01-15 18:05:39 2015-01-15 18:23:42 18米3秒
#>2 2015-01-10 19:33:38 2015-01-10 19:53:28 19米50秒
#>3 2015-01-10 19:33:38 2015-01-10 19:43:41
#>4 2015-01-10 19:33:39 2015-01-10 19:35:31 1M 52S
如果您希望以秒
的形式输出,而不是以.period
的形式输出,则下面的代码可以工作
库(dplyr)
df1%>%
变异(dur=as.numeric(difftime)(tpep\u dropoff\u datetime,
tpep_皮卡_日期时间),
单位(秒)
#>#tibble:4 x 3
#>tpep_皮卡_日期时间tpep_下车_日期时间dur
#>
#> 1 2015-01-15 18:05:39 2015-01-15 18:23:42 1083
#> 2 2015-01-10 19:33:38 2015-01-10 19:53:28 1190
#> 3 2015-01-10 19:33:38 2015-01-10 19:43:41 603
#> 4 2015-01-10 19:33:39 2015-01-10 19:35:31 112
数据:
read.table(text=“tpep\u pickup\u datetime tpep\u dropoff\u datetime
2015-01-15_18:05:39 2015-01-15_18:23:42
2015-01-10_19:33:38 2015-01-10_19:53:28
2015-01-10_19:33:38 2015-01-10_19:43:41
2015-01-10_19:33:39 2015-01-10_19:35:31“,stringsAsFactors=F,header=T)->df1
df1%>%
变异时间(VAR(“tpep拾取日期时间”、“tpep衰减日期时间”),
列表(~as.POSIXct(,format=“%Y-%m-%d_%H:%m:%S”,tz=Sys.timezone())))%>%
as_tible->df1
df1#只是为了显示数据
#>tpep_取货_日期时间tpep_卸货_日期时间
#>
#> 1 2015-01-15 18:05:39 2015-01-15 18:23:42
#> 2 2015-01-10 19:33:38 2015-01-10 19:53:28
#> 3 2015-01-10 19:33:38 2015-01-10 19:43:41
#> 4 2015-01-10 19:33:39 2015-01-10 19:35:31
与此相同,我们可以使用dplyr
、lubridate::seconds\u to_period
和base::difftime
获得输出
库(dplyr)
图书馆(lubridate)
df1%>%
变异(dur=秒到周期(如数字(difftime)(tpep\u dropoff\u datetime,
tpep_皮卡_日期时间),
单位(秒)
#>#tibble:4 x 3
#>tpep_皮卡_日期时间tpep_下车_日期时间dur
#>
#>1 2015-01-15 18:05:39 2015-01-15 18:23:42 18米3秒
#>2 2015-01-10 19:33:38 2015-01-10 19:53:28 19米50秒
#>3 2015-01-10 19:33:38 2015-01-10 19:43:41
#>4 2015-01-10 19:33:39 2015-01-10 19:35:31 1M 52S
如果您希望以秒
的形式输出,而不是以.period
的形式输出,则下面的代码可以工作
库(dplyr)
df1%>%
变异(dur=as.numeric(difftime)(tpep\u dropoff\u datetime,
tpep_皮卡_日期时间),
单位(秒)
#>#tibble:4 x 3
#>tpep_皮卡_日期时间tpep_下车_日期时间dur
#>
#> 1 2015-01-15 18:05:39 2015-01-15 18:23:42 1083
#> 2 2015-01-10 19:33:38 2015-01-10 19:53:28 1190
#> 3 2015-01-10 19:33:38 2015-01-10 19:43:41 603
#> 4 2015-01-10 19:33:39 2015-01-10 19:35:31 112
数据:
read.table(text=“tpep\u pickup\u datetime tpep\u dropoff\u datetime
2015-01-15_18:05:39 2015-01-15_18:23:42
2015-01-10_19:33:38 2015-01-10_19:53:28
2015-01-10_19:33:38 2015-01-10_19:43:41
2015-01-10_19:33:39 2015-01-10_19:35:31“,stringsAsFactors=F,header=T)->df1
df1%>%
变异时间(VAR(“tpep拾取日期时间”、“tpep衰减日期时间”),
列表(~as.POSIXct(,format=“%Y-%m-%d_%H:%m:%S”,tz=Sys.timezone())))%>%
as_tible->df1
df1#只是为了显示数据
#>tpep_取货_日期时间tpep_卸货_日期时间
#>
#> 1 2015-01-15 18:05:39 2015-01-15 18:23:42
#> 2 2015-01-10 19:33:38 2015-01-10 19:53:28
#> 3 2015-01-10 19:33:38 2015-01-10 19:43:41
#> 4 2015-01-10 19:33:39 2015-01-10 19:35:31
试试这个:
all_data <- all_data %>%
mutate(new_pickup = as.POSIXct(tpep_pickup_datetime)) %>%
mutate(day_pickup = as.Date(new_pickup)) %>%
mutate(time_pickup = paste(hour(new_pickup), minute(new_pickup),second(new_pickup),sep="-")) %>%
mutate(new_dropoff = as.POSIXct(tpep_dropoff_datetime)) %>%
mutate(day_dropoff = as.Date(new_dropoff)) %>%
mutate(time_dropoff = paste(hour(new_dropoff), minute(new_dropoff),second(new_dropoff),sep="-")) %>%
mutate(trip_duration = ((hour(new_dropoff) - hour(new_pickup))*3600 + (minute(new_dropoff) - minute(new_pickup))*60 + (second(new_dropoff) - second(new_pickup))))
所有_数据%
突变(新拾取=as.POSIXct(tpep拾取日期时间))%>%
变异(日期提取=截止日期(新提取))%>%
变异(时间拾取=粘贴(小时(新拾取)、分钟(新拾取)、秒(新拾取)、sep=“-”)%%>%
突变(新衰减=as.POSIXct(tpep衰减\u日期时间))%>%
变异(日期衰减=截止日期(新衰减))%>%
变异(时间衰减=粘贴(小时(新衰减)、分钟(新衰减)、秒(新衰减)、sep=“-”)%%>%
变异(行程持续时间=((小时(新下车)-小时(新上车))*3600+(分钟(新下车)-分钟(新上车))*60+(秒(新下车)-秒(新上车)))
试试这个:
all_data <- all_data %>%
mutate(new_pickup = as.POSIXct(tpep_pickup_datetime)) %>%
mutate(day_pickup = as.Date(new_pickup)) %>%
mutate(time_pickup = paste(hour(new_pickup), minute(new_pickup),second(new_pickup),sep="-")) %>%
mutate(new_dropoff = as.POSIXct(tpep_dropoff_datetime)) %>%
mutate(day_dropoff = as.Date(new_dropoff)) %>%
mutate(time_dropoff = paste(hour(new_dropoff), minute(new_dropoff),second(new_dropoff),sep="-")) %>%
mutate(trip_duration = ((hour(new_dropoff) - hour(new_pickup))*3600 + (minute(new_dropoff) - minute(new_pickup))*60 + (second(new_dropoff) - second(new_pickup))))
所有_数据%
突变(新拾取=as.POSIXct(tpep拾取日期时间))%>%
变异(日期提取=截止日期(新提取))%>%
变异(时间拾取=粘贴(小时(新拾取)、分钟(新拾取)、秒(新拾取)、sep=“-”)%%>%
突变(新衰减=as.POSIXct(tpep衰减\u日期时间))%>%
变异(日期衰减=截止日期(新衰减))%>%
变异(时间衰减=粘贴(小时(新衰减)、分钟(新衰减)、秒(新衰减)、sep=“-”)%%>%
变异(行程持续时间=((小时(新行程下降)-小时(新行程上升))*3600+(分钟(新行程下降)-分钟(新行程下降)-分钟
all_data <- all_data %>%
mutate(new_pickup = as.POSIXct(tpep_pickup_datetime)) %>%
mutate(day_pickup = as.Date(new_pickup)) %>%
mutate(time_pickup = paste(hour(new_pickup), minute(new_pickup),second(new_pickup),sep="-")) %>%
mutate(new_dropoff = as.POSIXct(tpep_dropoff_datetime)) %>%
mutate(day_dropoff = as.Date(new_dropoff)) %>%
mutate(time_dropoff = paste(hour(new_dropoff), minute(new_dropoff),second(new_dropoff),sep="-")) %>%
mutate(trip_duration = ((hour(new_dropoff) - hour(new_pickup))*3600 + (minute(new_dropoff) - minute(new_pickup))*60 + (second(new_dropoff) - second(new_pickup))))