如何像R中的SQL Windows函数那样计算日差

如何像R中的SQL Windows函数那样计算日差,r,date-difference,R,Date Difference,输入: df <- structure(list(id = c(4848L, 4887L, 4899L, 4811L, 4834L, 4892L ), item = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Pasta", "Pizza"), class = "factor"), city = structure(c(1L, 1L, 2L, 2L, 2L, 1L), .Label = c("Berlin", "Hamburg"),

输入:

df <- structure(list(id = c(4848L, 4887L, 4899L, 4811L, 4834L, 4892L
), item = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Pasta", 
"Pizza"), class = "factor"), city = structure(c(1L, 1L, 2L, 2L, 
2L, 1L), .Label = c("Berlin", "Hamburg"), class = "factor"), 
    date = structure(c(17199, 17201, -643892, 17449, 17459, 17515
    ), class = "Date")), .Names = c("id", "item", "city", "date"
), row.names = c(NA, -6L), class = "data.frame")

目标:

df <- structure(list(id = c(4848L, 4887L, 4899L, 4811L, 4834L, 4892L
), item = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Pasta", 
"Pizza"), class = "factor"), city = structure(c(1L, 1L, 2L, 2L, 
2L, 1L), .Label = c("Berlin", "Hamburg"), class = "factor"), 
    date = structure(c(17199, 17201, -643892, 17449, 17459, 17515
    ), class = "Date")), .Names = c("id", "item", "city", "date"
), row.names = c(NA, -6L), class = "data.frame")
使用以下规则创建一个名为“dayDifference”的新列:对于每对“item city”对,计算相关对的日差

所需输出:

df <- structure(list(id = c(4848L, 4887L, 4899L, 4811L, 4834L, 4892L
), item = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Pasta", 
"Pizza"), class = "factor"), city = structure(c(1L, 1L, 2L, 2L, 
2L, 1L), .Label = c("Berlin", "Hamburg"), class = "factor"), 
    date = structure(c(17199, 17201, -643892, 17449, 17459, 17515
    ), class = "Date")), .Names = c("id", "item", "city", "date"
), row.names = c(NA, -6L), class = "data.frame")


  • 第1行和第2行[Pair Piza Berlin]对应于第3行,因为2月2日和2月4日之间有3天
  • 第3行[Pair Pizza Hambourg]对应于0,因为没有日差
  • 第4行和第5行[Pair Pasta Hambourg]对应21天,因为从10到20有21天
  • 第6行[Pair]对应于0,因为没有日差
信息:当然可以有两行以上的一对[例如,我可以有一对'pizza berlin'100行:如果是这样,总是取最大(日期)并减去最小(日期)pizza berlin对

约束:

df <- structure(list(id = c(4848L, 4887L, 4899L, 4811L, 4834L, 4892L
), item = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Pasta", 
"Pizza"), class = "factor"), city = structure(c(1L, 1L, 2L, 2L, 
2L, 1L), .Label = c("Berlin", "Hamburg"), class = "factor"), 
    date = structure(c(17199, 17201, -643892, 17449, 17459, 17515
    ), class = "Date")), .Names = c("id", "item", "city", "date"
), row.names = c(NA, -6L), class = "data.frame")
需要在R中完成[例如,没有与数据库的外部连接]

源代码:

df <- structure(list(id = c(4848L, 4887L, 4899L, 4811L, 4834L, 4892L
), item = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Pasta", 
"Pizza"), class = "factor"), city = structure(c(1L, 1L, 2L, 2L, 
2L, 1L), .Label = c("Berlin", "Hamburg"), class = "factor"), 
    date = structure(c(17199, 17201, -643892, 17449, 17459, 17515
    ), class = "Date")), .Names = c("id", "item", "city", "date"
), row.names = c(NA, -6L), class = "data.frame")
df不漂亮,但是

i<-unique(lapply(1:nrow(df),function(x) which(paste(df[,2],df[,3]) %in% paste(df[x,2],df[x,3]))))
for(j in 1:length(i)) df[i[[j]],"days"]<-abs(difftime(df[i[[j]],][1,"date"],df[i[[j]],][2,"date"]))

> df
    id  item    city       date days
1 4848 Pizza  Berlin 2017-02-02    2
2 4887 Pizza  Berlin 2017-02-04    2
3 4899 Pizza Hamburg 0207-02-01   NA
4 4811 Pasta Hamburg 2017-10-10   10
5 4834 Pasta Hamburg 2017-10-20   10
6 4892 Pasta  Berlin 2017-12-15   NA
i不漂亮,但是

i<-unique(lapply(1:nrow(df),function(x) which(paste(df[,2],df[,3]) %in% paste(df[x,2],df[x,3]))))
for(j in 1:length(i)) df[i[[j]],"days"]<-abs(difftime(df[i[[j]],][1,"date"],df[i[[j]],][2,"date"]))

> df
    id  item    city       date days
1 4848 Pizza  Berlin 2017-02-02    2
2 4887 Pizza  Berlin 2017-02-04    2
3 4899 Pizza Hamburg 0207-02-01   NA
4 4811 Pasta Hamburg 2017-10-10   10
5 4834 Pasta Hamburg 2017-10-20   10
6 4892 Pasta  Berlin 2017-12-15   NA

i我会使用
数据来完成。表

library(data.table)
setDT(df)
df[, min_date := min(date), by = c("item", "city")]
df[, max_date := max(date), by = c("item", "city")]
df[, dayDifference := difftime(max_date, min_date, units = "days")]
df[, c("min_date", "max_date") := NULL]
它将为您提供所需的输出:

id  item    city       date             dayDifference
1: 4848 Pizza  Berlin 2017-02-02        2 days
2: 4887 Pizza  Berlin 2017-02-04        2 days
3: 4899 Pizza Hamburg 0207-02-01        0 days
4: 4811 Pasta Hamburg 2017-10-10       10 days
5: 4834 Pasta Hamburg 2017-10-20       10 days
6: 4892 Pasta  Berlin 2017-12-15        0 days

您也可以使用
df[,dayDifference:=max\u date-min\u date]
而不是
df[,dayDifference:=difftime(max\u date,min\u date,units=“days”)
我会使用
数据。表
:

library(data.table)
setDT(df)
df[, min_date := min(date), by = c("item", "city")]
df[, max_date := max(date), by = c("item", "city")]
df[, dayDifference := difftime(max_date, min_date, units = "days")]
df[, c("min_date", "max_date") := NULL]
它将为您提供所需的输出:

id  item    city       date             dayDifference
1: 4848 Pizza  Berlin 2017-02-02        2 days
2: 4887 Pizza  Berlin 2017-02-04        2 days
3: 4899 Pizza Hamburg 0207-02-01        0 days
4: 4811 Pasta Hamburg 2017-10-10       10 days
5: 4834 Pasta Hamburg 2017-10-20       10 days
6: 4892 Pasta  Berlin 2017-12-15        0 days

您也可以使用
df[,dayDifference:=max\u date-min\u date]
而不是
df[,dayDifference:=difftime(max\u date,min\u date,units=“days”)]
Reduce
是一个很棒的功能

library(dplyr)
df %>% 
  group_by(item, city) %>% 
  mutate(dayDifference=abs(Reduce(`-`, as.numeric(range(date)))))

# A tibble: 6 x 5
# Groups:   item, city [4]
     id   item    city       date dayDifference
  <int> <fctr>  <fctr>     <date>         <dbl>
1  4848  Pizza  Berlin 2017-02-02             2
2  4887  Pizza  Berlin 2017-02-04             2
3  4899  Pizza Hamburg 0207-02-01             0
4  4811  Pasta Hamburg 2017-10-10            10
5  4834  Pasta Hamburg 2017-10-20            10
6  4892  Pasta  Berlin 2017-12-15             0
库(dplyr)
df%>%
按(项目、城市)分组%>%
mutate(dayDifference=abs(Reduce(`-`,as.numeric(range(date)'))))
#一个tibble:6x5
#分组:项目,城市[4]
id项目城市日期日差
14848披萨柏林2017-02-02 2
24887披萨柏林2017-02-04 2
34899汉堡比萨饼0207-02-01 0
4 4811汉堡意大利面2017-10-10
54834汉堡面食2017-10-20 10
64892柏林意大利面2017-12-15 0

Reduce
是一个很棒的功能

library(dplyr)
df %>% 
  group_by(item, city) %>% 
  mutate(dayDifference=abs(Reduce(`-`, as.numeric(range(date)))))

# A tibble: 6 x 5
# Groups:   item, city [4]
     id   item    city       date dayDifference
  <int> <fctr>  <fctr>     <date>         <dbl>
1  4848  Pizza  Berlin 2017-02-02             2
2  4887  Pizza  Berlin 2017-02-04             2
3  4899  Pizza Hamburg 0207-02-01             0
4  4811  Pasta Hamburg 2017-10-10            10
5  4834  Pasta Hamburg 2017-10-20            10
6  4892  Pasta  Berlin 2017-12-15             0
库(dplyr)
df%>%
按(项目、城市)分组%>%
mutate(dayDifference=abs(Reduce(`-`,as.numeric(range(date)'))))
#一个tibble:6x5
#分组:项目,城市[4]
id项目城市日期日差
14848披萨柏林2017-02-02 2
24887披萨柏林2017-02-04 2
34899汉堡比萨饼0207-02-01 0
44811汉堡面食2017-10-10 10
54834汉堡面食2017-10-20 10
64892柏林意大利面2017-12-15 0

第1行和第2行[Pair Piza Berlin]不对应于第2行,因为2月2日和2月4日之间相差2天?而对于[Pair Pota Hambourg]来说,相差应该是10天?@suchait I的说法,你完全正确。它已更新。第1行和第2行[Pair Piza Berlin]因为2月2日和2月4日之间有2天的时间差,所以与2不对应?对于[Pair Potas Hambourg]差异应该是10?@suchait I Appologys,你完全正确。它已更新。您好,谢谢,我认为这是一个很好的解决方案。我发现减少的函数方法很有趣。不过,我很惊讶我们需要使用group by。通常在SQL中,group by将创建一个agregation。如果我只运行到group-根据你给出的代码,我得到了与表df相同的观察值…这让我有点困惑。dplyr中的group_by函数与SQL有点不同吗?group_by没有聚合,它只是设置了后续函数的范围。我想我找到了一个很好的教程:file:///C:/Users/i342097/Downloads/data-transformation.pdf@dmi3k不太感谢这些有用的信息。对于SQL用户来说,这确实有点让人困惑:)但现在我得到了它。谢谢,我认为这是一个很好的解决方案。我觉得减少的函数方法很有趣。但我很惊讶,我们需要使用group by。通常在SQL中,group by将创建一个agregation。如果我un直到group by您给出的代码,我才得到与表df相同的观察值…这让我有点困惑。dplyr中的group_by函数与SQL有点不同吗?group_by没有聚合,它只是设置了后续函数的范围。我想我找到了一个很好的教程:file:///C:/Users/i342097/Downloads/data-transfo@dmi3kno非常感谢提供帮助的信息。对于SQL用户来说,这个名称确实有点让人困惑:)但现在我知道了