R 返回因子变量第一次和最后一次出现的日期
问题 我有一个数据框架,其中每一行都标记着公司之间的交换,公司在给定的日期给予和接受某些东西(他们可以给予不同的公司或自己)。由此,我想创建一个新的数据框,其中的列指示公司何时开始提供、何时停止提供、何时开始接收以及何时停止接收。下面是我开始介绍的示例数据框: 样本起始数据R 返回因子变量第一次和最后一次出现的日期,r,date,R,Date,问题 我有一个数据框架,其中每一行都标记着公司之间的交换,公司在给定的日期给予和接受某些东西(他们可以给予不同的公司或自己)。由此,我想创建一个新的数据框,其中的列指示公司何时开始提供、何时停止提供、何时开始接收以及何时停止接收。下面是我开始介绍的示例数据框: 样本起始数据 samp <- structure(list(giver = structure(c(1L, 2L, 6L, 3L, 1L, 3L, 4L, 1L, 6L, 1L, 5L), .Label = c("A", "B",
samp <- structure(list(giver = structure(c(1L, 2L, 6L, 3L, 1L, 3L, 4L, 1L, 6L, 1L, 5L), .Label = c("A", "B", "C", "X", "Y", "Z"), class = "factor"), receiver = structure(c(1L, 2L, 2L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 2L), .Label = c("A", "B", "C"), class = "factor"), date = structure(c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 9L), .Label = c("2000-01-01", "2000-01-02", "2000-01-03", "2000-01-04", "2000-01-05", "2000-01-06", "2000-01-07", "2000-01-08", "2000-01-09"), class = "factor")), .Names = c("giver", "receiver", "date"), class = "data.frame", row.names = c(NA, -11L))
samp$date <- as.Date(samp$date, "%Y-%m-%d") # Format date variable
samp
giver receiver date
A A 2000-01-01
B B 2000-01-01
Z B 2000-01-02
C C 2000-01-03
A A 2000-01-04
C C 2000-01-05
X C 2000-01-06
A A 2000-01-07
Z B 2000-01-08
A A 2000-01-09
Y B 2000-01-09
desire <- structure(list(company = structure(1:6, .Label = c("A", "B", "C", "X", "Y", "Z"), class = "factor"), start.giving = structure(c(1L, 1L, 3L, 4L, 5L, 2L), .Label = c("2000-01-01", "2000-01-02", "2000-01-03", "2000-01-05", "2000-01-09"), class = "factor"), stop.giving = structure(c(5L, 1L, 2L, 3L, 5L, 4L), .Label = c("2000-01-01", "2000-01-05", "2000-01-06", "2000-01-08", "2000-01-09"), class = "factor"), start.receiving = structure(c(1L, 1L, 2L, NA, NA, NA), .Label = c("2000-01-01", "2000-01-03"), class = "factor"), stop.receiving = structure(c(2L, 2L, 1L, NA, NA, NA), .Label = c("2000-01-06", "2000-01-09"), class = "factor")), .Names = c("company", "start.giving", "stop.giving", "start.receiving", "stop.receiving"), class = "data.frame", row.names = c(NA, -6L))
desire
company start.giving stop.giving start.receiving stop.receiving
A 2000-01-01 2000-01-09 2000-01-01 2000-01-09
B 2000-01-01 2000-01-01 2000-01-01 2000-01-09
C 2000-01-03 2000-01-05 2000-01-03 2000-01-06
X 2000-01-05 2000-01-06 <NA> <NA>
Y 2000-01-09 2000-01-09 <NA> <NA>
Z 2000-01-02 2000-01-08 <NA> <NA>
这可以通过
聚合
和合并
命令在基本R中完成:
# Import starting sample data
samp <- structure(list(giver = structure(c(1L, 2L, 6L, 3L, 1L, 3L, 4L, 1L, 6L, 1L, 5L), .Label = c("A", "B", "C", "X", "Y", "Z"), class = "factor"), receiver = structure(c(1L, 2L, 2L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 2L), .Label = c("A", "B", "C"), class = "factor"), date = structure(c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 9L), .Label = c("2000-01-01", "2000-01-02", "2000-01-03", "2000-01-04", "2000-01-05", "2000-01-06", "2000-01-07", "2000-01-08", "2000-01-09"), class = "factor")), .Names = c("giver", "receiver", "date"), class = "data.frame", row.names = c(NA, -11L))
samp$date <- as.Date(samp$date, "%Y-%m-%d") # Format date variable
# Find first and last occurrence by date
g1 <- aggregate(samp$date, list(samp$giver), min)
colnames(g1)[1] = "company"
colnames(g1)[2] = "start.giving"
g2 <- aggregate(samp$date, list(samp$giver), max)
colnames(g2)[1] = "company"
colnames(g2)[2] = "stop.giving"
s1 <- aggregate(samp$date, list(samp$receiver), min)
colnames(s1)[1] = "company"
colnames(s1)[2] = "start.receiving"
s2 <- aggregate(samp$date, list(samp$receiver), max)
colnames(s2)[1] = "company"
colnames(s2)[2] = "stop.receiving"
# Merge data frames by company name
a1 <- merge(g1, g2, by=c("company"))
b1 <- merge(s1, s2, by=c("company"))
c1 <- merge(a1, b1, by=c("company"), all.x = TRUE)
c1 # Display desired data frame
company start.giving stop.giving start.receiving stop.receiving
A 2000-01-01 2000-01-09 2000-01-01 2000-01-09
B 2000-01-01 2000-01-01 2000-01-01 2000-01-09
C 2000-01-03 2000-01-05 2000-01-03 2000-01-06
X 2000-01-06 2000-01-06 <NA> <NA>
Y 2000-01-09 2000-01-09 <NA> <NA>
Z 2000-01-02 2000-01-08 <NA> <NA>
#导入起始样本数据
samp这里有一个使用data.table
包的简化版本
library(data.table)
setDT(samp)
Res1 <- samp[, .(start = min(date), stop = max(date)), by = .(company = giver)]
Res2 <- samp[, .(start = min(date), stop = max(date)), by = .(company = receiver)]
merge(Res1, Res2, by = "company", all = TRUE, suffixes = c(".giving", ".receiving"))
# company start.giving stop.giving start.receiving stop.receiving
# 1: A 2000-01-01 2000-01-09 2000-01-01 2000-01-09
# 2: B 2000-01-01 2000-01-01 2000-01-01 2000-01-09
# 3: C 2000-01-03 2000-01-05 2000-01-03 2000-01-06
# 4: X 2000-01-06 2000-01-06 <NA> <NA>
# 5: Y 2000-01-09 2000-01-09 <NA> <NA>
# 6: Z 2000-01-02 2000-01-08 <NA> <NA>
库(data.table)
setDT(samp)
Res1,下面是另一个使用dcast新功能的版本:
require(data.table) ## v1.9.5+
foo <- function(x, col) {
ans <- dcast(x, paste(col, "~ ."), value.var="date", fun=list(min, max))
setnames(ans, c("company", "start", "stop"))
}
setDT(samp)
merge(foo(samp, "giver"), foo(samp, "receiver"), by = "company",
all=TRUE, suffixes=c(".giving", ".receiving"))
# company start.giving stop.giving start.receiving stop.receiving
# 1: A 2000-01-01 2000-01-09 2000-01-01 2000-01-09
# 2: B 2000-01-01 2000-01-01 2000-01-01 2000-01-09
# 3: C 2000-01-03 2000-01-05 2000-01-03 2000-01-06
# 4: X 2000-01-06 2000-01-06 <NA> <NA>
# 5: Y 2000-01-09 2000-01-09 <NA> <NA>
# 6: Z 2000-01-02 2000-01-08 <NA> <NA>
require(data.table)#v1.9.5+
foo该dplyr
版本:
library("dplyr")
giving <- samp %>% group_by(giver) %>%
summarise(start.giving=min(date),
stop.giving=max(date)) %>%
rename(company=giver)
receiving <- samp %>% group_by(receiver) %>%
summarise(start.receiving=min(date),
stop.receiving=max(date)) %>%
rename(company=receiver)
full_join(giving,receiving)
。。。虽然代码现在不那么透明,实际上也不更短。。。如果你打算经常做这类事情,那是值得的。在创建接收
对象时,我认为在(start.receiving=min(date)),
之后有一个额外的括号。它应该是(start.receiving=min(date),
,对吗?另外,由于某种原因,我在full\u join()
命令中遇到一个错误:“错误:找不到函数'full\u join'”尽管显然已加载了dplyr
。Thx以进行更正。在最新版本的dplyr
中,联接/合并函数的名称相当流畅。您可以尝试使用outer_join()
。
foo <- function(x,f) {
ss <- c("start","stop")
group_by_(x,.dots=f) %>%
summarise(start=min(date),
stop=max(date)) %>%
rename_(.dots=c(company=f,
setNames(ss,paste(ss,f,sep="."))))
}
full_join(foo(samp,"giver"),foo(samp,"receiver"))