如何在R中透视表
我试着学习R已经有一段时间了,但我的知识还没有达到一个像样的水平。请帮我做这个支点 我有一个包含5000行的csv数据文件,其中包含以下数据字段:名称、频道(内部或外部)、调查发送日期和调查接收日期 基本数据如下所示 我希望这是把在下面的格式 我试过这个如何在R中透视表,r,reshape,R,Reshape,我试着学习R已经有一段时间了,但我的知识还没有达到一个像样的水平。请帮我做这个支点 我有一个包含5000行的csv数据文件,其中包含以下数据字段:名称、频道(内部或外部)、调查发送日期和调查接收日期 基本数据如下所示 我希望这是把在下面的格式 我试过这个 library("reshape2") dcast(w, Recruiter~channel)" 这工作正常,但我不知道如何添加“已发送调查”、“已接收调查”和“已发送调查-已接收调查”的计数。请尝试以下简单代码: outdf = d
library("reshape2")
dcast(w, Recruiter~channel)"
这工作正常,但我不知道如何添加“已发送调查”、“已接收调查”和“已发送调查-已接收调查”的计数。请尝试以下简单代码:
outdf = dcast(ddf, name~channel, length)
outdf$total_channel = outdf$external + outdf$internal
outdf$survey_sent = data.frame(table(ddf$name))$Freq
outdf$survey_rcd = data.frame(with(ddf[ddf$survey_rcd!="",], table(name)))$Freq
outdf$survey_pending= outdf$survey_sent - outdf$survey_rcd
outdf
# name external internal total_channel survey_sent survey_rcd survey_pending
#1 a 0 4 4 4 2 2
#2 b 4 1 5 5 2 3
#3 c 2 2 4 4 3 1
样本数据:
ddf = structure(list(name = c("a", "a", "a", "a", "b", "b", "b", "b",
"b", "c", "c", "c", "c"), channel = c("internal", "internal",
"internal", "internal", "external", "external", "external", "external",
"internal", "internal", "internal", "external", "external"),
survey_sent = c("15/02/13", "16/02/13", "17/02/13", "18/02/13",
"19/02/13", "20/02/13", "21/02/13", "22/02/13", "23/02/13",
"24/02/13", "25/02/13", "26/02/13", "27/02/13"), survey_rcd = c("26/03/14",
"", "", "29/03/14", "30/03/14", "", "", "", "03/04/14", "04/04/14",
"", "06/04/14", "07/04/14")), .Names = c("name", "channel",
"survey_sent", "survey_rcd"), class = "data.frame", row.names = c(NA,
-13L))
ddf
name channel survey_sent survey_rcd
1 a internal 15/02/13 26/03/14
2 a internal 16/02/13
3 a internal 17/02/13
4 a internal 18/02/13 29/03/14
5 b external 19/02/13 30/03/14
6 b external 20/02/13
7 b external 21/02/13
8 b external 22/02/13
9 b internal 23/02/13 03/04/14
10 c internal 24/02/13 04/04/14
11 c internal 25/02/13
12 c external 26/02/13 06/04/14
13 c external 27/02/13 07/04/14
尝试以下简单代码:
outdf = dcast(ddf, name~channel, length)
outdf$total_channel = outdf$external + outdf$internal
outdf$survey_sent = data.frame(table(ddf$name))$Freq
outdf$survey_rcd = data.frame(with(ddf[ddf$survey_rcd!="",], table(name)))$Freq
outdf$survey_pending= outdf$survey_sent - outdf$survey_rcd
outdf
# name external internal total_channel survey_sent survey_rcd survey_pending
#1 a 0 4 4 4 2 2
#2 b 4 1 5 5 2 3
#3 c 2 2 4 4 3 1
样本数据:
ddf = structure(list(name = c("a", "a", "a", "a", "b", "b", "b", "b",
"b", "c", "c", "c", "c"), channel = c("internal", "internal",
"internal", "internal", "external", "external", "external", "external",
"internal", "internal", "internal", "external", "external"),
survey_sent = c("15/02/13", "16/02/13", "17/02/13", "18/02/13",
"19/02/13", "20/02/13", "21/02/13", "22/02/13", "23/02/13",
"24/02/13", "25/02/13", "26/02/13", "27/02/13"), survey_rcd = c("26/03/14",
"", "", "29/03/14", "30/03/14", "", "", "", "03/04/14", "04/04/14",
"", "06/04/14", "07/04/14")), .Names = c("name", "channel",
"survey_sent", "survey_rcd"), class = "data.frame", row.names = c(NA,
-13L))
ddf
name channel survey_sent survey_rcd
1 a internal 15/02/13 26/03/14
2 a internal 16/02/13
3 a internal 17/02/13
4 a internal 18/02/13 29/03/14
5 b external 19/02/13 30/03/14
6 b external 20/02/13
7 b external 21/02/13
8 b external 22/02/13
9 b internal 23/02/13 03/04/14
10 c internal 24/02/13 04/04/14
11 c internal 25/02/13
12 c external 26/02/13 06/04/14
13 c external 27/02/13 07/04/14
dplyr
解决方案
> head(data)
Name Channel Sent Recd
1 A Internal 2014-07-10 2014-07-12
2 A Internal 2014-07-16 <NA>
3 A External 2014-08-04 2014-08-10
4 A Internal 2014-08-16 2014-08-18
5 A Internal 2014-07-29 <NA>
6 A External 2014-08-05 2014-08-14
给出:
Name External Internal Total Sent Recd Pending
1 A 6 4 10 10 8 2
2 B 2 7 9 9 6 3
3 C 4 5 9 9 4 5
注意,我使用realDate
对象表示日期,使用NA
表示缺少的数据
由此产生的数据:
data =
structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
Channel = c("Internal", "Internal", "External", "Internal",
"Internal", "External", "External", "External", "External",
"External", "Internal", "External", "Internal", "Internal",
"Internal", "External", "Internal", "Internal", "Internal",
"Internal", "Internal", "External", "Internal", "External",
"External", "External", "Internal", "Internal"), Sent = structure(c(16261,
16267, 16286, 16298, 16280, 16287, 16294, 16292, 16291, 16282,
16304, 16297, 16262, 16274, 16264, 16270, 16252, 16276, 16279,
16275, 16277, 16293, 16253, 16272, 16288, 16283, 16281, 16296
), class = "Date"), Recd = structure(c(16263.5024573486,
NA, 16292.4899729695, 16300.3446546271, NA, 16296.9054549634,
16301.318120582, 16301.4672047794, 16295.238142278, 16286.8117301762,
NA, 16306.6499495078, NA, 16282.0412430186, 16272.4275530744,
16273.9005153924, 16255.7532094959, NA, 16284.9287535194,
NA, 16279.182732366, 16302.4864703286, NA, NA, 16296.6838856321,
NA, 16290.3657759354, NA), class = "Date")), .Names = c("Name",
"Channel", "Sent", "Recd"), row.names = c(NA, -28L), class = "data.frame")
dplyr
解决方案
> head(data)
Name Channel Sent Recd
1 A Internal 2014-07-10 2014-07-12
2 A Internal 2014-07-16 <NA>
3 A External 2014-08-04 2014-08-10
4 A Internal 2014-08-16 2014-08-18
5 A Internal 2014-07-29 <NA>
6 A External 2014-08-05 2014-08-14
给出:
Name External Internal Total Sent Recd Pending
1 A 6 4 10 10 8 2
2 B 2 7 9 9 6 3
3 C 4 5 9 9 4 5
注意,我使用realDate
对象表示日期,使用NA
表示缺少的数据
由此产生的数据:
data =
structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
Channel = c("Internal", "Internal", "External", "Internal",
"Internal", "External", "External", "External", "External",
"External", "Internal", "External", "Internal", "Internal",
"Internal", "External", "Internal", "Internal", "Internal",
"Internal", "Internal", "External", "Internal", "External",
"External", "External", "Internal", "Internal"), Sent = structure(c(16261,
16267, 16286, 16298, 16280, 16287, 16294, 16292, 16291, 16282,
16304, 16297, 16262, 16274, 16264, 16270, 16252, 16276, 16279,
16275, 16277, 16293, 16253, 16272, 16288, 16283, 16281, 16296
), class = "Date"), Recd = structure(c(16263.5024573486,
NA, 16292.4899729695, 16300.3446546271, NA, 16296.9054549634,
16301.318120582, 16301.4672047794, 16295.238142278, 16286.8117301762,
NA, 16306.6499495078, NA, 16282.0412430186, 16272.4275530744,
16273.9005153924, 16255.7532094959, NA, 16284.9287535194,
NA, 16279.182732366, 16302.4864703286, NA, NA, 16296.6838856321,
NA, 16290.3657759354, NA), class = "Date")), .Names = c("Name",
"Channel", "Sent", "Recd"), row.names = c(NA, -28L), class = "data.frame")
或者使用
数据。表
(使用@Spacedman的数据)
库(data.table)
DT1或使用数据。表
(使用@Spacedman的数据)
库(data.table)
DT1您应该在此处发布示例数据,例如dput(head(data,20))的输出,以便其他人更好地帮助您。具体取决于:您的问题是“如何使用重塑
将数据合并到两列中”,还是“如何编写逻辑表达式,如if(已发送调查和已接收调查)
”?用于“已发送调查”和“已接收调查”“,您可以使用聚合,例如,aggregate(Survey.sent~Name,w,length)
。您应该在此处发布示例数据,例如dput(head(data,20))的输出,以便其他人更好地帮助您。取决于:您的问题是“如何使用重塑
将数据合并到两列中”,还是“如何编写类似if的逻辑表达式?”(已发送调查和已接收调查)
“?对于“已发送调查”和“已接收调查”,您可以使用聚合,例如聚合(Survey.Sent~Name,w,length)
。