如何基于R中的另一个现有表推导百分位数
我有以下收入、产品的百分位数分布:如何基于R中的另一个现有表推导百分位数,r,R,我有以下收入、产品的百分位数分布: Percentile Revenue Prod 0 344 1 0.1 10000 1 0.2 15413 1 0.3 19918 1 0.4 28729 2 0.5 41136 2 0.6 60000 3 0.7 90429 5 0.8 125684 7 0.9
Percentile Revenue Prod
0 344 1
0.1 10000 1
0.2 15413 1
0.3 19918 1
0.4 28729 2
0.5 41136 2
0.6 60000 3
0.7 90429 5
0.8 125684 7
0.9 202231 10
1 3515000 80
对于以下记录,需要从上述分布中得出两个指标-rev_百分位和prod_百分位:
Code Revenue prod freq
A6696 57657 3 3
A6828 184552 12 4
A843 101632 2 2
A141 58551 2 2
B579 166668 7 6
B625 98641 3 3
M257 92664 5 4
P50 12173 2 2
S339 81494 3 3
S06283 100290 4 2
例如,对于代码A6696,rev_百分位值为0.6,prod_百分位值也为0.6
同样,对于第二个代码,两个值都是0.9
有人能帮我在R中实现吗?我们可以在这里使用
cut
或findInterval
将数据分组
df$rev_percentile <- perct_tab$Percentile[findInterval(df$Revenue,c(0, perct_tab$Revenue))]
df$prod_percentile <- perct_tab$Percentile[findInterval(df$prod, perct_tab$Prod)]
df
# Code Revenue prod freq rev_percentile prod_percentile
#1 A6696 57657 3 3 0.6 0.6
#2 A6828 184552 12 4 0.9 0.9
#3 A843 101632 2 2 0.8 0.5
#4 A141 58551 2 2 0.6 0.5
#5 B579 166668 7 6 0.9 0.8
#6 B625 98641 3 3 0.8 0.6
#7 M257 92664 5 4 0.8 0.7
#8 P50 12173 2 2 0.2 0.5
#9 S339 81494 3 3 0.7 0.6
#10 S06283 100290 4 2 0.8 0.6
df$rev_百分位使用cut的替代解决方案
df$Revenue_percentiles <- cut(df$Revenue, breaks=c(0, perct_tab$Revenue), labels=seq(0, 1, .1))
df$Revenue\u percentiles在这里,我提供了两种方法:
- 方法1:与解决方案类似,使用
findInterval()
数据
df1 <- structure(list(Percentile = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6,
0.7, 0.8, 0.9, 1), Revenue = c(344L, 10000L, 15413L, 19918L,
28729L, 41136L, 60000L, 90429L, 125684L, 202231L, 3515000L),
Prod = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 5L, 7L, 10L, 80L)), class = "data.frame", row.names = c(NA,
-11L))
df2 <- structure(list(Code = structure(c(2L, 3L, 4L, 1L, 5L, 6L, 7L,
8L, 10L, 9L), .Label = c("A141", "A6696", "A6828", "A843", "B579",
"B625", "M257", "P50", "S06283", "S339"), class = "factor"),
Revenue = c(57657L, 184552L, 101632L, 58551L, 166668L, 98641L,
92664L, 12173L, 81494L, 100290L), prod = c(3L, 12L, 2L, 2L,
7L, 3L, 5L, 2L, 3L, 4L), freq = c(3L, 4L, 2L, 2L, 6L, 3L,
4L, 2L, 3L, 2L), prod_percentile = c(0.6, 0.9, 0.5, 0.5,
0.8, 0.6, 0.7, 0.5, 0.6, 0.6), rev_percentile = c(0.6, 0.9,
0.8, 0.6, 0.9, 0.8, 0.8, 0.2, 0.7, 0.8)), row.names = c(NA,
-10L), class = "data.frame")
df1
df2<- within(df2,
list(rev_percentile <- df1$Percentile[findInterval(Revenue,df1$Revenue)+1],
prod_percentile <- df1$Percentile[findInterval(prod,df1$Prod)]))
df2<- within(df2,
list(rev_percentile <- df1$Percentile[sapply(Revenue, function(x) which.max(x<df1$Revenue))]],
prod_percentile <- df1$Percentile[sapply(prod, function(x) which.max(x<df1$Prod)-1)]))
df1 <- structure(list(Percentile = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6,
0.7, 0.8, 0.9, 1), Revenue = c(344L, 10000L, 15413L, 19918L,
28729L, 41136L, 60000L, 90429L, 125684L, 202231L, 3515000L),
Prod = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 5L, 7L, 10L, 80L)), class = "data.frame", row.names = c(NA,
-11L))
df2 <- structure(list(Code = structure(c(2L, 3L, 4L, 1L, 5L, 6L, 7L,
8L, 10L, 9L), .Label = c("A141", "A6696", "A6828", "A843", "B579",
"B625", "M257", "P50", "S06283", "S339"), class = "factor"),
Revenue = c(57657L, 184552L, 101632L, 58551L, 166668L, 98641L,
92664L, 12173L, 81494L, 100290L), prod = c(3L, 12L, 2L, 2L,
7L, 3L, 5L, 2L, 3L, 4L), freq = c(3L, 4L, 2L, 2L, 6L, 3L,
4L, 2L, 3L, 2L), prod_percentile = c(0.6, 0.9, 0.5, 0.5,
0.8, 0.6, 0.7, 0.5, 0.6, 0.6), rev_percentile = c(0.6, 0.9,
0.8, 0.6, 0.9, 0.8, 0.8, 0.2, 0.7, 0.8)), row.names = c(NA,
-10L), class = "data.frame")