如何基于R中的另一个现有表推导百分位数

如何基于R中的另一个现有表推导百分位数,r,R,我有以下收入、产品的百分位数分布: Percentile Revenue Prod 0 344 1 0.1 10000 1 0.2 15413 1 0.3 19918 1 0.4 28729 2 0.5 41136 2 0.6 60000 3 0.7 90429 5 0.8 125684 7 0.9

我有以下收入、产品的百分位数分布:

Percentile  Revenue Prod
0           344     1
0.1         10000   1
0.2         15413   1
0.3         19918   1
0.4         28729   2
0.5         41136   2
0.6         60000   3
0.7         90429   5
0.8         125684  7
0.9         202231  10
1           3515000 80
对于以下记录,需要从上述分布中得出两个指标-rev_百分位和prod_百分位:

Code    Revenue prod freq
A6696   57657   3       3
A6828   184552  12      4
A843    101632  2       2
A141    58551   2       2
B579    166668  7       6
B625    98641   3       3
M257    92664   5       4
P50     12173   2       2
S339    81494   3       3
S06283  100290  4       2
例如,对于代码A6696,rev_百分位值为0.6,prod_百分位值也为0.6 同样,对于第二个代码,两个值都是0.9


有人能帮我在R中实现吗?

我们可以在这里使用
cut
findInterval
将数据分组

df$rev_percentile <- perct_tab$Percentile[findInterval(df$Revenue,c(0, perct_tab$Revenue))]
df$prod_percentile <- perct_tab$Percentile[findInterval(df$prod, perct_tab$Prod)]
df

#     Code Revenue prod freq rev_percentile prod_percentile
#1   A6696   57657    3    3            0.6             0.6
#2   A6828  184552   12    4            0.9             0.9
#3    A843  101632    2    2            0.8             0.5
#4    A141   58551    2    2            0.6             0.5
#5    B579  166668    7    6            0.9             0.8
#6    B625   98641    3    3            0.8             0.6
#7    M257   92664    5    4            0.8             0.7
#8     P50   12173    2    2            0.2             0.5
#9    S339   81494    3    3            0.7             0.6
#10 S06283  100290    4    2            0.8             0.6

df$rev_百分位使用
cut的替代解决方案

df$Revenue_percentiles <- cut(df$Revenue, breaks=c(0, perct_tab$Revenue), labels=seq(0, 1, .1))

df$Revenue\u percentiles在这里,我提供了两种方法:

  • 方法1:与解决方案类似,使用
    findInterval()
数据

df1 <- structure(list(Percentile = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 
0.7, 0.8, 0.9, 1), Revenue = c(344L, 10000L, 15413L, 19918L, 
28729L, 41136L, 60000L, 90429L, 125684L, 202231L, 3515000L), 
    Prod = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 5L, 7L, 10L, 80L)), class = "data.frame", row.names = c(NA, 
-11L))

df2 <- structure(list(Code = structure(c(2L, 3L, 4L, 1L, 5L, 6L, 7L, 
8L, 10L, 9L), .Label = c("A141", "A6696", "A6828", "A843", "B579", 
"B625", "M257", "P50", "S06283", "S339"), class = "factor"), 
    Revenue = c(57657L, 184552L, 101632L, 58551L, 166668L, 98641L, 
    92664L, 12173L, 81494L, 100290L), prod = c(3L, 12L, 2L, 2L, 
    7L, 3L, 5L, 2L, 3L, 4L), freq = c(3L, 4L, 2L, 2L, 6L, 3L, 
    4L, 2L, 3L, 2L), prod_percentile = c(0.6, 0.9, 0.5, 0.5, 
    0.8, 0.6, 0.7, 0.5, 0.6, 0.6), rev_percentile = c(0.6, 0.9, 
    0.8, 0.6, 0.9, 0.8, 0.8, 0.2, 0.7, 0.8)), row.names = c(NA, 
-10L), class = "data.frame")
df1
df2<- within(df2,
       list(rev_percentile <- df1$Percentile[findInterval(Revenue,df1$Revenue)+1], 
       prod_percentile <- df1$Percentile[findInterval(prod,df1$Prod)]))
df2<- within(df2,
       list(rev_percentile <- df1$Percentile[sapply(Revenue, function(x) which.max(x<df1$Revenue))]], 
       prod_percentile <- df1$Percentile[sapply(prod, function(x) which.max(x<df1$Prod)-1)]))
df1 <- structure(list(Percentile = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 
0.7, 0.8, 0.9, 1), Revenue = c(344L, 10000L, 15413L, 19918L, 
28729L, 41136L, 60000L, 90429L, 125684L, 202231L, 3515000L), 
    Prod = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 5L, 7L, 10L, 80L)), class = "data.frame", row.names = c(NA, 
-11L))

df2 <- structure(list(Code = structure(c(2L, 3L, 4L, 1L, 5L, 6L, 7L, 
8L, 10L, 9L), .Label = c("A141", "A6696", "A6828", "A843", "B579", 
"B625", "M257", "P50", "S06283", "S339"), class = "factor"), 
    Revenue = c(57657L, 184552L, 101632L, 58551L, 166668L, 98641L, 
    92664L, 12173L, 81494L, 100290L), prod = c(3L, 12L, 2L, 2L, 
    7L, 3L, 5L, 2L, 3L, 4L), freq = c(3L, 4L, 2L, 2L, 6L, 3L, 
    4L, 2L, 3L, 2L), prod_percentile = c(0.6, 0.9, 0.5, 0.5, 
    0.8, 0.6, 0.7, 0.5, 0.6, 0.6), rev_percentile = c(0.6, 0.9, 
    0.8, 0.6, 0.9, 0.8, 0.8, 0.2, 0.7, 0.8)), row.names = c(NA, 
-10L), class = "data.frame")