R 使用自定义箱使用ggplot2覆盖密度和直方图打印

R 使用自定义箱使用ggplot2覆盖密度和直方图打印,r,ggplot2,histogram,kernel-density,R,Ggplot2,Histogram,Kernel Density,所以我有一些数据——几个样本中的基因表达——我想以一种有意义的方式绘制一个直方图,然后叠加一条密度曲线。沿着这幅图的思路: 但是使用cut()设置垃圾箱。我的数据如下所示: df <- structure(list(gene_id = c("0610005C13Rik", "0610007C21Rik", "0610007L01Rik", "0610007N19Rik", "0610007P08Rik", "0610007P14Rik", "0610007P22Rik", "061

所以我有一些数据——几个样本中的基因表达——我想以一种有意义的方式绘制一个直方图,然后叠加一条密度曲线。沿着这幅图的思路:

但是使用cut()设置垃圾箱。我的数据如下所示:

df <- structure(list(gene_id = c("0610005C13Rik", "0610007C21Rik", 
"0610007L01Rik", "0610007N19Rik", "0610007P08Rik", "0610007P14Rik", 
"0610007P22Rik", "0610008F07Rik", "0610009B14Rik", "0610009B22Rik", 
"Zwilch", "Zwint", "Zxda", "Zxdb", "Zxdc", "Zyg11a", "Zyg11b", 
"Zyx", "Zzef1", "Zzz3"), sample_name = structure(c(8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L), .Label = c("SRSF1", "SRSF2", "SRSF3", "SRSF4", "SRSF5", 
"SRSF6", "SRSF7", "GFP"), class = "factor"), fpkm = c(0, 21.7863, 
27.9083, 3.83815, 8.65554, 40.7533, 20.2477, 0, 0.670717, 41.5058, 
33.6888, 17.7709, 0.756213, 1.38326, 4.07163, 5.53123, 8.50196, 
40.8762, 2.79376, 14.6917), conf_hi = c(0, 28.6113, 33.396, 5.43846, 
10.5892, 50.3002, 25.7052, 0, 1.27673, 52.502, 40.9057, 21.8586, 
1.0982, 1.95332, 5.68713, 7.53352, 10.3355, 49.5294, 3.49622, 
17.7908), conf_lo = c(0, 14.9613, 22.4206, 2.23784, 6.72186, 
31.2064, 14.7903, 0, 0.0647031, 30.5095, 26.4719, 13.6831, 0.414229, 
0.813193, 2.45612, 3.52894, 6.66839, 32.2231, 2.0913, 11.5925
), quant_status = c("OK", "OK", "OK", "OK", "OK", "OK", "OK", 
"OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", 
"OK", "OK"), stdev = c(0, 3.4125, 2.74385, 0.800155, 0.96683, 
4.77345, 2.72875, 0, 0.3030065, 5.4981, 3.60845, 2.04385, 0.1709935, 
0.28503, 0.80775, 1.001145, 0.91677, 4.3266, 0.35123, 1.54955
)), .Names = c("gene_id", "sample_name", "fpkm", "conf_hi", "conf_lo", 
"quant_status", "stdev"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 185871L, 185872L, 185873L, 185874L, 185875L, 
185876L, 185877L, 185878L, 185879L, 185880L), class = "data.frame")

df您是否也在问(您可能应该问)如何使用除简单高斯分布以外的分布?当我运行您的代码时,我得到以下错误:重命名错误(x,.base_to_ggplot,warn_missing=FALSE):找不到函数“revalue”,这将非常困难,因为密度层的x轴与条形层的x轴不同。它们不仅有不同的范围,而且棒层基本上是在对数尺度上。非常好的点@DWin。以对数比例绘制时,分布接近高斯分布:。当然这里不是这样,但我已经完全忘记了。这将更像是一个双峰分布。目前的情况是@Ista。另一种方法是使用my cut函数在ggplot2内部定义binwidth。那可能吗?
df <- structure(list(gene_id = c("0610005C13Rik", "0610007C21Rik", 
"0610007L01Rik", "0610007N19Rik", "0610007P08Rik", "0610007P14Rik", 
"0610007P22Rik", "0610008F07Rik", "0610009B14Rik", "0610009B22Rik", 
"Zwilch", "Zwint", "Zxda", "Zxdb", "Zxdc", "Zyg11a", "Zyg11b", 
"Zyx", "Zzef1", "Zzz3"), sample_name = structure(c(8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L), .Label = c("SRSF1", "SRSF2", "SRSF3", "SRSF4", "SRSF5", 
"SRSF6", "SRSF7", "GFP"), class = "factor"), fpkm = c(0, 21.7863, 
27.9083, 3.83815, 8.65554, 40.7533, 20.2477, 0, 0.670717, 41.5058, 
33.6888, 17.7709, 0.756213, 1.38326, 4.07163, 5.53123, 8.50196, 
40.8762, 2.79376, 14.6917), conf_hi = c(0, 28.6113, 33.396, 5.43846, 
10.5892, 50.3002, 25.7052, 0, 1.27673, 52.502, 40.9057, 21.8586, 
1.0982, 1.95332, 5.68713, 7.53352, 10.3355, 49.5294, 3.49622, 
17.7908), conf_lo = c(0, 14.9613, 22.4206, 2.23784, 6.72186, 
31.2064, 14.7903, 0, 0.0647031, 30.5095, 26.4719, 13.6831, 0.414229, 
0.813193, 2.45612, 3.52894, 6.66839, 32.2231, 2.0913, 11.5925
), quant_status = c("OK", "OK", "OK", "OK", "OK", "OK", "OK", 
"OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", 
"OK", "OK"), stdev = c(0, 3.4125, 2.74385, 0.800155, 0.96683, 
4.77345, 2.72875, 0, 0.3030065, 5.4981, 3.60845, 2.04385, 0.1709935, 
0.28503, 0.80775, 1.001145, 0.91677, 4.3266, 0.35123, 1.54955
)), .Names = c("gene_id", "sample_name", "fpkm", "conf_hi", "conf_lo", 
"quant_status", "stdev"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 185871L, 185872L, 185873L, 185874L, 185875L, 
185876L, 185877L, 185878L, 185879L, 185880L), class = "data.frame")
df.bin <- transform(df, expression_bin=cut(fpkm, breaks=c(-Inf, 0.01, 0.1, 1, 10, 100, 1000, 10000, Inf), labels=c("<0.01","0.01-0.1","0.1-1","1-10","10-100","100-1000","1000-10.000", ">10.000")))
p.fpkm.bin <- ggplot(df.bin, aes(x=expression_bin)) + 
          geom_histogram(aes(fill = expression_bin), stat="bin") +
          #           geom_density(aes(x=fpkm),alpha=.2, fill="#FF6666")  # Overlay with transparent density plot
          facet_wrap(~ sample_name) +
          theme(axis.text.x  = element_text(angle=45, vjust=0.5)) + xlab("Expression category (fpkm)") + 
          ylab("Number of genes") + scale_fill_brewer()
p.fpkm.bin