R 为基准数据创建图表

R 为基准数据创建图表,r,charts,ggplot2,aggregate,R,Charts,Ggplot2,Aggregate,我正在使用我的前任编写的现有R代码。代码用于生成PDF报告,以显示软件测试运行的数据 我正在尝试创建的一组图表应该是绘制“基准”结果的百分比变化。这个基准应该是我们有数据支持的最早版本 以下是当前用于构建基准偏差图的代码部分。 library(ggplot2) dbhandle <- SQLConn_remote(DBName = "DATABASE", ServerName = "SERVER") Testdf<-sqlQuery(dbha

我正在使用我的前任编写的现有R代码。代码用于生成PDF报告,以显示软件测试运行的数据

我正在尝试创建的一组图表应该是绘制“基准”结果的百分比变化。这个基准应该是我们有数据支持的最早版本

以下是当前用于构建基准偏差图的代码部分。

library(ggplot2)

dbhandle <- SQLConn_remote(DBName = "DATABASE", ServerName = "SERVER")
Testdf<-sqlQuery(dbhandle, 'select * from TABLENAME 
                order by FileName, Number, Category', stringsAsFactors = FALSE)
versions<-unique(Testdf[order(Testdf$Number), ][,2])

benchmarks<-aggregate(Value~FileName, subset(Testdf, Number == 1 | Number == 2)[, c('FileName', 'Value')], mean)
names(benchmarks)[2]<-'Benchmark'

Testdf<-merge(Testdf, benchmarks)
Testdf$Version<-factor(Testdf$Version, levels = versions)
Testdf$Deviation<-Testdf$Value- Testdf$Benchmark
Testdf$DeviationP<-(Testdf$Value- Testdf$Benchmark)/Testdf$Benchmark

g<-ggplot(subset(Testdf, !is.na(Value) & Deviation <.5) , aes(color = Value, x = Version, y = Deviation, group = FileName)) + geom_line() +geom_point(aes(shape = Build), size = 1.5) +
  scale_shape_manual(values=c(1,15)) + stat_summary(fun.y=sum, geom="line") + 
  ylab("Run Time Deviation from Benchmark (min)") +  
  scale_colour_gradient(name = 'Run Time',low = 'blue', high = 'red') + 
  theme(axis.text.x = element_text(angle = 90, vjust = .5)) + theme(axis.title.y = element_text(vjust = 1))
g
rw1 <- c("File1", "File1", "File1", "File2", "File2", "File2", "File3", "File3", "File3", "File1", "File1", "File1", "File2", "File2", "File2", "File3", "File3", "File3", "File1", "File1", "File1", "File2", "File2", "File2", "File3", "File3", "File3")
rw2 <- c("0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.02", "0.02", "0.02", "0.02", "0.02", "0.02", "0.02", "0.02", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03")
rw3 <- c("Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final")
rw4 <- c(123, 456, 789, 312, 645, 978, 741, 852, 963, 369, 258, 147, 753, 498, 951, 753, 915, 438, 978, 741, 852, 963, 369, 258, 147, 753, 498)
rw5 <- c("01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12")
rw6 <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3)
rw7 <- c("Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Release", "Release", "Release", "Release", "Release", "Release", "Release", "Release", "Release")
rw8 <- c("None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "Cannot Connect to Database", "None", "None", "None", "None", "None", "None", "None", "None")


Testdf = data.frame(rw1, rw2, rw3, rw4, rw5, rw6, rw7, rw8)
colnames(Testdf) <- c("FileName", "Version", "Category", "Value", "Date", "Number", "Build", "Error") 

如果您想自己重新创建,可以在R中使用此示例数据帧。

library(ggplot2)

dbhandle <- SQLConn_remote(DBName = "DATABASE", ServerName = "SERVER")
Testdf<-sqlQuery(dbhandle, 'select * from TABLENAME 
                order by FileName, Number, Category', stringsAsFactors = FALSE)
versions<-unique(Testdf[order(Testdf$Number), ][,2])

benchmarks<-aggregate(Value~FileName, subset(Testdf, Number == 1 | Number == 2)[, c('FileName', 'Value')], mean)
names(benchmarks)[2]<-'Benchmark'

Testdf<-merge(Testdf, benchmarks)
Testdf$Version<-factor(Testdf$Version, levels = versions)
Testdf$Deviation<-Testdf$Value- Testdf$Benchmark
Testdf$DeviationP<-(Testdf$Value- Testdf$Benchmark)/Testdf$Benchmark

g<-ggplot(subset(Testdf, !is.na(Value) & Deviation <.5) , aes(color = Value, x = Version, y = Deviation, group = FileName)) + geom_line() +geom_point(aes(shape = Build), size = 1.5) +
  scale_shape_manual(values=c(1,15)) + stat_summary(fun.y=sum, geom="line") + 
  ylab("Run Time Deviation from Benchmark (min)") +  
  scale_colour_gradient(name = 'Run Time',low = 'blue', high = 'red') + 
  theme(axis.text.x = element_text(angle = 90, vjust = .5)) + theme(axis.title.y = element_text(vjust = 1))
g
rw1 <- c("File1", "File1", "File1", "File2", "File2", "File2", "File3", "File3", "File3", "File1", "File1", "File1", "File2", "File2", "File2", "File3", "File3", "File3", "File1", "File1", "File1", "File2", "File2", "File2", "File3", "File3", "File3")
rw2 <- c("0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.01", "0.02", "0.02", "0.02", "0.02", "0.02", "0.02", "0.02", "0.02", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03", "0.03")
rw3 <- c("Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final", "Time", "Size", "Final")
rw4 <- c(123, 456, 789, 312, 645, 978, 741, 852, 963, 369, 258, 147, 753, 498, 951, 753, 915, 438, 978, 741, 852, 963, 369, 258, 147, 753, 498)
rw5 <- c("01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12", "01/01/12")
rw6 <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3)
rw7 <- c("Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Iteration", "Release", "Release", "Release", "Release", "Release", "Release", "Release", "Release", "Release")
rw8 <- c("None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "None", "Cannot Connect to Database", "None", "None", "None", "None", "None", "None", "None", "None")


Testdf = data.frame(rw1, rw2, rw3, rw4, rw5, rw6, rw7, rw8)
colnames(Testdf) <- c("FileName", "Version", "Category", "Value", "Date", "Number", "Build", "Error") 

rw1我假设您的问题特别是在计算基准变量方面

首先,其目的似乎是按文件计算
number==1
number==2
的所有行的
Value
平均值

这是通过两个步骤完成的

  • 子集(Testdf,Number==1 | Number==2)[,c('FileName','Value')]
    返回数字为
    1
    2
    的行,以及
    文件名和
    值的列
  • 聚合(值~文件名,子集(*如上*),平均值)
    按文件名取值的平均值。因为我们进行了筛选,所以它只考虑满足数字标准的行
  • 写入的行结果为:

    >benchmarks 
      FileName Benchmark
    1    File1 357.0
    2    File2 689.5
    3    File3 777.0
    
    然后将其合并回文件名上的帧。这里更明确的代码是:

    Testdf<-merge(Testdf, benchmarks, by = "FileName")
    
    然后,每行都有该文件名的平均值

    然后,他们计算与该基准的偏差,分别为
    %
    #

    备用方式

    data.table
    语法可能更容易理解:

    library(data.table)
    setDT(Testdf)
    Testdf[, Benchmark := mean(Value[Number == 1 | Number == 2]), by = "FileName"]
    
    分解如下:

    Testdf[,
    因为逗号左边没有任何内容,所以我们将其应用于每一行

    Benchmark:=mean(Value[Number==1 | Number==2])
    这将创建一个名为Benchmark的新列。Benchmark的值是列
    Value
    的平均值,但仅适用于数字为
    1
    2
    的行

    ,by=“FileName”]
    我们将分别为每个文件名计算基准。考虑这一点的一种方法是,我们将取
    filename==File1
    的所有行,然后取
    Value
    的平均值。然后取
    filename==File2
    的所有行,并执行相同的操作。
    by=
    参数为文件名的每个唯一值执行此操作

    下一步


    问题是:代码应该做什么?平均值是正确的基准吗?如果是这样的话,上面的代码就起作用了。图形看起来很混乱,因此ggplot代码可能存在问题。进一步澄清这一点将有助于我们帮助您。

    您是对的,
    |
    意味着或<代码>聚合
    将函数应用于data.frame的切片。切片定义在平铺的右侧
    ~
    Number==1 | Number==2
    也可以在%c(1,2)
    中写为
    Number%。谢谢你的回答。我注意到图表乱七八糟。我认为
    Number==1 | Number==2
    选择了两个值作为基准。这不是我想要的正确方法。我想使用最早的条目作为基准。这样,图表将绘制与原始值的偏差。问题是我不想只选择
    Number==1
    ,因为有时会出现错误和崩溃,导致条目没有任何值。如果
    Number==1
    中没有条目,那么代码应该移动到
    Number==2
    @David“最早的”在数据中如何表示?它将是该特定
    文件名的列中最低的
    Number
    。换句话说,代码应该根据
    数字
    列中的最小数字设置基准。此外,我如何才能使其成为唯一一个被绘制的类别是
    时间
    类别?@David看看您提供的示例,您需要想出一种处理冲突的方法。也就是说,在前几行中有三个数字1,都是针对file1的。应该使用哪一种?对于图形问题,我建议开始一个新问题,因为它比嵌套在现有的无关问题中更有可能得到回答