Scala Spark作业在每次运行时返回不同的结果

Scala Spark作业在每次运行时返回不同的结果,scala,apache-spark,Scala,Apache Spark,我正在研究一个scala代码,它在某些数据集上执行线性回归。现在我使用20个内核和25个执行器,每次运行Spark作业时,我都会得到不同的结果 文件的输入大小为2GB和400MB。但是,当我使用20个内核和1个执行器运行作业时,我得到了一致的结果 到目前为止,有人经历过这样的事情吗 请查找以下代码: import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark

我正在研究一个scala代码,它在某些数据集上执行线性回归。现在我使用20个内核和25个执行器,每次运行Spark作业时,我都会得到不同的结果

文件的输入大小为2GB和400MB。但是,当我使用20个内核和1个执行器运行作业时,我得到了一致的结果

到目前为止,有人经历过这样的事情吗

请查找以下代码:

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SchemaRDD
import org.apache.spark.Partitioner
import org.apache.spark.storage.StorageLevel

object TextProcess{
  def main(args: Array[String]){
            val conf = new SparkConf().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
            val sc = new SparkContext(conf)
            val sqlContext = new org.apache.spark.sql.SQLContext(sc)
            val numExecutors=(conf.get("spark.executor.instances").toInt)
            // Read the 2 input files
            // First file is either cases / controls
            val input1 = sc.textFile(args(0))
            // Second file is Gene Expression
            val input2 = sc.textFile(args(1))

              //collecting header information
            val header1=sc.parallelize(input1.take(1))
            val header2=sc.parallelize(input2.take(1))

            //mapping data without the header information
            val map1 = input1.subtract(header1).map(x => (x.split(" ")(0)+x.split(" ")(1), x))
            val map2 = input2.subtract(header2).map(x => (x.split(" ")(0)+x.split(" ")(1), x))


            //joining data. here is where the order was getting affected. 
            val joinedMap = map1.join(map2)

            //adding the header back to the top of RDD
            val x = header1.union(joinedMap.map{case(x,(y,z))=>y})

            val y = header2.union(joinedMap.map{case(x,(y,z))=>z})

            //removing irrelevant columns
            val rddX = x.map(x=>x.split(" ").drop(3)).zipWithIndex.map{case(a,b)=> a.map(x=>b.toString+" "+x.toString)}
            val rddY = y.map(x=>x.split(" ").drop(2)).zipWithIndex.map{case(a,b)=> a.map(x=>b.toString+" "+x.toString)}


            //transposing and cross joining data. This keeps the identifier at the start
            val transposedX = rddX.flatMap(x => x.zipWithIndex.map(x=>x.swap)).reduceByKey((a,b)=> a+":"+b).map{case(a,b)=>b.split(":").sorted}
            val transposedY = rddY.flatMap(x => x.zipWithIndex.map(x=>x.swap)).reduceByKey((a,b)=> a+":"+b).map{case(a,b)=>b.split(":").sorted}.persist(StorageLevel.apply(false, true, false, false, numExecutors))

            val cleanedX = transposedX.map(x=>x.map(x=>x.slice(x.indexOfSlice(" ")+1,x.length)))
            val cleanedY = transposedY.map(x=>x.map(x=>x.slice(x.indexOfSlice(" ")+1,x.length))).persist(StorageLevel.apply(false, true, false, false, numExecutors))


            val cartXY = cleanedX.cartesian(cleanedY)
            val finalDataSet= cartXY.map{case(a,b)=>a zip b} 
            //convert to key value pair
            val regressiondataset = finalDataSet.map(x=>(x(0),x.drop(1).filter{case(a,b)=> a!="NA" && b!="NA" && a!="null" && b!="null"}.map{case(a,b)=> (a.toDouble, b.toDouble)}))


            val linearOutput = regressiondataset.map(s => new LinearRegression(s._1 ,s._2).outputVal)

            linearOutput.saveAsTextFile(args(2))
            cleanedY.unpersist()
            transposedY.unpersist()

  }
}


class LinearRegression (val keys: (String, String),val pairs: Array[(Double,Double)]) {


  val size = pairs.size

  // first pass: read in data, compute xbar and ybar
  val sums = pairs.aggregate(new X_X2_Y(0D,0D,0D))(_ + new X_X2_Y(_),_+_)
  val bars = (sums.x / size, sums.y / size)

  // second pass: compute summary statistics
  val sumstats = pairs.foldLeft(new X2_Y2_XY(0D,0D,0D))(_ + new X2_Y2_XY(_, bars))

  val beta1 = sumstats.xy / sumstats.x2
  val beta0 = bars._2 - (beta1 * bars._1)
  val betas = (beta0, beta1)

  //println("y = " + ("%4.3f" format beta1) + " * x + " + ("%4.3f" format beta0))


  // analyze results
  val correlation = pairs.aggregate(new RSS_SSR(0D,0D))(_ + RSS_SSR.build(_, bars, betas),_+_)
  val R2 = correlation.ssr / sumstats.y2
  val svar = correlation.rss / (size - 2)
  val svar1 = svar / sumstats.x2
  val svar0 = ( svar / size ) + ( bars._1 * bars._1 * svar1)
  val svar0bis = svar * sums.x2 / (size * sumstats.x2)

  /* println("R^2                 = " + R2)
   println("std error of beta_1 = " + Math.sqrt(svar1))
   println("std error of beta_0 = " + Math.sqrt(svar0))
   println("std error of beta_0 = " + Math.sqrt(svar0bis))
   println("SSTO = " + sumstats.y2)
   println("SSE  = " + correlation.rss)
   println("SSR  = " + correlation.ssr)*/


  def outputVal() = keys._1
                    +"\t"+keys._2
                    +"\t"+beta1
                    +"\t"+beta0
                    +"\t"+R2
                    +"\t"+Math.sqrt(svar1)
                    +"\t"+Math.sqrt(svar0)
                    +"\t"+sumstats.y2
                    +"\t"+correlation.rss
                    +"\t"+correlation.ssr+"\t;

}


object RSS_SSR {
  def build(p: (Double,Double), bars: (Double,Double), betas: (Double,Double)): RSS_SSR = {
    val fit = (betas._2 * p._1) + betas._1
    val rss = (fit-p._2) * (fit-p._2)
    val ssr = (fit-bars._2) * (fit-bars._2)
    new RSS_SSR(rss, ssr)
  }
}

class RSS_SSR(val rss: Double, val ssr: Double) {
  def +(p: RSS_SSR): RSS_SSR = new RSS_SSR(rss+p.rss, ssr+p.ssr)
}

class X_X2_Y(val x: Double, val x2: Double, val y: Double) {
  def this(p: (Double,Double)) = this(p._1, p._1*p._1, p._2)
  def +(p: X_X2_Y): X_X2_Y = new X_X2_Y(x+p.x,x2+p.x2,y+p.y)
}

class X2_Y2_XY(val x2: Double, val y2: Double, val xy: Double) {
  def this(p: (Double,Double), bars: (Double,Double)) = this((p._1-bars._1)*(p._1-bars._1), (p._2-bars._2)*(p._2-bars._2),(p._1-bars._1)*(p._2-bars._2))
  def +(p: X2_Y2_XY): X2_Y2_XY = new X2_Y2_XY(x2+p.x2,y2+p.y2,xy+p.xy)
}

我只是胡乱猜测,
.union
据我所知可能无法保持顺序,因此您的标题可能会以您永远不知道在哪里结束。这会改变你的结果吗?维克多谢谢你的回复。除了联合还有别的选择吗?你说的“不同的结果”是什么意思@VictorMoroz是对的,Spark通常是并行的和分布式的,不关注以相同的顺序获得输出(RDD上要处理的每个记录都可以移动)。但是结果应该是一样的,这与顺序无关。我有大约2列作为键,另外4列作为值。所以每次我运行一个作业,对于同一个键,我会得到不同的值。我希望这有帮助?我想这个问题以前有人问过。无论如何,请提供线性回归的定义。