Multithreading 火花流式罐';不要启动多个线程

Multithreading 火花流式罐';不要启动多个线程,multithreading,spark-streaming,Multithreading,Spark Streaming,我使用spark streaming接收卡夫卡的数据,如下所示: val conf = new SparkConf() conf.setMaster("local[*]").setAppName("KafkaStreamExample") .setSparkHome("/home/kufu/spark/spark-1.5.2-bin-hadoop2.6") .setExecutorEnv("spark.executor.extraClassPath","target/scala-2.11

我使用spark streaming接收卡夫卡的数据,如下所示:

val conf = new SparkConf()
conf.setMaster("local[*]").setAppName("KafkaStreamExample")
  .setSparkHome("/home/kufu/spark/spark-1.5.2-bin-hadoop2.6")
  .setExecutorEnv("spark.executor.extraClassPath","target/scala-2.11/sparkstreamexamples_2.11-1.0.jar")

val threadNum = 3

val ssc = new StreamingContext(conf, Seconds(2))
val topicMap = Map(consumeTopic -> 1)

val dataRDDs:IndexedSeq[InputDStream[(String, String)]] = approachType match {
  case KafkaStreamJob.ReceiverBasedApproach =>
    (1 to threadNum).map(_=>
      KafkaUtils.createStream(ssc, zkOrBrokers, "testKafkaGroupId", topicMap))
  case KafkaStreamJob.DirectApproach =>
    (1 to threadNum).map(_=>
      KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
        ssc, Map[String, String]("metadata.broker.list" -> zkOrBrokers),
      Set[String](consumeTopic)))
}

//dataRDDs.foreach(_.foreachRDD(genProcessing(approachType)))
val dataRDD = ssc.union(dataRDDs)
dataRDD.foreachRDD(genProcessing(approachType))

ssc.start()
ssc.awaitTermination()
def eachRDDProcessing(rdd:RDD[(String, String)]):Unit = {
  if(count>max) throw new Exception("Stop here")
  println("--------- num: "+count+" ---------")

  val batchNum = count
  val curTime = System.currentTimeMillis()

  Thread.sleep(5000)

  val family = approachType match{
    case KafkaStreamJob.DirectApproach => KafkaStreamJob.DirectFamily
    case KafkaStreamJob.ReceiverBasedApproach => KafkaStreamJob.NormalFamily
  }

  val families = KafkaStreamJob.DirectFamily :: KafkaStreamJob.NormalFamily :: Nil

  val time = System.currentTimeMillis().toString

  val messageCount = rdd.count()

  rdd.foreach(tuple => {
    val hBaseConn = new HBaseConnection(KafkaStreamJob.rawDataTable,
      KafkaStreamJob.zookeeper, families)
    hBaseConn.openOrCreateTable()
    val puts = new java.util.ArrayList[Put]()
    val strs = tuple._2.split(":")
    val row = strs(1) + ":" + strs(0) + ":" + time
    val put = new Put(Bytes.toBytes(row))
    put.add(Bytes.toBytes(family), Bytes.toBytes(KafkaStreamJob.tableQualifier),
      Bytes.toBytes("batch " + batchNum.toString + ":" + strs(1)))
    puts.add(put)
    hBaseConn.puts(puts)
    hBaseConn.close()
  })

  count+=1
  println("--------- add "+messageCount+" messages ---------")
}
eachRDDProcessing
genProcessing生成一个处理数据的过程,需要5秒(睡眠5秒)。代码如下所示:

val conf = new SparkConf()
conf.setMaster("local[*]").setAppName("KafkaStreamExample")
  .setSparkHome("/home/kufu/spark/spark-1.5.2-bin-hadoop2.6")
  .setExecutorEnv("spark.executor.extraClassPath","target/scala-2.11/sparkstreamexamples_2.11-1.0.jar")

val threadNum = 3

val ssc = new StreamingContext(conf, Seconds(2))
val topicMap = Map(consumeTopic -> 1)

val dataRDDs:IndexedSeq[InputDStream[(String, String)]] = approachType match {
  case KafkaStreamJob.ReceiverBasedApproach =>
    (1 to threadNum).map(_=>
      KafkaUtils.createStream(ssc, zkOrBrokers, "testKafkaGroupId", topicMap))
  case KafkaStreamJob.DirectApproach =>
    (1 to threadNum).map(_=>
      KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
        ssc, Map[String, String]("metadata.broker.list" -> zkOrBrokers),
      Set[String](consumeTopic)))
}

//dataRDDs.foreach(_.foreachRDD(genProcessing(approachType)))
val dataRDD = ssc.union(dataRDDs)
dataRDD.foreachRDD(genProcessing(approachType))

ssc.start()
ssc.awaitTermination()
def eachRDDProcessing(rdd:RDD[(String, String)]):Unit = {
  if(count>max) throw new Exception("Stop here")
  println("--------- num: "+count+" ---------")

  val batchNum = count
  val curTime = System.currentTimeMillis()

  Thread.sleep(5000)

  val family = approachType match{
    case KafkaStreamJob.DirectApproach => KafkaStreamJob.DirectFamily
    case KafkaStreamJob.ReceiverBasedApproach => KafkaStreamJob.NormalFamily
  }

  val families = KafkaStreamJob.DirectFamily :: KafkaStreamJob.NormalFamily :: Nil

  val time = System.currentTimeMillis().toString

  val messageCount = rdd.count()

  rdd.foreach(tuple => {
    val hBaseConn = new HBaseConnection(KafkaStreamJob.rawDataTable,
      KafkaStreamJob.zookeeper, families)
    hBaseConn.openOrCreateTable()
    val puts = new java.util.ArrayList[Put]()
    val strs = tuple._2.split(":")
    val row = strs(1) + ":" + strs(0) + ":" + time
    val put = new Put(Bytes.toBytes(row))
    put.add(Bytes.toBytes(family), Bytes.toBytes(KafkaStreamJob.tableQualifier),
      Bytes.toBytes("batch " + batchNum.toString + ":" + strs(1)))
    puts.add(put)
    hBaseConn.puts(puts)
    hBaseConn.close()
  })

  count+=1
  println("--------- add "+messageCount+" messages ---------")
}
eachRDDProcessing

但是spark streaming不会启动多线程。任务是一个接一个地处理的,每个任务大约需要5秒钟。我的机器有8个内核,spark在一个节点上运行。

我不知道spark流将启动线程,特别是在驱动程序上。关键是如果您有多个节点,您的genProcessing将在不同的节点上运行

此外,如果调用rdd.foreachPartition(…),假设它应该获得更好的并行性