Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/redis/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Spark Streaming scala性能急剧下降_Scala_Redis_Apache Kafka_Spark Streaming_Spark Cassandra Connector - Fatal编程技术网

Spark Streaming scala性能急剧下降

Spark Streaming scala性能急剧下降,scala,redis,apache-kafka,spark-streaming,spark-cassandra-connector,Scala,Redis,Apache Kafka,Spark Streaming,Spark Cassandra Connector,我有以下代码:- case class event(imei: String, date: String, gpsdt: String,dt: String,id: String) case class historyevent(imei: String, date: String, gpsdt: String) object kafkatesting { def main(args: Array[String]) { val clients = new RedisClientPool("19

我有以下代码:-

case class event(imei: String, date: String, gpsdt: String,dt: String,id: String)
case class historyevent(imei: String, date: String, gpsdt: String)
object kafkatesting {
def main(args: Array[String]) {

val clients = new RedisClientPool("192.168.0.40", 6379)
val conf = new SparkConf()
  .setAppName("KafkaReceiver")
  .set("spark.cassandra.connection.host", "192.168.0.40")
  .set("spark.cassandra.connection.keep_alive_ms", "20000")
  .set("spark.executor.memory", "3g")
  .set("spark.driver.memory", "4g")
  .set("spark.submit.deployMode", "cluster")
  .set("spark.executor.instances", "4")
  .set("spark.executor.cores", "3")
  .set("spark.streaming.backpressure.enabled", "true")
  .set("spark.streaming.backpressure.initialRate", "100")
  .set("spark.streaming.kafka.maxRatePerPartition", "7")

val sc = SparkContext.getOrCreate(conf)
val ssc = new StreamingContext(sc, Seconds(10))
val sqlContext = new SQLContext(sc)
val kafkaParams = Map[String, String](
  "bootstrap.servers" -> "192.168.0.113:9092",
  "group.id" -> "test-group-aditya",
  "auto.offset.reset" -> "largest")

val topics = Set("random")
val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)

kafkaStream.foreachRDD { rdd =>

  val updatedRDD = rdd.map(a =>
    {
      implicit val formats = DefaultFormats
      val jValue = parse(a._2)
      val fleetrecord = jValue.extract[historyevent]
      val hash = fleetrecord.imei + fleetrecord.date + fleetrecord.gpsdt
      val md5Hash = DigestUtils.md5Hex(hash).toUpperCase()
      val now = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime())

      event(fleetrecord.imei, fleetrecord.date, fleetrecord.gpsdt, now, md5Hash)
    })
    .collect()

  updatedRDD.foreach(f =>
    {
      clients.withClient {
        client =>
          {
            val value = f.imei + " , " + f.gpsdt
            val zscore = Calendar.getInstance().getTimeInMillis
            val key = new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime())
            val dt = new SimpleDateFormat("HH:mm:ss").format(Calendar.getInstance().getTime())
            val q1 = "00:00:00"
            val q2 = "06:00:00"
            val q3 = "12:00:00"
            val q4 = "18:00:00"
            val quater = if (dt > q1 && dt < q2) {
              System.out.println(dt + " lies in quarter 1");
              " -> 1"
            } else if (dt > q2 && dt < q3) {
              System.out.println(dt + " lies in quarter 2");
              " -> 2"
            } else if (dt > q3 && dt < q4) {
              System.out.println(dt + " lies in quarter 3");
              " -> 3"
            } else {
              System.out.println(dt + " lies in quarter 4");
              " -> 4"
            }
            client.zadd(key + quater, zscore, value)
            println(f.toString())
          }
      }
    })
  val collection = sc.parallelize(updatedRDD)
  collection.saveToCassandra("db", "table", SomeColumns("imei", "date", "gpsdt","dt","id"))
}

ssc.start()
ssc.awaitTermination()
}
}
case类事件(imei:String,date:String,gpsdt:String,dt:String,id:String)
案例类historyevent(imei:String,date:String,gpsdt:String)
对象卡夫卡测试{
def main(参数:数组[字符串]){
val客户=新的再客户池(“192.168.0.40”,6379)
val conf=new SparkConf()
.setAppName(“卡夫卡接收器”)
.set(“spark.cassandra.connection.host”,“192.168.0.40”)
.set(“spark.cassandra.connection.keep_live_ms”,“20000”)
.set(“spark.executor.memory”、“3g”)
.set(“spark.driver.memory”,“4g”)
.set(“spark.submit.deployMode”、“集群”)
.set(“spark.executor.instances”、“4”)
.set(“spark.executor.cores”、“3”)
.set(“火花、流、背压、启用”、“真”)
.set(“火花、流动、背压、初始速率”、“100”)
.set(“spark.streaming.kafka.maxRatePerPartition”,“7”)
val sc=SparkContext.getOrCreate(conf)
val ssc=新的StreamingContext(sc,秒(10))
val sqlContext=新的sqlContext(sc)
val kafkaParams=Map[String,String](
“bootstrap.servers”->“192.168.0.113:9092”,
“组id”->“测试组aditya”,
“自动偏移重置”->“最大值”)
val主题=设置(“随机”)
val kafkaStream=KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
kafkaStream.foreachRDD{rdd=>
val updatedd=rdd.map(a=>
{
隐式val格式=默认格式
val jValue=parse(a._2)
val fleetrecord=jValue.extract[historyevent]
val hash=fleetrecord.imei+fleetrecord.date+fleetrecord.gpsdt
val md5Hash=DigestUtils.md5Hex(hash.toUpperCase())
val now=new SimpleDateFormat(“yyyy-MM-dd HH:MM:ss”).format(Calendar.getInstance().getTime())
事件(fleetrecord.imei、fleetrecord.date、fleetrecord.gpsdt、now、md5Hash)
})
.collect()
updatedd.foreach(f=>
{
客户{
客户=>
{
val值=f.imei+“,”+f.gpsdt
val zscore=Calendar.getInstance().getTimeInMillis
val key=new SimpleDateFormat(“yyyy-MM-dd”).format(Calendar.getInstance().getTime())
val dt=新的SimpleDataFormat(“HH:mm:ss”).format(Calendar.getInstance().getTime())
val q1=“00:00:00”
val q2=“06:00:00”
val q3=“12:00:00”
val q4=“18:00:00”
val quater=if(dt>q1和&dt 1"
}否则如果(dt>q2和&dt 2"
}否则如果(dt>q3&&dt 3"
}否则{
系统输出打印LN(dt+“位于第四季度”);
" -> 4"
}
client.zadd(key+quater、zscore、value)
println(f.toString())
}
}
})
val collection=sc.parallelize(updatedd)
saveToCassandra(“db”,“table”,一些列(“imei”,“date”,“gpsdt”,“dt”,“id”))
}
ssc.start()
ssc.终止协议()
}
}
我使用这段代码将卡夫卡的数据插入卡桑德拉和Redis,但面临以下问题:-

1) 当前正在处理前一批时,应用程序将创建一个活动批的长队列。因此,我只希望在前一批完成执行后,才有下一批

2) 我有四个节点的集群,它正在处理每个批处理,但执行700条记录大约需要30-40秒


我的代码是经过优化的还是我需要改进代码以获得更好的性能?

是的,您可以在
mapPartition
中完成所有工作。datastax中有一些API允许您直接保存数据流。以下是如何为C*实现这一点

val partitionedDstream = kafkaStream.repartition(5) //change this value as per your data and spark cluster

//Now instead of iterating each RDD work on each partition.
val eventsStream: DStream[event] = partitionedDstream.mapPartitions(x => {
  val lst = scala.collection.mutable.ListBuffer[event]()
  while (x.hasNext) {
    val a = x.next()
    implicit val formats = DefaultFormats
    val jValue = parse(a._2)
    val fleetrecord = jValue.extract[historyevent]
    val hash = fleetrecord.imei + fleetrecord.date + fleetrecord.gpsdt
    val md5Hash = DigestUtils.md5Hex(hash).toUpperCase()
    val now = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime())
    lst += event(fleetrecord.imei, fleetrecord.date, fleetrecord.gpsdt, now, md5Hash)
  }
  lst.toList.iterator
})

eventsStream.cache() //because you are using same Dstream for C* and Redis

//instead of collecting each RDD save whole Dstream at once
import com.datastax.spark.connector.streaming._
eventsStream.saveToCassandra("db", "table", SomeColumns("imei", "date", "gpsdt", "dt", "id"))
此外,cassandra还接受
时间戳
作为
值,因此您还可以如下更改代码的某些部分

val now = System.currentTimeMillis()

//also change your case class to take `Long` instead of `String`
case class event(imei: String, date: String, gpsdt: String, dt: Long, id: String)

同样,您也可以更改Redis的设置。

我不确定Redis,但您将数据保存到cassandra的方式是错误的。有API可以将
Dstream
直接保存到cassandra中,无需收集并转换为RDD。还可以尝试使用
mapPartition
而不是
foreachRDD
。您可以查看包
com.datastax.spark.connector.streaming
@vindev-Hi,您可以看到我实际上是从数据流中提取数据,并向每行添加一些列,然后将其保存到cassandra。那么,这可能是你建议的方式吗?您可以分享您建议的更新代码吗?不确定spark streaming相关的问题,但在每个活动上创建
SimpleDataFormat
Calendar
是非常浪费资源的
println
调用每个事件的速度也非常慢。未找到:键入DStreamuse
import org.apache.spark.stream.dstream
我添加了类型只是为了参考,您可以删除该类型。好的,我这样做了,val partitionedDstream=kafkaStream.repartition(5),此语句的含义是什么?它将创建数据流的5个分区。阅读更多关于分区的内容实际上我现在已经开始了我的工作,我可以看到分区记录花费了很多时间。