Apache spark 启用检查点的Spark streaming SQS

Apache spark 启用检查点的Spark streaming SQS,apache-spark,spark-streaming,spark-checkpoint,Apache Spark,Spark Streaming,Spark Checkpoint,我浏览过多个网站,比如 一些链接讨论了我们如何编码,但它太抽象了,我需要花很多时间来弄清楚它到底是如何工作的经过长时间的斗争,我能够使用检查点设置流式代码,添加到这里以帮助其他人 import java.util.concurrent.Executors import com.amazonaws.auth.DefaultAWSCredentialsProviderChain import com.amazonaws.regions.Regions import com.amazonaws

我浏览过多个网站,比如


一些链接讨论了我们如何编码,但它太抽象了,我需要花很多时间来弄清楚它到底是如何工作的

经过长时间的斗争,我能够使用检查点设置流式代码,添加到这里以帮助其他人

import java.util.concurrent.Executors

import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
import com.amazonaws.regions.Regions
import com.amazonaws.services.sqs.model.Message
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.log4j.LogManager
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}

object StreamingApp extends scala.Serializable {
  @transient private final val mapper = new ObjectMapper
  @transient private final val LOG = LogManager.getLogger(getClass)
  @transient private final val executor = Executors.newFixedThreadPool(Runtime.getRuntime.availableProcessors)
  var s3 = "s3"
  private var shutdownMarker: String = _
  private var stopFlag: Boolean = false

  def main(args: Array[String]): Unit = {
    val queueName = args(0)
    val region = args(1)
    val fetchMaxMessage = args(2).toInt
    val visibilityTimeOutSeconds = args(3).toInt
    val waitTimeoutInMillis = args(4).toLong
    val isLocal = args(5).toBoolean
    val bucket = args(6)
    if (args.length >= 10)
      shutdownMarker = args(9)
    val sparkConf = initialiseSparkConf(isLocal)
    sparkConf.set(Constants.QUEUE_NAME, queueName)
    sparkConf.set(Constants.REGION, region)
    sparkConf.set(Constants.FETCH_MAX_MESSAGE, fetchMaxMessage.toString)
    sparkConf.set(Constants.VISIBILITY_TIMEOUT_SECONDS, visibilityTimeOutSeconds.toString)
    sparkConf.set(Constants.WAIT_TIMEOUT_IN_MILLIS, waitTimeoutInMillis.toString)

    shutdownMarker = s"$s3://$bucket/streaming/shutdownmarker"
    val checkpointDirectory = s"$s3://$bucket/streaming/checkpoint/"
    var context: StreamingContext = null

    try {
      context = StreamingContext.getOrCreate(checkpointDirectory, () => createContext(sparkConf, waitTimeoutInMillis, checkpointDirectory, args))
      context.start
      val checkIntervalMillis = 10000
      var isStopped = false

      while (!isStopped) {
        println("calling awaitTerminationOrTimeout")
        isStopped = context.awaitTerminationOrTimeout(checkIntervalMillis)
        if (isStopped)
          println("confirmed! The streaming context is stopped. Exiting application...")
        checkShutdownMarker(context.sparkContext)
        if (!isStopped && stopFlag) {
          println("stopping ssc right now")
          context.stop(stopSparkContext = true, stopGracefully = true)
          println("ssc is stopped!!!!!!!")
        }
      }
    }
    finally {
      LOG.info("Exiting the Application")
      if (context != null && org.apache.spark.streaming.StreamingContextState.STOPPED != context.getState) {
        context.stop(stopSparkContext = true, stopGracefully = true)
      }
      if (!executor.isShutdown)
        executor.shutdown()
    }
  }

  def checkShutdownMarker(sparkContext: SparkContext): Unit = {
    if (!stopFlag) {
      stopFlag = isFileExists(shutdownMarker, sparkContext)
    }
    println(s"Stop marker $shutdownMarker file found: $stopFlag at time ${System.currentTimeMillis()}")
  }

def isFileExists(path: String, sparkContext: SparkContext): Boolean = {
    isValidPath(isDir = false, path, getFileSystem(path,sparkContext))
  }

  def getFileSystem(path: String, sparkContext: SparkContext): FileSystem = {
    FileSystem.get(URI.create(path), sparkContext.hadoopConfiguration)
  }

def isValidPath(isDir: Boolean, path: String, fileSystem: FileSystem): Boolean = {
    LOG.info("Validating path {}", path)
    if (path.startsWith(Constants.S3) || path.startsWith(Constants.HDFS) || path.startsWith(Constants.FILE)) {
      val fsPath = new Path(path)
      if (isDir) {
        fileSystem isDirectory fsPath
      } else {
        fileSystem isFile fsPath
      }
    } else {
      Files.exists(Paths.get(path))
    }
  }

  def createContext(sparkConf: SparkConf, waitTimeoutInMillis: Long, checkpointDirectory: String, args: Array[String]): StreamingContext = {

    val context = new StreamingContext(sparkConf, Duration(waitTimeoutInMillis + 1000))
    processMessage(context, args)
    context.checkpoint(checkpointDirectory) // set checkpoint directory
    context
  }

  def processMessage(context: StreamingContext, args: Array[String]): Unit = {

    val bucket = args(6)
    val wgPath = args(7)
    var stagingPath = args(8)
    val waitTimeoutInMillis = args(4).toLong
    if (context != null) {

      if (!stagingPath.endsWith("/")) {
        stagingPath = s"$stagingPath/"
      }
      val outputPrefix = s"$s3://$bucket/$stagingPath"

      LOG.info(s"Number of cores for driver: ${Runtime.getRuntime.availableProcessors}")

      val sparkContext: SparkContext = context.sparkContext

      val broadcasts = BroadCaster.getInstance(sparkContext, s"$s3://$bucket/$wgPath")

      val input = context.receiverStream(broadcasts(Constants.SQS_RECEIVER).value.asInstanceOf[SQSReceiver])
      //input.checkpoint(interval = Seconds(60))
      LOG.info(s"Scheduling mode ${sparkContext.getSchedulingMode.toString}")
      input.foreachRDD(r => {
        val sparkSession = SparkSession.builder.config(r.sparkContext.getConf).getOrCreate()

        val messages = r.collect().map(message => mapper.readValue(message, classOf[Message]))

        val broadcasts = BroadCaster.getInstance(r.sparkContext, s"$s3://$bucket/$wgPath")
        //Application logic
      })
    }
  }


  def initialiseSparkConf(local: Boolean): SparkConf = {
    val sparkConf = new SparkConf()
      .setAppName("Spark Streaming")
      .set("spark.scheduler.mode", "FAIR")
      .set("spark.sql.parquet.filterpushdown", "true")
      .set("spark.executor.hearbeatInterval", "20")
      .set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
      .set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
      .set("spark.streaming.receiver.writeAheadLog.enable", "true")
      .set("spark.streaming.stopGracefullyOnShutdown", "true")
      .set("spark.streaming.backpressure.enabled","true")
      .set("spark.streaming.backpressure.pid.minRate","10") //SQS support batch of 10

    if (local) {
      s3 = "s3a"
      sparkConf.setMaster("local[*]")
    } else {
      sparkConf.set("hive.metastore.client.factory.class",
        "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
    }
  }
}

object BroadCaster {

  @volatile private var instance: Map[String, Broadcast[Any]] = _

  def getInstance(sparkContext: SparkContext, wgPath: String): Map[String, Broadcast[Any]] = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          instance = Utils.createBroadcastObjects(wgPath, sparkContext)
          instance += (Constants.SQS_RECEIVER -> sparkContext.broadcast(getSQSReceiver(sparkContext.getConf)))
        }
      }
    }
    instance
  }

  private def getSQSReceiver(conf: SparkConf): SQSReceiver = {
    val javaSQSReceiver = new SQSReceiver(conf.get(Constants.QUEUE_NAME)).withRegion(Regions.fromName(conf.get(Constants.REGION))).withCredential(new DefaultAWSCredentialsProviderChain())
      .withFetchMaxMessage(conf.getInt(Constants.FETCH_MAX_MESSAGE, 10)).withVisibilityTimeOutSeconds(conf.getInt(Constants.VISIBILITY_TIMEOUT_SECONDS, 1800)).withWaitTimeoutinMillis(conf.getLong(Constants.WAIT_TIMEOUT_IN_MILLIS, 1000))
    javaSQSReceiver
  }
}

导入java.util.List;
导入org.apache.log4j.Logger;
导入org.apache.spark.storage.StorageLevel;
导入org.apache.spark.streaming.receiver.receiver;
导入com.amazonaws.auth.AWSCredentialsProvider;
导入com.amazonaws.regions.regions;
导入com.amazonaws.services.sqs.AmazonSQS;
导入com.amazonaws.services.sqs.AmazonSQSClientBuilder;
导入com.amazonaws.services.sqs.model.DeleteMessageBatchRequest;
导入com.amazonaws.services.sqs.model.DeleteMessageRequest;
导入com.amazonaws.services.sqs.model.Message;
导入com.amazonaws.services.sqs.model.ReceiveMessageRequest;
导入com.fasterxml.jackson.core.JsonProcessingException;
导入com.fasterxml.jackson.databind.ObjectMapper;
公共类SQSReceiver扩展了接收方{
私有字符串队列名称;
私人临时AWSCredentialsProvider凭证;
私人区域=Regions.US\u EAST\u 1;
私用长时间waitTimeoutinMillis=0L;
私有对象映射器映射器=新对象映射器();
私有瞬态记录器=Logger.getLogger(SQSReceiver.class);
私有布尔deleteonReceive=false;
private int fetchMaxMessage=100;
私有int可视性timeoutseconds=60;
私有字符串sqqueueurl;
私有瞬态AmazonSQS AmazonSQS;
公共SQSReceiver(字符串队列名称){
此(队列名称,false);
}
公共SQSReceiver(字符串queueName,布尔DeleteOnReceive){
super(StorageLevel.MEMORY_和_DISK_SER());
this.queueName=队列名称;
this.deleteonReceive=deleteonReceive;
setupsq(队列名称);
}
专用无效设置SQS(字符串队列名称){
AmazonSQSClientBuilder AmazonSQSClientBuilder=AmazonSQSClientBuilder.standard();
如果(凭证!=null){
amazonSQSClientBuilder.withCredentials(凭证);
}
amazonSQSClientBuilder.withRegion(地区);
amazonSQS=amazonSQSClientBuilder.build();
sqsqueurl=amazonSQS.getQueueUrl(queueName.getQueueUrl();
}
public void onStart(){
新线程(this::receive.start();
}
公共void onStop(){
//除了调用receive()的线程之外,没有什么可做的
//被设计为在isStopped()返回false时自行停止
}
私有无效接收(){
试一试{
setupsq(队列名称);
ReceiveMessageRequest ReceiveMessageRequest=新的ReceiveMessageRequest(sqsQueueUrl)。withMaxNumberOfMessages(fetchMaxMessage)。withVisibilityTimeout(visibilityTimeOutSeconds)
。withWaitTimeSeconds(20)//https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/examples-sqs-long-polling.html
receiveMessagesFromSQS(amazonSQS、sqsQueueUrl、receiveMessageRequest);
}捕获(可丢弃的e){
停止(“初始化时遇到错误”,e);
}
}
private void receivemessages fromsqs(最终的AmazonSQS AmazonSQS,最终的字符串sqsqueurl,
ReceiveMessageRequest(接收消息请求){
试一试{
而(!isStopped()){
List messages=amazonSQS.receiveMessage(receiveMessageRequest.getMessages();
如果(删除接收){
字符串receiptHandle=messages.get(0).getReceiptHandle();
messages.forEach(m->store(m.getBody());
deleteMessage(新的DeleteMessageRequest(sqsqueUrl,receiptHandle));
}否则{
messages.forEach(this::storeMessage);
}
如果(waitTimeoutinMillis>0L)
睡眠(waitTimeoutinMillis);
}
重新启动(“尝试再次连接”);
}捕获(IllegalArgumentException | InterruptedException e){
重新启动(“无法连接”,e);
}捕获(可丢弃的e){
重新启动(“接收数据错误”,e);
}
}
私有消息(消息m){
试一试{
如果(m!=null)
存储(mapper.writeValueAsString(m));
}捕获(JsonProcessingException e){
logger.error(“无法将消息写入流上下文”);
}
}
具有visibilityTimeOutSeconds(int visibilityTimeOutSeconds)的公共SQS接收器{
this.visibilityTimeOutSeconds=visibilityTimeOutSeconds;
归还这个;
}
带有fetchMaxMessage(int fetchMaxMessage)的公共SQS接收器{
如果(fetchMaxMessage>10){
抛出新的IllegalArgumentException(“FetchMaxMessage不能大于10”);
}
this.fetchMaxMessage=fetchMaxMessage;
归还这个;
}
带有waitTimeoutinMillis(长waitTimeoutinMillis)的公共SQS接收器{
this.waitTimeoutinMillis=waitTimeoutinMillis;
归还这个;
}
带区域的公共SQS接收器(区域){
这个区域=区域;
归还这个;
}
带有凭证的公共SQS接收器(AWSCredentialsProvider凭证){
this.credential=凭证;
归还这个;
}
public void deleteMessages(DeleteMessageBatchRequest请求){
请求。withQueueUrl(sqqueueURL);
amazonSQS.deleteMessageBatch(请求);
}
}

经过长时间的斗争,我能够使用checkpoint设置流式代码,在这里添加代码以帮助其他人

import java.util.concurrent.Executors

import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
import com.amazonaws.regions.Regions
import com.amazonaws.services.sqs.model.Message
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.log4j.LogManager
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}

object StreamingApp extends scala.Serializable {
  @transient private final val mapper = new ObjectMapper
  @transient private final val LOG = LogManager.getLogger(getClass)
  @transient private final val executor = Executors.newFixedThreadPool(Runtime.getRuntime.availableProcessors)
  var s3 = "s3"
  private var shutdownMarker: String = _
  private var stopFlag: Boolean = false

  def main(args: Array[String]): Unit = {
    val queueName = args(0)
    val region = args(1)
    val fetchMaxMessage = args(2).toInt
    val visibilityTimeOutSeconds = args(3).toInt
    val waitTimeoutInMillis = args(4).toLong
    val isLocal = args(5).toBoolean
    val bucket = args(6)
    if (args.length >= 10)
      shutdownMarker = args(9)
    val sparkConf = initialiseSparkConf(isLocal)
    sparkConf.set(Constants.QUEUE_NAME, queueName)
    sparkConf.set(Constants.REGION, region)
    sparkConf.set(Constants.FETCH_MAX_MESSAGE, fetchMaxMessage.toString)
    sparkConf.set(Constants.VISIBILITY_TIMEOUT_SECONDS, visibilityTimeOutSeconds.toString)
    sparkConf.set(Constants.WAIT_TIMEOUT_IN_MILLIS, waitTimeoutInMillis.toString)

    shutdownMarker = s"$s3://$bucket/streaming/shutdownmarker"
    val checkpointDirectory = s"$s3://$bucket/streaming/checkpoint/"
    var context: StreamingContext = null

    try {
      context = StreamingContext.getOrCreate(checkpointDirectory, () => createContext(sparkConf, waitTimeoutInMillis, checkpointDirectory, args))
      context.start
      val checkIntervalMillis = 10000
      var isStopped = false

      while (!isStopped) {
        println("calling awaitTerminationOrTimeout")
        isStopped = context.awaitTerminationOrTimeout(checkIntervalMillis)
        if (isStopped)
          println("confirmed! The streaming context is stopped. Exiting application...")
        checkShutdownMarker(context.sparkContext)
        if (!isStopped && stopFlag) {
          println("stopping ssc right now")
          context.stop(stopSparkContext = true, stopGracefully = true)
          println("ssc is stopped!!!!!!!")
        }
      }
    }
    finally {
      LOG.info("Exiting the Application")
      if (context != null && org.apache.spark.streaming.StreamingContextState.STOPPED != context.getState) {
        context.stop(stopSparkContext = true, stopGracefully = true)
      }
      if (!executor.isShutdown)
        executor.shutdown()
    }
  }

  def checkShutdownMarker(sparkContext: SparkContext): Unit = {
    if (!stopFlag) {
      stopFlag = isFileExists(shutdownMarker, sparkContext)
    }
    println(s"Stop marker $shutdownMarker file found: $stopFlag at time ${System.currentTimeMillis()}")
  }

def isFileExists(path: String, sparkContext: SparkContext): Boolean = {
    isValidPath(isDir = false, path, getFileSystem(path,sparkContext))
  }

  def getFileSystem(path: String, sparkContext: SparkContext): FileSystem = {
    FileSystem.get(URI.create(path), sparkContext.hadoopConfiguration)
  }

def isValidPath(isDir: Boolean, path: String, fileSystem: FileSystem): Boolean = {
    LOG.info("Validating path {}", path)
    if (path.startsWith(Constants.S3) || path.startsWith(Constants.HDFS) || path.startsWith(Constants.FILE)) {
      val fsPath = new Path(path)
      if (isDir) {
        fileSystem isDirectory fsPath
      } else {
        fileSystem isFile fsPath
      }
    } else {
      Files.exists(Paths.get(path))
    }
  }

  def createContext(sparkConf: SparkConf, waitTimeoutInMillis: Long, checkpointDirectory: String, args: Array[String]): StreamingContext = {

    val context = new StreamingContext(sparkConf, Duration(waitTimeoutInMillis + 1000))
    processMessage(context, args)
    context.checkpoint(checkpointDirectory) // set checkpoint directory
    context
  }

  def processMessage(context: StreamingContext, args: Array[String]): Unit = {

    val bucket = args(6)
    val wgPath = args(7)
    var stagingPath = args(8)
    val waitTimeoutInMillis = args(4).toLong
    if (context != null) {

      if (!stagingPath.endsWith("/")) {
        stagingPath = s"$stagingPath/"
      }
      val outputPrefix = s"$s3://$bucket/$stagingPath"

      LOG.info(s"Number of cores for driver: ${Runtime.getRuntime.availableProcessors}")

      val sparkContext: SparkContext = context.sparkContext

      val broadcasts = BroadCaster.getInstance(sparkContext, s"$s3://$bucket/$wgPath")

      val input = context.receiverStream(broadcasts(Constants.SQS_RECEIVER).value.asInstanceOf[SQSReceiver])
      //input.checkpoint(interval = Seconds(60))
      LOG.info(s"Scheduling mode ${sparkContext.getSchedulingMode.toString}")
      input.foreachRDD(r => {
        val sparkSession = SparkSession.builder.config(r.sparkContext.getConf).getOrCreate()

        val messages = r.collect().map(message => mapper.readValue(message, classOf[Message]))

        val broadcasts = BroadCaster.getInstance(r.sparkContext, s"$s3://$bucket/$wgPath")
        //Application logic
      })
    }
  }


  def initialiseSparkConf(local: Boolean): SparkConf = {
    val sparkConf = new SparkConf()
      .setAppName("Spark Streaming")
      .set("spark.scheduler.mode", "FAIR")
      .set("spark.sql.parquet.filterpushdown", "true")
      .set("spark.executor.hearbeatInterval", "20")
      .set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
      .set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
      .set("spark.streaming.receiver.writeAheadLog.enable", "true")
      .set("spark.streaming.stopGracefullyOnShutdown", "true")
      .set("spark.streaming.backpressure.enabled","true")
      .set("spark.streaming.backpressure.pid.minRate","10") //SQS support batch of 10

    if (local) {
      s3 = "s3a"
      sparkConf.setMaster("local[*]")
    } else {
      sparkConf.set("hive.metastore.client.factory.class",
        "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
    }
  }
}

object BroadCaster {

  @volatile private var instance: Map[String, Broadcast[Any]] = _

  def getInstance(sparkContext: SparkContext, wgPath: String): Map[String, Broadcast[Any]] = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          instance = Utils.createBroadcastObjects(wgPath, sparkContext)
          instance += (Constants.SQS_RECEIVER -> sparkContext.broadcast(getSQSReceiver(sparkContext.getConf)))
        }
      }
    }
    instance
  }

  private def getSQSReceiver(conf: SparkConf): SQSReceiver = {
    val javaSQSReceiver = new SQSReceiver(conf.get(Constants.QUEUE_NAME)).withRegion(Regions.fromName(conf.get(Constants.REGION))).withCredential(new DefaultAWSCredentialsProviderChain())
      .withFetchMaxMessage(conf.getInt(Constants.FETCH_MAX_MESSAGE, 10)).withVisibilityTimeOutSeconds(conf.getInt(Constants.VISIBILITY_TIMEOUT_SECONDS, 1800)).withWaitTimeoutinMillis(conf.getLong(Constants.WAIT_TIMEOUT_IN_MILLIS, 1000))
    javaSQSReceiver
  }
}

导入java.util.List;
导入org.apache.log4j.Logger;
导入org.apache.spark.storage.StorageLevel;
导入org.apache.spark.streaming.receiver.receiver;
导入com.amazonaws.auth.AWSCredentialsProvider;
导入com.amazonaws.regions.regions;
导入com.amazonaws.services.sqs.AmazonSQS;
导入com.a