Apache spark 启用检查点的Spark streaming SQS
我浏览过多个网站,比如Apache spark 启用检查点的Spark streaming SQS,apache-spark,spark-streaming,spark-checkpoint,Apache Spark,Spark Streaming,Spark Checkpoint,我浏览过多个网站,比如 一些链接讨论了我们如何编码,但它太抽象了,我需要花很多时间来弄清楚它到底是如何工作的经过长时间的斗争,我能够使用检查点设置流式代码,添加到这里以帮助其他人 import java.util.concurrent.Executors import com.amazonaws.auth.DefaultAWSCredentialsProviderChain import com.amazonaws.regions.Regions import com.amazonaws
一些链接讨论了我们如何编码,但它太抽象了,我需要花很多时间来弄清楚它到底是如何工作的经过长时间的斗争,我能够使用检查点设置流式代码,添加到这里以帮助其他人
import java.util.concurrent.Executors
import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
import com.amazonaws.regions.Regions
import com.amazonaws.services.sqs.model.Message
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.log4j.LogManager
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}
object StreamingApp extends scala.Serializable {
@transient private final val mapper = new ObjectMapper
@transient private final val LOG = LogManager.getLogger(getClass)
@transient private final val executor = Executors.newFixedThreadPool(Runtime.getRuntime.availableProcessors)
var s3 = "s3"
private var shutdownMarker: String = _
private var stopFlag: Boolean = false
def main(args: Array[String]): Unit = {
val queueName = args(0)
val region = args(1)
val fetchMaxMessage = args(2).toInt
val visibilityTimeOutSeconds = args(3).toInt
val waitTimeoutInMillis = args(4).toLong
val isLocal = args(5).toBoolean
val bucket = args(6)
if (args.length >= 10)
shutdownMarker = args(9)
val sparkConf = initialiseSparkConf(isLocal)
sparkConf.set(Constants.QUEUE_NAME, queueName)
sparkConf.set(Constants.REGION, region)
sparkConf.set(Constants.FETCH_MAX_MESSAGE, fetchMaxMessage.toString)
sparkConf.set(Constants.VISIBILITY_TIMEOUT_SECONDS, visibilityTimeOutSeconds.toString)
sparkConf.set(Constants.WAIT_TIMEOUT_IN_MILLIS, waitTimeoutInMillis.toString)
shutdownMarker = s"$s3://$bucket/streaming/shutdownmarker"
val checkpointDirectory = s"$s3://$bucket/streaming/checkpoint/"
var context: StreamingContext = null
try {
context = StreamingContext.getOrCreate(checkpointDirectory, () => createContext(sparkConf, waitTimeoutInMillis, checkpointDirectory, args))
context.start
val checkIntervalMillis = 10000
var isStopped = false
while (!isStopped) {
println("calling awaitTerminationOrTimeout")
isStopped = context.awaitTerminationOrTimeout(checkIntervalMillis)
if (isStopped)
println("confirmed! The streaming context is stopped. Exiting application...")
checkShutdownMarker(context.sparkContext)
if (!isStopped && stopFlag) {
println("stopping ssc right now")
context.stop(stopSparkContext = true, stopGracefully = true)
println("ssc is stopped!!!!!!!")
}
}
}
finally {
LOG.info("Exiting the Application")
if (context != null && org.apache.spark.streaming.StreamingContextState.STOPPED != context.getState) {
context.stop(stopSparkContext = true, stopGracefully = true)
}
if (!executor.isShutdown)
executor.shutdown()
}
}
def checkShutdownMarker(sparkContext: SparkContext): Unit = {
if (!stopFlag) {
stopFlag = isFileExists(shutdownMarker, sparkContext)
}
println(s"Stop marker $shutdownMarker file found: $stopFlag at time ${System.currentTimeMillis()}")
}
def isFileExists(path: String, sparkContext: SparkContext): Boolean = {
isValidPath(isDir = false, path, getFileSystem(path,sparkContext))
}
def getFileSystem(path: String, sparkContext: SparkContext): FileSystem = {
FileSystem.get(URI.create(path), sparkContext.hadoopConfiguration)
}
def isValidPath(isDir: Boolean, path: String, fileSystem: FileSystem): Boolean = {
LOG.info("Validating path {}", path)
if (path.startsWith(Constants.S3) || path.startsWith(Constants.HDFS) || path.startsWith(Constants.FILE)) {
val fsPath = new Path(path)
if (isDir) {
fileSystem isDirectory fsPath
} else {
fileSystem isFile fsPath
}
} else {
Files.exists(Paths.get(path))
}
}
def createContext(sparkConf: SparkConf, waitTimeoutInMillis: Long, checkpointDirectory: String, args: Array[String]): StreamingContext = {
val context = new StreamingContext(sparkConf, Duration(waitTimeoutInMillis + 1000))
processMessage(context, args)
context.checkpoint(checkpointDirectory) // set checkpoint directory
context
}
def processMessage(context: StreamingContext, args: Array[String]): Unit = {
val bucket = args(6)
val wgPath = args(7)
var stagingPath = args(8)
val waitTimeoutInMillis = args(4).toLong
if (context != null) {
if (!stagingPath.endsWith("/")) {
stagingPath = s"$stagingPath/"
}
val outputPrefix = s"$s3://$bucket/$stagingPath"
LOG.info(s"Number of cores for driver: ${Runtime.getRuntime.availableProcessors}")
val sparkContext: SparkContext = context.sparkContext
val broadcasts = BroadCaster.getInstance(sparkContext, s"$s3://$bucket/$wgPath")
val input = context.receiverStream(broadcasts(Constants.SQS_RECEIVER).value.asInstanceOf[SQSReceiver])
//input.checkpoint(interval = Seconds(60))
LOG.info(s"Scheduling mode ${sparkContext.getSchedulingMode.toString}")
input.foreachRDD(r => {
val sparkSession = SparkSession.builder.config(r.sparkContext.getConf).getOrCreate()
val messages = r.collect().map(message => mapper.readValue(message, classOf[Message]))
val broadcasts = BroadCaster.getInstance(r.sparkContext, s"$s3://$bucket/$wgPath")
//Application logic
})
}
}
def initialiseSparkConf(local: Boolean): SparkConf = {
val sparkConf = new SparkConf()
.setAppName("Spark Streaming")
.set("spark.scheduler.mode", "FAIR")
.set("spark.sql.parquet.filterpushdown", "true")
.set("spark.executor.hearbeatInterval", "20")
.set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
.set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
.set("spark.streaming.receiver.writeAheadLog.enable", "true")
.set("spark.streaming.stopGracefullyOnShutdown", "true")
.set("spark.streaming.backpressure.enabled","true")
.set("spark.streaming.backpressure.pid.minRate","10") //SQS support batch of 10
if (local) {
s3 = "s3a"
sparkConf.setMaster("local[*]")
} else {
sparkConf.set("hive.metastore.client.factory.class",
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
}
}
}
object BroadCaster {
@volatile private var instance: Map[String, Broadcast[Any]] = _
def getInstance(sparkContext: SparkContext, wgPath: String): Map[String, Broadcast[Any]] = {
if (instance == null) {
synchronized {
if (instance == null) {
instance = Utils.createBroadcastObjects(wgPath, sparkContext)
instance += (Constants.SQS_RECEIVER -> sparkContext.broadcast(getSQSReceiver(sparkContext.getConf)))
}
}
}
instance
}
private def getSQSReceiver(conf: SparkConf): SQSReceiver = {
val javaSQSReceiver = new SQSReceiver(conf.get(Constants.QUEUE_NAME)).withRegion(Regions.fromName(conf.get(Constants.REGION))).withCredential(new DefaultAWSCredentialsProviderChain())
.withFetchMaxMessage(conf.getInt(Constants.FETCH_MAX_MESSAGE, 10)).withVisibilityTimeOutSeconds(conf.getInt(Constants.VISIBILITY_TIMEOUT_SECONDS, 1800)).withWaitTimeoutinMillis(conf.getLong(Constants.WAIT_TIMEOUT_IN_MILLIS, 1000))
javaSQSReceiver
}
}
导入java.util.List;
导入org.apache.log4j.Logger;
导入org.apache.spark.storage.StorageLevel;
导入org.apache.spark.streaming.receiver.receiver;
导入com.amazonaws.auth.AWSCredentialsProvider;
导入com.amazonaws.regions.regions;
导入com.amazonaws.services.sqs.AmazonSQS;
导入com.amazonaws.services.sqs.AmazonSQSClientBuilder;
导入com.amazonaws.services.sqs.model.DeleteMessageBatchRequest;
导入com.amazonaws.services.sqs.model.DeleteMessageRequest;
导入com.amazonaws.services.sqs.model.Message;
导入com.amazonaws.services.sqs.model.ReceiveMessageRequest;
导入com.fasterxml.jackson.core.JsonProcessingException;
导入com.fasterxml.jackson.databind.ObjectMapper;
公共类SQSReceiver扩展了接收方{
私有字符串队列名称;
私人临时AWSCredentialsProvider凭证;
私人区域=Regions.US\u EAST\u 1;
私用长时间waitTimeoutinMillis=0L;
私有对象映射器映射器=新对象映射器();
私有瞬态记录器=Logger.getLogger(SQSReceiver.class);
私有布尔deleteonReceive=false;
private int fetchMaxMessage=100;
私有int可视性timeoutseconds=60;
私有字符串sqqueueurl;
私有瞬态AmazonSQS AmazonSQS;
公共SQSReceiver(字符串队列名称){
此(队列名称,false);
}
公共SQSReceiver(字符串queueName,布尔DeleteOnReceive){
super(StorageLevel.MEMORY_和_DISK_SER());
this.queueName=队列名称;
this.deleteonReceive=deleteonReceive;
setupsq(队列名称);
}
专用无效设置SQS(字符串队列名称){
AmazonSQSClientBuilder AmazonSQSClientBuilder=AmazonSQSClientBuilder.standard();
如果(凭证!=null){
amazonSQSClientBuilder.withCredentials(凭证);
}
amazonSQSClientBuilder.withRegion(地区);
amazonSQS=amazonSQSClientBuilder.build();
sqsqueurl=amazonSQS.getQueueUrl(queueName.getQueueUrl();
}
public void onStart(){
新线程(this::receive.start();
}
公共void onStop(){
//除了调用receive()的线程之外,没有什么可做的
//被设计为在isStopped()返回false时自行停止
}
私有无效接收(){
试一试{
setupsq(队列名称);
ReceiveMessageRequest ReceiveMessageRequest=新的ReceiveMessageRequest(sqsQueueUrl)。withMaxNumberOfMessages(fetchMaxMessage)。withVisibilityTimeout(visibilityTimeOutSeconds)
。withWaitTimeSeconds(20)//https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/examples-sqs-long-polling.html
receiveMessagesFromSQS(amazonSQS、sqsQueueUrl、receiveMessageRequest);
}捕获(可丢弃的e){
停止(“初始化时遇到错误”,e);
}
}
private void receivemessages fromsqs(最终的AmazonSQS AmazonSQS,最终的字符串sqsqueurl,
ReceiveMessageRequest(接收消息请求){
试一试{
而(!isStopped()){
List messages=amazonSQS.receiveMessage(receiveMessageRequest.getMessages();
如果(删除接收){
字符串receiptHandle=messages.get(0).getReceiptHandle();
messages.forEach(m->store(m.getBody());
deleteMessage(新的DeleteMessageRequest(sqsqueUrl,receiptHandle));
}否则{
messages.forEach(this::storeMessage);
}
如果(waitTimeoutinMillis>0L)
睡眠(waitTimeoutinMillis);
}
重新启动(“尝试再次连接”);
}捕获(IllegalArgumentException | InterruptedException e){
重新启动(“无法连接”,e);
}捕获(可丢弃的e){
重新启动(“接收数据错误”,e);
}
}
私有消息(消息m){
试一试{
如果(m!=null)
存储(mapper.writeValueAsString(m));
}捕获(JsonProcessingException e){
logger.error(“无法将消息写入流上下文”);
}
}
具有visibilityTimeOutSeconds(int visibilityTimeOutSeconds)的公共SQS接收器{
this.visibilityTimeOutSeconds=visibilityTimeOutSeconds;
归还这个;
}
带有fetchMaxMessage(int fetchMaxMessage)的公共SQS接收器{
如果(fetchMaxMessage>10){
抛出新的IllegalArgumentException(“FetchMaxMessage不能大于10”);
}
this.fetchMaxMessage=fetchMaxMessage;
归还这个;
}
带有waitTimeoutinMillis(长waitTimeoutinMillis)的公共SQS接收器{
this.waitTimeoutinMillis=waitTimeoutinMillis;
归还这个;
}
带区域的公共SQS接收器(区域){
这个区域=区域;
归还这个;
}
带有凭证的公共SQS接收器(AWSCredentialsProvider凭证){
this.credential=凭证;
归还这个;
}
public void deleteMessages(DeleteMessageBatchRequest请求){
请求。withQueueUrl(sqqueueURL);
amazonSQS.deleteMessageBatch(请求);
}
}
经过长时间的斗争,我能够使用checkpoint设置流式代码,在这里添加代码以帮助其他人
import java.util.concurrent.Executors
import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
import com.amazonaws.regions.Regions
import com.amazonaws.services.sqs.model.Message
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.log4j.LogManager
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}
object StreamingApp extends scala.Serializable {
@transient private final val mapper = new ObjectMapper
@transient private final val LOG = LogManager.getLogger(getClass)
@transient private final val executor = Executors.newFixedThreadPool(Runtime.getRuntime.availableProcessors)
var s3 = "s3"
private var shutdownMarker: String = _
private var stopFlag: Boolean = false
def main(args: Array[String]): Unit = {
val queueName = args(0)
val region = args(1)
val fetchMaxMessage = args(2).toInt
val visibilityTimeOutSeconds = args(3).toInt
val waitTimeoutInMillis = args(4).toLong
val isLocal = args(5).toBoolean
val bucket = args(6)
if (args.length >= 10)
shutdownMarker = args(9)
val sparkConf = initialiseSparkConf(isLocal)
sparkConf.set(Constants.QUEUE_NAME, queueName)
sparkConf.set(Constants.REGION, region)
sparkConf.set(Constants.FETCH_MAX_MESSAGE, fetchMaxMessage.toString)
sparkConf.set(Constants.VISIBILITY_TIMEOUT_SECONDS, visibilityTimeOutSeconds.toString)
sparkConf.set(Constants.WAIT_TIMEOUT_IN_MILLIS, waitTimeoutInMillis.toString)
shutdownMarker = s"$s3://$bucket/streaming/shutdownmarker"
val checkpointDirectory = s"$s3://$bucket/streaming/checkpoint/"
var context: StreamingContext = null
try {
context = StreamingContext.getOrCreate(checkpointDirectory, () => createContext(sparkConf, waitTimeoutInMillis, checkpointDirectory, args))
context.start
val checkIntervalMillis = 10000
var isStopped = false
while (!isStopped) {
println("calling awaitTerminationOrTimeout")
isStopped = context.awaitTerminationOrTimeout(checkIntervalMillis)
if (isStopped)
println("confirmed! The streaming context is stopped. Exiting application...")
checkShutdownMarker(context.sparkContext)
if (!isStopped && stopFlag) {
println("stopping ssc right now")
context.stop(stopSparkContext = true, stopGracefully = true)
println("ssc is stopped!!!!!!!")
}
}
}
finally {
LOG.info("Exiting the Application")
if (context != null && org.apache.spark.streaming.StreamingContextState.STOPPED != context.getState) {
context.stop(stopSparkContext = true, stopGracefully = true)
}
if (!executor.isShutdown)
executor.shutdown()
}
}
def checkShutdownMarker(sparkContext: SparkContext): Unit = {
if (!stopFlag) {
stopFlag = isFileExists(shutdownMarker, sparkContext)
}
println(s"Stop marker $shutdownMarker file found: $stopFlag at time ${System.currentTimeMillis()}")
}
def isFileExists(path: String, sparkContext: SparkContext): Boolean = {
isValidPath(isDir = false, path, getFileSystem(path,sparkContext))
}
def getFileSystem(path: String, sparkContext: SparkContext): FileSystem = {
FileSystem.get(URI.create(path), sparkContext.hadoopConfiguration)
}
def isValidPath(isDir: Boolean, path: String, fileSystem: FileSystem): Boolean = {
LOG.info("Validating path {}", path)
if (path.startsWith(Constants.S3) || path.startsWith(Constants.HDFS) || path.startsWith(Constants.FILE)) {
val fsPath = new Path(path)
if (isDir) {
fileSystem isDirectory fsPath
} else {
fileSystem isFile fsPath
}
} else {
Files.exists(Paths.get(path))
}
}
def createContext(sparkConf: SparkConf, waitTimeoutInMillis: Long, checkpointDirectory: String, args: Array[String]): StreamingContext = {
val context = new StreamingContext(sparkConf, Duration(waitTimeoutInMillis + 1000))
processMessage(context, args)
context.checkpoint(checkpointDirectory) // set checkpoint directory
context
}
def processMessage(context: StreamingContext, args: Array[String]): Unit = {
val bucket = args(6)
val wgPath = args(7)
var stagingPath = args(8)
val waitTimeoutInMillis = args(4).toLong
if (context != null) {
if (!stagingPath.endsWith("/")) {
stagingPath = s"$stagingPath/"
}
val outputPrefix = s"$s3://$bucket/$stagingPath"
LOG.info(s"Number of cores for driver: ${Runtime.getRuntime.availableProcessors}")
val sparkContext: SparkContext = context.sparkContext
val broadcasts = BroadCaster.getInstance(sparkContext, s"$s3://$bucket/$wgPath")
val input = context.receiverStream(broadcasts(Constants.SQS_RECEIVER).value.asInstanceOf[SQSReceiver])
//input.checkpoint(interval = Seconds(60))
LOG.info(s"Scheduling mode ${sparkContext.getSchedulingMode.toString}")
input.foreachRDD(r => {
val sparkSession = SparkSession.builder.config(r.sparkContext.getConf).getOrCreate()
val messages = r.collect().map(message => mapper.readValue(message, classOf[Message]))
val broadcasts = BroadCaster.getInstance(r.sparkContext, s"$s3://$bucket/$wgPath")
//Application logic
})
}
}
def initialiseSparkConf(local: Boolean): SparkConf = {
val sparkConf = new SparkConf()
.setAppName("Spark Streaming")
.set("spark.scheduler.mode", "FAIR")
.set("spark.sql.parquet.filterpushdown", "true")
.set("spark.executor.hearbeatInterval", "20")
.set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
.set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
.set("spark.streaming.receiver.writeAheadLog.enable", "true")
.set("spark.streaming.stopGracefullyOnShutdown", "true")
.set("spark.streaming.backpressure.enabled","true")
.set("spark.streaming.backpressure.pid.minRate","10") //SQS support batch of 10
if (local) {
s3 = "s3a"
sparkConf.setMaster("local[*]")
} else {
sparkConf.set("hive.metastore.client.factory.class",
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
}
}
}
object BroadCaster {
@volatile private var instance: Map[String, Broadcast[Any]] = _
def getInstance(sparkContext: SparkContext, wgPath: String): Map[String, Broadcast[Any]] = {
if (instance == null) {
synchronized {
if (instance == null) {
instance = Utils.createBroadcastObjects(wgPath, sparkContext)
instance += (Constants.SQS_RECEIVER -> sparkContext.broadcast(getSQSReceiver(sparkContext.getConf)))
}
}
}
instance
}
private def getSQSReceiver(conf: SparkConf): SQSReceiver = {
val javaSQSReceiver = new SQSReceiver(conf.get(Constants.QUEUE_NAME)).withRegion(Regions.fromName(conf.get(Constants.REGION))).withCredential(new DefaultAWSCredentialsProviderChain())
.withFetchMaxMessage(conf.getInt(Constants.FETCH_MAX_MESSAGE, 10)).withVisibilityTimeOutSeconds(conf.getInt(Constants.VISIBILITY_TIMEOUT_SECONDS, 1800)).withWaitTimeoutinMillis(conf.getLong(Constants.WAIT_TIMEOUT_IN_MILLIS, 1000))
javaSQSReceiver
}
}
导入java.util.List;
导入org.apache.log4j.Logger;
导入org.apache.spark.storage.StorageLevel;
导入org.apache.spark.streaming.receiver.receiver;
导入com.amazonaws.auth.AWSCredentialsProvider;
导入com.amazonaws.regions.regions;
导入com.amazonaws.services.sqs.AmazonSQS;
导入com.a