Apache flink 连通弗林克流中的背压

Apache flink 连通弗林克流中的背压,apache-flink,Apache Flink,我正在试验如何正确传播背压,当我连接了流作为我计算图的一部分时。问题是:我有两个数据源,其中一个比另一个更快地接收数据,我认为我们需要重播一些数据,而其中一个数据源有我们用来丰富另一个数据源的罕见事件。然后将这两个源连接到一个流中,该流期望它们至少在某种程度上是同步的,以某种方式将它们合并在一起(生成元组,丰富,…),并返回结果 对于单输入流,实现背压相当容易,您只需在processElement函数中花费很长时间即可。对于connectedstreams,我最初的想法是在每个ProcessFu

我正在试验如何正确传播背压,当我连接了流作为我计算图的一部分时。问题是:我有两个数据源,其中一个比另一个更快地接收数据,我认为我们需要重播一些数据,而其中一个数据源有我们用来丰富另一个数据源的罕见事件。然后将这两个源连接到一个流中,该流期望它们至少在某种程度上是同步的,以某种方式将它们合并在一起(生成元组,丰富,…),并返回结果

对于单输入流,实现背压相当容易,您只需在processElement函数中花费很长时间即可。对于connectedstreams,我最初的想法是在每个ProcessFunction中都有一些逻辑,等待另一个流跟上。例如,我可以有一个时间跨度有限的缓冲区(大到足以容纳水印),并且函数不会接受会使该跨度超过阈值的事件。例如:

leftLock.aquire { nonEmptySignal =>
  while (queueSpan() > capacity.toMillis && lastTs() < ctx.timestamp()) {
    println("WAITING")
    nonEmptySignal.await()
  }

  queueOp { queue =>
    println(s"Left Event $value recieved ${Thread.currentThread()}")
    queue.add(Left(value))
  }
  ctx.timerService().registerEventTimeTimer(value.ts)
}
leftLock.aquire{nonEmptySignal=>
while(queueSpan()>capacity.toMillis&&lastTs()
println(s“收到的左事件$value${Thread.currentThread()}”)
queue.add(左(值))
}
ctx.timerService().RegisterEventTimer(value.ts)
}
下面是我的示例的完整代码(它使用两个锁编写,假设从两个不同的线程进行访问,我认为情况并非如此):

import java.util.concurrent.atomic.{AtomicBoolean,AtomicLong}
导入java.util.concurrent.locks.{Condition,ReentrantLock}
导入scala.collection.JavaConverters_
导入com.google.common.collect.MinMaxPriorityQueue
导入org.apache.flink.api.common.state.{ValueState,ValueStateDescriptor}
导入org.apache.flink.api.common.typeinfo.{TypeHint,TypeInformation}
导入org.apache.flink.api.java.utils.ParameterTool
导入org.apache.flink.api.scala_
导入org.apache.flink.configuration.configuration
导入org.apache.flink.streaming.api.TimeCharacteristic
导入org.apache.flink.streaming.api.environment.LocalStreamEnvironment
导入org.apache.flink.streaming.api.functions.co.CoProcessFunction
导入org.apache.flink.streaming.api.functions.source.{RichSourceFunction,SourceFunction}
导入org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
导入org.apache.flink.streaming.api.watermark.watermark
导入org.apache.flink.util.Collector
导入scala.collection.mutable
导入scala.concurrent.duration_
特征时间戳{
长
}
case类StateObject(ts:Long,state:String)扩展了时间戳
case类DataObject(ts:Long,data:String)扩展了时间戳
case类StatefulDataObject(ts:Long,state:Option[String],data:String)扩展了时间戳
类数据源[A](工厂:Long=>A,速率:Int,加速因子:Long=0)扩展了RichSourceFunction[A]{
private val max=new AtomicLong()
private val isRunning=新原子布尔值(false)
私有val加速比=新原子长(0)
专用val水印延迟=5秒
覆盖def cancel():单位={
isRunning.set(false)
}
覆盖def运行(ctx:SourceFunction.SourceContext[A]):单位={
isRunning.set(真)
同时(正在运行.get()){
val time=System.currentTimeMillis()+加速比.addAndGet(加速系数)
val事件=工厂(时间)
ctx.collectWithTimestamp(事件、时间)
println(s“事件$事件来源$加速因子”)
val watermark=时间-WatermarkDelay.toMillis
if(max.get()<水印){
emitWatermark(新水印(time-WatermarkDelay.toMillis))
最大设置(水印)
}
线程睡眠(速率)
}
}
}
类条件运算符{
private val lock=new ReentrantLock()
专用val信号:条件=锁。新条件()
定义获取[B](函数:条件=>B):B={
lock.lock()
试一试{
职能(信号)
}最后{
lock.unlock()
}
}
}
类阻塞协处理函数(容量:FiniteDuration=20秒)
扩展协处理函数[StateObject,DataObject,StatefulDataObject]{
私有类型MergedType=任一[StateObject,DataObject]
private lazy val leftLock=新条件运算符()
private lazy val rightLock=新条件运算符()
私有变量queueState:ValueState[MinMaxPriorityQueue[MergedType]=_
私有var数据状态:ValueState[StateObject]=_
覆盖def open(参数:配置):装置={
super.open(参数)
queueState=getRuntimeContext.getState(新的ValueStateDescriptor[MinMaxPriorityQueue[MergedType]](
“事件队列”,
of(新类型提示[MinMaxPriorityQueue[MergedType]](){})
))
dataState=getRuntimeContext.getState(新的ValueStateDescriptor[StateObject](
“事件状态”,
of(新类型提示[StateObject](){})
))
}
重写def processElement1(值:StateObject,
ctx:协处理函数[StateObject,DataObject,StatefulDataObject]#上下文,
输出:收集器[StatefulDataObject]:单位={
leftLock.aquire{nonEmptySignal=>
while(queueSpan()>capacity.toMillis&&lastTs()
println(s“收到的左事件$value${Thread.currentThread()}”)
queue.add(左(值))
}
ctx.timerService().RegisterEventTimer(value.ts)
}
}
重写def processElement2(值:DataObject,
ctx:协处理函数[StateObject,DataObject,StatefulDataObject]#上下文,
输出:收集器[StatefulDataObject]:单位={
rightLock.aquire{nonEmptySignal=>
while(queueSpan()>capacity.toMillis&&lastTs()
println(s“Right E
import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong}
import java.util.concurrent.locks.{Condition, ReentrantLock}

import scala.collection.JavaConverters._
import com.google.common.collect.MinMaxPriorityQueue
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation}
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.environment.LocalStreamEnvironment
import org.apache.flink.streaming.api.functions.co.CoProcessFunction
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.util.Collector

import scala.collection.mutable
import scala.concurrent.duration._

trait Timestamped {
  val ts: Long
}

case class StateObject(ts: Long, state: String) extends Timestamped

case class DataObject(ts: Long, data: String) extends Timestamped

case class StatefulDataObject(ts: Long, state: Option[String], data: String) extends Timestamped

class DataSource[A](factory: Long => A, rate: Int, speedUpFactor: Long = 0) extends RichSourceFunction[A] {

  private val max = new AtomicLong()
  private val isRunning = new AtomicBoolean(false)
  private val speedUp = new AtomicLong(0)
  private val WatermarkDelay = 5 seconds

  override def cancel(): Unit = {
    isRunning.set(false)
  }

  override def run(ctx: SourceFunction.SourceContext[A]): Unit = {
    isRunning.set(true)
    while (isRunning.get()) {
      val time = System.currentTimeMillis() + speedUp.addAndGet(speedUpFactor)
      val event = factory(time)
      ctx.collectWithTimestamp(event, time)
      println(s"Event $event sourced $speedUpFactor")

      val watermark = time - WatermarkDelay.toMillis
      if (max.get() < watermark) {
        ctx.emitWatermark(new Watermark(time - WatermarkDelay.toMillis))
        max.set(watermark)
      }
      Thread.sleep(rate)
    }
  }
}

class ConditionalOperator {
  private val lock = new ReentrantLock()
  private val signal: Condition = lock.newCondition()

  def aquire[B](func: Condition => B): B = {
    lock.lock()
    try {
      func(signal)
    } finally {
      lock.unlock()
    }
  }
}

class BlockingCoProcessFunction(capacity: FiniteDuration = 20 seconds)
  extends CoProcessFunction[StateObject, DataObject, StatefulDataObject] {

  private type MergedType = Either[StateObject, DataObject]
  private lazy val leftLock = new ConditionalOperator()
  private lazy val rightLock = new ConditionalOperator()
  private var queueState: ValueState[MinMaxPriorityQueue[MergedType]] = _
  private var dataState: ValueState[StateObject] = _

  override def open(parameters: Configuration): Unit = {
    super.open(parameters)

    queueState = getRuntimeContext.getState(new ValueStateDescriptor[MinMaxPriorityQueue[MergedType]](
      "event-queue",
      TypeInformation.of(new TypeHint[MinMaxPriorityQueue[MergedType]]() {})
    ))

    dataState = getRuntimeContext.getState(new ValueStateDescriptor[StateObject](
      "event-state",
      TypeInformation.of(new TypeHint[StateObject]() {})
    ))
  }

  override def processElement1(value: StateObject,
                               ctx: CoProcessFunction[StateObject, DataObject, StatefulDataObject]#Context,
                               out: Collector[StatefulDataObject]): Unit = {
    leftLock.aquire { nonEmptySignal =>
      while (queueSpan() > capacity.toMillis && lastTs() < ctx.timestamp()) {
        println("WAITING")
        nonEmptySignal.await()
      }

      queueOp { queue =>
        println(s"Left Event $value recieved ${Thread.currentThread()}")
        queue.add(Left(value))
      }
      ctx.timerService().registerEventTimeTimer(value.ts)
    }
  }

  override def processElement2(value: DataObject,
                               ctx: CoProcessFunction[StateObject, DataObject, StatefulDataObject]#Context,
                               out: Collector[StatefulDataObject]): Unit = {
    rightLock.aquire { nonEmptySignal =>
      while (queueSpan() > capacity.toMillis && lastTs() < ctx.timestamp()) {
        println("WAITING")
        nonEmptySignal.await()
      }

      queueOp { queue =>
        println(s"Right Event $value recieved ${Thread.currentThread()}")
        queue.add(Right(value))
      }
      ctx.timerService().registerEventTimeTimer(value.ts)
    }
  }

  override def onTimer(timestamp: Long,
                       ctx: CoProcessFunction[StateObject, DataObject, StatefulDataObject]#OnTimerContext,
                       out: Collector[StatefulDataObject]): Unit = {
    println(s"Watermarked $timestamp")
    leftLock.aquire { leftSignal =>
      rightLock.aquire { rightSignal =>
        queueOp { queue =>
          while (Option(queue.peekFirst()).exists(x => timestampOf(x) <= timestamp)) {
            queue.poll() match {
              case Left(state) =>
                dataState.update(state)
                leftSignal.signal()
              case Right(event) =>
                println(s"Event $event emitted ${Thread.currentThread()}")
                out.collect(
                  StatefulDataObject(
                    event.ts,
                    Option(dataState.value()).map(_.state),
                    event.data
                  )
                )
                rightSignal.signal()
            }
          }
        }
      }
    }
  }

  private def queueOp[B](func: MinMaxPriorityQueue[MergedType] => B): B = queueState.synchronized {
    val queue = Option(queueState.value()).
      getOrElse(
        MinMaxPriorityQueue.
          orderedBy(Ordering.by((x: MergedType) => timestampOf(x))).create[MergedType]()
      )
    val result = func(queue)
    queueState.update(queue)
    result
  }

  private def timestampOf(data: MergedType): Long = data match {
    case Left(y) =>
      y.ts
    case Right(y) =>
      y.ts
  }

  private def queueSpan(): Long = {
    queueOp { queue =>
      val firstTs = Option(queue.peekFirst()).map(timestampOf).getOrElse(Long.MaxValue)
      val lastTs = Option(queue.peekLast()).map(timestampOf).getOrElse(Long.MinValue)
      println(s"Span: $firstTs - $lastTs = ${lastTs - firstTs}")
      lastTs - firstTs
    }
  }

  private def lastTs(): Long = {
    queueOp { queue =>
      Option(queue.peekLast()).map(timestampOf).getOrElse(Long.MinValue)
    }
  }
}

object BackpressureTest {

  var data = new mutable.ArrayBuffer[DataObject]()

  def main(args: Array[String]): Unit = {
    val streamConfig = new Configuration()
    val env = new StreamExecutionEnvironment(new LocalStreamEnvironment(streamConfig))

    env.getConfig.disableSysoutLogging()
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    val stateSource = env.addSource(new DataSource(ts => StateObject(ts, ts.toString), 1000))
    val dataSource = env.addSource(new DataSource(ts => DataObject(ts, ts.toString), 100, 100))

    stateSource.
      connect(dataSource).
      keyBy(_ => "", _ => "").
      process(new BlockingCoProcessFunction()).
      print()

    env.execute()
  }
}