Apache flink 连通弗林克流中的背压
我正在试验如何正确传播背压,当我连接了流作为我计算图的一部分时。问题是:我有两个数据源,其中一个比另一个更快地接收数据,我认为我们需要重播一些数据,而其中一个数据源有我们用来丰富另一个数据源的罕见事件。然后将这两个源连接到一个流中,该流期望它们至少在某种程度上是同步的,以某种方式将它们合并在一起(生成元组,丰富,…),并返回结果 对于单输入流,实现背压相当容易,您只需在processElement函数中花费很长时间即可。对于connectedstreams,我最初的想法是在每个ProcessFunction中都有一些逻辑,等待另一个流跟上。例如,我可以有一个时间跨度有限的缓冲区(大到足以容纳水印),并且函数不会接受会使该跨度超过阈值的事件。例如:Apache flink 连通弗林克流中的背压,apache-flink,Apache Flink,我正在试验如何正确传播背压,当我连接了流作为我计算图的一部分时。问题是:我有两个数据源,其中一个比另一个更快地接收数据,我认为我们需要重播一些数据,而其中一个数据源有我们用来丰富另一个数据源的罕见事件。然后将这两个源连接到一个流中,该流期望它们至少在某种程度上是同步的,以某种方式将它们合并在一起(生成元组,丰富,…),并返回结果 对于单输入流,实现背压相当容易,您只需在processElement函数中花费很长时间即可。对于connectedstreams,我最初的想法是在每个ProcessFu
leftLock.aquire { nonEmptySignal =>
while (queueSpan() > capacity.toMillis && lastTs() < ctx.timestamp()) {
println("WAITING")
nonEmptySignal.await()
}
queueOp { queue =>
println(s"Left Event $value recieved ${Thread.currentThread()}")
queue.add(Left(value))
}
ctx.timerService().registerEventTimeTimer(value.ts)
}
leftLock.aquire{nonEmptySignal=>
while(queueSpan()>capacity.toMillis&&lastTs()
println(s“收到的左事件$value${Thread.currentThread()}”)
queue.add(左(值))
}
ctx.timerService().RegisterEventTimer(value.ts)
}
下面是我的示例的完整代码(它使用两个锁编写,假设从两个不同的线程进行访问,我认为情况并非如此):
import java.util.concurrent.atomic.{AtomicBoolean,AtomicLong}
导入java.util.concurrent.locks.{Condition,ReentrantLock}
导入scala.collection.JavaConverters_
导入com.google.common.collect.MinMaxPriorityQueue
导入org.apache.flink.api.common.state.{ValueState,ValueStateDescriptor}
导入org.apache.flink.api.common.typeinfo.{TypeHint,TypeInformation}
导入org.apache.flink.api.java.utils.ParameterTool
导入org.apache.flink.api.scala_
导入org.apache.flink.configuration.configuration
导入org.apache.flink.streaming.api.TimeCharacteristic
导入org.apache.flink.streaming.api.environment.LocalStreamEnvironment
导入org.apache.flink.streaming.api.functions.co.CoProcessFunction
导入org.apache.flink.streaming.api.functions.source.{RichSourceFunction,SourceFunction}
导入org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
导入org.apache.flink.streaming.api.watermark.watermark
导入org.apache.flink.util.Collector
导入scala.collection.mutable
导入scala.concurrent.duration_
特征时间戳{
长
}
case类StateObject(ts:Long,state:String)扩展了时间戳
case类DataObject(ts:Long,data:String)扩展了时间戳
case类StatefulDataObject(ts:Long,state:Option[String],data:String)扩展了时间戳
类数据源[A](工厂:Long=>A,速率:Int,加速因子:Long=0)扩展了RichSourceFunction[A]{
private val max=new AtomicLong()
private val isRunning=新原子布尔值(false)
私有val加速比=新原子长(0)
专用val水印延迟=5秒
覆盖def cancel():单位={
isRunning.set(false)
}
覆盖def运行(ctx:SourceFunction.SourceContext[A]):单位={
isRunning.set(真)
同时(正在运行.get()){
val time=System.currentTimeMillis()+加速比.addAndGet(加速系数)
val事件=工厂(时间)
ctx.collectWithTimestamp(事件、时间)
println(s“事件$事件来源$加速因子”)
val watermark=时间-WatermarkDelay.toMillis
if(max.get()<水印){
emitWatermark(新水印(time-WatermarkDelay.toMillis))
最大设置(水印)
}
线程睡眠(速率)
}
}
}
类条件运算符{
private val lock=new ReentrantLock()
专用val信号:条件=锁。新条件()
定义获取[B](函数:条件=>B):B={
lock.lock()
试一试{
职能(信号)
}最后{
lock.unlock()
}
}
}
类阻塞协处理函数(容量:FiniteDuration=20秒)
扩展协处理函数[StateObject,DataObject,StatefulDataObject]{
私有类型MergedType=任一[StateObject,DataObject]
private lazy val leftLock=新条件运算符()
private lazy val rightLock=新条件运算符()
私有变量queueState:ValueState[MinMaxPriorityQueue[MergedType]=_
私有var数据状态:ValueState[StateObject]=_
覆盖def open(参数:配置):装置={
super.open(参数)
queueState=getRuntimeContext.getState(新的ValueStateDescriptor[MinMaxPriorityQueue[MergedType]](
“事件队列”,
of(新类型提示[MinMaxPriorityQueue[MergedType]](){})
))
dataState=getRuntimeContext.getState(新的ValueStateDescriptor[StateObject](
“事件状态”,
of(新类型提示[StateObject](){})
))
}
重写def processElement1(值:StateObject,
ctx:协处理函数[StateObject,DataObject,StatefulDataObject]#上下文,
输出:收集器[StatefulDataObject]:单位={
leftLock.aquire{nonEmptySignal=>
while(queueSpan()>capacity.toMillis&&lastTs()
println(s“收到的左事件$value${Thread.currentThread()}”)
queue.add(左(值))
}
ctx.timerService().RegisterEventTimer(value.ts)
}
}
重写def processElement2(值:DataObject,
ctx:协处理函数[StateObject,DataObject,StatefulDataObject]#上下文,
输出:收集器[StatefulDataObject]:单位={
rightLock.aquire{nonEmptySignal=>
while(queueSpan()>capacity.toMillis&&lastTs()
println(s“Right E
import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong}
import java.util.concurrent.locks.{Condition, ReentrantLock}
import scala.collection.JavaConverters._
import com.google.common.collect.MinMaxPriorityQueue
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation}
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.environment.LocalStreamEnvironment
import org.apache.flink.streaming.api.functions.co.CoProcessFunction
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.util.Collector
import scala.collection.mutable
import scala.concurrent.duration._
trait Timestamped {
val ts: Long
}
case class StateObject(ts: Long, state: String) extends Timestamped
case class DataObject(ts: Long, data: String) extends Timestamped
case class StatefulDataObject(ts: Long, state: Option[String], data: String) extends Timestamped
class DataSource[A](factory: Long => A, rate: Int, speedUpFactor: Long = 0) extends RichSourceFunction[A] {
private val max = new AtomicLong()
private val isRunning = new AtomicBoolean(false)
private val speedUp = new AtomicLong(0)
private val WatermarkDelay = 5 seconds
override def cancel(): Unit = {
isRunning.set(false)
}
override def run(ctx: SourceFunction.SourceContext[A]): Unit = {
isRunning.set(true)
while (isRunning.get()) {
val time = System.currentTimeMillis() + speedUp.addAndGet(speedUpFactor)
val event = factory(time)
ctx.collectWithTimestamp(event, time)
println(s"Event $event sourced $speedUpFactor")
val watermark = time - WatermarkDelay.toMillis
if (max.get() < watermark) {
ctx.emitWatermark(new Watermark(time - WatermarkDelay.toMillis))
max.set(watermark)
}
Thread.sleep(rate)
}
}
}
class ConditionalOperator {
private val lock = new ReentrantLock()
private val signal: Condition = lock.newCondition()
def aquire[B](func: Condition => B): B = {
lock.lock()
try {
func(signal)
} finally {
lock.unlock()
}
}
}
class BlockingCoProcessFunction(capacity: FiniteDuration = 20 seconds)
extends CoProcessFunction[StateObject, DataObject, StatefulDataObject] {
private type MergedType = Either[StateObject, DataObject]
private lazy val leftLock = new ConditionalOperator()
private lazy val rightLock = new ConditionalOperator()
private var queueState: ValueState[MinMaxPriorityQueue[MergedType]] = _
private var dataState: ValueState[StateObject] = _
override def open(parameters: Configuration): Unit = {
super.open(parameters)
queueState = getRuntimeContext.getState(new ValueStateDescriptor[MinMaxPriorityQueue[MergedType]](
"event-queue",
TypeInformation.of(new TypeHint[MinMaxPriorityQueue[MergedType]]() {})
))
dataState = getRuntimeContext.getState(new ValueStateDescriptor[StateObject](
"event-state",
TypeInformation.of(new TypeHint[StateObject]() {})
))
}
override def processElement1(value: StateObject,
ctx: CoProcessFunction[StateObject, DataObject, StatefulDataObject]#Context,
out: Collector[StatefulDataObject]): Unit = {
leftLock.aquire { nonEmptySignal =>
while (queueSpan() > capacity.toMillis && lastTs() < ctx.timestamp()) {
println("WAITING")
nonEmptySignal.await()
}
queueOp { queue =>
println(s"Left Event $value recieved ${Thread.currentThread()}")
queue.add(Left(value))
}
ctx.timerService().registerEventTimeTimer(value.ts)
}
}
override def processElement2(value: DataObject,
ctx: CoProcessFunction[StateObject, DataObject, StatefulDataObject]#Context,
out: Collector[StatefulDataObject]): Unit = {
rightLock.aquire { nonEmptySignal =>
while (queueSpan() > capacity.toMillis && lastTs() < ctx.timestamp()) {
println("WAITING")
nonEmptySignal.await()
}
queueOp { queue =>
println(s"Right Event $value recieved ${Thread.currentThread()}")
queue.add(Right(value))
}
ctx.timerService().registerEventTimeTimer(value.ts)
}
}
override def onTimer(timestamp: Long,
ctx: CoProcessFunction[StateObject, DataObject, StatefulDataObject]#OnTimerContext,
out: Collector[StatefulDataObject]): Unit = {
println(s"Watermarked $timestamp")
leftLock.aquire { leftSignal =>
rightLock.aquire { rightSignal =>
queueOp { queue =>
while (Option(queue.peekFirst()).exists(x => timestampOf(x) <= timestamp)) {
queue.poll() match {
case Left(state) =>
dataState.update(state)
leftSignal.signal()
case Right(event) =>
println(s"Event $event emitted ${Thread.currentThread()}")
out.collect(
StatefulDataObject(
event.ts,
Option(dataState.value()).map(_.state),
event.data
)
)
rightSignal.signal()
}
}
}
}
}
}
private def queueOp[B](func: MinMaxPriorityQueue[MergedType] => B): B = queueState.synchronized {
val queue = Option(queueState.value()).
getOrElse(
MinMaxPriorityQueue.
orderedBy(Ordering.by((x: MergedType) => timestampOf(x))).create[MergedType]()
)
val result = func(queue)
queueState.update(queue)
result
}
private def timestampOf(data: MergedType): Long = data match {
case Left(y) =>
y.ts
case Right(y) =>
y.ts
}
private def queueSpan(): Long = {
queueOp { queue =>
val firstTs = Option(queue.peekFirst()).map(timestampOf).getOrElse(Long.MaxValue)
val lastTs = Option(queue.peekLast()).map(timestampOf).getOrElse(Long.MinValue)
println(s"Span: $firstTs - $lastTs = ${lastTs - firstTs}")
lastTs - firstTs
}
}
private def lastTs(): Long = {
queueOp { queue =>
Option(queue.peekLast()).map(timestampOf).getOrElse(Long.MinValue)
}
}
}
object BackpressureTest {
var data = new mutable.ArrayBuffer[DataObject]()
def main(args: Array[String]): Unit = {
val streamConfig = new Configuration()
val env = new StreamExecutionEnvironment(new LocalStreamEnvironment(streamConfig))
env.getConfig.disableSysoutLogging()
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
val stateSource = env.addSource(new DataSource(ts => StateObject(ts, ts.toString), 1000))
val dataSource = env.addSource(new DataSource(ts => DataObject(ts, ts.toString), 100, 100))
stateSource.
connect(dataSource).
keyBy(_ => "", _ => "").
process(new BlockingCoProcessFunction()).
print()
env.execute()
}
}