Apache spark fromOffset/untilOffset/offset.count与RDD分区中记录总数之间的差异
我的Spark kakfa流逻辑如下所示 当我查看偏移量和记录时,分区的offset.count()和使用foreach循环时实际打印的记录总数是不同的。(在RDD分区上) 这些数字与我的逻辑不一致 有人能指导我纠正我的逻辑吗Apache spark fromOffset/untilOffset/offset.count与RDD分区中记录总数之间的差异,apache-spark,apache-kafka,spark-streaming,Apache Spark,Apache Kafka,Spark Streaming,我的Spark kakfa流逻辑如下所示 当我查看偏移量和记录时,分区的offset.count()和使用foreach循环时实际打印的记录总数是不同的。(在RDD分区上) 这些数字与我的逻辑不一致 有人能指导我纠正我的逻辑吗 JavaInputDStream<byte[]> directKafkaStream = KafkaUtils.createDirectStream(jsc, String.class, byte[].class, StringDecoder.
JavaInputDStream<byte[]> directKafkaStream = KafkaUtils.createDirectStream(jsc, String.class, byte[].class,
StringDecoder.class, DefaultDecoder.class, byte[].class, kafkaParams, topicMap,
(Function<MessageAndMetadata<String, byte[]>, byte[]>) MessageAndMetadata::message);
directKafkaStream.foreachRDD(rdd -> {
rdd.foreachPartition(itr -> {
Integer partnId = TaskContext.get().partitionId();
ArrayList <byte[]> recordBatch = new ArrayList <byte[]>();
while (itr.hasNext()) {
byte[] record = itr.next();
recordBatch.add(record);
}
OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
//Partition 0 fromOffset 0 untilOffset 2 offset.count(): 2
//Partition 1 fromOffset 0 untilOffset 3 offset.count(): 3
for (OffsetRange offset : offsets) {
if (offset.partition() == partnId) {
recordBatch.forEach((rec) -> {
//Partition 0 fromOffset 0 untilOffset 2 offset.count(): 2 --> TOTAL RECORDS after LOOPING: 3
//Partition 1 fromOffset 0 untilOffset 3 offset.count(): 3 --> TOTAL RECORDS after LOOPING: 3
//Business Logic goes here.
}
}
}
});
});
JavaInputDStream directKafkaStream=KafkaUtils.createDirectStream(jsc,String.class,byte[].class,
StringDecoder.class,DefaultDecoder.class,字节[].class,kafkaParams,topicMap,
(函数)MessageAndMetadata::message);
directKafkaStream.foreachRDD(rdd->{
rdd.foreachPartition(itr->{
整数partnId=TaskContext.get().partitionId();
ArrayList recordBatch=新的ArrayList();
while(itr.hasNext()){
字节[]记录=itr.next();
recordBatch.add(记录);
}
OffsetRange[]偏移=((HasOffsetRanges)rdd.rdd()).offsetRanges();
//分区0从偏移量0到偏移量2偏移量。计数():2
//分区1从偏移量0到偏移量3。计数():3
对于(偏移范围偏移:偏移){
if(offset.partition()==partnId){
recordBatch.forEach((rec)->{
//分区0从偏移量0到偏移量2偏移量。计数():2-->循环后的总记录数:3
//分区1从偏移量0到偏移量3偏移量。计数():3-->循环后的总记录数:3
//这里是业务逻辑。
}
}
}
});
});