Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/apache-kafka/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Apache spark Spark Streaming Direct Kafka消费者在执行者之间分布不均匀_Apache Spark_Apache Kafka - Fatal编程技术网

Apache spark Spark Streaming Direct Kafka消费者在执行者之间分布不均匀

Apache spark Spark Streaming Direct Kafka消费者在执行者之间分布不均匀,apache-spark,apache-kafka,Apache Spark,Apache Kafka,我在Spark中创建了一个示例Direct Kafka流。卡夫卡有30个给定主题的分区。但所有使用者都是从同一执行器机器执行的 卡夫卡经理截图 根据我在direct Kafka Stream中的理解,Driver将偏移量提供给执行者,并使用该偏移量进行轮询 Spark版本:2.4 示例代码如下: import com.google.common.collect.ImmutableList; import org.apache.kafka.clients.consumer.ConsumerC

我在Spark中创建了一个示例Direct Kafka流。卡夫卡有30个给定主题的分区。但所有使用者都是从同一执行器机器执行的

卡夫卡经理截图

根据我在direct Kafka Stream中的理解,Driver将偏移量提供给执行者,并使用该偏移量进行轮询

Spark版本:2.4

示例代码如下:



import com.google.common.collect.ImmutableList;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.ByteArrayDeserializer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.TaskContext;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.*;

import java.util.Arrays;
import java.util.HashMap;


public class Main {


    public static void main(String[] args) throws InterruptedException {
        SparkConf conf = new SparkConf().setAppName("StreamingTest");

        conf.set("spark.shuffle.service.enabled", "true");
        conf.set("spark.streaming.kafka.maxRatePerPartition", "100");
        conf.set("spark.streaming.backpressure.enabled", "true");
        conf.set("spark.streaming.concurrentJobs", "1");
        conf.set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC");
        conf.set("spark.streaming.backpressure.pid.minRate", "1500");


        JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(5));



        JavaInputDStream<ConsumerRecord<Object, Object>> kafkaStream1 = createKafkaStream(ssc, "test-topic-1");

        kafkaStream1.foreachRDD(rdd -> rdd.foreachPartition(p -> p.forEachRemaining(e -> {
            System.out.println("Processing test-topic-1");
            try {
                Thread.sleep(2);
            } catch (InterruptedException ex) {
                ex.printStackTrace();
            }
        })));

        kafkaStream1.foreachRDD(rdd -> {
            OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
            final OffsetRange[] beginOffsets = Arrays.stream(offsetRanges).map(o -> OffsetRange.create(o.topicPartition(), 0, o.fromOffset())).toArray(OffsetRange[]::new);
            rdd.foreachPartition(partition -> {
                OffsetRange o = beginOffsets[TaskContext.get().partitionId()];

            });
            ((CanCommitOffsets) kafkaStream1.inputDStream()).commitAsync(beginOffsets);
        });



        ssc.start();
        ssc.awaitTermination();
    }

    public static JavaInputDStream<ConsumerRecord<Object, Object>> createKafkaStream(JavaStreamingContext ssc, String topic) {
        HashMap<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "<broker-ids>");
        kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, topic+"hrishi-testing-nfr-7");
        kafkaParams.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG, 5000);
        kafkaParams.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 80000);
        kafkaParams.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1000);
        kafkaParams.put(ConsumerConfig.MAX_PARTITION_FETCH_BYTES_CONFIG, 10000000);
        kafkaParams.put(ConsumerConfig.MAX_POLL_INTERVAL_MS_CONFIG, 5000);
        kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
        kafkaParams.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false);
        kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class);

        return KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(ImmutableList.of(topic), kafkaParams));
    }
}



导入com.google.common.collect.ImmutableList;
导入org.apache.kafka.clients.consumer.ConsumerConfig;
导入org.apache.kafka.clients.consumer.ConsumerRecord;
导入org.apache.kafka.common.serialization.ByteArraydSerializer;
导入org.apache.kafka.common.serialization.StringDeserializer;
导入org.apache.spark.SparkConf;
导入org.apache.spark.TaskContext;
导入org.apache.spark.streaming.Durations;
导入org.apache.spark.streaming.api.java.JavaInputDStream;
导入org.apache.spark.streaming.api.java.JavaStreamingContext;
导入org.apache.spark.streaming.kafka010.*;
导入java.util.array;
导入java.util.HashMap;
公共班机{
公共静态void main(字符串[]args)引发InterruptedException{
SparkConf conf=new SparkConf().setAppName(“StreamingTest”);
conf.set(“spark.shuffle.service.enabled”,“true”);
conf.set(“spark.streaming.kafka.maxRatePerPartition”,“100”);
conf.set(“spark.streaming.backpressure.enabled”,“true”);
conf.set(“spark.streaming.concurrentJobs”,“1”);
conf.set(“spark.executor.extraJavaOptions”,“-XX:+UseConcMarkSweepGC”);
conf.set(“spark.streaming.backpressure.pid.minRate”、“1500”);
JavaStreamingContext ssc=新的JavaStreamingContext(conf,Durations.seconds(5));
JavaInputDStream kafkaStream1=创建Kafkastream(ssc,“测试主题-1”);
kafkaStream1.foreachRDD(rdd->rdd.foreachPartition(p->p.forEachRemaining(e->{
System.out.println(“处理测试主题1”);
试一试{
睡眠(2);
}捕获(中断异常例外){
例如printStackTrace();
}
})));
kafkaStream1.foreachRDD(rdd->{
OffsetRange[]offsetRanges=((HasOffsetRanges)rdd.rdd()).offsetRanges();
final OffsetRange[]beginOffsets=Arrays.stream(offsetRanges).map(o->OffsetRange.create(o.topicPartition(),0,o.fromfoffset()).toArray(OffsetRange[]::new);
rdd.foreachPartition(分区->{
OffsetRange o=beginOffsets[TaskContext.get().partitionId()];
});
((CanCommitOffsets)kafkaStream1.inputDStream()).commitAsync(beginOffsets);
});
ssc.start();
ssc.终止();
}
公共静态JavaInputDStream createKafkaStream(JavaStreamingContext ssc,字符串主题){
HashMap kafkaParams=新HashMap();
kafkaParams.put(ConsumerConfig.BOOTSTRAP\u SERVERS\u CONFIG,“”);
kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG,主题+“hrishi-testing-nfr-7”);
kafkaParams.put(ConsumerConfig.HEARTBEAT\u INTERVAL\u MS\u CONFIG,5000);
kafkaParams.put(ConsumerConfig.SESSION\u TIMEOUT\u MS\u CONFIG,80000);
kafkaParams.put(ConsumerConfig.MAX\u POLL\u RECORDS\u CONFIG,1000);
kafkaParams.put(ConsumerConfig.MAX\u PARTITION\u FETCH\u BYTES\u CONFIG,10000000);
kafkaParams.put(ConsumerConfig.MAX\u POLL\u INTERVAL\u MS\u CONFIG,5000);
kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,“最早”);
kafkaParams.put(ConsumerConfig.ENABLE\u AUTO\u COMMIT\u CONFIG,false);
kafkaParams.put(ConsumerConfig.KEY\u反序列化程序\u CLASS\u配置,StringDeserializer.CLASS);
kafkaParams.put(ConsumerConfig.VALUE\u反序列化程序\u类\u配置,ByteArrayDeserializer.CLASS);
返回KafkaUtils.createDirectStream(ssc,LocationStrategies.PreferConsistent(),ConsumerStrategies.Subscribe(ImmutableList.of(topic),kafkaParams));
}
}

我发现了它发生的问题,因为我正在提交来自驱动程序的偏移量。这就是代码

((CanCommitOffsets)kafkaStream1.inputDStream()).commitAsync(offsetRanges)

   kafkaStream1.foreachRDD(rdd -> {
        OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
        rdd.foreachPartition(partition -> {
            partition.forEachRemaining(e -> {
                try {
                    System.out.println("hrishi mess" + e);
                    Thread.sleep(2);
                } catch (InterruptedException ex) {
                    ex.printStackTrace();
                }
            });

        });
        ((CanCommitOffsets) kafkaStream1.inputDStream()).commitAsync(offsetRanges);
    });

接下来,我在Executors上启用了debug log,发现KafkaRDD正在从Kafka进行轮询,这在日志中清晰可见

不推荐使用数据流api。你试过结构化流媒体吗?是的,我知道。Dstream已被弃用,但我的用例仅限于映射分区,所以这不是问题,目前无法负担迁移。我怀疑这是因为数据流。你也可以映射数据帧。无论如何,请显示完整代码,此属性看起来可疑
spark.streaming.concurrentJobs
添加了完整的示例代码@cricket\u 007看起来您的代码实际上没有做任何事情接受提交偏移量。您是否尝试过使用非常基本的属性集?你想优化什么?