Java Kafka主题划分和Spark执行器映射
我正在使用卡夫卡主题的火花流。主题由5个分区创建。“我的所有消息”将使用tablename作为键发布到kafka主题。 鉴于此,我假设该表的所有消息都应该转到同一分区。 但我注意到,在spark日志中,同一个表的消息有时会发送到executor的node-1,有时发送到executor的node-2 我使用以下命令在纱线簇模式下运行代码:Java Kafka主题划分和Spark执行器映射,java,apache-spark,apache-kafka,spark-streaming,kafka-consumer-api,Java,Apache Spark,Apache Kafka,Spark Streaming,Kafka Consumer Api,我正在使用卡夫卡主题的火花流。主题由5个分区创建。“我的所有消息”将使用tablename作为键发布到kafka主题。 鉴于此,我假设该表的所有消息都应该转到同一分区。 但我注意到,在spark日志中,同一个表的消息有时会发送到executor的node-1,有时发送到executor的node-2 我使用以下命令在纱线簇模式下运行代码: spark-submit --name DataProcessor --master yarn-cluster --files /opt/ETL_JAR/ex
spark-submit --name DataProcessor --master yarn-cluster --files /opt/ETL_JAR/executor-log4j-spark.xml,/opt/ETL_JAR/driver-log4j-spark.xml,/opt/ETL_JAR/application.properties --conf "spark.driver.extraJavaOptions=-Dlog4j.configuration=driver-log4j-spark.xml" --conf "spark.executor.extraJavaOptions=-Dlog4j.configuration=executor-log4j-spark.xml" --class com.test.DataProcessor /opt/ETL_JAR/etl-all-1.0.jar
这个提交在节点1上创建了一个驱动程序,在节点1和节点2上创建了两个执行器
我不希望节点1和节点2执行器读取同一分区。但这种情况正在发生
还尝试按照以下配置指定使用者组,但没有区别
kafkaParams.put("group.id", "app1");
这就是我们使用createDirectStream方法创建流的方式
*不是通过动物园管理员
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", brokers);
kafkaParams.put("auto.offset.reset", "largest");
kafkaParams.put("group.id", "app1");
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topicsSet
);
HashMap kafkaParams=newhashmap();
kafkaParams.put(“metadata.broker.list”,brokers);
kafkaParams.put(“自动偏移重置”、“最大”);
kafkaParams.put(“group.id”,“app1”);
JavaPairInputStream消息=KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
卡夫卡帕拉姆斯,
主题集
);
完整代码:
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.api.java.JavaStreamingContextFactory;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import kafka.serializer.StringDecoder;
import scala.Tuple2;
public class DataProcessor2 implements Serializable {
private static final long serialVersionUID = 3071125481526170241L;
private static Logger log = LoggerFactory.getLogger("DataProcessor");
public static void main(String[] args) {
final String sparkCheckPointDir = ApplicationProperties.getProperty(Consts.SPARK_CHECKPOINTING_DIR);
DataProcessorContextFactory3 factory = new DataProcessorContextFactory3();
JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(sparkCheckPointDir, factory);
// Start the process
jssc.start();
jssc.awaitTermination();
}
}
class DataProcessorContextFactory3 implements JavaStreamingContextFactory, Serializable {
private static final long serialVersionUID = 6070911284191531450L;
private static Logger logger = LoggerFactory.getLogger(DataProcessorContextFactory.class);
DataProcessorContextFactory3() {
}
@Override
public JavaStreamingContext create() {
logger.debug("creating new context..!");
final String brokers = ApplicationProperties.getProperty(Consts.KAFKA_BROKERS_NAME);
final String topic = ApplicationProperties.getProperty(Consts.KAFKA_TOPIC_NAME);
final String app = "app1";
final String offset = ApplicationProperties.getProperty(Consts.KAFKA_CONSUMER_OFFSET, "largest");
logger.debug("Data processing configuration. brokers={}, topic={}, app={}, offset={}", brokers, topic, app,
offset);
if (StringUtils.isBlank(brokers) || StringUtils.isBlank(topic) || StringUtils.isBlank(app)) {
System.err.println("Usage: DataProcessor <brokers> <topic>\n" + Consts.KAFKA_BROKERS_NAME
+ " is a list of one or more Kafka brokers separated by comma\n" + Consts.KAFKA_TOPIC_NAME
+ " is a kafka topic to consume from \n\n\n");
System.exit(1);
}
final String majorVersion = "1.0";
final String minorVersion = "3";
final String version = majorVersion + "." + minorVersion;
final String applicationName = "DataProcessor-" + topic + "-" + version;
// for dev environment
SparkConf sparkConf = new SparkConf().setMaster("local[*]").setAppName(applicationName);
// for cluster environment
//SparkConf sparkConf = new SparkConf().setAppName(applicationName);
final long sparkBatchDuration = Long
.valueOf(ApplicationProperties.getProperty(Consts.SPARK_BATCH_DURATION, "10"));
final String sparkCheckPointDir = ApplicationProperties.getProperty(Consts.SPARK_CHECKPOINTING_DIR);
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(sparkBatchDuration));
logger.debug("setting checkpoint directory={}", sparkCheckPointDir);
jssc.checkpoint(sparkCheckPointDir);
HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topic.split(",")));
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", brokers);
kafkaParams.put("auto.offset.reset", offset);
kafkaParams.put("group.id", "app1");
// @formatter:off
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
kafkaParams,
topicsSet
);
// @formatter:on
processRDD(messages, app);
return jssc;
}
private void processRDD(JavaPairInputDStream<String, String> messages, final String app) {
JavaDStream<MsgStruct> rdd = messages.map(new MessageProcessFunction());
rdd.foreachRDD(new Function<JavaRDD<MsgStruct>, Void>() {
private static final long serialVersionUID = 250647626267731218L;
@Override
public Void call(JavaRDD<MsgStruct> currentRdd) throws Exception {
if (!currentRdd.isEmpty()) {
logger.debug("Receive RDD. Create JobDispatcherFunction at HOST={}", FunctionUtil.getHostName());
currentRdd.foreachPartition(new VoidFunction<Iterator<MsgStruct>>() {
@Override
public void call(Iterator<MsgStruct> arg0) throws Exception {
while(arg0.hasNext()){
System.out.println(arg0.next().toString());
}
}
});
} else {
logger.debug("Current RDD is empty.");
}
return null;
}
});
}
public static class MessageProcessFunction implements Function<Tuple2<String, String>, MsgStruct> {
@Override
public MsgStruct call(Tuple2<String, String> data) throws Exception {
String message = data._2();
System.out.println("message:"+message);
return MsgStruct.parse(message);
}
}
public static class MsgStruct implements Serializable{
private String message;
public static MsgStruct parse(String msg){
MsgStruct m = new MsgStruct();
m.message = msg;
return m;
}
public String toString(){
return "content inside="+message;
}
}
}
import java.io.Serializable;
导入java.util.array;
导入java.util.HashMap;
导入java.util.HashSet;
导入java.util.Iterator;
导入org.apache.commons.lang3.StringUtils;
导入org.apache.spark.SparkConf;
导入org.apache.spark.api.java.JavaRDD;
导入org.apache.spark.api.java.function.function;
导入org.apache.spark.api.java.function.VoidFunction;
导入org.apache.spark.streaming.Durations;
导入org.apache.spark.streaming.api.java.JavaDStream;
导入org.apache.spark.streaming.api.java.JavaPairInputStream;
导入org.apache.spark.streaming.api.java.JavaStreamingContext;
导入org.apache.spark.streaming.api.java.JavaStreamingContextFactory;
导入org.apache.spark.streaming.kafka.KafkaUtils;
导入org.slf4j.Logger;
导入org.slf4j.LoggerFactory;
导入kafka.serializer.StringDecoder;
导入scala.Tuple2;
公共类DataProcessor2实现了可序列化{
私有静态最终长serialVersionUID=307115481526170241L;
私有静态记录器log=LoggerFactory.getLogger(“数据处理器”);
公共静态void main(字符串[]args){
最后一个字符串sparkCheckPointDir=ApplicationProperties.getProperty(Consts.SPARK\u checkpoint\u DIR);
DataProcessorContextFactory3工厂=新DataProcessorContextFactory3();
JavaStreamingContext jssc=JavaStreamingContext.getOrCreate(sparkCheckPointDir,工厂);
//开始这个过程
jssc.start();
jssc.aittimination();
}
}
类DataProcessorContextFactory3实现JavaStreamingContextFactory,可序列化{
私有静态最终长serialVersionUID=6070911284191531450L;
私有静态记录器Logger=LoggerFactory.getLogger(DataProcessorContextFactory.class);
DataProcessorContextFactory3(){
}
@凌驾
公共JavaStreamingContext创建(){
debug(“创建新上下文…”);
最终字符串brokers=ApplicationProperties.getProperty(Consts.KAFKA\u brokers\u NAME);
最后一个字符串topic=ApplicationProperties.getProperty(Consts.KAFKA\u topic\u NAME);
最终字符串app=“app1”;
最终字符串偏移量=ApplicationProperties.getProperty(Consts.KAFKA_CONSUMER_offset,“最大”);
debug(“数据处理配置.brokers={},topic={},app={},offset={}”,brokers,topic,app,
抵消);
if(StringUtils.isBlank(代理)| | StringUtils.isBlank(主题)| | StringUtils.isBlank(应用)){
System.err.println(“用法:数据处理器\n”+Consts.KAFKA\u代理\u名称
+“是一个由一个或多个Kafka代理组成的列表,用逗号分隔\n”+Consts.Kafka\u TOPIC\u NAME
+“是要从中消费的卡夫卡主题\n\n”);
系统出口(1);
}
最终字符串majorVersion=“1.0”;
最后一个字符串minorVersion=“3”;
最终字符串版本=majorVersion+“+minorVersion;
最终字符串applicationName=“DataProcessor-”+主题+“-”+版本;
//用于开发环境
SparkConf SparkConf=new SparkConf().setMaster(“local[*]”)。setAppName(applicationName);
//面向集群环境
//SparkConf SparkConf=new SparkConf().setAppName(applicationName);
最终长sparkBatchDuration=长
.valueOf(ApplicationProperties.getProperty(Consts.SPARK\u BATCH\u DURATION,“10”));
最后一个字符串sparkCheckPointDir=ApplicationProperties.getProperty(Consts.SPARK\u checkpoint\u DIR);
JavaStreamingContext jssc=新的JavaStreamingContext(sparkConf,Durations.seconds(sparkBatchDuration));
debug(“设置检查点目录={}”,sparkCheckPointDir);
jssc.检查点(sparkCheckPointDir);
HashSet-topicsSet=新的HashSet(Arrays.asList(topic.split(“,”));
HashMap kafkaParams=新HashMap();
kafkaParams.put(“metadata.broker.list”,brokers);
kafkaParams.put(“自动.偏移.重置”,偏移量);
kafkaParams.put(“group.id”,“app1”);
//@formatter:off
JavaPairInputStream消息=KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
卡夫卡帕拉姆斯,
主题集
);
//@formatter:on
processRDD(消息、应用程序);
返回jssc;
}
私有void processRDD(JavaPairInputStream消息,最终字符串应用程序){
JavaDStream rdd=messages.map(新的MessageProcessFunction());
rdd.foreachRDD(n
Map<TopicPartition, String> partitionMapToHost = new HashMap<>();
// partition 0 -> h1, partition 1 and 2 -> h2
partitionMapToHost.put(new TopicPartition("topic-name", 0), "h1");
partitionMapToHost.put(new TopicPartition("topic-name", 1), "h2");
partitionMapToHost.put(new TopicPartition("topic-name", 2), "h2");
List<String> topicCollection = Arrays.asList("topic-name");
Map<String, Object> kafkaParams = new HasMap<>();
kafkaParams.put("bootstrap.servers", "10.0.0.2:9092,10.0.0.3:9092");
kafkaParams.put("group.id", "group-id-name");
kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
kafkaParams.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
JavaInputDStream<ConsumerRecord<String, String>> records = KafkaUtils.createDirectStream(jssc,
LocationStrategies.PreferFixed(partitionMapToHost), // PreferFixed is the key
ConsumerStrategies.Subscribe(topicCollection, kafkaParams));