Java 从卡夫卡消息中获取主题
如何从《卡夫卡》中的消息中识别主题名称Java 从卡夫卡消息中获取主题,java,cassandra,apache-spark,apache-kafka,Java,Cassandra,Apache Spark,Apache Kafka,如何从《卡夫卡》中的消息中识别主题名称 String[] topics = { "test", "test1", "test2" }; for (String t : topics) { topicMap.put(t, new Integer(3)); } SparkConf conf = new SparkConf().setAppName("KafkaReceiver") .set("spark.streaming.receiver.
String[] topics = { "test", "test1", "test2" };
for (String t : topics) {
topicMap.put(t, new Integer(3));
}
SparkConf conf = new SparkConf().setAppName("KafkaReceiver")
.set("spark.streaming.receiver.writeAheadLog.enable", "false")
.setMaster("local[4]")
.set("spark.cassandra.connection.host", "localhost");
;
final JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext jssc = new JavaStreamingContext(sc, new Duration(
1000));
/* Receive Kafka streaming inputs */
JavaPairReceiverInputDStream<String, String> messages = KafkaUtils
.createStream(jssc, "localhost:2181", "test-group",
topicMap);
JavaDStream<MessageAndMetadata> data =
messages.map(new Function<Tuple2<String, String>, MessageAndMetadata>()
{
public MessageAndMetadata call(Tuple2<String, String> message)
{
System.out.println("message ="+message._2);
return null;
}
}
);
String[]topics={“test”、“test1”、“test2”};
for(字符串t:主题){
topicMap.put(t,新整数(3));
}
SparkConf conf=new SparkConf().setAppName(“KafkaReceiver”)
.set(“spark.streaming.receiver.writeAheadLog.enable”,“false”)
.setMaster(“本地[4]”)
.set(“spark.cassandra.connection.host”、“localhost”);
;
最终JavaSparkContext sc=新的JavaSparkContext(conf);
JavaStreamingContext jssc=新的JavaStreamingContext(sc,新的持续时间(
1000));
/*接收卡夫卡流式输入*/
JavaPairReceiverInputStream消息=KafkaUtils
.createStream(jssc,“本地主机:2181”,“测试组”,
topicMap);
JavaDStream数据=
messages.map(新函数()
{
公共消息和元数据调用(Tuple2消息)
{
System.out.println(“message=“+message._2”);
返回null;
}
}
);
我可以从卡夫卡制作人那里得到消息。但由于消费者现在从三个主题中消费,因此需要识别主题名称。不幸的是,这并不简单,因为Spark源代码中的KafkaReceiver和ReliableKafkaReceiver仅存储message和metadata.key和message Spark的JIRA中有两张与此问题相关的公开票:
package org.apache.spark.streaming.kafka
import java.lang.{Integer => JInt}
import java.util.{Map => JMap, Properties}
import kafka.consumer.{KafkaStream, Consumer, ConsumerConfig, ConsumerConnector}
import kafka.serializer.{Decoder, StringDecoder}
import kafka.utils.VerifiableProperties
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.streaming.util.WriteAheadLogUtils
import org.apache.spark.util.ThreadUtils
import scala.collection.JavaConverters._
import scala.collection.Map
import scala.reflect._
object MoreKafkaUtils {
def createStream(
jssc: JavaStreamingContext,
zkQuorum: String,
groupId: String,
topics: JMap[String, JInt],
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
): JavaReceiverInputDStream[(String, String, String)] = {
val kafkaParams = Map[String, String](
"zookeeper.connect" -> zkQuorum, "group.id" -> groupId,
"zookeeper.connection.timeout.ms" -> "10000")
val walEnabled = WriteAheadLogUtils.enableReceiverLog(jssc.ssc.conf)
new KafkaInputDStreamWithTopic[String, String, StringDecoder, StringDecoder](jssc.ssc, kafkaParams, topics.asScala.mapValues(_.intValue()), walEnabled, storageLevel)
}
}
private[streaming]
class KafkaInputDStreamWithTopic[
K: ClassTag,
V: ClassTag,
U <: Decoder[_] : ClassTag,
T <: Decoder[_] : ClassTag](
@transient ssc_ : StreamingContext,
kafkaParams: Map[String, String],
topics: Map[String, Int],
useReliableReceiver: Boolean,
storageLevel: StorageLevel
) extends ReceiverInputDStream[(K, V, String)](ssc_) with Logging {
def getReceiver(): Receiver[(K, V, String)] = {
if (!useReliableReceiver) {
new KafkaReceiverWithTopic[K, V, U, T](kafkaParams, topics, storageLevel)
} else {
new ReliableKafkaReceiverWithTopic[K, V, U, T](kafkaParams, topics, storageLevel)
}
}
}
private[streaming]
class KafkaReceiverWithTopic[
K: ClassTag,
V: ClassTag,
U <: Decoder[_] : ClassTag,
T <: Decoder[_] : ClassTag](
kafkaParams: Map[String, String],
topics: Map[String, Int],
storageLevel: StorageLevel
) extends Receiver[(K, V, String)](storageLevel) with Logging {
// Connection to Kafka
var consumerConnector: ConsumerConnector = null
def onStop() {
if (consumerConnector != null) {
consumerConnector.shutdown()
consumerConnector = null
}
}
def onStart() {
logInfo("Starting Kafka Consumer Stream with group: " + kafkaParams("group.id"))
// Kafka connection properties
val props = new Properties()
kafkaParams.foreach(param => props.put(param._1, param._2))
val zkConnect = kafkaParams("zookeeper.connect")
// Create the connection to the cluster
logInfo("Connecting to Zookeeper: " + zkConnect)
val consumerConfig = new ConsumerConfig(props)
consumerConnector = Consumer.create(consumerConfig)
logInfo("Connected to " + zkConnect)
val keyDecoder = classTag[U].runtimeClass.getConstructor(classOf[VerifiableProperties])
.newInstance(consumerConfig.props)
.asInstanceOf[Decoder[K]]
val valueDecoder = classTag[T].runtimeClass.getConstructor(classOf[VerifiableProperties])
.newInstance(consumerConfig.props)
.asInstanceOf[Decoder[V]]
// Create threads for each topic/message Stream we are listening
val topicMessageStreams = consumerConnector.createMessageStreams(
topics, keyDecoder, valueDecoder)
val executorPool =
ThreadUtils.newDaemonFixedThreadPool(topics.values.sum, "KafkaMessageHandler")
try {
// Start the messages handler for each partition
topicMessageStreams.values.foreach { streams =>
streams.foreach { stream => executorPool.submit(new MessageHandler(stream)) }
}
} finally {
executorPool.shutdown() // Just causes threads to terminate after work is done
}
}
// Handles Kafka messages
private class MessageHandler(stream: KafkaStream[K, V])
extends Runnable {
def run() {
logInfo("Starting MessageHandler.")
try {
val streamIterator = stream.iterator()
while (streamIterator.hasNext()) {
val msgAndMetadata = streamIterator.next()
store((msgAndMetadata.key, msgAndMetadata.message, msgAndMetadata.topic))
}
} catch {
case e: Throwable => reportError("Error handling message; exiting", e)
}
}
}
}
package org.apache.spark.streaming.kafka
导入java.lang.{Integer=>JInt}
导入java.util.{Map=>JMap,Properties}
导入kafka.consumer.{KafkaStream,consumer,ConsumerConfig,ConsumerConnector}
导入kafka.serializer.{Decoder,StringDecoder}
导入kafka.utils.VerifiableProperties
导入org.apache.spark.Logging
导入org.apache.spark.storage.StorageLevel
导入org.apache.spark.streaming.StreamingContext
导入org.apache.spark.streaming.api.java.{JavaReceiverInputDStream,JavaStreamingContext}
导入org.apache.spark.streaming.dstream.ReceiverInputDStream
导入org.apache.spark.streaming.receiver.receiver
导入org.apache.spark.streaming.util.WriteAheadLogUtils
导入org.apache.spark.util.ThreadUtils
导入scala.collection.JavaConverters_
导入scala.collection.Map
导入scala.reflect_
卡夫卡提尔酒店{
def createStream(
jssc:JavaStreamingContext,
zkQuorum:String,
groupId:String,
主题:JMap[String,JInt],
storageLevel:storageLevel=storageLevel.MEMORY和磁盘服务器2
):JavaReceiverInputDStream[(字符串,字符串,字符串)]={
val kafkaParams=Map[String,String](
“zookeeper.connect”->zkQuorum,“group.id”->groupId,
“zookeeper.connection.timeout.ms”->“10000”)
val walEnabled=WriteAheadLogUtils.enableReceiverLog(jssc.ssc.conf)
新的KafkaInputDStreamWithTopic[String,String,StringDecoder,StringDecoder](jssc.ssc,kafkaParams,topics.asScala.mapValues(u.intValue()),WALEABLED,storageLevel)
}
}
私人[流媒体]
类KafkaInputDStreamWithTopic[
K:类标签,
V:ClassTag,
U reportError(“错误处理消息;正在退出”,e)
}
}
}
}
从Spark 1.5.0开始,鼓励从最近的版本开始使用无接收器/直接方法,该版本在最近的1.5.0中已从实验性版本升级。
这个新的Direct API让您可以轻松地获取消息及其元数据,除了其他好东西。我对这个问题的答案很感兴趣。您找到了方法吗?@Arun:找到了解决方案吗?如果找到了,您可以分享它吗?谢谢!您还可以尝试使用实验性的KafkaUtils.createDirectStream,它包含messageHandler:jf函数[MessageAndMetadata[K,V],R]作为参数。我使用的是直接方法,不知道如何获取消息元数据。您能详细说明一下吗?@BrandonBradley,请查看官方文档中上面链接后面的最后一段代码。基本上,您必须在获得RDD后立即将其转换为HasOffsetRanges。