Apache storm 风暴场分组示例
我使用的是Kafka storm,Kafka向storm发送/发出json字符串,在storm中,我想根据json中的键/字段将负载分配给两个工人。怎么做?在我的例子中,它是json字符串中的groupid字段 例如,我有这样的json:Apache storm 风暴场分组示例,apache-storm,Apache Storm,我使用的是Kafka storm,Kafka向storm发送/发出json字符串,在storm中,我想根据json中的键/字段将负载分配给两个工人。怎么做?在我的例子中,它是json字符串中的groupid字段 例如,我有这样的json: {groupid: 1234, userid: 145, comments:"I want to distribute all this group 1234 to one worker", size:50,type:"group json"} {group
{groupid: 1234, userid: 145, comments:"I want to distribute all this group 1234 to one worker", size:50,type:"group json"}
{groupid: 1235, userid: 134, comments:"I want to distribute all this group 1234 to another worker", size:90,type:"group json"}
{groupid: 1234, userid: 158, comments:"I want to be sent to same worker as group 1234", size:50,type:"group json"}
==风暴0.9.4。使用=====
我的源代码如下:
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import storm.kafka.KafkaSpout;
import storm.kafka.SpoutConfig;
import storm.kafka.StringScheme;
import storm.kafka.ZkHosts;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.spout.SchemeAsMultiScheme;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
public class KafkaBoltMain {
private static final String SPOUTNAME="TopicSpout";
private static final String ANALYSISBOLT = "AnalysisWorker";
private static final String CLIENTID = "Storm";
private static final String TOPOLOGYNAME = "LocalTopology";
private static class AppAnalysisBolt extends BaseRichBolt {
private static final long serialVersionUID = -6885792881303198646L;
private OutputCollector _collector;
private long groupid=-1L;
private String log="test";
public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
_collector = collector;
}
public void execute(Tuple tuple) {
List<Object> objs = tuple.getValues();
int i=0;
for(Object obj:objs){
System.out.println(""+i+"th object's value is:"+obj.toString());
i++;
}
// _collector.emit(new Values(groupid,log));
_collector.ack(tuple);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("groupid","log"));
}
}
public static void main(String[] args){
String zookeepers = null;
String topicName = null;
if(args.length == 2 ){
zookeepers = args[0];
topicName = args[1];
}else if(args.length == 1 && args[0].equalsIgnoreCase("help")){
System.out.println("xxxx");
System.exit(0);
}
else{
System.out.println("You need to have two arguments: kafka zookeeper:port and topic name");
System.out.println("xxxx");
System.exit(-1);
}
SpoutConfig spoutConfig = new SpoutConfig(new ZkHosts(zookeepers),
topicName,
"",// zookeeper root path for offset storing
CLIENTID);
spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout(SPOUTNAME, kafkaSpout, 1);
builder.setBolt(ANALYSISBOLT, new AppAnalysisBolt(),2)
.fieldsGrouping(SPOUTNAME,new Fields("groupid"));
//Configuration
Config conf = new Config();
conf.setDebug(false);
//Topology run
conf.put(Config.TOPOLOGY_MAX_SPOUT_PENDING, 1);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(TOPOLOGYNAME, conf, builder.createTopology());
}
}
import java.util.List;
导入java.util.Map;
导入java.util.concurrent.AtomicInteger;
导入storm.kafka.KafkaSpout;
导入storm.kafka.SpoutConfig;
导入storm.kafka.StringScheme;
导入storm.kafka.ZkHosts;
导入backtype.storm.Config;
导入backtype.storm.LocalCluster;
导入backtype.storm.spout.SchemeAsMultiScheme;
导入backtype.storm.task.OutputCollector;
导入backtype.storm.task.TopologyContext;
导入backtype.storm.topology.outputfieldsclarer;
导入backtype.storm.topology.TopologyBuilder;
导入backtype.storm.topology.base.BaseRichBolt;
导入backtype.storm.tuple.Fields;
导入backtype.storm.tuple.tuple;
导入backtype.storm.tuple.Values;
公共级卡夫卡布尔顿酒店{
私有静态最终字符串SPOUTNAME=“TopicSpout”;
私有静态最终字符串ANALYSISBOLT=“AnalysisWorker”;
私有静态最终字符串CLIENTID=“Storm”;
私有静态最终字符串TOPOLOGYNAME=“LocalTopology”;
私有静态类AppAnalysisBolt扩展了BaseRichBolt{
私有静态最终长serialVersionUID=-6885792881303198646L;
专用输出采集器\u采集器;
专用长组ID=-1L;
私有字符串log=“test”;
public void prepare(地图配置、拓扑上下文、OutputCollector){
_收集器=收集器;
}
公共void执行(元组){
List objs=tuple.getValues();
int i=0;
用于(对象对象对象:对象对象对象){
System.out.println(“+i+”对象的值为:“+obj.toString());
i++;
}
//_collector.emit(新值(groupid,log));
_collector.ack(元组);
}
公共无效申报输出字段(OutputFields申报器申报器){
declarer.declare(新字段(“groupid”、“log”));
}
}
公共静态void main(字符串[]args){
字符串zookeepers=null;
字符串topicName=null;
如果(args.length==2){
zookeepers=args[0];
topicName=args[1];
}else if(args.length==1&&args[0].equalsIgnoreCase(“帮助”){
系统输出打印项次(“xxxx”);
系统出口(0);
}
否则{
println(“您需要有两个参数:kafka zookeeper:port和topic name”);
系统输出打印项次(“xxxx”);
系统退出(-1);
}
喷动配置喷动配置=新喷动配置(新ZkHosts(动物园管理员),
主题名,
“”,//用于偏移存储的zookeeper根路径
CLIENTID);
spoutConfig.scheme=new SchemeAsMultiScheme(new StringScheme());
KafkaSpout KafkaSpout=新的KafkaSpout(spoutConfig);
TopologyBuilder=新TopologyBuilder();
建造商固定管道(管道名称,卡夫卡斯波特,1);
builder.setBolt(ANALYSISBOLT,新AppAnalysisBolt(),2)
.Fields分组(SPOUTNAME,新字段(“groupid”);
//配置
Config conf=new Config();
conf.setDebug(false);
//拓扑运行
conf.put(Config.TOPOLOGY\u MAX\u SPOUT\u PENDING,1);
LocalCluster cluster=新的LocalCluster();
submitTopology(TOPOLOGYNAME,conf,builder.createTopology());
}
}
但当我提交拓扑时,会出现以下错误:
12794 [main] WARN backtype.storm.daemon.nimbus - Topology submission exception. (topology name='LocalTopology') #<InvalidTopologyException InvalidTopologyException(msg:Component:
[AnalysisWorker] subscribes from stream: [default] of component [TopicSpout] with non-existent fields: #{"groupid"})>
12800 [main] ERROR org.apache.storm.zookeeper.server.NIOServerCnxnFactory - Thread Thread[main,5,main] died
backtype.storm.generated.InvalidTopologyException: null
12794[main]警告backtype.storm.daemon.nimbus-拓扑提交异常。(拓扑名称='LocalTopology')#
12800[main]错误org.apache.storm.zookeeper.server.NIOServerCnxnFactory-线程[main,5,main]已死亡
backtype.storm.generated.InvalidTopologyException:null
为什么不存在文件警告消息?有什么提示吗?您需要从json对象中提取json属性,并将两个值(json对象和字符串groupId)作为两个值元组传递。当您将流声明为拓扑规范逻辑的一部分时,您应该给第二个字段命名为“groupId”,这样应该可以正常工作。如果您不想修改Kafka喷口,您需要一个中介螺栓,其唯一目的是将groupId从json对象中分离出来。中间螺栓还可以使用基于json对象中groupId的目标的定向流(emitDirect()方法) 这就是我不重用卡夫卡喷口的一个原因——除了盲目地将数据写入流之外,我通常还想做其他事情,但这既不存在也不存在