Java 时间序列数据的Cassandra Map Reduce
如何从映射器中访问Cassandra柱族?具体来说,如何将映射方法的参数转换回我期望的java类型 键{logType}->{列名:timeUUID,列值:csv日志行,ttl:1year} 感谢@Chris&@rs_atl,我成功运行了hadoop作业,以下是完整的代码:Java 时间序列数据的Cassandra Map Reduce,java,hadoop,mapreduce,cassandra,hector,Java,Hadoop,Mapreduce,Cassandra,Hector,如何从映射器中访问Cassandra柱族?具体来说,如何将映射方法的参数转换回我期望的java类型 键{logType}->{列名:timeUUID,列值:csv日志行,ttl:1year} 感谢@Chris&@rs_atl,我成功运行了hadoop作业,以下是完整的代码: package com.xxx.hadoop; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; impo
package com.xxx.hadoop;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.SortedMap;
import org.apache.cassandra.db.IColumn;
import org.apache.cassandra.hadoop.ColumnFamilyInputFormat;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.cassandra.thrift.SliceRange;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.thrift.TBaseHelper;
import com.xxx.parser.LogParser;
import com.netflix.astyanax.serializers.StringSerializer;
public class LogTypeCounterByDate extends Configured implements Tool {
private static final String KEYSPACE = "LogKS";
private static final String COLUMN_FAMILY = "LogBlock";
private static final String JOB_NAME = "LOG_LINE_COUNT";
private static final String INPUT_PARTITIONER = "org.apache.cassandra.dht.RandomPartitioner";
private static final String INPUT_RPC_PORT = "9160";
private static final String INPUT_INITIAL_ADDRESS = "192.168.1.21";
private static final String OUTPUT_PATH = "/logOutput/results";
@Override
public int run(String[] args) throws Exception {
//Configuration conf = new Configuration();
Job job = new Job(getConf(), JOB_NAME);
job.setJarByClass(LogTypeCounterByDate.class);
job.setMapperClass(LogTypeCounterByDateMapper.class);
job.setReducerClass(LogTypeCounterByDateReducer.class);
job.setInputFormatClass(ColumnFamilyInputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setNumReduceTasks(1);
ConfigHelper.setRangeBatchSize(getConf(), 1000);
/*SlicePredicate predicate = new SlicePredicate().setSlice_range(new SliceRange(ByteBuffer.wrap(new byte[0]),
ByteBuffer.wrap(new byte[0]), true, 1));*/
SliceRange sliceRange = new SliceRange(ByteBuffer.wrap(new byte[0]),
ByteBuffer.wrap(new byte[0]), true, 1000);
SlicePredicate slicePredicate = new SlicePredicate();
slicePredicate.setSlice_range(sliceRange);
ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
ConfigHelper.setInputRpcPort(job.getConfiguration(), INPUT_RPC_PORT);
ConfigHelper.setInputInitialAddress(job.getConfiguration(), INPUT_INITIAL_ADDRESS);
ConfigHelper.setInputPartitioner(job.getConfiguration(), INPUT_PARTITIONER);
ConfigHelper.setInputSlicePredicate(job.getConfiguration(), slicePredicate);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
job.waitForCompletion(true);
return job.isSuccessful() ? 0 : 1;
}
public static void main(String[] args) throws Exception{
ToolRunner.run(new Configuration(), new LogTypeCounterByDate(), args);
System.exit(0);
}
public static class LogTypeCounterByDateMapper extends Mapper<ByteBuffer, SortedMap<ByteBuffer, IColumn>, Text, LongWritable>
{
@SuppressWarnings("rawtypes")
@Override
protected void setup(Mapper.Context context){
}
@SuppressWarnings({ })
public void map(ByteBuffer key, SortedMap<ByteBuffer, IColumn> columns, Context context) throws IOException, InterruptedException{
//String[] lines = columns.;
String rowkey = StringSerializer.get().fromByteBuffer(TBaseHelper.rightSize(key));
Iterator<ByteBuffer> iter = columns.keySet().iterator();
IColumn column;
String line;
LogParser lp = null;
while(iter.hasNext()){
column = columns.get(iter.next());
line = StringSerializer.get().fromByteBuffer(TBaseHelper.rightSize(column.value()));
lp = new LogParser(line);
context.write(new Text(rowkey + "\t" + "LineCount"), new LongWritable(1L));
context.write(new Text(rowkey + "\t" + "Minutes"), new LongWritable(lp.getTotalDuration()));
}
}
}
public static class LogTypeCounterByDateReducer extends Reducer<Text, LongWritable, Text, LongWritable>
{
public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException{
long total = 0;
for(LongWritable val : values){
total += val.get();
}
context.write(key, new LongWritable(total));
}
}
}
上面的代码只为每一行向mapper提供1000列,而我希望以每次1000列的批量为每一行提供所有列
请有人在这方面帮助我。给定参数:
ByteBuffer key;
SortedMap<ByteBuffer, IColumn> columns;
获取反序列化的键值。注意,这里的假设是行键是字符串。如果是其他类型,则必须使用适当的序列化程序类
要获取列值,请执行以下操作:
Iterator<ByteBuffer> = columns.keySet().iterator();
while (iter.hasNext()) {
IColumn col = columns.get(iter.next());
xxx colVal = xxxSerializer.get().fromByteBuffer(TBaseHelper.rightSize(col.value()));
}
其中xxx是列值的Java类型,xxxSerializer是相应的序列化程序
顺便说一句,tbaseheloper类用于将内部字节数组中的值的偏移量更正为零,从而强制执行序列化程序实现所做的假设
还有一件事。。。如果您正在检索时间序列,那么每一列都是它自己的时间序列值,您需要包括适当的映射器逻辑,比如某种数学运算和对列上迭代循环内的上下文的写入。相反,如果您有一个更静态的列族,更像传统的sql表,那么您可能会对整行执行一次对上下文的写入 给定参数:
ByteBuffer key;
SortedMap<ByteBuffer, IColumn> columns;
获取反序列化的键值。注意,这里的假设是行键是字符串。如果是其他类型,则必须使用适当的序列化程序类
要获取列值,请执行以下操作:
Iterator<ByteBuffer> = columns.keySet().iterator();
while (iter.hasNext()) {
IColumn col = columns.get(iter.next());
xxx colVal = xxxSerializer.get().fromByteBuffer(TBaseHelper.rightSize(col.value()));
}
其中xxx是列值的Java类型,xxxSerializer是相应的序列化程序
顺便说一句,tbaseheloper类用于将内部字节数组中的值的偏移量更正为零,从而强制执行序列化程序实现所做的假设
还有一件事。。。如果您正在检索时间序列,那么每一列都是它自己的时间序列值,您需要包括适当的映射器逻辑,比如某种数学运算和对列上迭代循环内的上下文的写入。相反,如果您有一个更静态的列族,更像传统的sql表,那么您可能会对整行执行一次对上下文的写入 谢谢@Chris,你能看一下我的代码吗?原始问题更新了。我不明白Mapper和Reducer的安装方法下是什么。安装或关闭方法不需要任何东西。如果您想在处理输入拆分之前或之后分别运行任何逻辑,它们就在那里。感谢您的确认,我将为Cassandra运行我的第一个Map Reduce作业,并在此处共享结果。再次感谢。我已经提出了另一个问题[link],你可以在@rs_atlThank@Chris回答的评论部分找到这个问题,你可以查看我的代码吗?原始问题已更新。我不明白Mapper和Reducer的安装方法下是什么。安装或关闭方法不需要任何东西。如果您想在处理输入拆分之前或之后分别运行任何逻辑,它们就在那里。感谢您的确认,我将为Cassandra运行我的第一个Map Reduce作业,并在此处共享结果。再次感谢。我已经提出了另一个问题[链接],你可以在@rs_atl回答的评论部分找到这个问题