MapReduce Hadoop-mapside使用customInputFormat连接2个数据集_Mapreduce

MapReduce Hadoop-mapside使用customInputFormat连接2个数据集

mapreduce

MapReduce Hadoop-mapside使用customInputFormat连接2个数据集,mapreduce,Mapreduce,我的输入文件1： cdd8dde3-0349-4f0d-b97a-7ae84b687f9c，美国密苏里州奥克莫斯海文巷4071号加纳埃斯特 81a43486-07e1-4b92-b92b-03d0caa87b5f，蒂莫西，邓肯，753体育场道，顿顿顿，马萨诸塞州文件2： cdd8dde3-0349-4f0d-b97a-7ae84b687f9c，517-706-9565，EstherJGarner@teleworm.us地址：万事达信用卡诺勒吉市Waskepter38号，邮编：530568729

我的输入文件1： cdd8dde3-0349-4f0d-b97a-7ae84b687f9c，美国密苏里州奥克莫斯海文巷4071号加纳埃斯特 81a43486-07e1-4b92-b92b-03d0caa87b5f，蒂莫西，邓肯，753体育场道，顿顿顿，马萨诸塞州

文件2： cdd8dde3-0349-4f0d-b97a-7ae84b687f9c，517-706-9565，EstherJGarner@teleworm.us地址：万事达信用卡诺勒吉市Waskepter38号，邮编：5305687295670850 81a43486-07e1-4b92-b92b-03d0caa87b5f，508-307-3433，TimothyDDuncan@einrot.com，康奈斯，吉菲迪巴，万事达卡，5265896533330445

I am learning hadoop mapreduce framework ,i am trying to join 2 data sets have first record(Text) in the line as the Key , i tried to search in stackoverflow previous posts but nothing worked out.Here i am trying to customize the InputFormat and trying to join with the ID which is first record in the each line of data set.

}

在日志中，映射输入记录为0。在hdfs上看不到任何输出。请有人帮助我了解此问题。谢谢

问题出在驱动程序类中，在strjoinsmt属性中，InputFormat被称为KeyValueLongInputFormat.class，它实际上适用于LongWritable键和文本值。相反，当键和值都是文本类型时，可以使用KeyValueTextInputFormat.class。由于输入是逗号分隔的文件，还可以通过在作业的配置对象中设置属性来指定自定义分隔符，如下所示：在Driver class.conf.setkey.value.separator.in.input.line，，；有关完整的详细信息，请查看以下示例：

Sudha，我尝试了keyvalueTextInputFormat，但错过了conf设置。我会尝试的……谢谢

**Driver class:**
         conf.setInputFormat(CompositeInputFormat.class);
         String strJoinStmt = CompositeInputFormat.compose("inner",
         KeyValueLongInputFormat.class, dirEmployeesData, dirSalaryData);
         conf.set("mapred.join.expr", strJoinStmt);
         conf.setNumReduceTasks(0);
         dirOutput.getFileSystem(conf).delete(dirOutput);
         TextOutputFormat.setOutputPath(conf, dirOutput);
         conf.setOutputKeyClass(Text.class);
         conf.setOutputValueClass(Text.class);
         conf.setOutputFormat(TextOutputFormat.class);

**Custom RecordReader class:**

public class KeyValueLongLineRecordReader implements
RecordReader<Text, Text> {
private final LineRecordReader lineRecordReader;
private byte separator = (byte) ',';
private LongWritable dummyKey;
private Text innerValue;
public Class getKeyClass() {
return Text.class;
}
public Text createKey() {
return new Text("");
}
public Text createValue() {
return new Text();
}
public KeyValueLongLineRecordReader(Configuration job, FileSplit split)
throws IOException {
lineRecordReader = new LineRecordReader(job, split);
dummyKey = lineRecordReader.createKey();
innerValue = lineRecordReader.createValue();
String sepStr = job.get("key.value.separator.in.input.line", ",");
this.separator = (byte) sepStr.charAt(0);
}
public static int findSeparator(byte[] utf, int start, int length, byte sep) {
for (int i = start; i < (start + length); i++) {
if (utf[i] == sep) {
return i;
}
}
return -1;
}
/** Read key/value pair in a line. */
public synchronized boolean next(Text key, Text value)
throws IOException {
Text tKey = key;
Text tValue = value;
byte[] line = null;
int lineLen = -1;
if (!lineRecordReader.next(dummyKey, innerValue)) {
    return false;
}  
else
    line = innerValue.getBytes();
lineLen = innerValue.getLength();

if (line == null)
return false;
int pos = findSeparator(line, 0, lineLen, this.separator);
if (pos == -1) {
tKey.set(new String(line, 0, lineLen));
tValue.set("");
} else {
int keyLen = pos;
byte[] keyBytes = new byte[keyLen];
System.arraycopy(line, 0, keyBytes, 0, keyLen);
int valLen = lineLen - keyLen - 1;
byte[] valBytes = new byte[valLen];
System.arraycopy(line, pos + 1, valBytes, 0, valLen);
tKey.set(new String(keyBytes));
tValue.set(valBytes);
}
return true;
}

**InputFormat class:**

public class KeyValueLongInputFormat extends
FileInputFormat<Text, Text> implements JobConfigurable {
private CompressionCodecFactory compressionCodecs = null;
@Override
public void configure(JobConf conf) {
compressionCodecs = new CompressionCodecFactory(conf);
}
protected boolean isSplitable(FileSystem fs, Path file) {
return compressionCodecs.getCodec(file) == null;
}
@Override
public RecordReader<Text, Text> getRecordReader(
InputSplit genericSplit, JobConf job, Reporter reporter)
throws IOException {
reporter.setStatus(genericSplit.toString());
return new KeyValueLongLineRecordReader(job, (FileSplit) genericSplit);
}
} 

**Finally Mapper class:**    
    enter code here

public class MapperMapSideJoinLargeDatasets extends MapReduceBase implements
Mapper<Text, TupleWritable, Text, Text> {
Text txtKey = new Text("");
Text txtValue = new Text("");
@Override
public void map(Text key, TupleWritable value,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
if (value.toString().length() > 0) {
txtKey.set(key.toString());
String arrEmpAttributes[] = value.get(0).toString().split(",");
String arrDeptAttributes[] = value.get(1).toString().split(",");
txtValue.set(arrEmpAttributes[1].toString() + "\t"
+ arrEmpAttributes[2].toString() + "\t" 
+ arrDeptAttributes[0].toString());
output.collect(txtKey, txtValue);
}
}