Hadoop 如何在MapReduce作业中导入存储在配置单元中的表数据?

Hadoop 如何在MapReduce作业中导入存储在配置单元中的表数据?,hadoop,mapreduce,hive,hcatalog,Hadoop,Mapreduce,Hive,Hcatalog,我在Ubuntu 14.04上使用Apache Hadoop 2.5.0的单节点群集设置 我使用Flume将推文存储在HDFS中。 然后,我使用以下配置单元命令在配置单元中创建一个表,以表格格式存储所有tweet: CREATE EXTERNAL TABLE tweets ( id BIGINT, created_at STRING, source STRING, favorited BOOLEAN, retweet_count INT, retweeted_

我在Ubuntu 14.04上使用Apache Hadoop 2.5.0的单节点群集设置 我使用Flume将推文存储在HDFS中。 然后,我使用以下配置单元命令在配置单元中创建一个表,以表格格式存储所有tweet:

CREATE EXTERNAL TABLE tweets (
   id BIGINT,
   created_at STRING,
   source STRING,
   favorited BOOLEAN,
   retweet_count INT,
   retweeted_status STRUCT<
      text:STRING,
      user:STRUCT<screen_name:STRING,name:STRING>>,
   entities STRUCT<
      urls:ARRAY<STRUCT<expanded_url:STRING>>,
      user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>,
      hashtags:ARRAY<STRUCT<text:STRING>>>,
   text STRING,
   user STRUCT<
      screen_name:STRING,
      name:STRING,
      friends_count:INT,
      followers_count:INT,
      statuses_count:INT,
      verified:BOOLEAN,
      utc_offset:INT,
      time_zone:STRING>,
   in_reply_to_screen_name STRING
)
ROW FORMAT SERDE 'com.cloudera.hive.serde.JSONSerDe'
LOCATION '/user/flume/tweets';
我使用的是ApacheHive0.13.1,其中已经包含了HCatalog。在所有这些之后,我正试图在Eclipse中使用Java语言编写一个MapReduce作业。我已将以下库作为外部JAR添加到我的项目中:

  • hadoop/share/hadoop/common安装路径中的所有库
  • hadoop/share/hadoop/mapreduce安装路径中的所有库
  • 配置单元的lib文件夹中存在的所有库
  • 配置单元/hcatalog/share/hcatalog安装路径中存在的所有库
  • 我的MapReduce代码正在尝试从表“tweets”导入tweets的文本,然后对其进行处理。我的MapReduce代码是:

    import java.io.IOException;
    import java.util.*;
    
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.conf.*;
    import org.apache.hadoop.io.*;
    import org.apache.hadoop.mapreduce.*;
    import org.apache.hadoop.util.*;
    import org.apache.hcatalog.common.*;
    import org.apache.hcatalog.mapreduce.*;
    import org.apache.hcatalog.data.*;
    import org.apache.hcatalog.data.schema.*;
    
    public class UseHCat extends Configured implements Tool {
    
        public static class Map extends Mapper<WritableComparable, HCatRecord, Text, IntWritable> {
        String tweetText;
    
        @Override
          protected void map( WritableComparable key,
                          HCatRecord value,
                          org.apache.hadoop.mapreduce.Mapper<WritableComparable, HCatRecord,
                          Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
            tweetText = (String) value.get(7);
            int i = 1;
            context.write(new Text(tweetText), new IntWritable(i));
        }
        }
    
        public static class Reduce extends Reducer<Text, IntWritable,
                                       WritableComparable, HCatRecord> {
    
        protected void reduce( Text key,
                               java.lang.Iterable<IntWritable> values,
                               org.apache.hadoop.mapreduce.Reducer<Text, IntWritable,
                               WritableComparable, HCatRecord>.Context context)
            throws IOException, InterruptedException {
            Iterator<IntWritable> iter = values.iterator();
            IntWritable iw = iter.next();
            int id = iw.get();
            HCatRecord record = new DefaultHCatRecord(2);
            record.set(0, key.toString());
            record.set(1, id);
            context.write(null, record);
        }
        }
    
        public int run(String[] args) throws Exception {
        Configuration conf = getConf();
    
        String inputTableName = "tweets";
        String outputTableName = "outputtable";
        String dbName = null;
    
        Job job = new Job(conf, "UseHCat");
        HCatInputFormat.setInput(job, InputJobInfo.create(dbName, inputTableName, null));
        job.setJarByClass(UseHCat.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
    
        // An HCatalog record as input
        job.setInputFormatClass(HCatInputFormat.class);
    
        // Mapper emits a string as key and an integer as value
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
    
        // Ignore the key for the reducer output; emitting an HCatalog record as value
        job.setOutputKeyClass(WritableComparable.class);
        job.setOutputValueClass(DefaultHCatRecord.class);
        job.setOutputFormatClass(HCatOutputFormat.class);
    
        HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, null));
        HCatSchema s = HCatOutputFormat.getTableSchema(job);
        System.err.println("INFO: output schema explicitly set for writing:" + s);
        HCatOutputFormat.setSchema(job, s);
        return (job.waitForCompletion(true) ? 0 : 1);
        }
    
        public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new UseHCat(), args);
        System.exit(exitCode);
        }
    }
    
    我们得到了以下错误:

    14/11/16 17:17:29 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/hive/ql/metadata/HiveStorageHandler
        at java.lang.ClassLoader.defineClass1(Native Method)
        at java.lang.ClassLoader.defineClass(ClassLoader.java:800)
        at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
        at java.net.URLClassLoader.defineClass(URLClassLoader.java:449)
        at java.net.URLClassLoader.access$100(URLClassLoader.java:71)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:361)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
        at java.security.AccessController.doPrivileged(Native Method)
        at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
        at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
        at org.apache.hcatalog.mapreduce.InitializeInput.getInputJobInfo(InitializeInput.java:146)
        at org.apache.hcatalog.mapreduce.InitializeInput.setInput(InitializeInput.java:86)
        at org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:86)
        at org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:55)
        at org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:47)
        at UseHCat.run(UseHCat.java:64)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
        at UseHCat.main(UseHCat.java:91)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:606)
        at org.apache.hadoop.util.RunJar.main(RunJar.java:212)
    Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.ql.metadata.HiveStorageHandler
        at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
        at java.security.AccessController.doPrivileged(Native Method)
        at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
        at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
        ... 26 more
    

    开发Hive是为了尽量减少编写mapreduce程序。您可以使用Hive查询执行该过程,它将在内部转换为mapreduce作业

    但是,如果要访问Hivedb数据,可以访问。配置单元不是数据库。以可读格式存储在仓库目录下的所有数据。您可以将完整路径作为mapreduce程序的输入


    您是否在eclipse中尝试过一个示例mapreduce程序。因为您已经构建了Hadoop插件,或者您可以使用eclipse中现有的插件来运行mapreduce。

    也可以使用指向某个博客的任何链接。可能其他节点中没有安装hive,因此包含
    HiveStorageHandler
    类的jar在那里不可用。尝试使用
    job.addFileToClassPath
    将外部JAR添加到缓存中,看看是否有帮助。不过这只是猜测而已!嘿,铁匠,我正在使用单节点群集设置,我能够运行示例MapReduce程序,如WordCount。我必须解决的主要问题是-如何在MapReduce作业中导入存储在配置单元中的表数据?嘿,Harshit,您是否找到了直接从自定义map reduce程序处理配置单元表数据的解决方案。
    hadoop jar MyProject.jar
    
    14/11/16 17:17:29 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/hive/ql/metadata/HiveStorageHandler
        at java.lang.ClassLoader.defineClass1(Native Method)
        at java.lang.ClassLoader.defineClass(ClassLoader.java:800)
        at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
        at java.net.URLClassLoader.defineClass(URLClassLoader.java:449)
        at java.net.URLClassLoader.access$100(URLClassLoader.java:71)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:361)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
        at java.security.AccessController.doPrivileged(Native Method)
        at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
        at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
        at org.apache.hcatalog.mapreduce.InitializeInput.getInputJobInfo(InitializeInput.java:146)
        at org.apache.hcatalog.mapreduce.InitializeInput.setInput(InitializeInput.java:86)
        at org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:86)
        at org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:55)
        at org.apache.hcatalog.mapreduce.HCatInputFormat.setInput(HCatInputFormat.java:47)
        at UseHCat.run(UseHCat.java:64)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
        at UseHCat.main(UseHCat.java:91)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:606)
        at org.apache.hadoop.util.RunJar.main(RunJar.java:212)
    Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.ql.metadata.HiveStorageHandler
        at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
        at java.security.AccessController.doPrivileged(Native Method)
        at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
        at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
        ... 26 more