使用sparksql的Hbase

使用sparksql的Hbase,hbase,apache-spark-sql,Hbase,Apache Spark Sql,我在hbase中有一个名为“sample”的表。我需要使用apachesparksql查询表。 是否有任何方法可以使用Apache spark sql查询读取hbase数据?spark sql是一个内存中的查询引擎,要在hbase表的顶部使用spark sql执行某些查询操作,您需要 使用Spark从HBase获取数据并创建Spark RDD SparkConf sparkConf = new SparkConf(); sparkConf.setAppName("SparkApp"); spar

我在hbase中有一个名为“sample”的表。我需要使用apachesparksql查询表。
是否有任何方法可以使用Apache spark sql查询读取hbase数据?

spark sql是一个内存中的查询引擎,要在hbase表的顶部使用spark sql执行某些查询操作,您需要

  • 使用Spark从HBase获取数据并创建Spark RDD

    SparkConf sparkConf = new SparkConf();
    sparkConf.setAppName("SparkApp");
    sparkConf.setMaster("local[*]");
    
    JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    
    Configuration config = HBaseConfiguration.create();
    config.addResource(new Path("/etc/hbase/hbase-site.xml"));
    config.addResource(new Path("/etc/hadoop/core-site.xml"));
    config.set(TableInputFormat.INPUT_TABLE, "sample");
    
    JavaPairRDD<ImmutableBytesWritable, Result> hbaseRDD = javaSparkContext.newAPIHadoopRDD(hbaseConfig, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
    
    JavaRDD<StudentBean> sampleRDD = hbaseRDD.map(new Function<Tuple2<ImmutableBytesWritable,Result>, StudentBean  >() {
        private static final long serialVersionUID = -2021713021648730786L;
        public StudentBean  call(Tuple2<ImmutableBytesWritable, Result> tuple) {
            StudentBean  bean = new StudentBean  ();
            Result result = tuple._2;
            bean.setRowKey(rowKey);
            bean.setFirstName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("firstName"))));
            bean.setLastName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("lastName"))));
            bean.setBranch(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("branch"))));
            bean.setEmailId(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("emailId"))));
            return bean;
        }
    });
    
    SparkConf SparkConf=new SparkConf();
    sparkConf.setAppName(“SparkApp”);
    sparkConf.setMaster(“本地[*]”);
    JavaSparkContext JavaSparkContext=新的JavaSparkContext(sparkConf);
    Configuration config=HBaseConfiguration.create();
    config.addResource(新路径(“/etc/hbase/hbase site.xml”);
    config.addResource(新路径(“/etc/hadoop/core site.xml”);
    config.set(TableInputFormat.INPUT_TABLE,“sample”);
    JavaPairdd hbaseRDD=javaSparkContext.newAPIHadoopRDD(hbaseConfig,TableInputFormat.class,ImmutableBytesWritable.class,Result.class);
    JavaRDD sampleRDD=hbaseRDD.map(新函数(){
    私有静态最终长serialVersionUID=-2021713021648730786L;
    public StudentBean调用(Tuple2 tuple){
    StudentBean=newstudentbean();
    结果=元组。\u 2;
    setRowKey(rowKey);
    setFirstName(Bytes.toString(result.getValue(Bytes.toBytes(“详细”),Bytes.toBytes(“firstName”)));
    setLastName(Bytes.toString(result.getValue(Bytes.toBytes(“细节”)、Bytes.toBytes(“lastName”)));
    setBranch(Bytes.toString(result.getValue(Bytes.toBytes(“细节”)、Bytes.toBytes(“分支”)));
    setEmailId(Bytes.toString(result.getValue(Bytes.toBytes(“细节”)、Bytes.toBytes(“emailId”)));
    返回豆;
    }
    });
    
  • 使用这个RDD创建DataFrame对象,并用一些临时表名注册它,然后您就可以执行查询了

    DataFrame schema = sqlContext.createDataFrame(sampleRDD, StudentBean.class);
    schema.registerTempTable("spark_sql_temp_table");
    
    DataFrame schemaRDD = sqlContext.sql("YOUR_QUERY_GOES_HERE");
    
    JavaRDD<StudentBean> result = schemaRDD.toJavaRDD().map(new Function<Row, StudentBean>() {
    
        private static final long serialVersionUID = -2558736294883522519L;
    
        public StudentBean call(Row row) throws Exception {
            StudentBean bean = new StudentBean();
            //  Do the mapping stuff here
            return bean;
        }
    });
    
    DataFrame schema=sqlContext.createDataFrame(sampledd,StudentBean.class);
    schema.registerTempTable(“spark_sql_temp_table”);
    DataFrame schemaRDD=sqlContext.sql(“您的查询在这里进行”);
    JavaRDD result=schemaRDD.toJavaRDD().map(新函数(){
    私有静态最终长serialVersionUID=-2558736294883522519L;
    public StudentBean调用(行)引发异常{
    StudentBean=newstudentbean();
    //在这里做映射
    返回豆;
    }
    });
    

  • Spark SQL是一个内存中的查询引擎,要在HBase表上使用Spark SQL执行某些查询操作,您需要

  • 使用Spark从HBase获取数据并创建Spark RDD

    SparkConf sparkConf = new SparkConf();
    sparkConf.setAppName("SparkApp");
    sparkConf.setMaster("local[*]");
    
    JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    
    Configuration config = HBaseConfiguration.create();
    config.addResource(new Path("/etc/hbase/hbase-site.xml"));
    config.addResource(new Path("/etc/hadoop/core-site.xml"));
    config.set(TableInputFormat.INPUT_TABLE, "sample");
    
    JavaPairRDD<ImmutableBytesWritable, Result> hbaseRDD = javaSparkContext.newAPIHadoopRDD(hbaseConfig, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
    
    JavaRDD<StudentBean> sampleRDD = hbaseRDD.map(new Function<Tuple2<ImmutableBytesWritable,Result>, StudentBean  >() {
        private static final long serialVersionUID = -2021713021648730786L;
        public StudentBean  call(Tuple2<ImmutableBytesWritable, Result> tuple) {
            StudentBean  bean = new StudentBean  ();
            Result result = tuple._2;
            bean.setRowKey(rowKey);
            bean.setFirstName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("firstName"))));
            bean.setLastName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("lastName"))));
            bean.setBranch(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("branch"))));
            bean.setEmailId(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("emailId"))));
            return bean;
        }
    });
    
    SparkConf SparkConf=new SparkConf();
    sparkConf.setAppName(“SparkApp”);
    sparkConf.setMaster(“本地[*]”);
    JavaSparkContext JavaSparkContext=新的JavaSparkContext(sparkConf);
    Configuration config=HBaseConfiguration.create();
    config.addResource(新路径(“/etc/hbase/hbase site.xml”);
    config.addResource(新路径(“/etc/hadoop/core site.xml”);
    config.set(TableInputFormat.INPUT_TABLE,“sample”);
    JavaPairdd hbaseRDD=javaSparkContext.newAPIHadoopRDD(hbaseConfig,TableInputFormat.class,ImmutableBytesWritable.class,Result.class);
    JavaRDD sampleRDD=hbaseRDD.map(新函数(){
    私有静态最终长serialVersionUID=-2021713021648730786L;
    public StudentBean调用(Tuple2 tuple){
    StudentBean=newstudentbean();
    结果=元组。\u 2;
    setRowKey(rowKey);
    setFirstName(Bytes.toString(result.getValue(Bytes.toBytes(“详细”),Bytes.toBytes(“firstName”)));
    setLastName(Bytes.toString(result.getValue(Bytes.toBytes(“细节”)、Bytes.toBytes(“lastName”)));
    setBranch(Bytes.toString(result.getValue(Bytes.toBytes(“细节”)、Bytes.toBytes(“分支”)));
    setEmailId(Bytes.toString(result.getValue(Bytes.toBytes(“细节”)、Bytes.toBytes(“emailId”)));
    返回豆;
    }
    });
    
  • 使用这个RDD创建DataFrame对象,并用一些临时表名注册它,然后您就可以执行查询了

    DataFrame schema = sqlContext.createDataFrame(sampleRDD, StudentBean.class);
    schema.registerTempTable("spark_sql_temp_table");
    
    DataFrame schemaRDD = sqlContext.sql("YOUR_QUERY_GOES_HERE");
    
    JavaRDD<StudentBean> result = schemaRDD.toJavaRDD().map(new Function<Row, StudentBean>() {
    
        private static final long serialVersionUID = -2558736294883522519L;
    
        public StudentBean call(Row row) throws Exception {
            StudentBean bean = new StudentBean();
            //  Do the mapping stuff here
            return bean;
        }
    });
    
    DataFrame schema=sqlContext.createDataFrame(sampledd,StudentBean.class);
    schema.registerTempTable(“spark_sql_temp_table”);
    DataFrame schemaRDD=sqlContext.sql(“您的查询在这里进行”);
    JavaRDD result=schemaRDD.toJavaRDD().map(新函数(){
    私有静态最终长serialVersionUID=-2558736294883522519L;
    public StudentBean调用(行)引发异常{
    StudentBean=newstudentbean();
    //在这里做映射
    返回豆;
    }
    });