使用sparksql的Hbase_Hbase_Apache Spark Sql

使用sparksql的Hbase

hbase

使用sparksql的Hbase,hbase,apache-spark-sql,Hbase,Apache Spark Sql,我在hbase中有一个名为“sample”的表。我需要使用apachesparksql查询表。是否有任何方法可以使用Apache spark sql查询读取hbase数据？spark sql是一个内存中的查询引擎，要在hbase表的顶部使用spark sql执行某些查询操作，您需要使用Spark从HBase获取数据并创建Spark RDD SparkConf sparkConf = new SparkConf(); sparkConf.setAppName("SparkApp"); spar

我在hbase中有一个名为“sample”的表。我需要使用apachesparksql查询表。

是否有任何方法可以使用Apache spark sql查询读取hbase数据？

spark sql是一个内存中的查询引擎，要在hbase表的顶部使用spark sql执行某些查询操作，您需要

使用Spark从HBase获取数据并创建Spark RDD

SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("SparkApp");
sparkConf.setMaster("local[*]");

JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);

Configuration config = HBaseConfiguration.create();
config.addResource(new Path("/etc/hbase/hbase-site.xml"));
config.addResource(new Path("/etc/hadoop/core-site.xml"));
config.set(TableInputFormat.INPUT_TABLE, "sample");

JavaPairRDD<ImmutableBytesWritable, Result> hbaseRDD = javaSparkContext.newAPIHadoopRDD(hbaseConfig, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);

JavaRDD<StudentBean> sampleRDD = hbaseRDD.map(new Function<Tuple2<ImmutableBytesWritable,Result>, StudentBean  >() {
    private static final long serialVersionUID = -2021713021648730786L;
    public StudentBean  call(Tuple2<ImmutableBytesWritable, Result> tuple) {
        StudentBean  bean = new StudentBean  ();
        Result result = tuple._2;
        bean.setRowKey(rowKey);
        bean.setFirstName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("firstName"))));
        bean.setLastName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("lastName"))));
        bean.setBranch(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("branch"))));
        bean.setEmailId(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("emailId"))));
        return bean;
    }
});

SparkConf SparkConf=new SparkConf（）；
sparkConf.setAppName（“SparkApp”）；
sparkConf.setMaster（“本地[*]”）；
JavaSparkContext JavaSparkContext=新的JavaSparkContext（sparkConf）；
Configuration config=HBaseConfiguration.create（）；
config.addResource（新路径（“/etc/hbase/hbase site.xml”）；
config.addResource（新路径（“/etc/hadoop/core site.xml”）；
config.set（TableInputFormat.INPUT_TABLE，“sample”）；
JavaPairdd hbaseRDD=javaSparkContext.newAPIHadoopRDD（hbaseConfig，TableInputFormat.class，ImmutableBytesWritable.class，Result.class）；
JavaRDD sampleRDD=hbaseRDD.map（新函数（）{
私有静态最终长serialVersionUID=-2021713021648730786L；
public StudentBean调用（Tuple2 tuple）{
StudentBean=newstudentbean（）；
结果=元组。\u 2；
setRowKey（rowKey）；
setFirstName（Bytes.toString（result.getValue（Bytes.toBytes（“详细”），Bytes.toBytes（“firstName”）））；
setLastName（Bytes.toString（result.getValue（Bytes.toBytes（“细节”）、Bytes.toBytes（“lastName”）））；
setBranch（Bytes.toString（result.getValue（Bytes.toBytes（“细节”）、Bytes.toBytes（“分支”）））；
setEmailId（Bytes.toString（result.getValue（Bytes.toBytes（“细节”）、Bytes.toBytes（“emailId”）））；
返回豆；
}
});

使用这个RDD创建DataFrame对象，并用一些临时表名注册它，然后您就可以执行查询了

DataFrame schema = sqlContext.createDataFrame(sampleRDD, StudentBean.class);
schema.registerTempTable("spark_sql_temp_table");

DataFrame schemaRDD = sqlContext.sql("YOUR_QUERY_GOES_HERE");

JavaRDD<StudentBean> result = schemaRDD.toJavaRDD().map(new Function<Row, StudentBean>() {

    private static final long serialVersionUID = -2558736294883522519L;

    public StudentBean call(Row row) throws Exception {
        StudentBean bean = new StudentBean();
        //  Do the mapping stuff here
        return bean;
    }
});

DataFrame schema=sqlContext.createDataFrame（sampledd，StudentBean.class）；
schema.registerTempTable（“spark_sql_temp_table”）；
DataFrame schemaRDD=sqlContext.sql（“您的查询在这里进行”）；
JavaRDD result=schemaRDD.toJavaRDD（）.map（新函数（）{
私有静态最终长serialVersionUID=-2558736294883522519L；
public StudentBean调用（行）引发异常{
StudentBean=newstudentbean（）；
//在这里做映射
返回豆；
}
});

Spark SQL是一个内存中的查询引擎，要在HBase表上使用Spark SQL执行某些查询操作，您需要

使用Spark从HBase获取数据并创建Spark RDD

SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("SparkApp");
sparkConf.setMaster("local[*]");

JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);

Configuration config = HBaseConfiguration.create();
config.addResource(new Path("/etc/hbase/hbase-site.xml"));
config.addResource(new Path("/etc/hadoop/core-site.xml"));
config.set(TableInputFormat.INPUT_TABLE, "sample");

JavaPairRDD<ImmutableBytesWritable, Result> hbaseRDD = javaSparkContext.newAPIHadoopRDD(hbaseConfig, TableInputFormat.class, ImmutableBytesWritable.class, Result.class);

JavaRDD<StudentBean> sampleRDD = hbaseRDD.map(new Function<Tuple2<ImmutableBytesWritable,Result>, StudentBean  >() {
    private static final long serialVersionUID = -2021713021648730786L;
    public StudentBean  call(Tuple2<ImmutableBytesWritable, Result> tuple) {
        StudentBean  bean = new StudentBean  ();
        Result result = tuple._2;
        bean.setRowKey(rowKey);
        bean.setFirstName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("firstName"))));
        bean.setLastName(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("lastName"))));
        bean.setBranch(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("branch"))));
        bean.setEmailId(Bytes.toString(result.getValue(Bytes.toBytes("details"), Bytes.toBytes("emailId"))));
        return bean;
    }
});

SparkConf SparkConf=new SparkConf（）；
sparkConf.setAppName（“SparkApp”）；
sparkConf.setMaster（“本地[*]”）；
JavaSparkContext JavaSparkContext=新的JavaSparkContext（sparkConf）；
Configuration config=HBaseConfiguration.create（）；
config.addResource（新路径（“/etc/hbase/hbase site.xml”）；
config.addResource（新路径（“/etc/hadoop/core site.xml”）；
config.set（TableInputFormat.INPUT_TABLE，“sample”）；
JavaPairdd hbaseRDD=javaSparkContext.newAPIHadoopRDD（hbaseConfig，TableInputFormat.class，ImmutableBytesWritable.class，Result.class）；
JavaRDD sampleRDD=hbaseRDD.map（新函数（）{
私有静态最终长serialVersionUID=-2021713021648730786L；
public StudentBean调用（Tuple2 tuple）{
StudentBean=newstudentbean（）；
结果=元组。\u 2；
setRowKey（rowKey）；
setFirstName（Bytes.toString（result.getValue（Bytes.toBytes（“详细”），Bytes.toBytes（“firstName”）））；
setLastName（Bytes.toString（result.getValue（Bytes.toBytes（“细节”）、Bytes.toBytes（“lastName”）））；
setBranch（Bytes.toString（result.getValue（Bytes.toBytes（“细节”）、Bytes.toBytes（“分支”）））；
setEmailId（Bytes.toString（result.getValue（Bytes.toBytes（“细节”）、Bytes.toBytes（“emailId”）））；
返回豆；
}
});

使用这个RDD创建DataFrame对象，并用一些临时表名注册它，然后您就可以执行查询了

DataFrame schema = sqlContext.createDataFrame(sampleRDD, StudentBean.class);
schema.registerTempTable("spark_sql_temp_table");

DataFrame schemaRDD = sqlContext.sql("YOUR_QUERY_GOES_HERE");

JavaRDD<StudentBean> result = schemaRDD.toJavaRDD().map(new Function<Row, StudentBean>() {

    private static final long serialVersionUID = -2558736294883522519L;

    public StudentBean call(Row row) throws Exception {
        StudentBean bean = new StudentBean();
        //  Do the mapping stuff here
        return bean;
    }
});

DataFrame schema=sqlContext.createDataFrame（sampledd，StudentBean.class）；
schema.registerTempTable（“spark_sql_temp_table”）；
DataFrame schemaRDD=sqlContext.sql（“您的查询在这里进行”）；
JavaRDD result=schemaRDD.toJavaRDD（）.map（新函数（）{
私有静态最终长serialVersionUID=-2558736294883522519L；
public StudentBean调用（行）引发异常{
StudentBean=newstudentbean（）；
//在这里做映射
返回豆；
}
});