使用Scala将SparkRDD写入HBase表
我正在尝试使用scala(以前从未使用过)将SparkRDD写入HBase表。整个代码如下所示:使用Scala将SparkRDD写入HBase表,scala,apache-spark,hbase,rdd,Scala,Apache Spark,Hbase,Rdd,我正在尝试使用scala(以前从未使用过)将SparkRDD写入HBase表。整个代码如下所示: import org.apache.hadoop.hbase.client.{HBaseAdmin, Result} import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor} import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apa
import org.apache.hadoop.hbase.client.{HBaseAdmin, Result}
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import scala.collection.JavaConverters._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark._
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.rdd.PairRDDFunctions
import org.apache.spark.SparkContext._
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.client._
object HBaseWrite {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("HBaseWrite").setMaster("local").set("spark.driver.allowMultipleContexts","true").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
val outputTable = "tablename"
System.setProperty("user.name", "hdfs")
System.setProperty("HADOOP_USER_NAME", "hdfs")
conf.set("hbase.master", "localhost:60000")
conf.setInt("timeout", 120000)
conf.set("hbase.zookeeper.quorum", "localhost")
conf.set("zookeeper.znode.parent", "/hbase-unsecure")
conf.setInt("hbase.client.scanner.caching", 10000)
sparkConf.registerKryoClasses(Array(classOf[org.apache.hadoop.hbase.client.Result]))
val jobConfig: JobConf = new JobConf(conf,this.getClass)
jobConfig.setOutputFormat(classOf[TableOutputFormat])
jobConfig.set(TableOutputFormat.OUTPUT_TABLE,outputTable)
val x = 12
val y = 15
val z = 25
var newarray = Array(x,y,z)
val newrddtohbase = sc.parallelize(newarray)
def convert(a:Int) : Tuple2[ImmutableBytesWritable,Put] = {
val p = new Put(Bytes.toBytes(a))
p.add(Bytes.toBytes("columnfamily"),
Bytes.toBytes("col_1"), Bytes.toBytes(a))
new Tuple2[ImmutableBytesWritable,Put](new ImmutableBytesWritable(a.toString.getBytes()), p);
}
new PairRDDFunctions(newrddtohbase.map(convert)).saveAsHadoopDataset(jobConfig)
sc.stop()
}
}
在执行HBaseWrite(main(Array())之后,我遇到的错误如下:
如何继续完成它?您在这里做错的事情是在
main
内部定义convert
如果以这种方式编写此代码,它可能会工作:
object HBaseWrite {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("HBaseWrite").setMaster("local").set("spark.driver.allowMultipleContexts","true").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
val outputTable = "tablename"
System.setProperty("user.name", "hdfs")
System.setProperty("HADOOP_USER_NAME", "hdfs")
conf.set("hbase.master", "localhost:60000")
conf.setInt("timeout", 120000)
conf.set("hbase.zookeeper.quorum", "localhost")
conf.set("zookeeper.znode.parent", "/hbase-unsecure")
conf.setInt("hbase.client.scanner.caching", 10000)
sparkConf.registerKryoClasses(Array(classOf[org.apache.hadoop.hbase.client.Result]))
val jobConfig: JobConf = new JobConf(conf,this.getClass)
jobConfig.setOutputFormat(classOf[TableOutputFormat])
jobConfig.set(TableOutputFormat.OUTPUT_TABLE,outputTable)
val x = 12
val y = 15
val z = 25
var newarray = Array(x,y,z)
val newrddtohbase = sc.parallelize(newarray)
val convertFunc = convert _
new PairRDDFunctions(newrddtohbase.map(convertFunc)).saveAsHadoopDataset(jobConfig)
sc.stop()
}
def convert(a:Int) : Tuple2[ImmutableBytesWritable,Put] = {
val p = new Put(Bytes.toBytes(a))
p.add(Bytes.toBytes("columnfamily"),
Bytes.toBytes("col_1"), Bytes.toBytes(a))
new Tuple2[ImmutableBytesWritable,Put](new ImmutableBytesWritable(a.toString.getBytes()), p);
}
}
注意:代码没有经过测试,但应该可以工作!例如,下面的方法将Int作为参数并返回Double
var toDouble: (Int) => Double = a => {
a.toDouble
}
您可以使用toDouble(2)
并返回2.0
与将方法转换为函数文本的方法相同,如下所示
val convert: (Int) => Tuple2[ImmutableBytesWritable,Put] = a => {
val p = new Put(Bytes.toBytes(a))
p.add(Bytes.toBytes("columnfamily"),
Bytes.toBytes("col_1"), Bytes.toBytes(a))
new Tuple2[ImmutableBytesWritable,Put](new ImmutableBytesWritable(a.toString.getBytes()), p);
}
感谢您的回复,但错误仍然是一样的。您可以将错误Stackk Toorg.apache.spark.sparkeException:Task not serializable at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:166)粘贴到org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:158)上吗org.apache.spark.SparkContext.clean(SparkContext.scala:1446)org.apache.spark.rdd.rdd.map(rdd.scala:286)我无法添加整个错误,因为它说问题中有很多代码,并拒绝了编辑提交。@RaviRanjan:将其作为要点添加,然后在此处提供链接!将您的
convert
方法作为function literal
insidemap
方法传递给map
方法,这就解决了问题。我想知道为什么会有人否决该方法s、 它为我消除了这个错误。
val convert: (Int) => Tuple2[ImmutableBytesWritable,Put] = a => {
val p = new Put(Bytes.toBytes(a))
p.add(Bytes.toBytes("columnfamily"),
Bytes.toBytes("col_1"), Bytes.toBytes(a))
new Tuple2[ImmutableBytesWritable,Put](new ImmutableBytesWritable(a.toString.getBytes()), p);
}