Scala 如何使用saveAsNewAPIHadoopDataset将RDD[List[(ImmutableBytesWritable,Put)]写入HBase

Scala 如何使用saveAsNewAPIHadoopDataset将RDD[List[(ImmutableBytesWritable,Put)]写入HBase,scala,apache-spark,hbase,rdd,Scala,Apache Spark,Hbase,Rdd,我知道我们可以使用saveAsNewAPIHadoopDataset和RDD[(ImmutableBytesWritable,Put)]使用spark写入HBase表 但我有一个列表,即RDD[list[(ImmutableBytesWritable,Put)],我想写两个不同的HBase表。 怎么做 下面是代码 package com.scryAnalytics.FeatureExtractionController import com.scryAnalytics.FeatureExtra

我知道我们可以使用saveAsNewAPIHadoopDatasetRDD[(ImmutableBytesWritable,Put)]使用spark写入HBase表

但我有一个列表,即RDD[list[(ImmutableBytesWritable,Put)],我想写两个不同的HBase表。 怎么做

下面是代码

package com.scryAnalytics.FeatureExtractionController

import com.scryAnalytics.FeatureExtractionController.DAO.{DocumentEntitiesDAO, NLPEntitiesDAO, SegmentFeaturesDAO}
import com.scryAnalytics.NLPGeneric.{GateGenericNLP, NLPEntities}
import com.sun.xml.bind.v2.TODO
import com.vocp.ner.main.GateNERImpl
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat, TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.mapreduce.Job
import com.scryAnalytics.FeatureExtraction.SegmentsFeatureExtraction
import com.scryAnalytics.FeatureExtraction.DAO.VOCPEntities

import scala.collection.JavaConversions._
import gate.FeatureMap
import java.util.Map.Entry

import scala.collection.JavaConversions
import scala.util.control.Breaks.break
import scala.util.control.ControlThrowable

/**
  * Created by sahil on 1/12/16.
 */

object Main {
  def main(args: Array[String]): Unit = {
    val inputTableName = "posts"
    val outputTableName = "drugSegmentNew1"
    val pluginHome = "/home/sahil/Voice-of-Cancer-Patients/VOCP Modules/bin/plugins"
val sc = new SparkContext(new SparkConf().setAppName("HBaseRead").setMaster("local[4]"))
val conf = HBaseConfiguration.create()
conf.set(HConstants.ZOOKEEPER_QUORUM, "localhost")
conf.set(TableInputFormat.INPUT_TABLE, inputTableName)

val admin = new HBaseAdmin(conf)
if (!admin.isTableAvailable(inputTableName)) {
  val tableDesc = new HTableDescriptor(TableName.valueOf(inputTableName))
  admin.createTable(tableDesc)
}
val job: Job = Job.getInstance(conf, "FeatureExtractionJob")
job.setOutputFormatClass(classOf[MultiTableOutputFormat])
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
  classOf[ImmutableBytesWritable], classOf[Result])
val resultRDD = hBaseRDD.map(x => x._2)

// TODO: Add filters

val entity: VOCPEntities = VOCPEntities.DRUG
val nlpRDD = resultRDD.mapPartitions { iter =>
  val nlpEntities: NLPEntitiesDAO = new NLPEntitiesDAO
  iter.map {
    result =>
      val message = Bytes.toString(result.getValue(Bytes.toBytes("p"), Bytes.toBytes("message")))
      val row_key = Bytes.toString(result.getRow)
      nlpEntities.setToken(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("token")))))
      nlpEntities.setSpaceToken(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("spaceToken")))))
      nlpEntities.setSentence(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("sentence")))))
      nlpEntities.setVG(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("verbGroup")))))
      nlpEntities.setSplit(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("split")))))
      nlpEntities.setNounChunk(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("nounChunk")))))

      nlpEntities.setDrugs(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("drug")))))
      nlpEntities.setRegimen(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("regimen")))))
      nlpEntities.setSideEffects(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("sideEffect")))))
      nlpEntities.setALT_DRUG(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("altDrug")))))
      nlpEntities.setALT_THERAPY(Utility.jsonToAnnotations(Bytes.toString(
        result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("altTherapy")))))
      (row_key, message, nlpEntities)
  }
}
val featureExtractionOld: SegmentsFeatureExtraction = new SegmentsFeatureExtraction(
  pluginHome, entity)
val outputRDD = nlpRDD.mapPartitions { iter =>
  val featureExtraction: SegmentsFeatureExtraction = new SegmentsFeatureExtraction(
    pluginHome, entity)
  iter.map { x =>
    val featuresJson = featureExtraction.generateFeatures(x._2, Utility.objectToJson(x._3))
    val segmentFeatures: SegmentFeaturesDAO = Utility.jsonToSegmentFeatures(featuresJson)
    val documentEntities: DocumentEntitiesDAO = new DocumentEntitiesDAO
    documentEntities.setSystemId(x._1)
    documentEntities.setToken(x._3.getToken)
    documentEntities.setSpaceToken(x._3.getSpaceToken)
    documentEntities.setSentence(x._3.getSentence)
    documentEntities.setVG(x._3.getVG)
    documentEntities.setNounChunk(x._3.getNounChunk)
    documentEntities.setSplit(x._3.getSplit)
    documentEntities.setDRUG(x._3.getDrugs)
    documentEntities.setSE(x._3.getSideEffects)
    documentEntities.setREG(x._3.getRegimen)
    documentEntities.setALT_DRUG(x._3.getALT_DRUG)
    documentEntities.setALT_THERAPY(x._3.getALT_THERAPY)
    documentEntities.setSegment(segmentFeatures.getSegment)
    documentEntities.setSegmentClass(segmentFeatures.getSegmentClass)
    documentEntities.setSegmentInstance(segmentFeatures.getSegmentInstance)
    (x._1, documentEntities)
  }
}
val newRDD = outputRDD.map { k => convertToPut(k) }
newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
  }
      def convertToPut(NlpWithRowKey: (String, DocumentEntitiesDAO)):   List[(ImmutableBytesWritable, Put)] = {
    val rowkey = NlpWithRowKey._1
val documentEntities = NlpWithRowKey._2
var returnList: List[(ImmutableBytesWritable, Put)] = List()
val segmentInstances = documentEntities.getSegmentInstance
val segments = documentEntities.getSegment
if(segments != null) {
  var count = 0
  for(segment <- segmentInstances) {
    val  keyString: String = documentEntities.getSystemId + "#" + Integer.toString(count)
    count = count + 1
    val outputKey: ImmutableBytesWritable = new ImmutableBytesWritable(keyString.getBytes())
    val put = new Put(outputKey.get())

    val features: FeatureMap = segment.getFeatures
    val it: Iterator[Entry[Object, Object]] = features.entrySet.iterator()
    var sideEffect_offset = "NULL"
    var entity_offset = "NULL"
      while(it.hasNext) {
          val pair = it.next()
        if(pair.getKey.equals("sideEffect-offset")) {
          sideEffect_offset = pair.getValue().toString()
        }
        else if(pair.getKey.equals("drug-offset")) {
          entity_offset = pair.getValue().toString()
        }
        else if(pair.getKey().equals("drug") ||  pair.getKey().equals("sideEffect")){
          put.add(Bytes.toBytes("seg"), Bytes.toBytes(pair.getKey.toString), Bytes
            .toBytes(pair.getValue().toString))
        }
        else {
          put.add(Bytes.toBytes("segFeatures"), Bytes.toBytes(pair.getKey.toString), Bytes
            .toBytes(pair.getValue().toString))
        }
      }
    put.add(Bytes.toBytes("seg"), Bytes.toBytes("RelationId"),
        Bytes.toBytes(documentEntities.getSystemId() + "-" + entity_offset + "-" + sideEffect_offset))
    put.add(Bytes.toBytes("segInst"),Bytes.toBytes("id"), Bytes.toBytes(segment.getId()))
    put.add(Bytes.toBytes("segInst"), Bytes.toBytes("type"), Bytes.toBytes(segment.getType()))
    put.add(Bytes.toBytes("segInst"), Bytes.toBytes("startNodeId"), Bytes.toBytes(
      segment.getStartNode().getId()))
    put.add(Bytes.toBytes("segInst"), Bytes.toBytes("startNodeOffset"),
      Bytes.toBytes(segment.getStartNode().getOffset()))
    put.add(Bytes.toBytes("segInst"),Bytes.toBytes("endNodeId"),
      Bytes.toBytes(segment.getEndNode().getId()))
    put.add(Bytes.toBytes("segInst"), Bytes.toBytes("endNodeOffset"),
      Bytes.toBytes(segment.getEndNode().getOffset()))

    put.add(Bytes.toBytes("seg"),Bytes.toBytes("system_id"),
      Bytes.toBytes(documentEntities.getSystemId()))
    put.add(Bytes.toBytes("seg"), Bytes.toBytes("segmentText"),
      Bytes.toBytes(segment.getAnnotatedText()))

    for(segmentClassAnnots <- documentEntities.getSegmentClass) {
      try {
        if (segment.getId().equals(segmentClassAnnots.getFeatures().get("instance-id"))) {
          put.add(Bytes.toBytes("segClass"), Bytes.toBytes("id"),
            Bytes.toBytes(segmentClassAnnots.getId()))
          put.add(Bytes.toBytes("segClass"), Bytes.toBytes("type"),
            Bytes.toBytes(segmentClassAnnots.getType()))
          put.add(Bytes.toBytes("segClass"), Bytes.toBytes("startNodeId"), Bytes
            .toBytes(segmentClassAnnots.getStartNode()
              .getId()))
          put.add(Bytes.toBytes("segClass"), Bytes.toBytes("startNodeOffset"), Bytes
            .toBytes(segmentClassAnnots.getStartNode()
              .getOffset()))
          put.add(Bytes.toBytes("segClass"), Bytes.toBytes("endNodeId"), Bytes
            .toBytes(segmentClassAnnots.getEndNode()
              .getId()))
          put.add(Bytes.toBytes("segClass"), Bytes.toBytes("endNodeOffset"), Bytes
            .toBytes(segmentClassAnnots.getEndNode()
              .getOffset()))
          break
        }
      } catch {
        case t: Throwable => t.printStackTrace
      }
      returnList = returnList:+((new ImmutableBytesWritable(Bytes.toBytes("drugSegmentNew1")), put))
    }
  }
  }
val PUT = new Put(Bytes.toBytes(rowkey))
PUT.add(Bytes.toBytes("f"), Bytes.toBytes("dStatus"), Bytes.toBytes("1"))
returnList = returnList:+((new ImmutableBytesWritable(Bytes.toBytes("posts")), PUT))
(returnList)
  }
}
package com.scryAnalytics.FeatureExtractionController
导入com.scryAnalytics.FeatureExtractionController.DAO.{DocumentEntitiesDAO,NLPentiesDao,SegmentFeaturesDAO}
导入com.scryAnalytics.NLPGeneric.{GateGenericNLP,nlpenties}
导入com.sun.xml.bind.v2.TODO
导入com.vocp.ner.main.GateNERImpl
导入org.apache.spark.{SparkConf,SparkContext}
导入org.apache.hadoop.hbase.{HBaseConfiguration,HConstants,HTableDescriptor,TableName}
导入org.apache.hadoop.hbase.client.{HBaseAdmin,Result}
导入org.apache.hadoop.hbase.io.ImmutableBytesWritable
导入org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat,TableInputFormat,TableOutputFormat}
导入org.apache.hadoop.hbase.util.Bytes
导入org.apache.hadoop.hbase.client.Put
导入org.apache.hadoop.mapreduce.Job
导入com.scryAnalytics.FeatureExtraction.SegmentsFeatureExtraction
导入com.scryAnalytics.FeatureExtraction.DAO.vopenties
导入scala.collection.JavaConversions_
导入gate.FeatureMap
导入java.util.Map.Entry
导入scala.collection.JavaConversions
导入scala.util.control.Breaks.break
导入scala.util.control.ControlThrowable
/**
*由sahil于2016年1月12日创建。
*/
对象主体{
def main(参数:数组[字符串]):单位={
val inputablename=“posts”
val outputTableName=“drugSegmentNew1”
val pluginHome=“/home/sahil/Voice of Cancer Patients/VOCP Modules/bin/plugins”
val sc=new SparkContext(new SparkConf().setAppName(“HBaseRead”).setMaster(“local[4]”)
val conf=HBaseConfiguration.create()
conf.set(HConstants.ZOOKEEPER_QUORUM,“localhost”)
conf.set(TableInputFormat.INPUT\u表格,InputableName)
val admin=new HBaseAdmin(conf)
如果(!admin.isTableAvailable(InputableName)){
val tableDesc=新的HTableDescriptor(TableName.valueOf(inputTableName))
admin.createTable(tableDesc)
}
val job:job=job.getInstance(conf,“FeatureExtractionJob”)
job.setOutputFormatClass(类[MultiTableOutputFormat])
val hBaseRDD=sc.newAPIHadoopRDD(conf,classOf[TableInputFormat],
classOf[ImmutableBytesWritable],classOf[Result])
val resultRDD=hBaseRDD.map(x=>x.\u 2)
//TODO:添加过滤器
val实体:vopenties=vopenties.DRUG
val nlpRDD=resultRDD.mapPartitions{iter=>
val-nlpentiesdao:nlpentiesdao=新的nlpentiesdao
iter.map{
结果=>
val message=Bytes.toString(result.getValue(Bytes.toBytes(“p”)、Bytes.toBytes(“message”))
val row_key=Bytes.toString(result.getRow)
nlpenties.setToken(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“gen”)、Bytes.toBytes(“token”;))))
nlpenties.setSpaceToken(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“gen”)、Bytes.toBytes(“spaceToken”()()))
nlpenties.setEntence(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“gen”)、Bytes.toBytes(“句子”;);)
nlpenties.setVG(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“gen”)、Bytes.toBytes(“verbGroup”()()))
nlpenties.setSplit(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“gen”)、Bytes.toBytes(“split”;);)
nlpenties.setNonuChunk(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“gen”)、Bytes.toBytes(“nounChunk”;);)
nlpenties.setDruges(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“ner”)、Bytes.toBytes(“drug”;))))
NLPenties.setSession(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“ner”)、Bytes.toBytes(“方案”;));)
nlpenties.setSideEffects(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“ner”)、Bytes.toBytes(“sideEffect”;)))
nlpenties.setALT_DRUG(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“ner”)、Bytes.toBytes(“altdruge”)和
nlpenties.setALT_治疗(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes(“ner”)、Bytes.toBytes(“altTherapy”;)))
(行_键、消息、nlpEntities)
}
}
val featureExtractionOld:SegmentsFeatureExtraction=新的SegmentsFeatureExtraction(
插件名称(实体)
val outputRDD=nlpRDD.mapPartitions{iter=>
val特征提取:分段特征提取=新分段特征提取(
插件名称(实体)
iter.map{x=>
val featuresJson=featureExtraction.generateFeatures(x._2,Utility.objectToJson(x._3))
val segmentFeatures:SegmentFeaturesDAO=Utility.jsonToSegmentFeatures(featuresJson)
val documentEntities:DocumentEntitiesDAO=新DocumentEntitiesDAO
documentEntities.setSystemId(x._1)
documentEntities.setToken(x._3.getToken)
documentEntities.setSpaceToken(x._3.getSpaceToken)
documentEntities.SetEntence(x._3.GetSession)
documentEntities.setVG(x._3.getVG)
documentEntities.setNonuChunk(x._3.getNonuChunk)
documentEntities.setSplit(x._3.getSplit)
documentEntities.SetDruge(x._3.getDrugs)
documentEntities.setSE(x._3.getSideEffects)
documentEntities.setREG(x._3.getReg)
documentEntities.setALT_药物(x._3.getALT_药物)
documentEntities.setALT_疗法(x._3.getALT_疗法)
documentEntities.setSegment(segmentFeatures.getSegment)
documentEntities.setSegmentClass(segmentFeatures.getSegmentClass)
documentEntities.setSegmentInstance(segmentFeatures.getSegmentInstance)
(x._1,文件实体)
}
}
val newRDD=outpurtdd.map{k=>convertToPut(k)}
newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
}
def转换器输出
val newRDD = outputRDD.map { k => convertToPut(k) }
val newRDD = outputRDD.flatMap { k => convertToPut(k) }