Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala Spark纱线远程提交_Scala_Apache Spark_Apache Kafka_Yarn_Cloudera - Fatal编程技术网

Scala Spark纱线远程提交

Scala Spark纱线远程提交,scala,apache-spark,apache-kafka,yarn,cloudera,Scala,Apache Spark,Apache Kafka,Yarn,Cloudera,目前我正在从事spark流媒体项目。刚开始,我还是火花卡夫卡时代的新手。为了尝试(或查看)程序的结果,目前我必须构建项目的jar,将其上传到集群,然后spark submit,我认为这种方式效率不高 我可以从IDE[远程]以编程方式运行这个程序吗?我使用scala IDE。我寻找一些代码来遵循,但仍然没有找到合适的 我的环境: -Cloudera 5.8.2[OS redhat 7.2、kerberos 5、spark_2.1、scala 2.11] -Windows 7按照以下步骤对应用程序进

目前我正在从事spark流媒体项目。刚开始,我还是火花卡夫卡时代的新手。为了尝试(或查看)程序的结果,目前我必须构建项目的jar,将其上传到集群,然后spark submit,我认为这种方式效率不高

我可以从IDE[远程]以编程方式运行这个程序吗?我使用scala IDE。我寻找一些代码来遵循,但仍然没有找到合适的

我的环境: -Cloudera 5.8.2[OS redhat 7.2、kerberos 5、spark_2.1、scala 2.11]
-Windows 7按照以下步骤对应用程序进行单元测试

  • 下载wondows的winutils设置HADOOP\u主环境变量
  • 为sparkstreaming提供确切的kafka代理url和主题名称
  • 确保设置了正确的偏移量菜单管理属性
  • 使用Intellij IDE(SCALA IDE也可以)。只要运行scala应用程序即可

    瓦尔·卡夫卡帕拉姆斯=地图( “metadata.broker.list”->“168.172.72.128:9092”, ConsumerConfig.AUTO\u OFFSET\u RESET\u CONFIG->“最小”, “group.id”->UUID.randomUUID().toString())

    val topicSet=Set(“test”)//主题名称 val kafkaStream=卡夫卡提尔 .createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topicSet) //创建BSON数据结构并将数据加载到MongoDB集合 卡夫卡斯特伦·弗雷阿查德( rdd=>{//业务逻辑代码})

  • 我遵循这个教程

    下面是我的代码:

    import org.apache.kafka.clients.consumer.ConsumerRecord
    import org.apache.kafka.common.serialization.StringDeserializer
    import org.apache.spark.streaming.kafka010._
    import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
    import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
    
    import scala.collection.mutable.ListBuffer
    import org.apache.spark.SparkConf
    import org.apache.spark.streaming.StreamingContext
    import org.apache.spark.streaming.Seconds
    import org.apache.spark.sql.types.{StringType, StructType, TimestampType}
    import org.apache.spark.sql.SparkSession
    import org.apache.spark.sql.functions.count
    
    object SparkKafkaExample {
    
      def main(args: Array[String]): Unit =
      {
    
      val brokers = "broker1.com:9092,broker2.com:9092," +
        "broker3.com:9092,broker4.com:9092,broker5.com:9092"
      // Create Spark Session
      val spark = SparkSession
        .builder()
        .appName("KafkaSparkDemo")
        .master("local[*]")
        .getOrCreate()
    
      import spark.implicits._
    
      // Create Streaming Context and Kafka Direct Stream with provided settings and 10 seconds batches
      val ssc = new StreamingContext(spark.sparkContext, Seconds(10))
    
      var kafkaParams = Map(
        "bootstrap.servers" -> brokers,
        "key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
        "value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
        "group.id" -> "test",
        "security.protocol" -> "SASL_PLAINTEXT",
        "sasl.kerberos.service.name" -> "kafka",
        "auto.offset.reset" -> "earliest")
    
      val topics = Array("sparkstreaming")
      val stream = KafkaUtils.createDirectStream[String, String](
        ssc,
        PreferConsistent,
        Subscribe[String, String](topics, kafkaParams))
    
      // Define a schema for JSON data
      val schema = new StructType()
        .add("action", StringType)
        .add("timestamp", TimestampType)
    
      // Process batches:
      // Parse JSON and create Data Frame
      // Execute computation on that Data Frame and print result
      stream.foreachRDD { (rdd, time) =>
        val data = rdd.map(record => record.value)
        val json = spark.read.schema(schema).json(data)
        val result = json.groupBy($"action").agg(count("*").alias("count"))
        result.show
      }
    
      ssc.start
      ssc.awaitTermination
    
    }
    }
    
    因为我的集群使用kerberos,所以我将这个配置文件(kafka_jaas.conf)传递给我的IDE(Eclipse->on VM参数)

    kafka-jaas.conf内容:

    KafkaClient {
        com.sun.security.auth.module.Krb5LoginModule required
        useKeyTab=true
        keyTab="user.keytab"
        serviceName="kafka"
        principal="user@HOST.COM";
    };
    Client {
       com.sun.security.auth.module.Krb5LoginModule required
       useKeyTab=true
       keyTab="user.keytab"
       storeKey=true
       useTicketCache=false
       serviceName="zookeeper"
       principal="user@HOST.COM";
    };
    
    KafkaClient {
        com.sun.security.auth.module.Krb5LoginModule required
        useKeyTab=true
        keyTab="user.keytab"
        serviceName="kafka"
        principal="user@HOST.COM";
    };
    Client {
       com.sun.security.auth.module.Krb5LoginModule required
       useKeyTab=true
       keyTab="user.keytab"
       storeKey=true
       useTicketCache=false
       serviceName="zookeeper"
       principal="user@HOST.COM";
    };