Apache spark spark和shell蜂巢的相同操作效果不同,为什么?
此代码插入来自spark的数据Apache spark spark和shell蜂巢的相同操作效果不同,为什么?,apache-spark,hadoop,hive,orc,Apache Spark,Hadoop,Hive,Orc,此代码插入来自spark的数据 String warehouseLocation = new File("spark-warehouse").getAbsolutePath(); SparkSession sparkSession = SparkSession.builder() .appName(appName) .config("spark.sql.warehouse.dir&quo
String warehouseLocation = new File("spark-warehouse").getAbsolutePath();
SparkSession sparkSession = SparkSession.builder()
.appName(appName)
.config("spark.sql.warehouse.dir", warehouseLocation)
.config("spark.sql.catalogImplementation","hive")
.enableHiveSupport()
.config("hive.exec.dynamic.partition", "true")
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.getOrCreate();
JavaStreamingContext jssc = new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()),
Durations.seconds(duration));
SQLContext sqlContext = sparkSession.sqlContext();
sqlContext.sql("CREATE TABLE IF NOT EXISTS " + tableName + " (value1 STRING, value2 STRING, value3 STRING, " +
"value4 STRING, value5 STRING, value6 STRING, value7 STRING) PARTITIONED BY (year STRING, mounth STRING, day STRING)" +
" STORED AS ORC");
sqlContext.sql("SET hive.merge.tezfiles=true");
sqlContext.sql("SET hive.merge.mapfiles=true");
sqlContext.sql( "SET hive.merge.size.per.task=256000000");
sqlContext.sql ( "SET hive.merge.smallfiles.avgsize=16000000");
sqlContext.sql("SET hive.merge.orcfile.stripe.level=true;");
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", broker);
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
Collection<String> topicsSet = Collections.singletonList(topic);
// Create direct kafka stream with brokers and topics
JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(
jssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.Subscribe(topicsSet, kafkaParams));
// Get the lines, split them into words, count the words and print
JavaDStream<String> lines = messages.map(ConsumerRecord::value);
lines.foreachRDD(new VoidFunction<JavaRDD<String>>() {
@Override
public void call(JavaRDD<String> rdd) {
if (!rdd.isEmpty()) {
JavaRDD<Data> dataRDD = rdd.map(new Function<String, Data>() {
@Override
public Data call(String msg) {
try {
return Data.insertDataByString(msg);
} catch (ParseException e) {
e.printStackTrace();
}
return null;
}
});
Dataset<Row> dataRow = sqlContext.createDataFrame(dataRDD, Data.class);
dataRow.createOrReplaceTempView("temp_table");
sqlContext.sql("insert into " + tableName + " partition(year,mounth,day) select value1, value2, " +
"value3, value4, value5, value6, value7, year, mounth, day from temp_table");
//dataRow.write().format("orc").partitionBy("year", "day").mode(SaveMode.Append).insertInto(tableName);
//sqlContext.sql("ALTER TABLE " + tableName + " PARTITION(year='2020', mounth='4', day='26') " + " CONCATENATE");
}
}
String warehouseLocation=新文件(“spark warehouse”).getAbsolutePath();
SparkSession SparkSession=SparkSession.builder()
.appName(appName)
.config(“spark.sql.warehouse.dir”,warehouseLocation)
.config(“spark.sql.catalogImplementation”、“配置单元”)
.enableHiveSupport()
.config(“hive.exec.dynamic.partition”,“true”)
.config(“hive.exec.dynamic.partition.mode”,“非严格”)
.getOrCreate();
JavaStreamingContext jssc=新的JavaStreamingContext(新的JavaSparkContext(sparkSession.sparkContext()),
持续时间。秒(持续时间);
SQLContext SQLContext=sparkSession.SQLContext();
sqlContext.sql(“如果不存在则创建表”+tableName+”(value1字符串、value2字符串、value3字符串、+
value4字符串、value5字符串、value6字符串、value7字符串)按(年份字符串、月份字符串、日期字符串)分区+
“存储为ORC”);
sqlContext.sql(“SET-hive.merge.tezfiles=true”);
sqlContext.sql(“SET-hive.merge.mapfiles=true”);
sql(“SET hive.merge.size.per.task=256000000”);
sqlContext.sql(“SET-hive.merge.smallfiles.avgsize=16000000”);
sqlContext.sql(“SET hive.merge.orcfile.stripe.level=true;”;
Map kafkaParams=新HashMap();
kafkaParams.put(“bootstrap.servers”,broker);
kafkaParams.put(“key.deserializer”,StringDeserializer.class);
kafkaParams.put(“value.deserializer”,StringDeserializer.class);
kafkaParams.put(“group.id”,“为每个流使用单独的组id”);
kafkaParams.put(“自动偏移重置”、“最新”);
kafkaParams.put(“enable.auto.commit”,false);
Collection topicset=Collections.singletonList(主题);
//创建带有代理和主题的直接卡夫卡流
JavaInputDStream消息=KafkaUtils.createDirectStream(
jssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.Subscribe(TopicSet,kafkaParams));
//把行数出来,分成几个字,数一数,然后打印出来
JavadStreamLines=messages.map(ConsumerRecord::value);
lines.foreachRDD(新的VoidFunction(){
@凌驾
公共无效调用(JavaRDD){
如果(!rdd.isEmpty()){
javardddatardd=rdd.map(新函数(){
@凌驾
公共数据调用(字符串消息){
试一试{
返回Data.insertdataystring(msg);
}捕获(解析异常){
e、 printStackTrace();
}
返回null;
}
});
Dataset dataRow=sqlContext.createDataFrame(dataRDD,Data.class);
创建或替换临时视图(“临时表”);
sql(“插入“+tableName+”分区(年、月、日)选择值1、值2,”+
“价值3、价值4、价值5、价值6、价值7,从临时表开始的年份、月份、日期”);
//dataRow.write().format(“orc”).partitionBy(“年”、“日”).mode(SaveMode.Append).insertInto(tableName);
//sql(“altertable”+tableName+”分区(year='2020',mounth='4',day='26')”+“CONCATENATE”);
}
}
执行此代码时,将在中创建表
hdfs://master.vmware.local:8020/apps/spark/warehouse/tablename/year=2020/mounth=4/day=26
并且进入第26天时会出现更多文件.c000
如果相反,从配置单元外壳创建表,则表位于其他位置,
hdfs://master.vmware.local:8020/warehouse/tablespace/managed/hive/table_name/year=2020/mounth=4/day=26/
在day=26中有文件:_orc_acid_version和_bucket_000000
我的目标是使用spark创建orc文件,但我认为使用spark我是使用hive的默认文件保存的。如何将数据从spark with hive保存到ocr文件?我建议Nifi而不是spark将Kafka带到HDFS/HiveI建议Nifi而不是spark将Kafka带到HDFS/hive