Hive 在配置单元中运行HQL时抛出异常
我想运行一个select语句并将结果放入一个表中,我确信这不是语法错误 HQL:Hive 在配置单元中运行HQL时抛出异常,hive,query-optimization,hiveql,apache-tez,Hive,Query Optimization,Hiveql,Apache Tez,我想运行一个select语句并将结果放入一个表中,我确信这不是语法错误 HQL: INSERT overwrite table datalake_rci.MID_DealerVehicleOutputValue --MIDDealerVehicleOutputValueID int, select b.MIDDealerVehicleOutputValueID ,b.DealerID --string, ,b.CHASSIS --string, ,b.DIMDealerID --bigint,
INSERT overwrite table datalake_rci.MID_DealerVehicleOutputValue
--MIDDealerVehicleOutputValueID int,
select
b.MIDDealerVehicleOutputValueID
,b.DealerID --string,
,b.CHASSIS --string,
,b.DIMDealerID --bigint,
,b.DIMVehicleID --bigint,
,case when a.DIMDealerID is not null
then 1 else b.DIMOutputValueID
end DIMOutputValueID --int
,b.OutputValueName --string,
,b.OutputValueName_CN --string,
,b.OutputValueCode --varchar(50),
,b.OutputValueOrder --int
from datalake_rci.MID_DealerVehicleOutputValue b
left outer join
(
select w.low,w.DIMDealerID, w.DIMVehicleID,w.OutputValueOrder,w.row_num from (
select z.low,z.DIMDealerID, z.DIMVehicleID, z.OutputValueOrder,
row_number() over(partition by z.DIMDealerID order by z.OutputValueOrder desc) row_num
from
(
select t1.low,y.DIMDealerID, y.DIMVehicleID, y.OutputValueOrder
from
(
select b.DIMDealerID, b.cnt*l.Rate low
from
(select DIMDealerID, count(*) cnt
from datalake_rci.MID_DealerVehicleOutputValue
group by DIMDealerID) b
cross join
(select Rate from datalake_rci.DIM_OutputValue where OutputValueCode = 'Low') l
) t1
inner join
(select DIMDealerID, DIMVehicleID, OutputValueOrder
from datalake_rci.MID_DealerVehicleOutputValue) y
on t1.DIMDealerID = y.DIMDealerID
) z
) w
where w.row_num <= w.low
) a on b.DIMDealerID = a.DIMDealerID;
我试了两次,但结果都一样。顺便说一下,
MID_DealerVehicleOutputValue
表中总共有336258079行。这是导致错误的原因吗?在此之前,其他一些类似的语句正在成功运行,但要处理的行数不多。您应该更改mapreduce.reduce.shuffle.memory.limit.percent的值
此参数用于单个无序移动(从单个映射任务复制的输出)应占用的上述内存缓冲区的最大百分比。洗牌的大小超过此大小将不会复制到内存缓冲区,而是直接写入减速机的磁盘
尝试减少此参数的值,然后再次运行查询
还要确保mapreduce.reduce.shuffle.merge.percent
的值低于mapreduce.reduce.shuffle.memory.limit.percent
在进行任何内存调整之前,请尝试以更好的方式重写查询。执行额外连接时会做不必要的工作。首先,您可以通过删除不必要的内部联接和正在计算count(*)的子查询来大大简化它。在(按DIMDELERID分区)上使用分析count(*)
:
INSERT overwrite table datalake_rci.MID_DealerVehicleOutputValue
--MIDDealerVehicleOutputValueID int,
select
b.MIDDealerVehicleOutputValueID
,b.DealerID --string,
,b.CHASSIS --string,
,b.DIMDealerID --bigint,
,b.DIMVehicleID --bigint,
,case when a.DIMDealerID is not null
then 1 else b.DIMOutputValueID
end DIMOutputValueID --int
,b.OutputValueName --string,
,b.OutputValueName_CN --string,
,b.OutputValueCode --varchar(50),
,b.OutputValueOrder --int
from datalake_rci.MID_DealerVehicleOutputValue b
left outer join
(
select w.low,w.DIMDealerID, w.DIMVehicleID,w.OutputValueOrder,w.row_num from
(
select z.low,z.DIMDealerID, z.DIMVehicleID, z.OutputValueOrder,
row_number() over(partition by z.DIMDealerID order by z.OutputValueOrder desc) row_num
from
(
select DIMDealerID, DIMVehicleID, OutputValueOrder,
count(*) over(partition by DIMDealerID) * l.Rate low
from datalake_rci.MID_DealerVehicleOutputValue
cross join
(select Rate from datalake_rci.DIM_OutputValue where OutputValueCode = 'Low') l
) z
) w
where w.row_num <= w.low
) a on b.DIMDealerID = a.DIMDealerID;
当然,我的查询可能包含一些bug,应该仔细测试,但我希望您已经知道了几乎总是可以消除自联接。最后,您的查询将只读取每个表一次,并且将消除许多其他繁重的步骤。我希望您至少可以去掉两个reducer和两个mapper顶点
我还建议增加映射程序的并行性。调整这些设置,尝试减少图形,直到运行更多贴图器:
--tune mapper parallelizm
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set tez.grouping.max-size=67108864;
set tez.grouping.min-size=32000000;
运行这些查询时,您有哪些配置单元设置,如果您的执行是tez,则将其更改为mr并给出一个try@Vijiy更好地研究如何调整Tez。Tez的表现比MR好得多,MR不受欢迎。非常感谢你,用你的方式完成这项任务只需不到500秒,效果很好。再次感谢你。
INSERT overwrite table datalake_rci.MID_DealerVehicleOutputValue
--MIDDealerVehicleOutputValueID int,
select
s.MIDDealerVehicleOutputValueID
,s.DealerID --string,
,s.CHASSIS --string,
,s.DIMDealerID --bigint,
,s.DIMVehicleID --bigint,
,case when s.row_num <= s.low --you do not need join to calculate this
then 1 else s.DIMOutputValueID
end DIMOutputValueID --int
,s.OutputValueName --string,
,s.OutputValueName_CN --string,
,s.OutputValueCode --varchar(50),
,s.OutputValueOrder --int
from
(
select s.low,s.DIMDealerID, s.DIMVehicleID, s.OutputValueOrder, s.MIDDealerVehicleOutputValueID,s.DealerID,s.CHASSIS,s.OutputValueName, s.OutputValueName_CN,s.OutputValueCode,s.OutputValueOrder
row_number() over(partition by s.DIMDealerID order by s.OutputValueOrder desc) row_num
from
(
select s.DIMDealerID, s.DIMVehicleID, s.OutputValueOrder, s.MIDDealerVehicleOutputValueID,s.DealerID,s.CHASSIS,s.OutputValueName, s.OutputValueName_CN,s.OutputValueCode,s.OutputValueOrder
count(*) over(partition by DIMDealerID) * l.Rate low
from datalake_rci.MID_DealerVehicleOutputValue s
cross join
(select Rate from datalake_rci.DIM_OutputValue where OutputValueCode = 'Low') l
) s
) s; --one or two subqueries also can be removed
--tune mapper parallelizm
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set tez.grouping.max-size=67108864;
set tez.grouping.min-size=32000000;