Warning: file_get_contents(/data/phpspider/zhask/data//catemap/1/database/8.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Hive 在配置单元中运行HQL时抛出异常_Hive_Query Optimization_Hiveql_Apache Tez - Fatal编程技术网

Hive 在配置单元中运行HQL时抛出异常

Hive 在配置单元中运行HQL时抛出异常,hive,query-optimization,hiveql,apache-tez,Hive,Query Optimization,Hiveql,Apache Tez,我想运行一个select语句并将结果放入一个表中,我确信这不是语法错误 HQL: INSERT overwrite table datalake_rci.MID_DealerVehicleOutputValue --MIDDealerVehicleOutputValueID int, select b.MIDDealerVehicleOutputValueID ,b.DealerID --string, ,b.CHASSIS --string, ,b.DIMDealerID --bigint,

我想运行一个select语句并将结果放入一个表中,我确信这不是语法错误

HQL:

INSERT overwrite table datalake_rci.MID_DealerVehicleOutputValue
--MIDDealerVehicleOutputValueID int,
select
b.MIDDealerVehicleOutputValueID
,b.DealerID --string,
,b.CHASSIS --string,
,b.DIMDealerID --bigint,
,b.DIMVehicleID --bigint,

,case when a.DIMDealerID is not null
      then 1 else b.DIMOutputValueID
end DIMOutputValueID  --int     

,b.OutputValueName --string,
,b.OutputValueName_CN --string,
,b.OutputValueCode --varchar(50),
,b.OutputValueOrder --int
from datalake_rci.MID_DealerVehicleOutputValue b
left outer join
(
    select w.low,w.DIMDealerID, w.DIMVehicleID,w.OutputValueOrder,w.row_num from (
        select z.low,z.DIMDealerID, z.DIMVehicleID, z.OutputValueOrder,
               row_number() over(partition by z.DIMDealerID order by z.OutputValueOrder desc) row_num
        from
        (
            select t1.low,y.DIMDealerID, y.DIMVehicleID, y.OutputValueOrder
           from
            (
                select b.DIMDealerID, b.cnt*l.Rate low
                from
                    (select DIMDealerID, count(*) cnt
                        from datalake_rci.MID_DealerVehicleOutputValue
                        group by DIMDealerID) b
                    cross join
                        (select Rate from datalake_rci.DIM_OutputValue where OutputValueCode = 'Low') l
            ) t1
            inner join
            (select DIMDealerID, DIMVehicleID, OutputValueOrder
                from datalake_rci.MID_DealerVehicleOutputValue) y
            on t1.DIMDealerID = y.DIMDealerID
        ) z
    ) w
    where w.row_num <= w.low
) a on b.DIMDealerID = a.DIMDealerID;

我试了两次,但结果都一样。顺便说一下,
MID_DealerVehicleOutputValue
表中总共有336258079行。这是导致错误的原因吗?在此之前,其他一些类似的语句正在成功运行,但要处理的行数不多。

您应该更改
mapreduce.reduce.shuffle.memory.limit.percent的值

此参数用于单个无序移动(从单个映射任务复制的输出)应占用的上述内存缓冲区的最大百分比。洗牌的大小超过此大小将不会复制到内存缓冲区,而是直接写入减速机的磁盘

尝试减少此参数的值,然后再次运行查询


还要确保
mapreduce.reduce.shuffle.merge.percent
的值低于
mapreduce.reduce.shuffle.memory.limit.percent
在进行任何内存调整之前,请尝试以更好的方式重写查询。执行额外连接时会做不必要的工作。首先,您可以通过删除不必要的内部联接和正在计算count(*)的子查询来大大简化它。在(按DIMDELERID分区)上使用分析
count(*)

INSERT overwrite table datalake_rci.MID_DealerVehicleOutputValue
--MIDDealerVehicleOutputValueID int,
select
b.MIDDealerVehicleOutputValueID
,b.DealerID --string,
,b.CHASSIS --string,
,b.DIMDealerID --bigint,
,b.DIMVehicleID --bigint,

,case when a.DIMDealerID is not null
      then 1 else b.DIMOutputValueID
end DIMOutputValueID  --int     

,b.OutputValueName --string,
,b.OutputValueName_CN --string,
,b.OutputValueCode --varchar(50),
,b.OutputValueOrder --int
from datalake_rci.MID_DealerVehicleOutputValue b
left outer join
(
    select w.low,w.DIMDealerID, w.DIMVehicleID,w.OutputValueOrder,w.row_num from 
    (
        select z.low,z.DIMDealerID, z.DIMVehicleID, z.OutputValueOrder,
               row_number() over(partition by z.DIMDealerID order by z.OutputValueOrder desc) row_num
        from
        (
           select DIMDealerID, DIMVehicleID, OutputValueOrder,
                    count(*) over(partition by DIMDealerID) * l.Rate low
                from datalake_rci.MID_DealerVehicleOutputValue 
                cross join
                        (select Rate from datalake_rci.DIM_OutputValue where OutputValueCode = 'Low') l

        ) z
    ) w
    where w.row_num <= w.low
) a on b.DIMDealerID = a.DIMDealerID; 
当然,我的查询可能包含一些bug,应该仔细测试,但我希望您已经知道了几乎总是可以消除自联接。最后,您的查询将只读取每个表一次,并且将消除许多其他繁重的步骤。我希望您至少可以去掉两个reducer和两个mapper顶点

我还建议增加映射程序的并行性。调整这些设置,尝试减少图形,直到运行更多贴图器:

--tune mapper parallelizm
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set tez.grouping.max-size=67108864;
set tez.grouping.min-size=32000000;

运行这些查询时,您有哪些配置单元设置,如果您的执行是tez,则将其更改为mr并给出一个try@Vijiy更好地研究如何调整Tez。Tez的表现比MR好得多,MR不受欢迎。非常感谢你,用你的方式完成这项任务只需不到500秒,效果很好。再次感谢你。
INSERT overwrite table datalake_rci.MID_DealerVehicleOutputValue
--MIDDealerVehicleOutputValueID int,
select
 s.MIDDealerVehicleOutputValueID
,s.DealerID --string,
,s.CHASSIS --string,
,s.DIMDealerID --bigint,
,s.DIMVehicleID --bigint,

,case when s.row_num <= s.low        --you do not need join to calculate this
      then 1 else s.DIMOutputValueID
  end DIMOutputValueID  --int     

,s.OutputValueName --string,
,s.OutputValueName_CN --string,
,s.OutputValueCode --varchar(50),
,s.OutputValueOrder --int
from 
    (
        select s.low,s.DIMDealerID, s.DIMVehicleID, s.OutputValueOrder, s.MIDDealerVehicleOutputValueID,s.DealerID,s.CHASSIS,s.OutputValueName, s.OutputValueName_CN,s.OutputValueCode,s.OutputValueOrder
               row_number() over(partition by s.DIMDealerID order by s.OutputValueOrder desc) row_num
        from
        (
           select s.DIMDealerID, s.DIMVehicleID, s.OutputValueOrder, s.MIDDealerVehicleOutputValueID,s.DealerID,s.CHASSIS,s.OutputValueName, s.OutputValueName_CN,s.OutputValueCode,s.OutputValueOrder
                    count(*) over(partition by DIMDealerID) * l.Rate low
                from datalake_rci.MID_DealerVehicleOutputValue s
                cross join
                        (select Rate from datalake_rci.DIM_OutputValue where OutputValueCode = 'Low') l

        ) s
    ) s; --one or two subqueries also can be removed
--tune mapper parallelizm
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set tez.grouping.max-size=67108864;
set tez.grouping.min-size=32000000;