数据聚合和采样

发布时间：2020-12-14 05:05:53 所属栏目：百科来源：网络整理
导读：基础聚合常见的聚合函数 count , sum , avg , max , min set hive.map.aggr = true; // mapper端预聚合，提高性能，但消耗较多内存注意：只能select group by从句的字段，否则报错 select name,gender_age.gender, count ( * ) as row_cnt -- error! from e
基础聚合 
常见的聚合函数 count,sum,avg,max,min
set hive.map.aggr=true;      //mapper端预聚合，提高性能，但消耗较多内存
注意：只能select group by从句的字段，否则报错
select name,gender_age.gender,count(*) as row_cnt  --error!
from employee
group by gender_age.gender;

如何select没有出现在group by中的字段？
    1)使用collect_set函数；
select gender_age.gender,collect_set(gender_age.age)[0] as random_age,count(*) as row_cnt
from employee
group by gender_age.gender;

select中使用多个聚合函数
select gender_age.gender,avg(gender_age.age) as avg_age,count(*) as row_cnt
from employee
group by gender_age.gender;

    2)使用分析函数(不需要 group by)
聚合函数和 case when 一起使用
select sum(case when gender_age.gender = "Male" then gender_age.age else 0 end) /
sum(case when gender_age.gender = "Male" then 1 else null end) as man_age_avg    -- 0 == null
from employee;

聚合函数和coalesce,if连用。 case when和if的区别？
coalesce(a1,a2,...) - Returns the first non-null argument
求age_sum,woman_age_sum
select sum(coalesce(gender_age.age,0)) as age_sum,sum(if(gender_age.gender = "Female",gender_age.age,0)) as woman_age_sum
from employee;

聚合函数不允许嵌套 如：avg(count(*)) error!

聚合函数中使用distinct 
select count(distinct gender_age.gender) as sex_uni_cnt,count(distinct name) as name_uni_cnt
from employee;
注意：count和distinct一起使用时，只会使用一个reducer，降低性能，解决方法：
select count(*) as sex_uni_cnt
from (
select distinct gender_age.gender from employee
) a;
注意：聚合时若遇到含有null值的列，则忽略该行，如：
--创建含有null行的表
create table t as select * from (
select employee_id - 99 as val1,employee_id - 98 as val2 from employee_hr
where employee_id <= 101
union all
select null as val1,2 as val2 from employee_hr
where employee_id = 100
) a;
+---------+---------+--+
| t.val1  | t.val2  |
+---------+---------+--+
| 1       | 2       |
| NULL    | 2       |
| 2       | 3       |
+---------+---------+--+
select sum(val1 + val2) from t;     --第二行会被忽略，解决方法：
select sum(coalesce(val1,0) + val2) from t;

高级聚合
grouping sets 自定义聚合
内层的grouping sets定义了每个union all中的group by要实现的数据。
select name,work_place[0] as main_place,count(employee_id) as emp_id_cnt
from employee_id
group by name,work_place[0] grouping sets((name,work_place[0]));
<==>
select name,work_place[0];

外层的grouping sets定义了union all要实现的数据。
select name,work_place[0] grouping sets((name),(work_place[0]));
<==>
select name,null as main_place,count(employee_id) as emp_id_cnt
from employee_id
group by name
union all
select null as name,count(employee_id) as employee_id_cnt
from employee_id
group by work_place[0];

内外层grouping by混合使用
select name,count(employee_id) as employee_id_cnt
from employee_id
group by name,work_place[0]),name,work_place[0],());

煮鱼：grouping sets的内容为表或记录的别名所指向的列时，可能会出现问题，已修复。如：
select gender_age.gender,count(name) as name_uni_cnt
from employee
group by gender_age.gender,gender_age.age grouping sets((gender_age.gender,gender_age.age));

rollup and cube

rollup 创建n + 1级聚合，n为group by中的分组列
group by a,b,c with rollup <==> grouping set((a,c),(a,b),(a),())
cube  创建2^n级聚合
group by a,c with cube 

grouping_id 表示行的列是否聚合的位向量的10进制形式??????????
select grouping_id,bin(cast(grouping_id as bigint)) as bit_vector,start_date,count(employee_id) emp_id_cont
from employee_hr
group by start_date,name
with cube 
order by start_date;

having - 对group by的结果进行过滤，避免分组以后再使用子查询
将员工按年龄分组，统计:人数<=1的年龄组
select gender_age.age 
from employee_id
group by gender_age.age having count(*) <= 1;
<==>
select a.age  
from (
select gender_age.age as age,count(*) as cnt    --内查询中的列一定要给个别名
from employee_id
group by gender_age.age
) a
where a.cnt <= 1;  --where中不支持UDAF函数，要起个别名

分析函数
分析函数与聚合函数的区别？
聚合函数会分组，然后为每个组生成一个结果；分析函数不对结果集分组
Function(arg1,...,argn) OVER ([PARTITION BY <...>] [ORDER BY <....>] [<window_clause>])
或者 P184
function(arg1,argn) over w as w_n
...
window
w as ([PARTITION BY <...>] [ORDER BY <....>] [<window_clause>])

Function(arg1,argn):
    标准的聚合函数
    分析函数：不能在窗口从句中使用，还有lead，lag。
        ntile:将排序好的数据集分桶，并且给每行分配一个合适的桶号，适用于将数据等分

        rank:组内元素进行编号
        dense_rank:组内元素进行编号，并排的下一个元素步进1
        row_number:安照排序和分区给元素编号，每组从1开始
        percent_rank:??

        cume_dist:??计算 <= 给定值的行数， 给定值 = 总行数 / 当前行            
    窗口函数：
        lead(value_expr,[offset],[default]):窗口(组)内指定列的行向下前进offset行
        lag(value_expr,[default]):窗口(组)内指定列的行向上滞后offset行
        first_value:
        last_value:
over + :
    标准的聚合函数
    注意order by只支持一个排序键，变通方法：使用rows unbounded preceding窗口从句
    partition by + :基本类型的一个或多个分区列 why?
    partition by + sorted by：任意类型的一个或多个分区列/排序列
    窗口从句：窗口和组的区别？窗口是分区内的某个范围
        作用：细颗粒度
        类型：
          ??：升序时，star_expr必须小于end_expr,否则整列值为null或报异常
          行类型窗口    ROWS BETWEEN <start_expr> AND <end_expr> 当前行之前或之后的行数
          范围类型窗口     RANGE BETWEEN <start_expr> AND <end_expr>
              两窗口的不同点：range仅支持一个排序列，且此列只能为数字或日期。窗口的范围由排序列决定

        若定义了order by,缺失窗口从句。默认为：range between unbounded preceding and current row 窗口为当前分区的起点到当前行
        都缺失。默认为：rows between unbounded preceding and unbounded following 窗口为当前分区

create table if not exists employee_contract (
name string,dept_num int,employee_id int,salary int,type string,start_date date
)
row format delimited
fields terminated by ‘|‘
stored as textfile;
load data local inpath "/home/centos/hiveessential/ch06/employee_contract.txt"
overwrite into table employee_contract;

select name,dept_num,salary,count(*) over (partition by dept_num) as row_cnt,--组内求和
sum(salary) over (partition by dept_num order by dept_num) as deptTotal,--    组内数据累加
sum(salary) over (order by dept_num) as runingTotol1,--各组之和累加， ？？？？
sum(salary) over (order by dept_num,name rows unbounded preceding) as runingTotol2  --看不懂
from employee_contract
order by dept_num,name;

set hive.strict.checks.large.query=false;
set hive.mapred.mode=nonstrict;

select name,count(*) over (partition by dept_num) as row_cnt,sum(salary) over(partition by dept_num order by dept_num) as deptTotal,sum(salary) over(order by dept_num) as runingTotol1,sum(salary) over(order by dept_num,name rows unbounded preceding) as runingTotol2
from employee_contract
order by dept_num,name;
<==>独立的窗口从句
select name,count(*) over row_cnt,sum(salary) over deptTotal,sum(salary) over runingTotol1,sum(salary) over runingTotol2
from employee_contract
order by dept_num,name
window
row_cnt as (partition by dept_num),overdeptTotal as (partition by dept_num order by dept_num),runingTotol1 as (order by dept_num),runingTotol2 as (order by dept_num,name rows unbounded preceding);

//范围查询
select name,start_year,max(salary) over (partition by dept_num order by start_year 
range between 2 preceding and current row) as win1
from (
select name,year(start_date) as start_year
from employee_contract
) a;

抽样
1. 随机取样??
select name from employee_id
distribute by rand() sort by rand()
limit 5;
2. 分桶表抽样
SELECT * FROM <BucketTable_Name> 
TABLESAMPLE(BUCKET <specified bucket number to sample> OUT OF <total number of buckets> ON [colname|RAND()]) table_alias;
select name from employee_id_buckets
tablesample(bucket 1 out of 2 on rand()) a;
3. 块抽样
SELECT * FROM <Table_Name> 
TABLESAMPLE(N PERCENT|ByteLengthLiteral|N ROWS) s;
按百分比大小抽样
select name from employee_id_buckets
tablesample (20 percent) a;
按行数抽样
select name from employee_id
tablesample (5 rows) a;
按字节大小抽样
select name from employee_id
tablesample (700B) a;
（编辑：李大同）
【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!