精华内容
下载资源
问答
  • Hive 开窗函数

    万次阅读 多人点赞 2018-08-06 00:19:44
    工作中用到了几个hive开窗函数,便想把hive开窗函数系统梳理一遍。 开窗函数 普通的聚合函数聚合的行集是组,开窗函数聚合的行集是窗口。因此,普通的聚合函数每组(Group by)只返回一个值,而开窗函数则可为窗口中...
    工作中用到了几个hive开窗函数,便想把hive开窗函数系统梳理一遍。
    

    开窗函数

    普通的聚合函数聚合的行集是组,开窗函数聚合的行集是窗口。因此,普通的聚合函数每组(Group by)只返回一个值,而开窗函数则可为窗口中的每行都返回一个值。简单理解,就是对查询的结果多出一列,这一列可以是聚合值,也可以是排序值。
    开窗函数一般分为两类,聚合开窗函数和排序开窗函数。
    

    测试数据

    -- 建表
    create table student_scores(
    id int,
    studentId int,
    language int,
    math int,
    english int,
    classId string,
    departmentId string
    );
    -- 写入数据
    insert into table student_scores values 
      (1,111,68,69,90,'class1','department1'),
      (2,112,73,80,96,'class1','department1'),
      (3,113,90,74,75,'class1','department1'),
      (4,114,89,94,93,'class1','department1'),
      (5,115,99,93,89,'class1','department1'),
      (6,121,96,74,79,'class2','department1'),
      (7,122,89,86,85,'class2','department1'),
      (8,123,70,78,61,'class2','department1'),
      (9,124,76,70,76,'class2','department1'),
      (10,211,89,93,60,'class1','department2'),
      (11,212,76,83,75,'class1','department2'),
      (12,213,71,94,90,'class1','department2'),
      (13,214,94,94,66,'class1','department2'),
      (14,215,84,82,73,'class1','department2'),
      (15,216,85,74,93,'class1','department2'),
      (16,221,77,99,61,'class2','department2'),
      (17,222,80,78,96,'class2','department2'),
      (18,223,79,74,96,'class2','department2'),
      (19,224,75,80,78,'class2','department2'),
      (20,225,82,85,63,'class2','department2');

    聚合开窗函数

    count开窗函数

    -- count 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    count(math) over() as count1,
     -- 以按classId分组的所有行作为窗口
    count(math) over(partition by classId) as count2,
     -- 以按classId分组、按math排序的所有行作为窗口
    count(math) over(partition by classId order by math) as count3,
     -- 以按classId分组、按math排序、按 当前行+往前1行+往后2行的行作为窗口
    count(math) over(partition by classId order by math rows between 1 preceding and 2 following) as count4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid count1  count2  count3  count4
    111         69      department1     class1  9       5       1       3
    113         74      department1     class1  9       5       2       4
    112         80      department1     class1  9       5       3       4
    115         93      department1     class1  9       5       4       3
    114         94      department1     class1  9       5       5       2
    124         70      department1     class2  9       4       1       3
    121         74      department1     class2  9       4       2       4
    123         78      department1     class2  9       4       3       3
    122         86      department1     class2  9       4       4       2
    
    结果解释:
    studentid=115,count1为所有的行数9,count2为分区class1中的行数5,count3为分区class1中math值<=93的行数4,
    count4为分区class1中math值向前+1行向后+2行(实际只有1行)的总行数3

    sum开窗函数

    -- sum开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    sum(math) over() as sum1,
    -- 以按classId分组的所有行作为窗口
    sum(math) over(partition by classId) as sum2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    sum(math) over(partition by classId order by math) as sum3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    sum(math) over(partition by classId order by math rows between 1 preceding and 2 following) as sum4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid sum1    sum2    sum3    sum4
    111         69      department1     class1  718     410     69      223
    113         74      department1     class1  718     410     143     316
    112         80      department1     class1  718     410     223     341
    115         93      department1     class1  718     410     316     267
    114         94      department1     class1  718     410     410     187
    124         70      department1     class2  718     308     70      222
    121         74      department1     class2  718     308     144     308
    123         78      department1     class2  718     308     222     238
    122         86      department1     class2  718     308     308     164
    
    结果解释:
        同count开窗函数

    min开窗函数

    -- min 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    min(math) over() as min1,
    -- 以按classId分组的所有行作为窗口
    min(math) over(partition by classId) as min2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    min(math) over(partition by classId order by math) as min3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    min(math) over(partition by classId order by math rows between 1 preceding and 2 following) as min4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid min1    min2    min3    min4
    111         69      department1     class1  69      69      69      69
    113         74      department1     class1  69      69      69      69
    112         80      department1     class1  69      69      69      74
    115         93      department1     class1  69      69      69      80
    114         94      department1     class1  69      69      69      93
    124         70      department1     class2  69      70      70      70
    121         74      department1     class2  69      70      70      70
    123         78      department1     class2  69      70      70      74
    122         86      department1     class2  69      70      70      78
    
    结果解释:
        同count开窗函数

    max开窗函数

    -- max 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    max(math) over() as max1,
    -- 以按classId分组的所有行作为窗口
    max(math) over(partition by classId) as max2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    max(math) over(partition by classId order by math) as max3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    max(math) over(partition by classId order by math rows between 1 preceding and 2 following) as max4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid max1    max2    max3    max4
    111         69      department1     class1  94      94      69      80
    113         74      department1     class1  94      94      74      93
    112         80      department1     class1  94      94      80      94
    115         93      department1     class1  94      94      93      94
    114         94      department1     class1  94      94      94      94
    124         70      department1     class2  94      86      70      78
    121         74      department1     class2  94      86      74      86
    123         78      department1     class2  94      86      78      86
    122         86      department1     class2  94      86      86      86
    
    结果解释:
        同count开窗函数

    avg开窗函数

    -- avg 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    avg(math) over() as avg1,
    -- 以按classId分组的所有行作为窗口
    avg(math) over(partition by classId) as avg2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    avg(math) over(partition by classId order by math) as avg3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    avg(math) over(partition by classId order by math rows between 1 preceding and 2 following) as avg4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid avg1                avg2    avg3                avg4
    111         69      department1     class1  79.77777777777777   82.0    69.0                74.33333333333333
    113         74      department1     class1  79.77777777777777   82.0    71.5                79.0
    112         80      department1     class1  79.77777777777777   82.0    74.33333333333333   85.25
    115         93      department1     class1  79.77777777777777   82.0    79.0                89.0
    114         94      department1     class1  79.77777777777777   82.0    82.0                93.5
    124         70      department1     class2  79.77777777777777   77.0    70.0                74.0
    121         74      department1     class2  79.77777777777777   77.0    72.0                77.0
    123         78      department1     class2  79.77777777777777   77.0    74.0                79.33333333333333
    122         86      department1     class2  79.77777777777777   77.0    77.0                82.0
    
    结果解释:
        同count开窗函数

    first_value开窗函数

    返回分区中的第一个值。
    
    -- first_value 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    first_value(math) over() as first_value1,
    -- 以按classId分组的所有行作为窗口
    first_value(math) over(partition by classId) as first_value2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    first_value(math) over(partition by classId order by math) as first_value3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    first_value(math) over(partition by classId order by math rows between 1 preceding and 2 following) as first_value4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid first_value1    first_value2    first_value3    first_value4
    111         69      department1     class1  69              69              69              69
    113         74      department1     class1  69              69              69              69
    112         80      department1     class1  69              69              69              74
    115         93      department1     class1  69              69              69              80
    114         94      department1     class1  69              69              69              93
    124         70      department1     class2  69              74              70              70
    121         74      department1     class2  69              74              70              70
    123         78      department1     class2  69              74              70              74
    122         86      department1     class2  69              74              70              78
    
    结果解释:
        studentid=124 first_value1:第一个值是69,first_value2:classId=class1分区 math的第一个值是69

    last_value开窗函数

    返回分区中的第一个值。
    
    -- last_value 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    last_value(math) over() as last_value1,
    -- 以按classId分组的所有行作为窗口
    last_value(math) over(partition by classId) as last_value2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    last_value(math) over(partition by classId order by math) as last_value3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    last_value(math) over(partition by classId order by math rows between 1 preceding and 2 following) as last_value4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid last_value1 last_value2 last_value3 last_value4
    111         69      department1     class1  70          93          69          80
    113         74      department1     class1  70          93          74          93
    112         80      department1     class1  70          93          80          94
    115         93      department1     class1  70          93          93          94
    114         94      department1     class1  70          93          94          94
    124         70      department1     class2  70          70          70          78
    121         74      department1     class2  70          70          74          86
    123         78      department1     class2  70          70          78          86
    122         86      department1     class2  70          70          86          86

    lag开窗函数

    lag(col,n,default) 用于统计窗口内往上第n个值。
        col:列名
        n:往上第n行
        default:往上第n行为NULL时候,取默认值,不指定则取NULL
    
    -- lag 开窗函数
    
    select studentId,math,departmentId,classId,
     --窗口内 往上取第二个 取不到时赋默认值60
    lag(math,2,60) over(partition by classId order by math) as lag1,
     --窗口内 往上取第二个 取不到时赋默认值NULL
    lag(math,2) over(partition by classId order by math) as lag2
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid lag1    lag2
    111         69      department1     class1  60      NULL
    113         74      department1     class1  60      NULL
    112         80      department1     class1  69      69
    115         93      department1     class1  74      74
    114         94      department1     class1  80      80
    124         70      department1     class2  60      NULL
    121         74      department1     class2  60      NULL
    123         78      department1     class2  70      70
    122         86      department1     class2  74      74
    
    结果解释:
        第3行 lag1:窗口内(69 74 80) 当前行80 向上取第二个值为69
        倒数第3行 lag2:窗口内(70 74) 当前行74 向上取第二个值为NULL

    lead开窗函数

    lead(col,n,default) 用于统计窗口内往下第n个值。
        col:列名
        n:往下第n行
        default:往下第n行为NULL时候,取默认值,不指定则取NULL
    
    -- lead开窗函数
    
    select studentId,math,departmentId,classId,
     --窗口内 往下取第二个 取不到时赋默认值60
    lead(math,2,60) over(partition by classId order by math) as lead1,
     --窗口内 往下取第二个 取不到时赋默认值NULL
    lead(math,2) over(partition by classId order by math) as lead2
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid lead1   lead2
    111         69      department1     class1  80      80
    113         74      department1     class1  93      93
    112         80      department1     class1  94      94
    115         93      department1     class1  60      NULL
    114         94      department1     class1  60      NULL
    124         70      department1     class2  78      78
    121         74      department1     class2  86      86
    123         78      department1     class2  60      NULL
    122         86      department1     class2  60      NULL
    
    结果解释:
        第4行lead1 窗口内向下第二个值为空,赋值60

    cume_dist开窗函数

    计算某个窗口或分区中某个值的累积分布。假定升序排序,则使用以下公式确定累积分布:
    小于等于当前值x的行数 / 窗口或partition分区内的总行数。其中,x 等于 order by 子句中指定的列的当前行中的值。
    
    -- cume_dist 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 统计小于等于当前分数的人数占总人数的比例
    cume_dist() over(order by math) as cume_dist1,
    -- 统计大于等于当前分数的人数占总人数的比例
    cume_dist() over(order by math desc) as cume_dist2,
    -- 统计分区内小于等于当前分数的人数占总人数的比例
    cume_dist() over(partition by classId order by math) as cume_dist3
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid cume_dist1              cume_dist2          cume_dist3
    111         69      department1     class1  0.1111111111111111      1.0                 0.2
    113         74      department1     class1  0.4444444444444444      0.7777777777777778  0.4
    112         80      department1     class1  0.6666666666666666      0.4444444444444444  0.6
    115         93      department1     class1  0.8888888888888888      0.2222222222222222  0.8
    114         94      department1     class1  1.0                     0.1111111111111111  1.0
    124         70      department1     class2  0.2222222222222222      0.8888888888888888  0.25
    121         74      department1     class2  0.4444444444444444      0.7777777777777778  0.5
    123         78      department1     class2  0.5555555555555556      0.5555555555555556  0.75
    122         86      department1     class2  0.7777777777777778      0.3333333333333333  1.0
    
    结果解释:
        第三行:
            cume_dist1=小于等于80的人数为6/总人数9=0.6666666666666666
            cume_dist2=大于等于80的人数为4/总人数9=0.4444444444444444
            cume_dist3=分区内小于等于80的人数为3/分区内总人数5=0.6
    

    排序开窗函数

    rank开窗函数

    rank 开窗函数基于 over 子句中的 order by 确定一组值中一个值的排名。如果存在partition by ,则为每个分区组中的每个值排名。排名可能不是连续的。例如,如果两个行的排名为 1,则下一个排名为 3。
    
    -- rank 开窗函数
    
    select *,
    -- 对全部学生按数学分数排序 
    rank() over(order by math) as rank1,
    -- 对院系 按数学分数排序
    rank() over(partition by departmentId order by math) as rank2,
    -- 对每个院系每个班级 按数学分数排序
    rank() over(partition by departmentId,classId order by math) as rank3
    from student_scores;
    
    结果
    
    id  studentid   language    math    english     classid departmentid    rank1   rank2   rank3
    1   111         68          69      90          class1  department1     1       1       1
    3   113         90          74      75          class1  department1     3       3       2
    2   112         73          80      96          class1  department1     9       6       3
    5   115         99          93      89          class1  department1     15      8       4
    4   114         89          94      93          class1  department1     17      9       5
    9   124         76          70      76          class2  department1     2       2       1
    6   121         96          74      79          class2  department1     3       3       2
    8   123         70          78      61          class2  department1     7       5       3
    7   122         89          86      85          class2  department1     14      7       4
    15  216         85          74      93          class1  department2     3       1       1
    14  215         84          82      73          class1  department2     11      5       2
    11  212         76          83      75          class1  department2     12      6       3
    10  211         89          93      60          class1  department2     15      8       4
    12  213         71          94      90          class1  department2     17      9       5
    13  214         94          94      66          class1  department2     17      9       5
    18  223         79          74      96          class2  department2     3       1       1
    17  222         80          78      96          class2  department2     7       3       2
    19  224         75          80      78          class2  department2     9       4       3
    20  225         82          85      63          class2  department2     13      7       4
    16  221         77          99      61          class2  department2     20      11      5

    dense_rank开窗函数

    dense_rank与rank有一点不同,当排名一样的时候,接下来的行是连续的。如两个行的排名为 1,则下一个排名为 2。
    
    -- dense_rank 开窗函数
    
    select *,
    -- 对全部学生按数学分数排序
    dense_rank() over(order by math) as dense_rank1,
    -- 对院系 按数学分数排序
    dense_rank() over(partition by departmentId order by math) as dense_rank2,
    -- 对每个院系每个班级 按数学分数排序
    dense_rank() over(partition by departmentId,classId order by math) as dense_rank3
    from student_scores;
    
    结果:
    id  studentid   language    math    english classid departmentid    dense_rank1 dense_rank2 dense_rank3
    1   111         68          69      90      class1  department1     1           1           1
    3   113         90          74      75      class1  department1     3           3           2
    2   112         73          80      96      class1  department1     5           5           3
    5   115         99          93      89      class1  department1     10          7           4
    4   114         89          94      93      class1  department1     11          8           5
    9   124         76          70      76      class2  department1     2           2           1
    6   121         96          74      79      class2  department1     3           3           2
    8   123         70          78      61      class2  department1     4           4           3
    7   122         89          86      85      class2  department1     9           6           4
    15  216         85          74      93      class1  department2     3           1           1
    14  215         84          82      73      class1  department2     6           4           2
    11  212         76          83      75      class1  department2     7           5           3
    10  211         89          93      60      class1  department2     10          7           4
    12  213         71          94      90      class1  department2     11          8           5
    13  214         94          94      66      class1  department2     11          8           5
    18  223         79          74      96      class2  department2     3           1           1
    17  222         80          78      96      class2  department2     4           2           2
    19  224         75          80      78      class2  department2     5           3           3
    20  225         82          85      63      class2  department2     8           6           4
    16  221         77          99      61      class2  department2     12          9           5

    ntile开窗函数

    将分区中已排序的行划分为大小尽可能相等的指定数量的排名的组,并返回给定行所在的组的排名。
    
    -- ntile 开窗函数
    
    select *,
    -- 对分区内的数据分成两组
    ntile(2) over(partition by departmentid order by math) as ntile1,
    -- 对分区内的数据分成三组
    ntile(3) over(partition by departmentid order by math) as ntile2
    from student_scores;
    
    结果
    id  studentid   language    math    english classid departmentid    ntile1  ntile2
    1   111         68          69      90      class1  department1     1       1
    9   124         76          70      76      class2  department1     1       1
    6   121         96          74      79      class2  department1     1       1
    3   113         90          74      75      class1  department1     1       2
    8   123         70          78      61      class2  department1     1       2
    2   112         73          80      96      class1  department1     2       2
    7   122         89          86      85      class2  department1     2       3
    5   115         99          93      89      class1  department1     2       3
    4   114         89          94      93      class1  department1     2       3
    18  223         79          74      96      class2  department2     1       1
    15  216         85          74      93      class1  department2     1       1
    17  222         80          78      96      class2  department2     1       1
    19  224         75          80      78      class2  department2     1       1
    14  215         84          82      73      class1  department2     1       2
    11  212         76          83      75      class1  department2     1       2
    20  225         82          85      63      class2  department2     2       2
    10  211         89          93      60      class1  department2     2       2
    12  213         71          94      90      class1  department2     2       3
    13  214         94          94      66      class1  department2     2       3
    16  221         77          99      61      class2  department2     2       3
    
    结果解释:
        第8行
            ntile1:对分区的数据均匀分成2组后,当前行的组排名为2
            ntile2:对分区的数据均匀分成3组后,当前行的组排名为3

    row_number开窗函数

    从1开始对分区内的数据排序。
    
    -- row_number 开窗函数
    
    select studentid,departmentid,classid,math,
    -- 对分区departmentid,classid内的数据按math排序
    row_number() over(partition by departmentid,classid order by math) as row_number
    from student_scores;
    
    结果
    studentid   departmentid    classid math    row_number
    111         department1     class1  69      1
    113         department1     class1  74      2
    112         department1     class1  80      3
    115         department1     class1  93      4
    114         department1     class1  94      5
    124         department1     class2  70      1
    121         department1     class2  74      2
    123         department1     class2  78      3
    122         department1     class2  86      4
    216         department2     class1  74      1
    215         department2     class1  82      2
    212         department2     class1  83      3
    211         department2     class1  93      4
    213         department2     class1  94      5
    214         department2     class1  94      6
    223         department2     class2  74      1
    222         department2     class2  78      2
    224         department2     class2  80      3
    225         department2     class2  85      4
    221         department2     class2  99      5
    
    结果解释:
        同一分区,相同值,不同序。如studentid=213 studentid=214 值都为94 排序为5,6。

    percent_rank开窗函数

    计算给定行的百分比排名。可以用来计算超过了百分之多少的人。如360小助手开机速度超过了百分之多少的人。
    (当前行的rank值-1)/(分组内的总行数-1)
    
    -- percent_rank 开窗函数
    
    select studentid,departmentid,classid,math,
    row_number() over(partition by departmentid,classid order by math) as row_number,
    percent_rank() over(partition by departmentid,classid order by math) as percent_rank
    from student_scores;
    
    结果
    studentid   departmentid    classid math    row_number  percent_rank
    111         department1     class1  69      1           0.0
    113         department1     class1  74      2           0.25
    112         department1     class1  80      3           0.5
    115         department1     class1  93      4           0.75
    114         department1     class1  94      5           1.0
    124         department1     class2  70      1           0.0
    121         department1     class2  74      2           0.3333333333333333
    123         department1     class2  78      3           0.6666666666666666
    122         department1     class2  86      4           1.0
    216         department2     class1  74      1           0.0
    215         department2     class1  82      2           0.2
    212         department2     class1  83      3           0.4
    211         department2     class1  93      4           0.6
    213         department2     class1  94      5           0.8
    214         department2     class1  94      6           0.8
    223         department2     class2  74      1           0.0
    222         department2     class2  78      2           0.25
    224         department2     class2  80      3           0.5
    225         department2     class2  85      4           0.75
    221         department2     class2  99      5           1.0
    
    结果解释:
        studentid=115,percent_rank=(4-1)/(5-1)=0.75
        studentid=123,percent_rank=(3-1)/(4-1)=0.6666666666666666
    展开全文
  • HIVE开窗函数

    2021-06-10 20:43:52
    ETL,SQL面试高频考点——HIVE开窗函数 目录标题ETL,SQL面试高频考点——HIVE开窗函数一,窗口函数介绍二,开窗函数三,开窗函数分类排序开窗函数:实列解析:后续更新中~ 一,窗口函数介绍 窗口函数,也叫OLAP函数...

    ETL,SQL面试高频考点——HIVE开窗函数(基础篇)

    一,窗口函数介绍

    • 窗口函数,也叫OLAP函数(Online Anallytical Processing,联机分析处理),可以对数据库数据进行实时分析处理。
    • 窗口函数由开窗函数和分析函数构成,窗口函数就是既要显示聚集前的数据,又要显示聚集后的数据,简单讲,就是你查询的结果上,多出一列值(可以是聚合值或者排序号),所以分析函数可以分为两类:聚合分析函数和排序分析函数
    • 基本语法:
    <窗口分析函数> over (partition by <用于分组的字段> order by<用于排序的列名>) 
    

    二,开窗函数

    1,概要
    开窗函数就是over()函数,就是限定一个窗口,来显示分析函数的结果
    2,开窗函数一般有两种(固定形式,不可更改)

    -- 第一种
    over(partiton by ... order by ...)
    -- 第二种
    over(distribute by ... order by ...)
    

    3,区别

    • partiton by是一个一个reduce处理数据的,所以使用全局排序order by distribute
    • distribute by是多个reduce处理数据的,所以使用局部排序sort by

    三,分析函数分类

    1,排序分析函数:

    • RANK() OVER();
    • ROW_NUMBER() OVER();
    • DENSE_RANK() OVER();
    • NTILE(n) OVER();
    实列解析
    • 创建一个学生成绩表
    create table if not exists sc(name string,score int)
    row format delimited fields terminated by '\t';
    
    • 导入数据到HIVE
      张三 100
      李四 90
      王五 80
      刘六 100
      田七 70
    load data local inpath '数据文件本地路径' into table sc
    

    1.使用 RANK() OVER()

    • 特点:并列跳跃排序
    • 不开窗sql实现rank()
      sql语句
    SELECT b.name,b.Score,(
    select count(a.score)
    from sc a
    WHERE a.score>b.score) +1 as Ranking
    FROM sc b
    ORDER BY b.score DESC
    
    • 结果
      在这里插入图片描述
    • 原理总结:
      第一部分实现表的降序排序

    sql语句

    SELECT b.name,b.Score
    FROM sc b
    ORDER BY b.score DESC
    

    在这里插入图片描述
    第二部分实现,假设给你一个分数,如何计算出他的Rank排名:
    比如图中的王五是95分,大于他的分数有[100,100],count统计个数为2,加1就能得到他的rank排名3,其他如此类推,所以每个同学的rank排名为成绩在他之上的人数+1

    sql语句

    --计算大于分数的集合个数
    (select count(a.score)
    from sc a
    WHERE a.score>b.score) +1 as Ranking
    
    • 使用开窗语句:
    FROM sc
    SELECT name,score,rank()  over(ORDER BY score desc) ranking
    
    • 结果:
      在这里插入图片描述

    2.使用 DENSE_RANK() OVER()

    • 特点:并列连续排序
    • 不开窗sql实现dense_rank()
      sql语句
    SELECT b.name,b.Score
    ,(select count(DISTINCT(a.score))
    from sc a
    WHERE a.score>=b.score) as Ranking
    FROM sc b
    ORDER BY b.score DESC
    
    • 结果
      在这里插入图片描述

    • 原理
      这个与上面的函数不同因为是连续的,所以我们需要对获得到的重复的分数进行去重,比如说李四的成绩为90,那大于等于90的集合为[100,100,95,90,90],去重之后集合为[100,95,90],集合的长度就是他对应的连续排名3

    • DENSE_RANK函数

    FROM sc
    SELECT name,score,dense_rank() OVER(ORDER BY score DESC) ranking
    
    对比总结

    通过上面两个函数,我们发现开窗函数能实现普通函数比较难实现或者无法实现的问题,因为聚合函数只能操作分组的字段,这也是聚合函数最大的特点,窗口函数能够操作所有的字段,不受分组的限制。

    3.ROW_NUMBER() PERCENT_RANK() NTILE() 概述

    • ROW_NUMBER()
      特点:连续排序

    • PERCENT_RANK()
      特点:百分比排序

    • NTILE()
      特点:将有序集分桶(bucket)

    sql语句

    FROM sc
    SELECT name,score,
    ntile(3) OVER(ORDER BY score DESC) ntres,
    percent_rank() OVER(ORDER BY score DESC) prank,
    row_number() OVER(ORDER BY score DESC) rownum
    

    结果
    在这里插入图片描述

    2.聚合分析函数

    函数 作用
    sum 求和
    avg 求平均数
    max 求最大值
    min 求最小值
    count 计数
    first_value 返回分区中的第一个值
    last_value 返回分区中的最后一个值
    lag(col,n,default) col指定列,n用于统计窗口内往上第n个值 ,default不指定查找不到默认为NULL
    lead(col,n,default) col指定列,n用于统计窗口内往下第n个值,default不指定查找不到默认为NULL
    cume_dist 计算某个窗口或分区中某个值的累积分布,值为order by 子句中指定的列的当前行中的值。

    例:sum avg max min count

    FROM sc
    SELECT name,score,
    sum(score) OVER() sumres,
    count(score) OVER() cres ,
    min(score) OVER() minres,
    max(score) OVER() maxres,
    avg(score) OVER() avgres
    

    结果
    在这里插入图片描述

    • 下面的函数的使用是根据over()中的参数变化而变化的

    first_value() 函数

    • 用法是根据partiton by 的字段进行分区,如果忽略partition by,会根据order by排序后的结果返回第一条数据
    • 示例 在每行数据后开窗显示总第一名和每个班的第一名
    FROM sc2 
    SELECT *,first_value(score) OVER(ORDER BY score DESC) as totalFirst,
    first_value(score) OVER(PARTITION BY class ORDER BY score DESC) as classFirst
    

    在这里插入图片描述

    last_value() 函数

    • 作用是返回到当前行的最后一条数据
    • 示例
    FROM SC2
    -- 求分区内到当前行的最后一个值
    SELECT name,score,class,last_value(score) OVER(PARTITION BY class order by score) lastvalue
    

    在这里插入图片描述

    lag() 函数

    • 用法是用于统计分组内的往上前n个值
    • 示例1 排名并显示每个同学和上一位同学的分差
    from sc2
    select *,
    score-lag(score,1,0) over(order by score desc) as gap
    
    

    在这里插入图片描述

    • 示例2 排名并显示同一个班上每个同学和上一位同学的分差
    from sc2
    select *,
    score-lag(score,1,score) over(partition by class order by score desc) as gapByClass
    

    在这里插入图片描述

    lead() 函数

    • 用法是用于统计分组内的往下后n个值
    • 示例 排名并显示每个同学和后一名一位同学的分差
    from sc2
    select *,
    score-lead(score,1,0) over(order by score desc) as gap
    

    在这里插入图片描述

    • 示例2 排名并显示同一个班上每个同学和下一位同学的分差
    from sc2
    select *,
    score-lead(score,1,score) over(partition by class order by score desc) as gapByClass
    

    在这里插入图片描述

    cume_dist() 函数

    • 用法是如果按升序排列,则统计:小于等于当前值的行数/总行数(number of rows ≤ current row)/(total number of rows)。如果是降序排列,则统计:大于等于当前值的行数/总行数。
    • 示例1 统计小于等于当前分数的人数占比
    from sc2
    select *,
    cume_dist() over(order by score) as cume_dist
    

    在这里插入图片描述

    • 示例2 统计每个班小于等于当前分数的人数占比
    from sc2
    select *,
    cume_dist() over(partition by class order by score) as cume_dist
    

    在这里插入图片描述

    3.用spark 自定义HIVE用户自定义函数

    • hive中的表,可以通过如下语句缓存表,提高分析速度
    cache table 表名
    
    • 自定义UDF函数,相对比较简单我用的spark-shell
      例:addName()函数
      在这里插入图片描述
    • 自定义UDAF函数
      需要写一个类继承UserDefinedAggregateFunction.实现抽象方法,注册后使用spark.sql调用
      例: 构造一个自定义的MyAvg()函数
    import org.apache.spark.SparkConf
    import org.apache.spark.sql.{Row, SparkSession}
    import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
    import org.apache.spark.sql.types.{DataType, DoubleType, LongType, StructField, StructType}
    
    object UdadDemo {
      def main(args: Array[String]): Unit = {
        val conf = new SparkConf().setMaster("local").setAppName("UdAd")
        val spark: SparkSession = new SparkSession.Builder().config(conf).enableHiveSupport().getOrCreate()
        val avg = new MyAvg
        spark.udf.register("myAvg",avg)
        spark.sql("select MyAvg(score) from sc").show()
      }
    }
    class MyAvg extends UserDefinedAggregateFunction(){
      //输入结构
      override def inputSchema: StructType = StructType(Seq(StructField("inputColumn",LongType)))
      //结构缓存
      override def bufferSchema: StructType ={
        StructType{
          Seq(StructField("sum",LongType),StructField("count",LongType))
        }
      }
      //返回值类型
      override def dataType: DataType = DoubleType
      //是否是稳定的
      override def deterministic: Boolean = true
      //初始化缓存设置0值的位置
      override def initialize(buffer: MutableAggregationBuffer): Unit = {
        buffer(0)=0L
        buffer(1)=0L
      }
      //更新缓存
      override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
        buffer(0)=buffer.getLong(0)+input.getLong(0)
        buffer(1)=buffer.getLong(1)+1
      }
      //合并缓存
      override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit ={
        buffer1(0)=buffer1.getLong(0)+buffer2.getLong(0)
        buffer1(1)=buffer1.getLong(1)+buffer2.getLong(1)
      }
      //计算输出结果
      override def evaluate(buffer: Row): Any = {
        buffer.getLong(0).toDouble/buffer.getLong(1)
      }
    }
    
    • 结果:
      ±-----------------------------------------+
      |myavg(CAST(score AS BIGINT))|
      ±------------------------------------------+
      | 88.0|
      ±------------------------------------------+

    后续更新中~

    展开全文
  • hive开窗函数

    2020-02-26 14:13:21
    工作中用到了几个hive开窗函数,便想把hive开窗函数系统梳理一遍。 开窗函数 普通的聚合函数聚合的行集是组,开窗函数聚合的行集是窗口。因此,普通的聚合函数每组(Group by)只返回一个值,而开窗函数则可为窗口中的每...

    Hive 开窗函数

    展开

    工作中用到了几个hive开窗函数,便想把hive开窗函数系统梳理一遍。
    

    开窗函数

    普通的聚合函数聚合的行集是组,开窗函数聚合的行集是窗口。因此,普通的聚合函数每组(Group by)只返回一个值,而开窗函数则可为窗口中的每行都返回一个值。
    简单理解,就是对查询的结果多出一列,这一列可以是聚合值,也可以是排序值。
    开窗函数一般分为两类,聚合开窗函数和排序开窗函数。
    

    测试数据

    -- 建表
    create table student_scores(
    id int,
    studentId int,
    language int,
    math int,
    english int,
    classId string,
    departmentId string
    );
    -- 写入数据
    insert into table student_scores values 
      (1,111,68,69,90,'class1','department1'),
      (2,112,73,80,96,'class1','department1'),
      (3,113,90,74,75,'class1','department1'),
      (4,114,89,94,93,'class1','department1'),
      (5,115,99,93,89,'class1','department1'),
      (6,121,96,74,79,'class2','department1'),
      (7,122,89,86,85,'class2','department1'),
      (8,123,70,78,61,'class2','department1'),
      (9,124,76,70,76,'class2','department1'),
      (10,211,89,93,60,'class1','department2'),
      (11,212,76,83,75,'class1','department2'),
      (12,213,71,94,90,'class1','department2'),
      (13,214,94,94,66,'class1','department2'),
      (14,215,84,82,73,'class1','department2'),
      (15,216,85,74,93,'class1','department2'),
      (16,221,77,99,61,'class2','department2'),
      (17,222,80,78,96,'class2','department2'),
      (18,223,79,74,96,'class2','department2'),
      (19,224,75,80,78,'class2','department2'),
      (20,225,82,85,63,'class2','department2');
    

    聚合开窗函数

    count开窗函数

    -- count 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    count(math) over() as count1,
     -- 以按classId分组的所有行作为窗口
    count(math) over(partition by classId) as count2,
     -- 以按classId分组、按math排序的所有行作为窗口
    count(math) over(partition by classId order by math) as count3,
     -- 以按classId分组、按math排序、按 当前行+往前1行+往后2行的行作为窗口
    count(math) over(partition by classId order by math rows between 1 preceding and 2 following) as count4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid count1  count2  count3  count4
    111         69      department1     class1  9       5       1       3
    113         74      department1     class1  9       5       2       4
    112         80      department1     class1  9       5       3       4
    115         93      department1     class1  9       5       4       3
    114         94      department1     class1  9       5       5       2
    124         70      department1     class2  9       4       1       3
    121         74      department1     class2  9       4       2       4
    123         78      department1     class2  9       4       3       3
    122         86      department1     class2  9       4       4       2
    
    结果解释:
    studentid=115,count1为所有的行数9,count2为分区class1中的行数5,count3为分区class1中math值<=93的行数4,
    count4为分区class1中math值向前+1行向后+2行(实际只有1行)的总行数3。
    

    sum开窗函数

    -- sum开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    sum(math) over() as sum1,
    -- 以按classId分组的所有行作为窗口
    sum(math) over(partition by classId) as sum2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    sum(math) over(partition by classId order by math) as sum3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    sum(math) over(partition by classId order by math rows between 1 preceding and 2 following) as sum4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid sum1    sum2    sum3    sum4
    111         69      department1     class1  718     410     69      223
    113         74      department1     class1  718     410     143     316
    112         80      department1     class1  718     410     223     341
    115         93      department1     class1  718     410     316     267
    114         94      department1     class1  718     410     410     187
    124         70      department1     class2  718     308     70      222
    121         74      department1     class2  718     308     144     308
    123         78      department1     class2  718     308     222     238
    122         86      department1     class2  718     308     308     164
    
    结果解释:
        同count开窗函数
    

    min开窗函数

    -- min 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    min(math) over() as min1,
    -- 以按classId分组的所有行作为窗口
    min(math) over(partition by classId) as min2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    min(math) over(partition by classId order by math) as min3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    min(math) over(partition by classId order by math rows between 1 preceding and 2 following) as min4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid min1    min2    min3    min4
    111         69      department1     class1  69      69      69      69
    113         74      department1     class1  69      69      69      69
    112         80      department1     class1  69      69      69      74
    115         93      department1     class1  69      69      69      80
    114         94      department1     class1  69      69      69      93
    124         70      department1     class2  69      70      70      70
    121         74      department1     class2  69      70      70      70
    123         78      department1     class2  69      70      70      74
    122         86      department1     class2  69      70      70      78
    
    结果解释:
        同count开窗函数
    

    max开窗函数

    -- max 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    max(math) over() as max1,
    -- 以按classId分组的所有行作为窗口
    max(math) over(partition by classId) as max2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    max(math) over(partition by classId order by math) as max3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    max(math) over(partition by classId order by math rows between 1 preceding and 2 following) as max4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid max1    max2    max3    max4
    111         69      department1     class1  94      94      69      80
    113         74      department1     class1  94      94      74      93
    112         80      department1     class1  94      94      80      94
    115         93      department1     class1  94      94      93      94
    114         94      department1     class1  94      94      94      94
    124         70      department1     class2  94      86      70      78
    121         74      department1     class2  94      86      74      86
    123         78      department1     class2  94      86      78      86
    122         86      department1     class2  94      86      86      86
    
    结果解释:
        同count开窗函数
    

    avg开窗函数

    -- avg 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    avg(math) over() as avg1,
    -- 以按classId分组的所有行作为窗口
    avg(math) over(partition by classId) as avg2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    avg(math) over(partition by classId order by math) as avg3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    avg(math) over(partition by classId order by math rows between 1 preceding and 2 following) as avg4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid avg1                avg2    avg3                avg4
    111         69      department1     class1  79.77777777777777   82.0    69.0                74.33333333333333
    113         74      department1     class1  79.77777777777777   82.0    71.5                79.0
    112         80      department1     class1  79.77777777777777   82.0    74.33333333333333   85.25
    115         93      department1     class1  79.77777777777777   82.0    79.0                89.0
    114         94      department1     class1  79.77777777777777   82.0    82.0                93.5
    124         70      department1     class2  79.77777777777777   77.0    70.0                74.0
    121         74      department1     class2  79.77777777777777   77.0    72.0                77.0
    123         78      department1     class2  79.77777777777777   77.0    74.0                79.33333333333333
    122         86      department1     class2  79.77777777777777   77.0    77.0                82.0
    
    结果解释:
        同count开窗函数
    

    first_value开窗函数

    返回分区中的第一个值。
    
    
    -- first_value 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    first_value(math) over() as first_value1,
    -- 以按classId分组的所有行作为窗口
    first_value(math) over(partition by classId) as first_value2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    first_value(math) over(partition by classId order by math) as first_value3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    first_value(math) over(partition by classId order by math rows between 1 preceding and 2 following) as first_value4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid first_value1    first_value2    first_value3    first_value4
    111         69      department1     class1  69              69              69              69
    113         74      department1     class1  69              69              69              69
    112         80      department1     class1  69              69              69              74
    115         93      department1     class1  69              69              69              80
    114         94      department1     class1  69              69              69              93
    124         70      department1     class2  69              74              70              70
    121         74      department1     class2  69              74              70              70
    123         78      department1     class2  69              74              70              74
    122         86      department1     class2  69              74              70              78
    
    结果解释:
        studentid=124 first_value1:第一个值是69,first_value2:classId=class1分区 math的第一个值是69。
    

    last_value开窗函数

    返回分区中的第一个值。
    
    
    -- last_value 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 以符合条件的所有行作为窗口
    last_value(math) over() as last_value1,
    -- 以按classId分组的所有行作为窗口
    last_value(math) over(partition by classId) as last_value2,
     -- 以按classId分组、按math排序后、按到当前行(含当前行)的所有行作为窗口
    last_value(math) over(partition by classId order by math) as last_value3,
     -- 以按classId分组、按math排序后、按当前行+往前1行+往后2行的行作为窗口
    last_value(math) over(partition by classId order by math rows between 1 preceding and 2 following) as last_value4
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid last_value1 last_value2 last_value3 last_value4
    111         69      department1     class1  70          93          69          80
    113         74      department1     class1  70          93          74          93
    112         80      department1     class1  70          93          80          94
    115         93      department1     class1  70          93          93          94
    114         94      department1     class1  70          93          94          94
    124         70      department1     class2  70          70          70          78
    121         74      department1     class2  70          70          74          86
    123         78      department1     class2  70          70          78          86
    122         86      department1     class2  70          70          86          86
    

    lag开窗函数

    lag(col,n,default) 用于统计窗口内往上第n个值。
        col:列名
        n:往上第n行
        default:往上第n行为NULL时候,取默认值,不指定则取NULL
    
    
    -- lag 开窗函数
    
    select studentId,math,departmentId,classId,
     --窗口内 往上取第二个 取不到时赋默认值60
    lag(math,2,60) over(partition by classId order by math) as lag1,
     --窗口内 往上取第二个 取不到时赋默认值NULL
    lag(math,2) over(partition by classId order by math) as lag2
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid lag1    lag2
    111         69      department1     class1  60      NULL
    113         74      department1     class1  60      NULL
    112         80      department1     class1  69      69
    115         93      department1     class1  74      74
    114         94      department1     class1  80      80
    124         70      department1     class2  60      NULL
    121         74      department1     class2  60      NULL
    123         78      department1     class2  70      70
    122         86      department1     class2  74      74
    
    结果解释:
        第3行 lag1:窗口内(69 74 80) 当前行80 向上取第二个值为69
        倒数第3行 lag2:窗口内(70 74) 当前行74 向上取第二个值为NULL
    

    lead开窗函数

    lead(col,n,default) 用于统计窗口内往下第n个值。
        col:列名
        n:往下第n行
        default:往下第n行为NULL时候,取默认值,不指定则取NULL
    
    
    -- lead开窗函数
    
    select studentId,math,departmentId,classId,
     --窗口内 往下取第二个 取不到时赋默认值60
    lead(math,2,60) over(partition by classId order by math) as lead1,
     --窗口内 往下取第二个 取不到时赋默认值NULL
    lead(math,2) over(partition by classId order by math) as lead2
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid lead1   lead2
    111         69      department1     class1  80      80
    113         74      department1     class1  93      93
    112         80      department1     class1  94      94
    115         93      department1     class1  60      NULL
    114         94      department1     class1  60      NULL
    124         70      department1     class2  78      78
    121         74      department1     class2  86      86
    123         78      department1     class2  60      NULL
    122         86      department1     class2  60      NULL
    
    结果解释:
        第4行lead1 窗口内向下第二个值为空,赋值60
    

    cume_dist开窗函数

    计算某个窗口或分区中某个值的累积分布。假定升序排序,则使用以下公式确定累积分布:
    小于等于当前值x的行数 / 窗口或partition分区内的总行数。其中,x 等于 order by 子句中指定的列的当前行中的值。
    
    
    -- cume_dist 开窗函数
    
    select studentId,math,departmentId,classId,
    -- 统计小于等于当前分数的人数占总人数的比例
    cume_dist() over(order by math) as cume_dist1,
    -- 统计大于等于当前分数的人数占总人数的比例
    cume_dist() over(order by math desc) as cume_dist2,
    -- 统计分区内小于等于当前分数的人数占总人数的比例
    cume_dist() over(partition by classId order by math) as cume_dist3
    from student_scores where departmentId='department1';
    
    结果
    studentid   math    departmentid    classid cume_dist1              cume_dist2          cume_dist3
    111         69      department1     class1  0.1111111111111111      1.0                 0.2
    113         74      department1     class1  0.4444444444444444      0.7777777777777778  0.4
    112         80      department1     class1  0.6666666666666666      0.4444444444444444  0.6
    115         93      department1     class1  0.8888888888888888      0.2222222222222222  0.8
    114         94      department1     class1  1.0                     0.1111111111111111  1.0
    124         70      department1     class2  0.2222222222222222      0.8888888888888888  0.25
    121         74      department1     class2  0.4444444444444444      0.7777777777777778  0.5
    123         78      department1     class2  0.5555555555555556      0.5555555555555556  0.75
    122         86      department1     class2  0.7777777777777778      0.3333333333333333  1.0
    
    结果解释:
        第三行:
            cume_dist1=小于等于80的人数为6/总人数9=0.6666666666666666
            cume_dist2=大于等于80的人数为4/总人数9=0.4444444444444444
            cume_dist3=分区内小于等于80的人数为3/分区内总人数5=0.6
    

    排序开窗函数

    rank开窗函数

    rank 开窗函数基于 over 子句中的 order by 确定一组值中一个值的排名。如果存在partition by ,则为每个分区组中的每个值排名。排名可能不是连续的。例如,如果两个行的排名为 1,则下一个排名为 3。
    
    
    -- rank 开窗函数
    
    select *,
    -- 对全部学生按数学分数排序 
    rank() over(order by math) as rank1,
    -- 对院系 按数学分数排序
    rank() over(partition by departmentId order by math) as rank2,
    -- 对每个院系每个班级 按数学分数排序
    rank() over(partition by departmentId,classId order by math) as rank3
    from student_scores;
    
    结果
    
    id  studentid   language    math    english     classid departmentid    rank1   rank2   rank3
    1   111         68          69      90          class1  department1     1       1       1
    3   113         90          74      75          class1  department1     3       3       2
    2   112         73          80      96          class1  department1     9       6       3
    5   115         99          93      89          class1  department1     15      8       4
    4   114         89          94      93          class1  department1     17      9       5
    9   124         76          70      76          class2  department1     2       2       1
    6   121         96          74      79          class2  department1     3       3       2
    8   123         70          78      61          class2  department1     7       5       3
    7   122         89          86      85          class2  department1     14      7       4
    15  216         85          74      93          class1  department2     3       1       1
    14  215         84          82      73          class1  department2     11      5       2
    11  212         76          83      75          class1  department2     12      6       3
    10  211         89          93      60          class1  department2     15      8       4
    12  213         71          94      90          class1  department2     17      9       5
    13  214         94          94      66          class1  department2     17      9       5
    18  223         79          74      96          class2  department2     3       1       1
    17  222         80          78      96          class2  department2     7       3       2
    19  224         75          80      78          class2  department2     9       4       3
    20  225         82          85      63          class2  department2     13      7       4
    16  221         77          99      61          class2  department2     20      11      5
    

    dense_rank开窗函数

    dense_rank与rank有一点不同,当排名一样的时候,接下来的行是连续的。如两个行的排名为 1,则下一个排名为 2。
    
    
    -- dense_rank 开窗函数
    
    select *,
    -- 对全部学生按数学分数排序
    dense_rank() over(order by math) as dense_rank1,
    -- 对院系 按数学分数排序
    dense_rank() over(partition by departmentId order by math) as dense_rank2,
    -- 对每个院系每个班级 按数学分数排序
    dense_rank() over(partition by departmentId,classId order by math) as dense_rank3
    from student_scores;
    
    结果:
    id  studentid   language    math    english classid departmentid    dense_rank1 dense_rank2 dense_rank3
    1   111         68          69      90      class1  department1     1           1           1
    3   113         90          74      75      class1  department1     3           3           2
    2   112         73          80      96      class1  department1     5           5           3
    5   115         99          93      89      class1  department1     10          7           4
    4   114         89          94      93      class1  department1     11          8           5
    9   124         76          70      76      class2  department1     2           2           1
    6   121         96          74      79      class2  department1     3           3           2
    8   123         70          78      61      class2  department1     4           4           3
    7   122         89          86      85      class2  department1     9           6           4
    15  216         85          74      93      class1  department2     3           1           1
    14  215         84          82      73      class1  department2     6           4           2
    11  212         76          83      75      class1  department2     7           5           3
    10  211         89          93      60      class1  department2     10          7           4
    12  213         71          94      90      class1  department2     11          8           5
    13  214         94          94      66      class1  department2     11          8           5
    18  223         79          74      96      class2  department2     3           1           1
    17  222         80          78      96      class2  department2     4           2           2
    19  224         75          80      78      class2  department2     5           3           3
    20  225         82          85      63      class2  department2     8           6           4
    16  221         77          99      61      class2  department2     12          9           5
    

    ntile开窗函数

    将分区中已排序的行划分为大小尽可能相等的指定数量的排名的组,并返回给定行所在的组的排名。
    
    
    -- ntile 开窗函数
    
    select *,
    -- 对分区内的数据分成两组
    ntile(2) over(partition by departmentid order by math) as ntile1,
    -- 对分区内的数据分成三组
    ntile(3) over(partition by departmentid order by math) as ntile2
    from student_scores;
    
    结果
    id  studentid   language    math    english classid departmentid    ntile1  ntile2
    1   111         68          69      90      class1  department1     1       1
    9   124         76          70      76      class2  department1     1       1
    6   121         96          74      79      class2  department1     1       1
    3   113         90          74      75      class1  department1     1       2
    8   123         70          78      61      class2  department1     1       2
    2   112         73          80      96      class1  department1     2       2
    7   122         89          86      85      class2  department1     2       3
    5   115         99          93      89      class1  department1     2       3
    4   114         89          94      93      class1  department1     2       3
    18  223         79          74      96      class2  department2     1       1
    15  216         85          74      93      class1  department2     1       1
    17  222         80          78      96      class2  department2     1       1
    19  224         75          80      78      class2  department2     1       1
    14  215         84          82      73      class1  department2     1       2
    11  212         76          83      75      class1  department2     1       2
    20  225         82          85      63      class2  department2     2       2
    10  211         89          93      60      class1  department2     2       2
    12  213         71          94      90      class1  department2     2       3
    13  214         94          94      66      class1  department2     2       3
    16  221         77          99      61      class2  department2     2       3
    
    结果解释:
        第8行
            ntile1:对分区的数据均匀分成2组后,当前行的组排名为2
            ntile2:对分区的数据均匀分成3组后,当前行的组排名为3
    

    row_number开窗函数

    从1开始对分区内的数据排序。
    
    
    -- row_number 开窗函数
    
    select studentid,departmentid,classid,math,
    -- 对分区departmentid,classid内的数据按math排序
    row_number() over(partition by departmentid,classid order by math) as row_number
    from student_scores;
    
    结果
    studentid   departmentid    classid math    row_number
    111         department1     class1  69      1
    113         department1     class1  74      2
    112         department1     class1  80      3
    115         department1     class1  93      4
    114         department1     class1  94      5
    124         department1     class2  70      1
    121         department1     class2  74      2
    123         department1     class2  78      3
    122         department1     class2  86      4
    216         department2     class1  74      1
    215         department2     class1  82      2
    212         department2     class1  83      3
    211         department2     class1  93      4
    213         department2     class1  94      5
    214         department2     class1  94      6
    223         department2     class2  74      1
    222         department2     class2  78      2
    224         department2     class2  80      3
    225         department2     class2  85      4
    221         department2     class2  99      5
    
    结果解释:
        同一分区,相同值,不同序。如studentid=213 studentid=214 值都为94 排序为5,6。
    

    percent_rank开窗函数

    计算给定行的百分比排名。可以用来计算超过了百分之多少的人。如360小助手开机速度超过了百分之多少的人。
    (当前行的rank值-1)/(分组内的总行数-1)
    
    
    -- percent_rank 开窗函数
    
    select studentid,departmentid,classid,math,
    row_number() over(partition by departmentid,classid order by math) as row_number,
    percent_rank() over(partition by departmentid,classid order by math) as percent_rank
    from student_scores;
    
    结果
    studentid   departmentid    classid math    row_number  percent_rank
    111         department1     class1  69      1           0.0
    113         department1     class1  74      2           0.25
    112         department1     class1  80      3           0.5
    115         department1     class1  93      4           0.75
    114         department1     class1  94      5           1.0
    124         department1     class2  70      1           0.0
    121         department1     class2  74      2           0.3333333333333333
    123         department1     class2  78      3           0.6666666666666666
    122         department1     class2  86      4           1.0
    216         department2     class1  74      1           0.0
    215         department2     class1  82      2           0.2
    212         department2     class1  83      3           0.4
    211         department2     class1  93      4           0.6
    213         department2     class1  94      5           0.8
    214         department2     class1  94      6           0.8
    223         department2     class2  74      1           0.0
    222         department2     class2  78      2           0.25
    224         department2     class2  80      3           0.5
    225         department2     class2  85      4           0.75
    221         department2     class2  99      5           1.0
    
    结果解释:
        studentid=115,percent_rank=(4-1)/(5-1)=0.75
        studentid=123,percent_rank=(3-1)/(4-1)=0.6666666666666666
    
    展开全文
  • Hive开窗函数

    2020-06-28 22:35:48
    Hive开窗函数的基本使用

    1、概述

    开窗函数即将一个大的表分成多个小的表并且分别对其进行处理,即开窗.

    2、相关函数说明

    OVER():指定分析函数工作的数据窗口大小,这个数据窗口大小可能会随着行的变而变化。

    CURRENT ROW:当前行

    n PRECEDING:往前n行数据

    n FOLLOWING:往后n行数据

    UNBOUNDED:起点,UNBOUNDED PRECEDING 表示从前面的起点, UNBOUNDED FOLLOWING表示到后面的终点

    LAG(col,n,default_val):往前第n行数据

    LEAD(col,n, default_val):往后第n行数据

    NTILE(n):把有序分区中的行分发到指定数据的组中,各个组有编号,编号从1开始,对于每一行,NTILE返回此行所属的组的编号。注意:n必须为int类型。

    3、案例实操

    数据准备:name,orderdate,cost

    jack,2017-01-01,10
    tony,2017-01-02,15
    jack,2017-02-03,23
    tony,2017-01-04,29
    jack,2017-01-05,46
    jack,2017-04-06,42
    tony,2017-01-07,50
    jack,2017-01-08,55
    mart,2017-04-08,62
    mart,2017-04-09,68
    neil,2017-05-10,12
    mart,2017-04-11,75
    neil,2017-06-12,80
    mart,2017-04-13,94
    

    创建hive表并导入数据

    create table business(
    name string, 
    orderdate string,
    cost int
    ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
    load data local inpath "/opt/module/datas/business.txt" into table business;
    

    (1)查询在2017年4月份购买过的顾客及总人数

    select name,count(*) over () 
    from business 
    where substring(orderdate,1,7) = '2017-04' 
    group by name;
    

    (2)查询顾客的购买明细及月购买总额(month返回月份)

    select name,orderdate,cost,sum(cost) over(partition by month(orderdate)) from business;
    

    (3)上述的场景, 将每个顾客的cost按照日期进行累加

    select name,orderdate,cost, 
    sum(cost) over() as sample1,--所有行相加 
    sum(cost) over(partition by name) as sample2,--按name分组,组内数据相加 
    sum(cost) over(partition by name order by orderdate) as sample3,--按name分组,组内数据累加 
    sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING and current row ) as sample4 ,--和sample3一样,由起点到当前行的聚合 
    sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING and current row) as sample5, --当前行和前面一行做聚合 
    sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6,--当前行和前边一行及后面一行 
    sum(cost) over(partition by name order by orderdate rows between current row and UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行 
    from business;
    //rows必须跟在Order by 子句之后,对排序的结果进行限制,使用固定的行数来限制分区中的数据行数量

    (4)查看顾客上次的购买时间

    select name,orderdate,cost, 
    lag(orderdate,1,'1900-01-01') over(partition by name order by orderdate ) as time1, lag(orderdate,2) over (partition by name order by orderdate) as time2 
    from business;
    

    (5)查询前20%时间的订单信息

    select * from (
        select name,orderdate,cost, ntile(5) over(order by orderdate) sorted
        from business
    ) t
    where sorted = 1;
    

     

    展开全文
  • HIVE 开窗函数

    2020-04-18 14:40:04
    开窗函数出现之前,存在着很多用 SQL 语句很难解决的问题,很多都要通过复杂的相关子查询或者存储过程来完成。为了解决这些问题,在2003年ISO SQL标准加入了开窗函数开窗函数的使用使得这些经典的难题可以被轻松...

空空如也

空空如也

1 2 3 4 5 ... 20
收藏数 630
精华内容 252
关键字:

hive开窗函数