【CS 61a study notes 9】HIVE SQL

Notes

P.S. : all the sql codes in this note are based on hive sql , so may be a little different with 61a sql part, mysql,oracle,etc.
most of the content in this note is based on my work experiences.JUST A RECORD

Executing order of SQL

#mermaid-svg-Twwy8wMdZ47RlSew {font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;fill:#333;}#mermaid-svg-Twwy8wMdZ47RlSew .error-icon{fill:#552222;}#mermaid-svg-Twwy8wMdZ47RlSew .error-text{fill:#552222;stroke:#552222;}#mermaid-svg-Twwy8wMdZ47RlSew .edge-thickness-normal{stroke-width:2px;}#mermaid-svg-Twwy8wMdZ47RlSew .edge-thickness-thick{stroke-width:3.5px;}#mermaid-svg-Twwy8wMdZ47RlSew .edge-pattern-solid{stroke-dasharray:0;}#mermaid-svg-Twwy8wMdZ47RlSew .edge-pattern-dashed{stroke-dasharray:3;}#mermaid-svg-Twwy8wMdZ47RlSew .edge-pattern-dotted{stroke-dasharray:2;}#mermaid-svg-Twwy8wMdZ47RlSew .marker{fill:#333333;stroke:#333333;}#mermaid-svg-Twwy8wMdZ47RlSew .marker.cross{stroke:#333333;}#mermaid-svg-Twwy8wMdZ47RlSew svg{font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;}#mermaid-svg-Twwy8wMdZ47RlSew .label{font-family:"trebuchet ms",verdana,arial,sans-serif;color:#333;}#mermaid-svg-Twwy8wMdZ47RlSew .cluster-label text{fill:#333;}#mermaid-svg-Twwy8wMdZ47RlSew .cluster-label span{color:#333;}#mermaid-svg-Twwy8wMdZ47RlSew .label text,#mermaid-svg-Twwy8wMdZ47RlSew span{fill:#333;color:#333;}#mermaid-svg-Twwy8wMdZ47RlSew .node rect,#mermaid-svg-Twwy8wMdZ47RlSew .node circle,#mermaid-svg-Twwy8wMdZ47RlSew .node ellipse,#mermaid-svg-Twwy8wMdZ47RlSew .node polygon,#mermaid-svg-Twwy8wMdZ47RlSew .node path{fill:#ECECFF;stroke:#9370DB;stroke-width:1px;}#mermaid-svg-Twwy8wMdZ47RlSew .node .label{text-align:center;}#mermaid-svg-Twwy8wMdZ47RlSew .node.clickable{cursor:pointer;}#mermaid-svg-Twwy8wMdZ47RlSew .arrowheadPath{fill:#333333;}#mermaid-svg-Twwy8wMdZ47RlSew .edgePath .path{stroke:#333333;stroke-width:2.0px;}#mermaid-svg-Twwy8wMdZ47RlSew .flowchart-link{stroke:#333333;fill:none;}#mermaid-svg-Twwy8wMdZ47RlSew .edgeLabel{background-color:#e8e8e8;text-align:center;}#mermaid-svg-Twwy8wMdZ47RlSew .edgeLabel rect{opacity:0.5;background-color:#e8e8e8;fill:#e8e8e8;}#mermaid-svg-Twwy8wMdZ47RlSew .cluster rect{fill:#ffffde;stroke:#aaaa33;stroke-width:1px;}#mermaid-svg-Twwy8wMdZ47RlSew .cluster text{fill:#333;}#mermaid-svg-Twwy8wMdZ47RlSew .cluster span{color:#333;}#mermaid-svg-Twwy8wMdZ47RlSew div.mermaidTooltip{position:absolute;text-align:center;max-width:200px;padding:2px;font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:12px;background:hsl(80, 100%, 96.2745098039%);border:1px solid #aaaa33;border-radius:2px;pointer-events:none;z-index:100;}#mermaid-svg-Twwy8wMdZ47RlSew :root{--mermaid-font-family:"trebuchet ms",verdana,arial,sans-serif;}

       From 
     

       Join 
     

       On 
     

       Where 
     

       Group by 
     

       Avg/Sum 
     

       Having 
     

       Select 
     

       Distinct 
     

       Order by 
     

       Limit

Bacis syntax

CREATE

CREATE (TEMP or TEMPORARY) TABLE table_name (data) ;
CREATE (TEMP or TEMPORARY) TABLE table_name AS SELECT … ;

INSERT

INSERT INTO table_name VALUES() ;
INSERT OVERWRITE table_name VALUES() ; - For partitioned tables, the insert overwrite table operation overwrites the data in the specified partition without affecting the data in other partitions.- **INSERT OVERWRITE TABLE table_name PARTITION(target_partition) SELECT … **- For a normal table (that is, a non-partitioned table), the insert overwrite table operation completely overwrites all the data in the table, regardless of any partition. All data will be deleted and replaced with newly inserted data.

JOIN

Separator ‘,’ - SELECT a.,b, from a,b ; If a has n rows and b has m rows , the result will be n*m rows.- Do not need the same columns to connect two table.- When a and b do not have the similar key colomns ,we use ‘,’ to connect two tables.
a FULL JOIN b ON a.col_name = b.col_name (and …) - It is different with ‘,’ . If a has n rows and b has m rows , the result will be max(n,m) rows.- ON can be regarded as a conditon statment
a LEFT JOIN b ON a.col_name = b.col_name (and …) - If a has n rows and b has m rows , the result will be n rows.- We can use LEFT JOIN and WHERE to replace INNER JOIN
a RIGHT JOIN b ON a.col_name = b.col_name (and …) - If a has n rows and b has m rows , the result will be m rows.
a INNER JOIN b ON a.col_name = b.col_name (and …)

UNION

UNION SELECT … UNION SELECT … - Automatically remove duplicate data- The data format for each column must be consistent- Certainly the column’s number must be the same.
*UNION ALL * SELECT … UNION ALL SELECT … - Keep the duplicate data- Others are the same as UNION

Conditional Statement

WHERE- SELECT … FROM table_name WHERE …(AND … OR…)
HAVING- SELECT col1,col2… FROM table_name GROUP BY col1,col2… HAVING SUM(col1)>1 OR COUNT(col2)>0 …
**CASE WHEN ** - SELECT CASE WHEN col1 IS NOT NULL THEN col1 ELSE ’ ’ END AS new_col1 FROM table_name- Remove duplicates upward
IF(condition, true_return, false_return)- select if(1<>1,2,3); => 3

Function

About DATE

Change time’s format

yyyyMMdd⇒ yyyy-MM-dd or yyyy-MM-dd => yyyyMMdd

# create
TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(target_date,'yyyyMMdd'),'yyyy-MM-dd'))as new_date
FROM_UNIXTIME(UNIX_TIMESTAMP(target_date,'yyyy-MM-dd'),'yyyyMMdd')as new_string

Change STRING to DATE - CAST(‘string_time’ AS DATE)

Counting dates

SET hivevar:NOW_DATE=TO_DATE('2024-02-18');--sundaySET hivevar:END_DATE=TO_DATE(ADD_MONTHS(${NOW_DATE},+1));--2024-3-18-- eg calculate age DATEDIFF(BIG_DATE,SMALL_DATE) = DAYSSELECT floor(datediff(${NOW_DATE},to_date(birthday))/365)as age ;-- calculate the weekend -2023-01-01 sundaySELECTCASEWHEN pmod(datediff(${NOW_DATE},TO_DATE('2023-01-01')),7)=0THEN'WEEKEND'WHEN pmod(datediff(${NOW_DATE},TO_DATE('2023-01-01')),7)=6THEN'WEEKEND'ELSE'WEEKDAY'ENDAS WEEK_TAG;-- the last day of the monthSELECT LAST_DAY(${NOW_DATE});-- '2024-02-29'-- extract year/month/dateSELECT DATE_FORMAT(${NOW_DATE},'yyyy-MM');-- 2024-02SELECT DATE_FORMAT(${NOW_DATE},'MM');-- 02SELECT DATE_FORMAT(${NOW_DATE},'dd');-- 18-- add days OR subtract days DATE_SUB()SELECT DATE_ADD(LAST_DAY(${NOW_DATE}),1);--2024-03-01-- next monday - sundaySELECT NEXT_DAY(${NOW_DATE},'MO');-- 2024-02-19SELECT NEXT_DAY(${NOW_DATE},'we');--2024-02-21

About STRING

Lpad & Rpad

To figure out the problem like ‘2>10’ in string.

Lpad/Rpad(target_str, repeat_times[int], repeat_character) - select Rpad(‘2’,3,‘0’); => ‘200’- select Lpad(‘2’,3,‘0’); => ‘002’

Substring & Substr

SUBSTRING(targer_string , start_index, cut_len) - The start_index is 1 not 0- if we do not give the value of cut_len, it will return all the character after the start index .
SUBSTR(targer_string , start_index, cut_len) - same as SUBSTRINGnote: some version of sql substring() is different with substr(). The last parameter of substring maybe the end_index. But i have tested in my sql version ,i got the same results using the same parameter’s value in the two function. JUST A NOTICE, maybe it depends on different situations.

Concat & Concat_ws

CONCAT(str1, str2…) or we can use || instead - Make sure that there are no empty strs in concat , or you will got NULL- || is equivalent to CONCAT
CONCAT_WS(separator, str1, str2…) - separator can not be NULL while str has no limit.

Column to row

CONCAT_WS(separator , COLLECT_LIST/COLLECT_SET(col_name))

collect_list(col_name)- KEEP the duplicate data
collect_set(col_name)- REMOVE the duplicate data

Row to column

Simplest but fussy way : Using case when.

EXPLODE()Just convert one row- EXPLODE(MAP(target_row)) : Set each pair of elements in the map as a row, with a column for key and a column for value

SELECT EXPLODE(ARRAY('1','E'));-- will get a new column:-- new_col-- 1-- ESELECT EXPLODE(MAP('A','1','B','2');-- key value-- A     1-- B     2

Lateral View***lateral view udtf(expression) tableAlias as columnAlias (,columnAlias)- udtf(expression) : eg： explore() , spilt()- tableAlias :The name of the virtual table that represents the transformation of the UDTF function- columnAlias：Indicates the virtual field name of the virtual table. If there is one column after splitting, write one column. If there are multiple columns after the split, declare all virtual column names in parentheses in the order of the columns, separated by commas- LOCATION: after FROM before WHERE

DROPTABLEIFEXISTS test_list;CREATETEMPORARYTABLE test_list 
(u_id STRING,
a_list STRING)row format delimited 
fieldsterminatedby"\t"
collection items terminatedby",";INSERTINTOTABLE test_list VALUES(('A,B'),('L1,L2,L3'));INSERTINTOTABLE test_list VALUES(('C,B,D'),('L4,L5,L6'));select uid_split, new_list 
from(select u_id,new_list
    from test_list
    lateral view explode(split(a_list,",")) tmpTable as new_list
) a
lateral view explode(split(u_id,",")) m as uid_split;-- ans: 15rows * 2 cols

uid_splitnew_listAL1BL1……CL6BL6DL6

**POSEXPLODE()**change two colums at the same time

DROPTABLEIFEXISTS test_list;CREATETEMPORARYTABLE test_list 
(u_id STRING,
a_list STRING)row format delimited 
fieldsterminatedby"\t"
collection items terminatedby",";INSERTINTOTABLE test_list VALUES(('A,B'),('L1,L2'));INSERTINTOTABLE test_list VALUES(('C,E,D'),('L4,L5,L6'));select uid_split,new_list
from(select index_l, index_u ,new_list,uid_split
from  test_list
lateral view posexplode(split(a_list,",")) ln_1 as index_l,new_list
lateral view posexplode(split(u_id,",")) u_1 as index_u,uid_split) a
where index_l = index_u;-- ans : 5rows * 2cols

uid_splitnew_listAL1BL2CL4EL5DL6

Extracting data

Using regular expression

Func regexp_extract : regexp_extract ( str , regexp, index)- str: target string or target column’s name- regexp : regular expression - ***** : matching the former character 0 times or infinite times- . : matching arbitrary characters- ? : matching the former character 0 times or 1 time- index : indicate that return which part of the results - if index = 0 , return the whole expression- if index in [1 + ] , we will give the specific string as the position before the parentheses and the index will accord to the parentheses’ order and number.

regexp_extract('{"c_id":"D","order_time":"20240506"}','"order_time":(.*?)(")',1)as order_time
-- expected to get '20240505'

Using json data

Func get_json_object : get_json_object(str,json_path)- str: target string or target column’s name- json_path : using $.key- if it is a nesting structure , then using $.key1.key2[0]

get_json_object('{"c_id":"D","order_time":"20240506"}',$.order_time)as order_time
-- expected to get '20240505'

Analysis function

Ranking

ROW_NUMBER() OVER( PARTITION BY … ORDER BY …) : 1 2 3 4 …
RANK()OVER(PARTITION BY … ORDER BY …) : 1 2 2 4
DENSE_RANK(PARTITION BY … ORDER BY …) : 1 2 2 3
NTILE(group_num) OVER(PARTITION BY … ORDER BY …) : make sure every group at least one row,and If the slices are uneven, the distribution of the first slice is increased by default.
CUME_DIST()OVER(PARTITION BY … ORDER BY …) : Returns the number of rows less than or equal to the current value/the total number of rows in the group

Lag&Lead

LAG(col_name, n , lag_val): - col_name : target modified column- n : The second argument is the nth line up (optional, default is 1)- lag_val : The third parameter is the default value (if the nth action is NULL, take the default value, if not specified, NULL).
LEAD(col_name, n , lead_val): - lag line up lead line down

First/Last value

FIRST_VALUE( target_col) OVER(PARTITION BY … ORDER BY …) : Take the first value after sorting within the group, up to the current row
LAST_VALUE( target_col) OVER(PARTITION BY … ORDER BY …): After sorting within the group, up to the current row, the last value

DROPTABLEIFEXISTS TEMP_TEST;CREATETEMPORARYTABLE TEMP_TEST ASSELECT EXPLODE(ARRAY('1','2','3','2','3','4'))AS TEST
;SELECT A.*,ROW_NUMBER()OVER(ORDERBY TEST )AS ROW_RK
,RANK()OVER(ORDERBY TEST )AS RK_RK
,DENSE_RANK()OVER(ORDERBY TEST )AS DEN_RK
,round(CUME_DIST()OVER(ORDERBY TEST),2)AS less_or_equ_rt 
,LAG(TEST,3,'LAG')OVER(ORDERBY TEST)AS lag_test
,LAG(TEST)OVER(ORDERBY TEST DESC)AS lag_2
,LEAD(TEST,2,'LEAD')OVER(ORDERBY TEST)AS lead_test
,LEAD(TEST)OVER(ORDERBY TEST DESC)AS lead_2
,FIRST_VALUE(TEST)OVER(ORDERBY TEST)AS first_test
,LAST_VALUE(TEST)OVER(ORDERBY TEST )AS last_test
FROM TEMP_TEST A ;

testrow_rkrk_rkden_rkgroup_2group_4less_or_equ_rtlag_testlag_2lead_testlead_2first_testlast_test1111110.17LAG22[NULL]112222110.5LAG231122322120.5LAG332123443220.831342133543230.8324LEAD31346642412[NULL]LEAD314

Handling NULL

NVL( expr1, default_expr): return default_expr if expr1 is null , or return expr1 - parameters can be number string date etc…

SELECT NVL(NULL,'TEST');-- 'TEST'SELECT NVL('',NULL);-- ''SELECT NVL(NULL,NULL);-- NULL

NULLIF(target_num , compare_num) : return null if target_num = compare_num , else return target_num - is a shorthand for : CASE WHEN target_num = compare_num THEN NULL else target_num

SELECT12/NULLIF(0,0);-- NULL

COALESCE(exp1,expr2,expr3…) : check the expression by order and return the first non-null value . if all the expressions are null ,then return null

SELECTCOALESCE(NULL,NULL,NULL);--NULLSELECTCOALESCE(0,NULL,1);--0SELECTCOALESCE(NULL,NULL,NULL,'first','second');--'first'SELECTCOALESCE(NULL,12/NULLIF(0,0),'third');--'third'

GROUP BY

basic syntax:

-- example dataCREATETEMPORARYTABLE TEST_GROUP(
C_ID   string  comment'CUSTOMER ID',
ORDER_YEAR  string  ,
PRODUCT string,
REVENUE  intcomment'Amount of consumption');INSERT OVERWRITE TABLE  TEST_GROUP VALUES('A','2020','pro_1','50'),('A','2020','pro_2','40'),('A','2019','pro_2','40'),('B','2020','pro_1','20'),('B','2021','pro_2','50'),('C','2020','pro_2','100');select c_id, order_year, product-- dimensions,sum(revenue)from TEST_GROUP
groupby c_id, order_year, product ;

Using group by to remove duplicate data ,which is faster than using DISTINCT
All dimesions like * c_id, order_year, product* should be occured after the group by statement

grouping__id sorting order

grouping__id : two underlines. grouping__id must be used with grouping set or cube or rollup , which can not be used alone.
If we group by 3 columns , the order of grouping__id is as follows: (maybe different version of sql will give different
0 : (col1, col2, col3)
1 : (col1, col2)
2 : (col1, col3)
3 : (col1)
4 : ( col2, col3)
5 : (col2)
6 : (col3)
7: ()

CUBE

They are aggregated according to different dimensions and have no priority
Basic syntax: SELECT col1, col2, col3 FROM temp_table GROUP BY CUBE(col1, col2, col3) or SELECT col1, col2, col3 FROM temp_table GROUP BY col1, col2, col3 WITH CUBE

-- to find the sales of  the every year and every productSELECT ORDER_YEAR,PRODUCT,SUM(REVENUE)AS SALES, grouping__id FROM TEST_GROUP GROUPBY ORDER_YEAR,PRODUCT
with cube;SELECT ORDER_YEAR,PRODUCT,SUM(REVENUE)AS SALES, grouping__id  FROM TEST_GROUP GROUPBY CUBE(ORDER_YEAR,PRODUCT);

order_yearproductsalesgrouping__id2019pro_24002020pro_17002020pro_214002021pro_25002019[NULL]4012020[NULL]21012021[NULL]501[NULL]pro_1702[NULL]pro_22302[NULL][NULL]3003

GROUPING SETS

When we just need some particular dimensions , we can use GROUPING SETS to determine which dimensions to aggregate
GROUPING SET is after GROUP BY
Basic syntax : SELECT col1, col2, col3 FROM temp_table GROUP BY col1, col2, col3 GROUPING SETS ((col1,col2,col3),())
() means taking all the data of the table as an integrity, while it can not be used alone . We can not write GROUPING SETS (()) .

SELECT c_id, order_year, product,sum(revenue)as sales, grouping__id 
FROM test_group
GROUPBY c_id, order_year, product
GROUPING SETS((),(c_id,order_year),product)ORDERBY grouping__id ;

c_idorder_yearproductsalesgrouping__idA2019[NULL]401B2020[NULL]201C2020[NULL]1001A2020[NULL]901B2021[NULL]501[NULL][NULL]pro_1706[NULL][NULL]pro_22306[NULL][NULL][NULL]3007

ROLLUP

The ROLLUP function is a subset of the CUBE, dominated by the leftmost dimension, aggregated in order.
Basic syntax :SELECT col1, col2, col3 FROM temp_table GROUP BY ROLLUP(col1, col2, col3) or SELECT col1, col2, col3 FROM temp_table GROUP BY col1, col2, col3 WITH ROLLUP
Is equivalent to SELECT col1, col2, col3 FROM temp_table GROUP BY col1, col2, col3 GROUPING SETS((),col1,(col1,col2),(col1,col2,col3))
Exist the priority ,and the left side has the highest priority, which means priority col1>col2>col3

SELECT ORDER_YEAR,PRODUCT,SUM(REVENUE)AS SALES, grouping__id FROM TEST_GROUP 
GROUPBY ORDER_YEAR,PRODUCT
WITH ROLLUP;-- is equivalent to SELECT ORDER_YEAR,PRODUCT,SUM(REVENUE)AS SALES, grouping__id  FROM TEST_GROUP 
GROUPBY ROLLUP(ORDER_YEAR,PRODUCT);-- is equivalent to SELECT  ORDER_YEAR,PRODUCT,SUM(REVENUE)AS SALES, grouping__id  FROM TEST_GROUP 
GROUPBY ORDER_YEAR,PRODUCT
GROUPING SETS((),ORDER_YEAR,(ORDER_YEAR,PRODUCT));

order_yearproductsalesgrouping__id2019pro_24002020pro_17002020pro_214002021pro_25002019[NULL]4012020[NULL]21012021[NULL]501[NULL][NULL]3003

Example

Based on HIVE SQL

-- Creat a new detail table which contains the customer and the goods they broughtDROPTABLEIFEXISTS ORDER_DETAIL;CREATETEMPORARYTABLE ORDER_DETAIL(
C_ID STRING,
ORDER_DATE DATE,
PRODUCT_NAME STRING,
PRODUCT_UNITS INT,
STAFF STRING
)ROW FORMAT DELIMITED FIELDSTERMINATEDBY',' STORED AS textfile;INSERTINTO ORDER_DETAIL VALUES('A','2024-06-07','DRESS','2','S1');INSERTINTO ORDER_DETAIL VALUES('A','2024-05-08','SHOES','1','S3');INSERTINTO ORDER_DETAIL VALUES('B','2024-06-06','SHIRT','2','S1');INSERTINTO ORDER_DETAIL VALUES('B','2024-05-09','PANTS','1','S2');INSERTINTO ORDER_DETAIL VALUES('C','2024-06-09','PANTS','2','S3');INSERTINTO ORDER_DETAIL VALUES('C','2024-05-08','SHIRT','3','S4');DROPTABLEIFEXISTS PRODUCT_INFO;CREATETEMPORARYTABLE PRODUCT_INFO ASSELECT CAST(''AS STRING)AS PRODUCT_NAME, CAST(''ASFLOAT)AS PRODUCT_PRICE;-- We create a null line in table PRODUCT_INFO , so we can use insert overwrite to delete this useless line.INSERT OVERWRITE TABLE PRODUCT_INFO VALUES('DRESS',200);INSERTINTO PRODUCT_INFO VALUES('SHOES',300);INSERTINTO PRODUCT_INFO VALUES('SHIRT',500);INSERTINTO PRODUCT_INFO VALUES('PANTS',100);-- We found that we lost the data of customer D and the data have not been parsed yet. DROPTABLEIFEXISTS MISSING_RAWDATA;CREATETEMPORARYTABLE MISSING_RAWDATA ASSELECT'{"c_id":"D","order_time":"20240506","buy_detail":[{"product_name":"DRESS","units":"4","client":"S4"}]}'AS json_data
UNIONALLSELECT'{"c_id":"D","order_time":"20240607","buy_detail":[{"product_name":"SHOES","units":"2","client":"S1"}]}'AS json_data
UNIONALLSELECT'{"c_id":"B","order_time":"20240606","buy_detail":[{"product_name":"SHIRT","units":"2","client":"S1"}]}'AS json_data;DROPTABLEIFEXISTS PARSE_JSON;CREATETEMPORARYTABLE PARSE_JSON ASSELECT 
regexp_extract(json_data,'"c_id":"(.*?)(")',1)AS C_ID
,get_json_object(json_data,'$.order_time')AS ORDER_DATE
,get_json_object(json_data,'$.buy_detail.product_name[0]')AS PRODUCT_NAME
,regexp_extract(json_data,'"units":"(.*?)(")',1)AS PRODUCT_UNITS
,get_json_object(json_data,'$.buy_detail.client[0]')AS STAFF
FROM MISSING_RAWDATA;-- change the format of order_date from yyyyMMdd to yyyy-MM-ddDROPTABLEIFEXISTS OTHERS_DETAIL;CREATETEMPORARYTABLE OTHERS_DETAIL ASSELECT C_ID
,TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(ORDER_DATE,'yyyyMMdd'),'yyyy-MM-dd'))AS ORDER_DATE
,PRODUCT_NAME
,CAST(PRODUCT_UNITS ASINT)AS PRODUCT_UNITS
,STAFF
FROM PARSE_JSON;DROPTABLEIFEXISTS FINAL_DETAIL;CREATETEMPORARYTABLE FINAL_DETAIL ASSELECT A.*,B.PRODUCT_PRICE
FROM ORDER_DETAIL A 
LEFTJOIN PRODUCT_INFO B ON A.PRODUCT_NAME = B.PRODUCT_NAME 
UNIONSELECT A.*,B.PRODUCT_PRICE
FROM OTHERS_DETAIL A 
LEFTJOIN PRODUCT_INFO B ON A.PRODUCT_NAME = B.PRODUCT_NAME ;SELECT*FROM FINAL_DETAIL;

c_idorder_dateproduct_nameproduct_unitsstaffproduct_priceA2024-06-07DRESS2S1200B2024-05-09PANTS1S2100C2024-05-08SHIRT3S4500C2024-06-09PANTS2S3100D2024-06-06DRESS4S4200A2024-05-08SHOES1S3300B2024-06-06SHIRT2S1500D2024-06-07SHOES2S1300

-- product assortmentDROPTABLEIFEXISTS PRODUCT_NAME_UNITS_UNION;CREATETEMPORARYTABLE PRODUCT_NAME_UNITS_UNION ASSELECT A.*, PRODUCT_NAME||'×'||PRODUCT_UNITS AS NEW_PRODUCT
,CONCAT(PRODUCT_NAME,'×',PRODUCT_UNITS)AS NEW_PRODUCT1
FROM FINAL_DETAIL A;-- Now the new_product1 keeps the same as new_product DROPTABLEIFEXISTS PRODUCT_ASSORTMENT;CREATETEMPORARYTABLE PRODUCT_ASSORTMENT ASSELECT C_ID,SUM(PRODUCT_PRICE*PRODUCT_UNITS)AS REVENUE,CONCAT_WS(",", SORT_ARRAY(COLLECT_LIST(NEW_PRODUCT)))AS ALL_BUY
FROM PRODUCT_NAME_UNITS_UNION
GROUPBY C_ID;SELECT*from PRODUCT_ASSORTMENT;

c_idrevenueall_buyA700DRESS×2,SHOES×1B1100PANTS×1,SHIRT×2C1700PANTS×2,SHIRT×3D1400DRESS×2,SHOES×1

-- if we do not want to analyze SHIRT and empty the SHIRTDROPTABLEIFEXISTS NO_SHIRT;CREATETEMPORARYTABLE NO_SHIRT ASSELECT C_ID
,CASEWHEN PRODUCT_NAME IN('DRESS','PANTS','SHOES')THEN PRODUCT_NAME ENDAS NEW_NAME
,CASEWHEN PRODUCT_NAME ='SHIRT'THEN CAST('0'ASINT)ELSE PRODUCT_UNITS ENDAS NEW_UNITS
FROM FINAL_DETAIL;-- Now processing the product assortment using CONCAT AND || may get different results.DROPTABLEIFEXISTS NO_SHIRT_ASSORT;CREATETEMPORARYTABLE NO_SHIRT_ASSORT ASSELECT C_ID,NEW_NAME||'×'||NEW_UNITS AS NEW_PRODUCT
,CONCAT(NEW_NAME,'×',NEW_UNITS)AS NEW_PRODUCT1
FROM NO_SHIRT ;-- so when we use the concat to combine two columns ,we need to make sure there is no empty value in that two columnsSELECT*FROM NO_SHIRT_ASSORT WHERE c_id ='C';

c_idnew_productnew_product_1C[NULL][NULL]CPANTS×2PANTS×2

-- FIND THE MONTHLY sales champion DROPTABLEIFEXISTS MONTHLY_SALES_BY_STAFF;CREATETEMPORARYTABLE MONTHLY_SALES_BY_STAFF ASSELECT-- substr(ORDER_DATE,6,2)
SUBSTRING(ORDER_DATE,6,2)AS ORDER_MON, STAFF,SUM(PRODUCT_PRICE*PRODUCT_UNITS)AS SALES
FROM FINAL_DETAIL
GROUPBY SUBSTRING(ORDER_DATE,6,2), STAFF;DROPTABLEIFEXISTS MONTHLY_SALES_RANK;CREATETEMPORARYTABLE MONTHLY_SALES_RANK ASSELECT ORDER_MON, STAFF, SALES, DENSE_RANK()OVER(PARTITIONBY ORDER_MON ORDERBY SALES DESC)AS MON_RANK
FROM MONTHLY_SALES_BY_STAFF;SELECT*FROM MONTHLY_SALES_RANK WHERE mon_rank='1';

order_monstaffsalesmon_rank06S12000105S423001

-- USE NTILE TO CLUSTERING DROPTABLEIFEXISTS NTILE_TEST;CREATETEMPORARYTABLE NTILE_TEST ASSELECT A.*,NTILE(3)OVER(PARTITIONBY STAFF ORDERBY PRODUCT_PRICE)AS GROUP_1
FROM FINAL_DETAIL A;SELECT c_id,staff,product_price,group_1 FROM NTILE_TEST;

c_idstaffproduct_pricegroup_1AS12001DS13002BS15003BS21001CS31001AS33002DS42001CS45002

DROPTABLEIFEXISTS PRODUCT_LIST;CREATETEMPORARYTABLE PRODUCT_LIST 
(PRODUCT_LINE STRING,
ALL_DETAIL array<STRING>)row format delimited 
fieldsterminatedby"\t"
collection items terminatedby",";INSERTINTOTABLE PRODUCT_LIST VALUES('LINE1',ARRAY('SHOES','PANTS'));INSERTINTOTABLE PRODUCT_LIST VALUES('LINE2',ARRAY('SHIRT','DRESS','SKIRT'));SELECT*FROM PRODUCT_LIST;--SELECT EXPLODE(ALL_DETAIL) FROM PRODUCT_LIST;

product_lineall_detailLINE1[“SHOES”,“PANTS”]LINE2[“SHIRT”,“DRESS”,“SKIRT”]

SELECT PRODUCT_LINE,DETAIL_NAME 
FROM PRODUCT_LIST
LATERAL VIEW EXPLODE(ALL_DETAIL) TMPTABLE AS DETAIL_NAME;

product_linedetail_nameLINE1SHOESLINE1PANTSLINE2SHIRTLINE2DRESSLINE2SKIRT

标签： hive sql

本文转载自: https://blog.csdn.net/present1012/article/details/138537589
版权归原作者 present1012 所有，如有侵权，请联系我们删除。