從頭學習大數據培訓課程 數據倉儲工具 hive(四)hive 的 select、union、SQL 依賴並發執行
1. HIVE SELECT 語法
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference[WHERE where_condition][GROUP BY col_list][ORDER BY col_list][CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY col_list] ]
[LIMIT number]2. 新建字典表用於join
http://www.jctrans.com/tool/gjym.htm
create table country_dict(
code string,name string,region string)ROW FORMAT DELIMITED FIELDS TERMINATED BY ;LOAD DATA LOCAL INPATH /home/qingniu/country_dict.dat OVERWRITE INTO TABLE country_dict;
3. 解決hive不支持的非等值連接問題
笛卡爾積的SQL
select from(select from user_install_status where dt=20141117) ainner join(select * from user_install_status where dt=20141228) bwhere a.country = b.country or a.gpcategory = b.gpcategorylimit 10;set hive.mapred.mode=strict;
限制執行可能形成笛卡爾積的SQLorder by全局排序的時候不加limit的情況partition表使用時需要加上分區4. union的使用
創建中間表用於減少SQL執行步驟
create external table user_install_status_mid(aid string,country string,gpcatetory string)partitioned by (dt string)
stored as orclocation /data/hainiu/user_install_status_mid;alter table user_install_status_mid add if not exists partition (dt=20141117) location 20141117;
insert overwrite table user_install_status_mid partition (dt=20141117)select aid,max(country) as country,max(gpcategory) as gpcategory from user_install_status where dt=20141117 group by aid;alter table user_install_status_mid add if not exists partition (dt=20141228) location 20141228;
insert overwrite table user_install_status_mid partition (dt=20141228)select aid,max(country) as country,max(gpcategory) as gpcategory from user_install_status where dt=20141228 group by aid;select * from (
select a.aid,a.gpcatetory from(select aid,gpcatetory from user_install_status_mid where dt=20141117) ainner join(select aid,gpcatetory from user_install_status_mid where dt=20141228) bon a.aid = b.aidunion allselect a.aid,a.gpcatetory from(select aid,gpcatetory from user_install_status_mid where dt=20141117) ainner join(select aid,gpcatetory from user_install_status_mid where dt=20141228) b
on a.aid= b.aid) aa limit 10;優化方法:
hive中類似於jobcontrol的使用增加reducer任務數量set mapred.reduce.tasks=20;開啟任務依賴並發執行set hive.exec.parallel=true;設置reducer內存大小set mapreduce.reduce.memory.mb=4096;
set mapreduce.reduce.java.opts=-Xmx3584m;5. map端的join
/+ MAPJOIN(c) /
set hive.auto.convert.join=true; //將小表刷入內存中,默認是trueset hive.ignore.mapjoin.hint=true;set hive.mapjoin.smalltable.filesize=2500000; //刷入內存表的大小(位元組),根據自己的數據集加大關閉自動mapjoin的方法是要同時把這兩個參數設置為false,不然手動的MAPJOIN參數不起作用
set hive.auto.convert.join=false;set hive.ignore.mapjoin.hint=false;select /+ MAPJOIN(c) / * from user_install_status u
inner join country_dict c
on u.country=c.codewhere u.dt=20141228limit 10;設置太大也不會校驗,所以要根據實際情況來設置
set hive.mapjoin.smalltable.filesize=2500000000000000;6. full outer join
包括兩個表的join結果,左邊在右邊中沒找到的結果(NULL),右邊在左邊沒找到的結果(NULL)
select * from user_install_status u
full outer joincountry_dict c
on u.country=c.codelimit 10;nohup hive -e "use hainiu;select * from user_install_status u full outer join country_dict c on u.country=c.code;" > ~/fulljoin 2>&1 &
7. GROUPING SETS使用
SELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ( (a,b) )
等於SELECT a, b, SUM(c) FROM tab1 GROUP BY a, bSELECT a, b, SUM(c) FROM tab1 GROUP BY a, b GROUPING SETS ( (a,b), a)
等於SELECT a, b, SUM(c) FROM tab1 GROUP BY a, bUNION ALLSELECT a, null, SUM( c ) FROM tab1 GROUP BY aSELECT a,b, SUM( c ) FROM tab1 GROUP BY a, b GROUPING SETS (a,b)
等於SELECT a, null, SUM( c ) FROM tab1 GROUP BY aUNION ALLSELECT null, b, SUM( c ) FROM tab1 GROUP BY bSELECT a, b, SUM( c ) FROM tab1 GROUP BY a, b GROUPING SETS ( (a, b), a, b, () )
等於SELECT a, b, SUM( c ) FROM tab1 GROUP BY a, bUNION ALLSELECT a, null, SUM( c ) FROM tab1 GROUP BY aUNION ALLSELECT null, b, SUM( c ) FROM tab1 GROUP BY bUNION ALLSELECT null, null, SUM( c ) FROM tab18. select中的條件判斷
IF( Test Condition, True Value, False Value )
示例:if(a is null,all,a)COALESCE( value1,value2,… )示例:example: COALESCE(NULL,NULL,5,NULL,4) returns 5CASE [ expression ]WHEN condition1 THEN result1WHEN condition2 THEN result2...WHEN conditionn THEN resultnELSE result END示例:1.CASE aWHEN apple THEN 1WHEN banana THEN2...WHEN orange THEN3ELSE 0 END2.CASE WHEN a=apple THEN 1
WHEN a=banana THEN2...WHEN a=orange THEN3ELSE 0 END推薦閱讀:
※【總結】2017年最值得收藏的可視化案例
※用Apache Spark進行大數據處理——第一部分:入門介紹
※大數據雙創行動丨校園裡誕生的大數據創客
※如何讓產品改版評估更智能更高效?
※滴滴拼車路徑的優化