9.3.1 小表和大表 join
在小表和大表 join 的时候, 最好把小表放在左边, 大表放在右边.
原因是在 Join 操作的 Reduce 阶段,位于 Join 操作符左边的表的内容会被加载进内存,将条目少的表放在左边,可以有效减少发生 OOM 错误的几率。
实例操作:小表在前和在后 Join 的效率
步骤1: 创建两张表: 一张大表, 一张小表
// 创建大表
create table bigtable(id bigint, time bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
// 创建小表
create table smalltable(id bigint, time bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
// 创建join后 表的语句(把 join 后的数据放在这个表中)
create table jointable(id bigint, time bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
步骤3: 向大表和小表导入数据
load data local inpath '/opt/module/datas/big_small/bigtable' into table bigtable;
load data local inpath '/opt/module/datas/big_small/smalltable' into table smalltable;
步骤3: 执行小表 join 大表
insert overwrite table jointable
    select b.id, b.time, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url
    from smalltable s
    left join bigtable  b
    on b.id = s.id;

步骤4: 执行大表 join 小表
insert overwrite table jointable
    select b.id, b.time, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url
    from bigtable  b
    left join smalltable s
    on b.id = s.id;
