从0到1搭建数仓DWD层案例实践
关注公众号:大数据技术派,回复资料,领取1024G资料。




drop table if exists dwd_fact_payment_info;create external table dwd_fact_payment_info (`id` string COMMENT 'id',`out_trade_no` string COMMENT '对外业务编号',`order_id` string COMMENT '订单编号',`user_id` string COMMENT '用户编号',`alipay_trade_no` string COMMENT '支付宝交易流水编号',`payment_amount` decimal(16,2) COMMENT '支付金额',`subject` string COMMENT '交易内容',`payment_type` string COMMENT '支付类型',`payment_time` string COMMENT '支付时间',`province_id` string COMMENT '省份ID') COMMENT '支付事实表表'PARTITIONED BY (`dt` string)stored as parquetlocation '/warehouse/gmall/dwd/dwd_fact_payment_info/'tblproperties ("parquet.compression"="lzo");
SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;insert overwrite table dwd_fact_payment_info partition(dt='2021-05-03')selectpi.id,pi.out_trade_no,pi.order_id,pi.user_id,pi.alipay_trade_no,pi.total_amount,pi.subject,pi.payment_type,pi.payment_time,oi.province_idfrom(select * from ods_payment_info where dt='2021-05-03')pijoin(select id, province_id from ods_order_info where dt='2021-05-03')oion pi.order_id = oi.id;
drop table if exists dwd_fact_order_refund_info;create external table dwd_fact_order_refund_info(`id` string COMMENT '编号',`user_id` string COMMENT '用户ID',`order_id` string COMMENT '订单ID',`sku_id` string COMMENT '商品ID',`refund_type` string COMMENT '退款类型',`refund_num` bigint COMMENT '退款件数',`refund_amount` decimal(16,2) COMMENT '退款金额',`refund_reason_type` string COMMENT '退款原因类型',`create_time` string COMMENT '退款时间') COMMENT '退款事实表'PARTITIONED BY (`dt` string)stored as parquetlocation '/warehouse/gmall/dwd/dwd_fact_order_refund_info/'tblproperties ("parquet.compression"="lzo");
insert overwrite table dwd_fact_order_refund_info partition(dt='2021-05-03')selectid,user_id,order_id,sku_id,refund_type,refund_num,refund_amount,refund_reason_type,create_timefrom ods_order_refund_infowhere dt='2021-05-03';

drop table if exists dwd_fact_cart_info;create external table dwd_fact_cart_info(`id` string COMMENT '编号',`user_id` string COMMENT '用户id',`sku_id` string COMMENT 'skuid',`cart_price` string COMMENT '放入购物车时价格',`sku_num` string COMMENT '数量',`sku_name` string COMMENT 'sku名称 (冗余)',`create_time` string COMMENT '创建时间',`operate_time` string COMMENT '修改时间',`is_ordered` string COMMENT '是否已经下单。1为已下单;0为未下单',`order_time` string COMMENT '下单时间',`source_type` string COMMENT '来源类型',`srouce_id` string COMMENT '来源编号') COMMENT '加购事实表'PARTITIONED BY (`dt` string)stored as parquetlocation '/warehouse/gmall/dwd/dwd_fact_cart_info/'tblproperties ("parquet.compression"="lzo");
insert overwrite table dwd_fact_cart_info partition(dt='2021-05-03')selectid,user_id,sku_id,cart_price,sku_num,sku_name,create_time,operate_time,is_ordered,order_time,source_type,source_idfrom ods_cart_infowhere dt='2020-06-14';

drop table if exists dwd_fact_coupon_use;create external table dwd_fact_coupon_use(`id` string COMMENT '编号',`coupon_id` string COMMENT '优惠券ID',`user_id` string COMMENT 'userid',`order_id` string COMMENT '订单id',`coupon_status` string COMMENT '优惠券状态',`get_time` string COMMENT '领取时间',`using_time` string COMMENT '使用时间(下单)',`used_time` string COMMENT '使用时间(支付)') COMMENT '优惠券领用事实表'PARTITIONED BY (`dt` string)stored as parquetlocation '/warehouse/gmall/dwd/dwd_fact_coupon_use/'tblproperties ("parquet.compression"="lzo");
`get_time` string COMMENT '领取时间',`using_time` string COMMENT '使用时间(下单)',`used_time` string COMMENT '使用时间(支付)'

insert overwrite table dwd_coupon_use partition(dt)selectid,coupon_id,user_id,order_id,coupon_status,get_time,using_time,used_time,expire_time,coalesce(date_format(used_time,'yyyy-MM-dd'),date_format(expire_time,'yyyy-MM-dd'),'9999-99-99')from ods_coupon_usewhere dt='2021-05-03';

set hive.exec.dynamic.partition.mode=nonstrict;set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;insert overwrite table dwd_fact_coupon_use partition(dt)selectif(new.id is null,old.id,new.id),if(new.coupon_id is null,old.coupon_id,new.coupon_id),if(new.user_id is null,old.user_id,new.user_id),if(new.order_id is null,old.order_id,new.order_id),if(new.coupon_status is null,old.coupon_status,new.coupon_status),if(new.get_time is null,old.get_time,new.get_time),if(new.using_time is null,old.using_time,new.using_time),if(new.used_time is null,old.used_time,new.used_time),date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')from(selectid,coupon_id,user_id,order_id,coupon_status,get_time,using_time,used_timefrom dwd_fact_coupon_usewhere dt in(selectdate_format(get_time,'yyyy-MM-dd')from ods_coupon_usewhere dt='2021-05-04'))oldfull outer join(selectid,coupon_id,user_id,order_id,coupon_status,get_time,using_time,used_timefrom ods_coupon_usewhere dt='2021-05-04')newon old.id=new.id;
数据仓库第4版 数据仓库工具箱 DAMA数据管理知识体系指南 华为数据之道
数仓建模—指标体系
数仓建模—宽表的设计
Spark SQL知识点与实战
Hive计算最大连续登陆天数
Flink计算pv和uv的通用方法
评论
