(注:由于静态图片展示受限,如果您对动态交互看板感兴趣,请点击此处查看 Tableau Public 在线版本)
数据处理与深度业务分析
数据清洗与底层数仓建设 (ODS 到 DWD)
由于数据规模高达上亿条,传统 MySQL (OLTP) 导入查询过慢,本项目采用 ClickHouse 列式数据库进行在线分析处理。
首先先检查时间列表是否有异常值:
1 2 3 4 5 6
SELECT MIN(timestamp) AS min_timestamp_raw, toDateTime(MIN(timestamp)) AS min_real_time, MAX(timestamp) AS max_timestamp_raw, toDateTime(MAX(timestamp)) AS max_real_time FROM wangchao_data.user_behavior;
with( selectcount(*) from wangchao_data.user_behavior) as total_sum, (selectsum(action_count) from( select toDate(toDateTime(timestamp)) as real_date, count(*) as action_count from wangchao_data.user_behavior groupby real_date orderby action_count desc limit 10)) as top10_sum select total_sum as total,top10_sum as top10, round((top10_sum/total_sum) *100,2) as represent_percent;
-- 注入精准时间段内的优质数据 INSERT INTO wangchao_data.user_behavior_clean SELECT user_id, item_id, category_id, behavior_type, timestamp, toDate(toDateTime(timestamp)) AS date_id FROM wangchao_data.user_behavior WHEREtimestamp>1511452800ANDtimestamp<1512316800;
注意:
1 2 3 4 5 6 7 8
selectcount(user_id),count(item_id),count(category_id),count(behavior_type),count(timestamp) from wangchao_data.user_behavior_clean;
SELECT count(*) AS total_rows, uniqExact(user_id, item_id, behavior_type, timestamp) AS unique_combinations, total_rows - unique_combinations AS duplicates FROM wangchao_data.user_behavior_clean;
select uniqExactIf(user_id,behavior_type='pv') as pv_people, uniqExactIf(user_id,behavior_type='cart') as cart_people, uniqExactIf(user_id,behavior_type='fav') as fav_people, uniqExactIf(user_id,behavior_type='buy') as buy_people, round((uniqExactIf(user_id,behavior_type='buy')/uniqExactIf(user_id,behavior_type='pv'))*100,2) as Conversion_Rate from wangchao_data.user_behavior_clean;
select toHour(toDateTime(timestamp,'Asia/Shanghai')) ashour, sum(behavior_type ='pv') as sum_pv, sum(behavior_type ='cart') as sum_cart, sum(behavior_type ='fav') as sum_fav, sum(behavior_type ='buy') as sum_buy from wangchao_data.user_behavior_clean groupbyhour orderbyhourasc;
select user_id,date_diff('day',max(date_id),toDate('2017-12-04')) as R, count(*) as F from wangchao_data.user_behavior_clean where behavior_type ='buy' groupby user_id orderby R asc ,F desc limit 20;
WITH new_biao1 AS ( SELECT user_id, date_diff('day', max(date_id), toDate('2017-12-04')) AS R, count(*) AS F FROM wangchao_data.user_behavior_clean WHERE behavior_type ='buy' GROUPBY user_id ), new_biao2 AS ( SELECT user_id, R, F, CASE WHEN R <=2THEN5 WHEN R <=4THEN4 WHEN R <=6THEN3 WHEN R <=8THEN2 ELSE1 ENDAS R_Score, CASE WHEN F >=5THEN5 WHEN F >=3THEN4 WHEN F >=2THEN3 ELSE1 ENDAS F_Score FROM new_biao1 ) SELECT CASE WHEN (R_Score + F_Score) >=8THEN'1.核心高价值SVIP(8-10分)' WHEN (R_Score + F_Score) >=6THEN'2.潜力活跃中产(6-7分)' WHEN (R_Score + F_Score) >=4THEN'3.濒危沉睡用户(4-5分)' ELSE'4.边缘流失客(2-3分)' ENDAS `用户分层`, count(user_id) AS `该层级的人数` FROM new_biao2 GROUPBY `用户分层` ORDERBY `用户分层` ASC;
INSERT INTO wangchao_data.dim_user_rfm WITH UserRF AS ( SELECT user_id, dateDiff('day', max(date_id), toDate('2017-12-04')) AS R, count(*) AS F FROM wangchao_data.user_behavior_clean WHERE behavior_type ='buy' GROUPBY user_id ), UserScored AS ( SELECT user_id, R, F, CASE WHEN R <=2THEN5WHEN R <=4THEN4WHEN R <=6THEN3WHEN R <=8THEN2ELSE1 ENDAS R_Score, CASE WHEN F >=5THEN5WHEN F >=3THEN4WHEN F =2THEN3ELSE1 ENDAS F_Score FROM UserRF ) SELECT user_id, R_Score, F_Score, (R_Score + F_Score) AS total_score, CASE WHEN (R_Score + F_Score) >=8THEN'1.核心高价值SVIP(8-10分)' WHEN (R_Score + F_Score) >=6THEN'2.潜力活跃中产(6-7分)' WHEN (R_Score + F_Score) >=4THEN'3.濒危沉睡用户(4-5分)' ELSE'4.边缘流失客(2-3分)' ENDAS user_level FROM UserScored;
select item_id as `商品id`,category_id as `类目id`,sum(behavior_type ='pv') as `浏览次数_PV`, sum(behavior_type ='buy') as `购买次数_Buy`, round((sum(behavior_type ='buy')/sum(behavior_type ='pv'))*100,2) as `单品转化率` from wangchao_data.user_behavior_clean groupby item_id,category_id having `购买次数_Buy` >0 orderby `购买次数_Buy` desc limit 20;
类目爆款前20:
1 2 3 4 5 6 7
select category_id as `类目id`,uniqExact(item_id) as `该类目下商品总数`, sum(behavior_type ='pv') as `总浏览次数`, sum(behavior_type ='buy') as `总购买次数` from wangchao_data.user_behavior_clean groupby category_id orderby `总购买次数` desc limit 20;
1 2 3 4 5 6 7 8
1. 流量重分配(找算法团队) • 对于第一名(转化率 79% 的潜力爆款): 它现在的瓶颈是“没人看”(只有 1700 多 PV)。必须立刻要求算法推荐团队:强行给这个商品增加曝光权重! 把它挂到首页去!只要给它 1 万的流量,它能给你换回 8000 个订单! • 对于第二名(转化率 5% 的引流款): 它的流量极大,说明头图很吸引人,但很多人点进去不买。你要去揪住运营和设计团队的衣领:立刻优化它的商品详情页,或者给它配一个“下单立减 10 元”的促销! 只要转化率能提 2 个百分点,销售额就能原地起飞。 2. 供应链压价与库存预警(找采购团队) 你看图二的类目榜单,第一名的类目 1,464,116卖了 3.4 万件。 拿着这个数据去找采购总监:“老大,这个赛道的货正在被疯抢。赶紧去跟上游工厂谈独家协议,把进货价往下压 10%!另外,立刻查一下仓库,按现在的日均消耗速度,这几款 Top 10 的单品还能撑几天?绝对不能出现爆款断货的情况!” 3. 关联推荐与打包销售 既然那个卖得最好的类目流量那么大,咱们能不能趁机带动一下其他不好卖的东西?比如买了这个类目里最火的 A 商品的人,大概率还会买什么 B 商品?咱们可以直接把 A 和 B 组合成一个“双十一必买套装”来提升客单价!
与爆款3122135绑定最深的“最佳拍档”top10:
1 2 3 4 5 6 7 8 9 10 11
select item_id as `关联购买商品id`,count(distinct user_id) as `共同购买人数` from wangchao_data.user_behavior_clean where behavior_type ='buy' and item_id !=3122135 and user_id in ( selectdistinct user_id from wangchao_data.user_behavior_clean where item_id =3122135and behavior_type ='buy') groupby item_id orderby `共同购买人数` desc limit 10;
这就是跟 3,122,135 绑定得最深的“最佳拍档”。如果 A 是一部手机,你查出来的第一名 B 大概率就是一个手机壳或者充电头。
with new_biao1 as ( selectdistinct user_id from wangchao_data.dim_user_rfm), new_biao2 as ( select user_id,behavior_type as current_step, lead(behavior_type , 1)over(partitionby user_id orderbytimestamp) as next_step from wangchao_data.user_behavior_clean where user_id in (select user_id from new_biao1)) select current_step as `起点动作`,next_step as `终点动作`,count(*) as `跳转次数` from new_biao2 where next_step isnot nulland current_step != next_step groupby current_step,next_step orderby `跳转次数` desc;
业务思考:留存率不仅仅反映产品粘性,更隐藏着平台的生命周期规律。 计算每日新增用户的 N 日留存情况 (SQL逻辑见源码) 使用 dateDiff 和 JOIN 追踪同期群留存情况:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
with new_biao1 as ( select user_id,min(date_id) as first_day from wangchao_data.user_behavior_clean groupby user_id ), new_biao2 as ( select n.first_day,dateDiff('day',n.first_day,b.date_id) as day_diff, b.user_id as user_id from new_biao1 as n join wangchao_data.user_behavior_clean as b on n.user_id = b.user_id) select first_day as `同期群日期`,day_diff as `第n天留存`,uniqExact(user_id) as `留存人数` from new_biao2 where day_diff >=0and day_diff<11 groupby `同期群日期`,`day_diff` orderby `同期群日期` asc , `day_diff` asc ;
with new_biao as ( select user_id , sum(behavior_type ='pv') as total_pvs, max(if(behavior_type ='buy',1,0)) as is_buyer from wangchao_data.user_behavior_clean groupby user_id ) select total_pvs as `累计浏览次数`, count(user_id) as `该阶段总人数`, sum(is_buyer) as `买过的人数`, round(sum(is_buyer)/count(user_id)*100,2) as `购买转化率` from new_biao where total_pvs between1and100 groupby `累计浏览次数` orderby `累计浏览次数` asc ;
WITH new_biao AS ( SELECT item_id, buy_count, -- 给所有商品按销量打上排名 row_number() OVER (ORDERBY buy_count DESC) AS rn, -- 计算被卖出的商品总种类数 count(*) OVER () AS total_items, -- 计算大盘总销量 sum(buy_count) OVER () AS total_buys FROM wangchao_data.ads_item_buy_count ) SELECT MAX(total_items) AS `总售出商品种类数`, MAX(total_buys) AS `大盘总销量`, SUM(buy_count) AS `前20%商品的总销量`, ROUND(SUM(buy_count) /MAX(total_buys) *100, 2) AS `头部20%商品销量占比(%)` FROM new_biao WHERE rn <= total_items *0.2;