数据预处理 参考sql
生活随笔
收集整理的這篇文章主要介紹了
数据预处理 参考sql
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
with-- 讀取exchange 表 ,里面包含item 和 us_price等信息
exchange as(
select * from `heidao-market.recommend_algorithm.v_mafia1_exchange`
),
charge_item as(
select * from `heidao-market.recommend_algorithm.v_mafia1_charge_item`
),# 讀取一年的活躍用戶,并且去重
sdk_player as (
select distinct player_id from `heidao-market.mafia1_ods.sdk_gift_bag_tracking`
where timestamp >= '2020-07-18' and timestamp < '2020-08-25 05:00:00'
#and MOD(CAST(CONCAT('0x', SUBSTR(TO_HEX(MD5(CONCAT('0j46o', CAST(player_id AS STRING)))),0, 8)) AS INT64), 100) between 30 and 44
),# 購買的禮包記錄
purchase as (
select *, max(giftbag_rn) over (partition by giftbag_id) as max_giftbag_rn from(
select player_id, timestamp, CAST(giftbag_id as STRING) as giftbag_id,
ROW_NUMBER() OVER (PARTITION BY player_id ORDER BY timestamp desc) as rn,
ROW_NUMBER() over(partition by player_id order by timestamp) as purchase_rn,
ROW_NUMBER() over(partition by giftbag_id order by timestamp) as giftbag_rn
from `mafia1_ods.game_log_giftbag`
-- 應該是開始購買的時間 purchase_end_time 購買結束的時間
--where timestamp >= '2020-01-01' and timestamp < '{4}'
where timestamp >= '2020-11-01' and timestamp < '2020-11-04'
and player_id in (select player_id from sdk_player))),--禮包點擊的的數據,只提取點擊停留時長多余1秒的數據
filter_record as (
select player_id, giftbag_id, timestamp, IFNULL(view_time, 1) as view_time, 0 as is_purchase from(
select player_id, value_1 as giftbag_id, timestamp, action,
TIMESTAMP_DIFF(LEAD(timestamp) OVER (PARTITION BY player_id, event_id, value_1 ORDER BY timestamp ASC), timestamp, MILLISECOND)/1000 as view_time,from `heidao-market.mafia1_ods.sdk_gift_bag_tracking` --where timestamp >= '{2}' and timestamp < '{1}' and type = 70-- record_start_time -- end_time
where timestamp >= '2020-10-26' and timestamp < '2020-11-03 14:00:00' and type = 70
and player_id in (select player_id from sdk_player))
where action = 1 and view_time >= 1
and giftbag_id in (select cast(giftbag_id as string) from exchange)
),-- 提取某次曝光之前的購買記錄,并且join ,在兩次購買之間只提取某個禮包最后最后一條點擊記錄
filter_record1 as (
select * except(rn1) from(
select *, row_number() over(partition by player_id, giftbag_id, purchase_rn order by timestamp desc) as rn1,
count(*) over(partition by player_id, giftbag_id, purchase_rn) as view_count
from(
select * except(rn, purchase_rn), ifnull(purchase_rn, 0) as purchase_rn from(
select filter_record.*, purchase.purchase_rn, purchase.giftbag_rn, purchase.max_giftbag_rn, purchase.giftbag_id as purchase_giftbag, purchase.timestamp as purchase_time,
ROW_NUMBER() over (partition by filter_record.player_id, filter_record.giftbag_id, filter_record.timestamp order by purchase.timestamp desc) as rn-- 兩次購買之間只用某個禮包最后一條曝光數據,目的是為了去重
from (filter_record left join purchase on filter_record.player_id = purchase.player_id and filter_record.timestamp >purchase.timestamp))
--這里的rn =1 意味著 最遠離上一次曝光
where rn = 1))
where rn1 = 1
#order by player_id desc, timestamp desc
),--這段sql 是為了聚合出最近購買的禮包列表
record_1 as(
select player_id, giftbag_id, timestamp, is_purchase, giftbag_rn, max_giftbag_rn, purchase_giftbag, purchase_time, purchase_rn, string_agg(view_giftbag, ',') as recent_view,
string_agg(cast(view_delta_time as string), ',') as recent_view_times,
string_agg(cast(view_count as string), ',') as recent_view_count
from(
select t1.* except(view_count), t2.view_count, t2.giftbag_id as view_giftbag, TIMESTAMP_DIFF(t1.timestamp, t2.timestamp, HOUR) + 1 as view_delta_time,
-- 這段row_number 是給用來提取最近多少條數據
row_number() over (partition by t1.player_id, t1.giftbag_id, t1.timestamp order by t2.timestamp desc) as rn
from (select * from filter_record1) as t1
left join (select * from filter_record1) as t2
-- 這段sql 是為了將曝光的表與自己聚合,在某個用戶和他之前的近七天的曝光數據
on t1.player_id = t2.player_id and t2.timestamp between TIMESTAMP_SUB(t1.timestamp, INTERVAL 7*24 HOUR) and TIMESTAMP_SUB(t1.timestamp, INTERVAL 60*5 SECOND))
where rn <= 50-- 實際上并不需要 group by 這么多特征,實際上只要 player_id, giftbag_id, timestamp 就可以了,其他特征是冗余特征
group by player_id, giftbag_id, timestamp, is_purchase, giftbag_rn, max_giftbag_rn, purchase_giftbag, purchase_time, purchase_rn
),record_2 as (
select player_id, giftbag_id, timestamp, is_purchase, giftbag_rn, max_giftbag_rn, purchase_giftbag, purchase_time, purchase_rn, recent_view, recent_view_times,recent_view_count,
string_agg(purchase_giftbag_id, ',') as recent_giftbags,
string_agg(cast(purchase_delta_time as string), ',') as recent_giftbag_times
from(
select record_1.*, purchase.giftbag_id as purchase_giftbag_id, TIMESTAMP_DIFF(record_1.timestamp, purchase.timestamp, HOUR) + 1 as purchase_delta_time,
row_number() over (partition by record_1.player_id, record_1.giftbag_id, record_1.timestamp order by purchase.timestamp desc) as rn
from record_1 left join purchase --將之前的禮包點擊的準備好的點擊的數據和購買的數據join 起來,時間取最近30天購買的禮包
on record_1.player_id = purchase.player_id and purchase.timestamp between TIMESTAMP_SUB(record_1.timestamp, INTERVAL 30*24 HOUR) and TIMESTAMP_SUB(record_1.timestamp, INTERVAL 60*5 SECOND))
where rn <= 10
group by player_id, giftbag_id, timestamp, is_purchase, giftbag_rn, max_giftbag_rn, purchase_giftbag, purchase_time, purchase_rn, recent_view, recent_view_times, recent_view_count),record as ( select * from record_2
where giftbag_rn > max_giftbag_rn - 2000
#or giftbag_rn is null and timestamp >= '{5}'
or timestamp >='2020-11-04 00:00:00'
),item_map as (select * from `heidao-market.recommend_algorithm.mafia1_binary_v20200713_item_map`),-- 本段sql 用來提取某個用戶在一段時間的變化
change_info as (
select player_id, item_map.map_id*sign(change_num) as table_id, change_num*item_map.exchange_val as change_num,
TIMESTAMP_TRUNC(timestamp, HOUR) as timestamp_hour,
TIMESTAMP_TRUNC(timestamp, DAY) as timestamp_day,
item_map.use_log
from(
select player_id, army_id as table_id, army_num as change_num, timestamp
from `heidao-market.mafia1_ods.game_log_army` where timestamp >= '2020-10-29' and timestamp < '2020-11-04 17:00:00' and army_id in (select table_id from item_map)
UNION ALL
select player_id,table_id, change_num, timestamp
from `heidao-market.mafia1_ods.game_log_item` where timestamp >= '2020-10-29' and timestamp < '2020-11-04 17:00:00' and table_id in (select table_id from item_map)) as t
left join item_map on t.table_id = item_map.table_id
),
-- 離散到小時的item變化
item_change_hour as(
select player_id, table_id, timestamp_hour as timestamp,
CASE WHEN use_log <= 0 then abs(sum(change_num)) else round(log10(1 + abs(sum(change_num)))*10) end as change_num from change_info
group by player_id, table_id, timestamp_hour, use_log),-- 離散到天的item 變化
item_change_day as(
select player_id, table_id, timestamp_day as timestamp,
CASE WHEN use_log <= 0 then abs(sum(change_num)) else round(log10(1 + abs(sum(change_num)))*10) end as change_num from change_info
group by player_id, table_id, timestamp_day, use_log),data as(
select * EXCEPT(purchase_weight, purchase_delta_time, purchase_rn, rn),
purchase_weight/sum(purchase_weight) OVER(PARTITION BY player_id, giftbag_id, purchase_rn) as purchase_weight,from (
select record.* except(purchase_rn, purchase_time), purchase.timestamp as purchase_time, purchase.rn as purchase_rn,
TIMESTAMP_DIFF(purchase.timestamp, record.timestamp, MINUTE) as purchase_delta_time,
EXP(-TIMESTAMP_DIFF(purchase.timestamp, record.timestamp, MINUTE)/30 + 0.0) as purchase_weight,
count(*) OVER(PARTITION BY record.player_id, record.giftbag_id, purchase.rn) as share_count,
ROW_NUMBER() OVER (PARTITION BY record.player_id, record.giftbag_id, record.timestamp ORDER BY purchase.timestamp) as rn,
from (record left join purchase -- 將禮包點擊記錄和禮包購買記錄join ,只提取在在點擊后一個小時購買的禮包數據
on record.player_id = purchase.player_id
and record.giftbag_id = purchase.giftbag_id
and TIMESTAMP_DIFF(purchase.timestamp, record.timestamp, MINUTE) BETWEEN 0 AND 60
)
order by player_id, timestamp)
where timestamp >= '2020-10-05' and rn = 1
#order by player_id, timestamp
),data1 as (
select player_id, giftbag_id, timestamp,
STRING_AGG(CAST(table_id as STRING), ',') as item_hour_id,
STRING_AGG(CAST(delta_time as STRING), ',') as item_hour_time,
STRING_AGG(CAST(change_num as STRING), ',') as item_hour_num from(
select data.*, item_change_hour.table_id, item_change_hour.change_num,
TIMESTAMP_DIFF(data.timestamp, item_change_hour.timestamp, HOUR) as delta_time from (--將data 與item_change_hour item 按照小時的變化進行join
data left join item_change_hour
on data.player_id = item_change_hour.player_id
and TIMESTAMP_DIFF(data.timestamp, item_change_hour.timestamp, HOUR) between 1 and 24)
)
group by player_id, giftbag_id, timestamp),data2 as (
select player_id, giftbag_id, timestamp,
STRING_AGG(CAST(table_id as STRING), ',') as item_day_id,
STRING_AGG(CAST(delta_time as STRING), ',') as item_day_time,
STRING_AGG(CAST(item_num as STRING), ',') as item_day_num from(
select data.*, item_change_day.table_id, item_change_day.change_num as item_num,
TIMESTAMP_DIFF(data.timestamp, item_change_day.timestamp, DAY) as delta_time from (
data left join item_change_day
on data.player_id = item_change_day.player_id
and TIMESTAMP_DIFF(data.timestamp, item_change_day.timestamp, DAY) between 1 and 7)
)
group by player_id, giftbag_id, timestamp),
data3 as (
select * except(giftbag_rn, max_giftbag_rn, purchase_giftbag), CAST(FLOOR(100*RAND()) AS INT64) as rand_num from(
select t.*, data2.* except(player_id, giftbag_id, timestamp) from(
select data.*, data1.* except(player_id, giftbag_id, timestamp) from (
data left join data1 on data.player_id = data1.player_id and data.giftbag_id = data1.giftbag_id and data.timestamp = data1.timestamp)) as t
left join data2 on t.player_id = data2.player_id and t.giftbag_id = data2.giftbag_id and t.timestamp = data2.timestamp) as t1
where timestamp >= '2020-10-05' and timestamp < '2020-11-04 17:00:00'
#and player_id not in (select player_id from mafia1_pf.v_internal_player)
),
ratio as (
select giftbag_id, ifnull(sum(purchase_weight), 0) as positive, count(*) as total,
sum(is_purchase)/count(*) as purchase_ratio,
ifnull(sum(purchase_weight)/count(*), 0) as positive_ratio from data3
group by giftbag_id
order by positive_ratio desc),data4 as (
select * from data3
where giftbag_id not in (select giftbag_id from ratio where (purchase_ratio >= 0.9 or positive_ratio <= 0.0))
),status1 as (select player_id ,timestamp ,status from `heidao-market.mafia1_ods.game_log_status_snapshot` where timestamp >= '2020-11-01')select * except(rank, status_time) from(
select *, row_number() over (partition by player_id, timestamp order by status_time desc) as rank from(
select data4.*, status1.timestamp as status_time, status1.status from (data4 left join status1 on data4.player_id = status1.player_id and status1.timestamp between TIMESTAMP_SUB(data4.timestamp, INTERVAL 48 HOUR) and data4.timestamp)))
where rank = 1 and status is not null limit 1000
總結
以上是生活随笔為你收集整理的数据预处理 参考sql的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: sql item_map
- 下一篇: Systemd 入门教程