Last active
July 9, 2018 08:30
-
-
Save fish8/ad638e9da1a638378494f2d764f7baf2 to your computer and use it in GitHub Desktop.
create internal/external table on hive #hive #ss
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE EXTERNAL TABLE `j_crh04_30`( | |
`jid` int, | |
`uid` bigint, | |
`hits` int, | |
`max_mps` double) | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
location '/user/ss_deploy/hive.db/dm_taozt/j_crh04_30'; | |
-- charactoristic speed origninal | |
CREATE EXTERNAL TABLE `j_cspeed_org`( | |
`jid` int, | |
`uid` bigint, | |
`jvps` int, | |
`crh_hits` int, | |
`speed3` string, | |
`s_order` string, | |
`sec` double, | |
`distance` double) | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
location '/user/ss_deploy/workspace/wqj/speedJvp'; | |
-- split comma delimited speeds '32,454.323,23.1212' | |
CREATE TABLE j_cspeed | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
as | |
SELECT jid, uid, jvps, sec, distance, max(cast(s as double)) mps_c | |
FROM (SELECT * FROM j_cspeed_org LATERAL VIEW explode(split(speed3,',')) t1 as s) x | |
group by jid,uid,jvps,sec,distance; | |
create external table ch_1( | |
cell_id int, | |
chr_hits int, | |
jvp_hits int, | |
crh_rate double, | |
crh_prob double | |
) | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
location '/user/ss_deploy/hive.db/dm_taozt/ch_1'; | |
CREATE TABLE j_cspeed_1 | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
as | |
select *, 0.1*exp(76.9-mps_c)/power((1+exp(76.9-mps_c)),1.01) crh_prob from j_cspeed | |
-- iter epoch 1 | |
drop table if exists j_crh_1; | |
create table j_crh_1 | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
as | |
select cs.uid, cs.jid, cs.jvps,cs.sec, cs.distance, cs.mps_c, cs.crh_prob speed_priori, | |
count(distinct v.cell_id) crh_hits, | |
exp(sum(log(ch_1.crh_prob)))*cs.crh_prob/( | |
exp(sum(log(ch_1.crh_prob)))*cs.crh_prob+exp(sum(log(1-ch_1.crh_prob)))*(1-cs.crh_prob) | |
) crh_prob, | |
exp(sum(log(ch_1.crh_prob)))/( | |
exp(sum(log(ch_1.crh_prob)))+exp(sum(log(1-ch_1.crh_prob))) | |
) crh_prob_nospeed | |
from j_cspeed_1 cs inner join cip.journey_via_point_single v on cs.uid=v.user_id and cs.jid=v.journey_id | |
inner join ch_1 on v.cell_id=ch_1.cell_id | |
where v.province='011' and v.date_dt=20180601 | |
group by cs.uid, cs.jid, cs.jvps,cs.sec, cs.distance, cs.mps_c, cs.crh_prob; | |
drop table if exists hit_crh04_30; | |
create table hit_crh04_30 | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
as | |
select jvp.cell_id,count(*) crh_hits | |
from j_crh04_30 h inner join cip.journey_via_point_single jvp on h.jid=jvp.journey_id and h.uid=jvp.user_id | |
where jvp.date_dt=20180601 and jvp.province='011' | |
group by jvp.cell_id; | |
drop table if exists hit04_30_plus; | |
create table hit04_30_plus | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
as | |
select distinct h.cell_id, h.crh_hits, v.jvp_hits, h.crh_hits/v.jvp_hits crh_rate, c.lat_wgs84, c.lon_wgs84,c.technology, c.site_type, c.service_to | |
from hit_crh04_30 h | |
inner join cip.cellref_single c on h.cell_id=c.cell_id | |
inner join hit_jvp0 v on h.cell_id=v.cell_id | |
where c.date_dt=20180601 and c.province='011'; | |
-- sorted jvp, on user and time | |
drop table if exists jvp_sorted_04_30; | |
create table jvp_sorted_04_30 | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
as | |
select v.user_id, v.journey_id, v.journey_via_point_id, v.cell_id, v.location_id, v.start_dt, v.journey_via_point_type_id, | |
ROW_NUMBER() OVER(order by v.user_id, v.start_dt) ts_jvp_id | |
from j_crh04_30 h inner join cip.journey_via_point_single v on h.uid=v.user_id and h.jid=v.journey_id | |
where v.province='011' and v.date_dt=20180601; | |
drop table if exists jvp_sorted_all; | |
create table jvp_sorted_all | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
as | |
select v.user_id, v.journey_id, v.journey_via_point_id, v.cell_id, v.location_id, v.start_dt, v.journey_via_point_type_id, | |
ROW_NUMBER() OVER(partition by v.user_id order by v.start_dt) ts_jvp_id | |
from cip.journey_via_point_single v | |
where v.province='011' and v.date_dt=20180601; | |
-- transfer matrix | |
drop table if exists crh_cell_trans_04_30; | |
create table crh_cell_trans_04_30 | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
as | |
select a.cell_id from_cell_id, b.cell_id to_cell_id, count(distinct a.ts_jvp_id) `n`, | |
sum(unix_timestamp(b.start_dt)-unix_timestamp(a.start_dt))/count(distinct a.ts_jvp_id) time_avg, | |
max(unix_timestamp(b.start_dt)-unix_timestamp(a.start_dt)) time_max, | |
min(unix_timestamp(b.start_dt)-unix_timestamp(a.start_dt)) time_min, | |
percentile(unix_timestamp(b.start_dt)-unix_timestamp(a.start_dt),0.5) time_median | |
from jvp_sorted_04_30 a inner join jvp_sorted_04_30 b on a.user_id=b.user_id and a.ts_jvp_id+1=b.ts_jvp_id | |
group by a.cell_id, b.cell_id; | |
drop table if exists crh_cell_trans_all; | |
create table crh_cell_trans_all | |
row format delimited fields terminated by '|' Lines terminated by '\n' null defined as '' stored as textfile | |
as | |
select if(x.cell_id is null, -1, a.cell_id) from_cell_id, if(y.cell_id is null, -1,b.cell_id) to_cell_id, count(distinct a.ts_jvp_id) `n`, | |
sum(unix_timestamp(b.start_dt)-unix_timestamp(a.start_dt))/count(distinct a.ts_jvp_id) time_avg, | |
max(unix_timestamp(b.start_dt)-unix_timestamp(a.start_dt)) time_max, | |
min(unix_timestamp(b.start_dt)-unix_timestamp(a.start_dt)) time_min, | |
percentile(unix_timestamp(b.start_dt)-unix_timestamp(a.start_dt),0.5) time_median | |
from jvp_sorted_all a inner join jvp_sorted_all b on a.user_id=b.user_id and a.ts_jvp_id+1=b.ts_jvp_id | |
left join hit_crh04_30 x on a.cell_id=x.cell_id | |
left join hit_crh04_30 y on b.cell_id=y.cell_id | |
group by if(x.cell_id is null, -1, a.cell_id), if(y.cell_id is null, -1,b.cell_id); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment