This article introduce how to find outliers using Local Outlier Detection (LOF) on Hivemall.
create database lof;
use lof;
create external table hundred_balls (
rowid int,
weight double,
specific_heat double,
reflectance double
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ' '
STORED AS TEXTFILE LOCATION '/dataset/lof/hundred_balls';
Download hundred_balls.txt that is originally provides in this article.
You can find outliers in this picture.
awk '{FS=" "; OFS=" "; print NR,$0}' hundred_balls.txt | \
hadoop fs -put - /dataset/lof/hundred_balls/hundred_balls.txt
create table train
as
select rowid, array(concat("weight:", weight), concat("specific_heat:", specific_heat), concat("reflectance:", reflectance)) as features
from hundred_balls;
create table train_normalized
as
WITH fv as (
select
rowid,
extract_feature(feature) as feature,
extract_weight(feature) as value
from
train
LATERAL VIEW explode(features) exploded AS feature
),
stats as (
select
feature,
-- avg(value) as mean, stddev_pop(value) as stddev
min(value) as min, max(value) as max
from
fv
group by
feature
),
norm as (
select
rowid,
t1.feature,
-- zscore(t1.value, t2.mean, t2.stddev) as zscore
rescale(t1.value, t2.min, t2.max) as minmax
from
fv t1 JOIN
stats t2 ON (t1.feature = t2.feature)
),
norm_fv as (
select
rowid,
-- concat(feature, ":", zscore) as feature
concat(feature, ":", minmax) as feature
from
norm
)
select
rowid,
collect_list(feature) as features
from
norm_fv
group by
rowid
;
-- workaround to deal with a bug in Hive/Tez
-- https://issues.apache.org/jira/browse/HIVE-10729
set hive.auto.convert.join=false;
-- parameter of LoF
set hivevar:k=12;
-- find topk outliers
set hivevar:topk=3;
WITH list_distance as (
select
t1.rowid as target,
t2.rowid as neighbour,
euclid_distance(t1.features, t2.features) as distance
-- cosine_similarity(t1.features, t2.features) as distance
from
-- train t1 CROSS JOIN train t2
train_normalized t1 CROSS JOIN train_normalized t2
where
t1.rowid != t2.rowid
),
list_neighbours as (
select
target,
neighbour,
distance
from (
select
*,
rank() over (partition by target order by distance asc) as rank
from
list_distance
) t
where
rank <= ${k}
),
k_distance as (
select
target,
max(distance) as k_distance
from
list_neighbours
group by
target
),
reach_distance as (
select
t1.target,
-- t1.neighbour,
max2(t2.k_distance, t1.distance) as reach_distance
from
list_neighbours t1 JOIN
k_distance t2 ON (t1.neighbour = t2.target)
),
lrd as (
select
target,
1.0 / avg(reach_distance) as lrd
from
reach_distance
group by
target
),
neighbours_lrd as (
select
t1.target,
-- t1.neighbour,
t2.lrd
from
list_neighbours t1 JOIN
lrd t2 on (t1.neighbour = t2.target)
)
select
t1.target,
sum(t2.lrd / t1.lrd) / count(1) as lof
from
lrd t1 JOIN
neighbours_lrd t2 on (t1.target = t2.target)
group by
t1.target
order by lof desc
limit ${topk};
> 87 3.031143750623693
> 16 1.975556449228491
> 1 1.8415763677073722