Skip to content

Instantly share code, notes, and snippets.

@myui
Last active August 29, 2015 14:22
Show Gist options
  • Save myui/12a1192e77eb54a08f5a to your computer and use it in GitHub Desktop.
Save myui/12a1192e77eb54a08f5a to your computer and use it in GitHub Desktop.
Outlier Detection using Local Outlier Factor on Hivemall

This article introduce how to find outliers using Local Outlier Detection (LOF) on Hivemall.

Data Preparation

create database lof;
use lof;

create external table hundred_balls (
  rowid int, 
  weight double,
  specific_heat double,
  reflectance double
)
ROW FORMAT DELIMITED
  FIELDS TERMINATED BY ' '
STORED AS TEXTFILE LOCATION '/dataset/lof/hundred_balls';

Download hundred_balls.txt that is originally provides in this article.

You can find outliers in this picture.

awk '{FS=" "; OFS=" "; print NR,$0}' hundred_balls.txt | \
hadoop fs -put - /dataset/lof/hundred_balls/hundred_balls.txt
create table train
as
select rowid, array(concat("weight:", weight), concat("specific_heat:", specific_heat), concat("reflectance:", reflectance)) as features
from hundred_balls;

Apply Data Normalization

create table train_normalized
as
WITH fv as (
select 
  rowid, 
  extract_feature(feature) as feature,
  extract_weight(feature) as value
from 
  train 
  LATERAL VIEW explode(features) exploded AS feature
), 
stats as (
select
  feature,
  -- avg(value) as mean, stddev_pop(value) as stddev
  min(value) as min, max(value) as max
from
  fv
group by
  feature
), 
norm as (
select 
  rowid, 
  t1.feature, 
  -- zscore(t1.value, t2.mean, t2.stddev) as zscore
  rescale(t1.value, t2.min, t2.max) as minmax
from 
  fv t1 JOIN
  stats t2 ON (t1.feature = t2.feature) 
),
norm_fv as (
select
  rowid, 
  -- concat(feature, ":", zscore) as feature
  concat(feature, ":", minmax) as feature
from
  norm
)
select 
  rowid, 
  collect_list(feature) as features
from
  norm_fv
group by
  rowid
;

Outlier Detection using Local Outlier Facotor (LOF)

-- workaround to deal with a bug in Hive/Tez
-- https://issues.apache.org/jira/browse/HIVE-10729
set hive.auto.convert.join=false;
-- parameter of LoF
set hivevar:k=12;

-- find topk outliers
set hivevar:topk=3;

WITH list_distance as (
select
  t1.rowid as target, 
  t2.rowid as neighbour, 
  euclid_distance(t1.features, t2.features) as distance
  -- cosine_similarity(t1.features, t2.features) as distance
from 
  -- train t1 CROSS JOIN train t2
  train_normalized t1 CROSS JOIN train_normalized t2
where
  t1.rowid != t2.rowid
), 
list_neighbours as (
select
  target, 
  neighbour,
  distance
from (
  select
    *, 
    rank() over (partition by target order by distance asc) as rank
  from
    list_distance
) t
where 
  rank <= ${k}
), 
k_distance as (
select
  target, 
  max(distance) as k_distance
from
  list_neighbours
group by
  target
), 
reach_distance as (
select
  t1.target,
  -- t1.neighbour,
  max2(t2.k_distance, t1.distance) as reach_distance
from
  list_neighbours t1 JOIN 
  k_distance t2 ON (t1.neighbour = t2.target)
), 
lrd as (
select
  target,   
  1.0 / avg(reach_distance) as lrd
from
  reach_distance
group by
  target
), 
neighbours_lrd as (
select
  t1.target, 
  -- t1.neighbour,
  t2.lrd
from
  list_neighbours t1 JOIN
  lrd t2 on (t1.neighbour = t2.target)
)
select
  t1.target, 
  sum(t2.lrd / t1.lrd) / count(1) as lof
from
  lrd t1 JOIN
  neighbours_lrd t2 on (t1.target = t2.target)
group by
  t1.target
order by lof desc
limit ${topk};
> 87      3.031143750623693
> 16      1.975556449228491
> 1       1.8415763677073722
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment