Skip to content

Instantly share code, notes, and snippets.

@thiagomata
Last active September 22, 2020 01:29
Show Gist options
  • Save thiagomata/a9737c3455d6248bef9f to your computer and use it in GitHub Desktop.
Save thiagomata/a9737c3455d6248bef9f to your computer and use it in GitHub Desktop.
/**
* Based on the mysql cluster
* @link http://jonisalonen.com/2012/k-means-clustering-in-mysql/
*/
-- SELECT
-- * FROM
-- unnest(kmeans(array(SELECT
-- (
-- lat,
-- lng,
-- count(*)
-- )::Pin
-- FROM myschema.mytable group by something, lat, lng ), 50 ))
-- DROP FUNCTION KMEANS( inPins pin[], maxPoints integer );
CREATE TYPE /* IF NOT EXISTS */ pin AS (
lat float,
lng float,
count bigint
);
CREATE OR REPLACE FUNCTION KMEANS( inPins pin[], maxPoints integer )
RETURNS pin[] AS
$$
declare
-- TOTAL OF ROWS CHANGED
countChanged bigint;
-- TOTAL OF WEIGHT DATA COUNT
countPoints bigint;
-- RETURN PINS
returnPins pin[];
-- SUM OF LAT OF ALL PINS
totalLat float;
-- SUM OF LNG OF ALL PINS
totalLng float;
-- STEP OF EACH LAT CLUSTER
stepLat float;
-- STEP OF EACH LNG CLUSTER
stepLng float;
-- MIN LAT OF DATA
minLat float;
-- MIN LNG OF DATA
minLng float;
-- CLUSTER AND DATA COUNT PROPORTION
clusterProportion float;
BEGIN
SELECT array_length( inPins, 1 ) into countPoints;
/**
* IF IS THE EASY WAY, LET IT GO
*/
IF countPoints < maxPoints THEN
RETURN inPins;
END IF;
/**
* CREATE TEMPORARY TABLES
*/
DROP TABLE IF EXISTS km_data;
CREATE TEMPORARY TABLE km_data(
id BIGINT PRIMARY KEY,
cluster_id BIGINT,
lat FLOAT,
lng FLOAT,
count BIGINT
);
CREATE INDEX IF NOT EXISTS KM_DATA_LAT_LNG ON km_data (lat,lng);
CREATE INDEX IF NOT EXISTS KM_DATA_LAT_LNG_COUNT ON km_data (lat,lng,count);
CREATE INDEX IF NOT EXISTS KM_DATA_CLUSTER ON km_data (cluster_id);
CREATE INDEX IF NOT EXISTS KM_DATA_ID ON km_data (id);
DROP TABLE IF EXISTS km_clusters;
CREATE TEMPORARY TABLE km_clusters(
id BIGINT PRIMARY KEY,
lat FLOAT,
lng FLOAT,
count BIGINT
);
CREATE INDEX IF NOT EXISTS KM_CLUSTER_LAT_LNG ON km_clusters (lat,lng);
CREATE INDEX IF NOT EXISTS KM_CLUSTER_LAT_LNG_COUNT ON km_clusters (lat,lng,count);
CREATE INDEX IF NOT EXISTS KM_CLUSTER_ID ON km_data (id);
/**
* CREATE THE PIN GROUP ( GROUP SAME POSITION ELEMENTS )
*/
INSERT INTO km_data
SELECT
row_number() OVER () AS id,
null AS cluster_id,
lat AS lat,
lng AS lng,
sum(count) AS count
FROM
unnest( inPins )
group by
lat, lng;
/**
* GET THE LIMITS OF THE KM_DATA LAT AND LNG
*/
SELECT MIN(lat) into minLat FROM km_data;
SELECT MIN(lng) into minLng FROM km_data;
/**
* DEFINE THE STEP TO EACH CLUSTER
*/
SELECT (MAX(lat)-MIN(lat))/count(*) into stepLat FROM km_data;
SELECT (MAX(lng)-MIN(lng))/count(*) into stepLng FROM km_data;
/**
* DEFINE CLUSTER PROPORTION
*/
SELECT count(*) / maxPoints into clusterProportion FROM km_data;
/**
* CREATE INITAL DATA TO KM_CLUSTER
*/
INSERT INTO km_clusters (id, lat, lng, count)
SELECT
row_number() OVER () AS id,
minLat + stepLat * clusterProportion * num AS lat,
minLng + stepLng * clusterProportion * num AS lng,
0 AS count
FROM
generate_series(0, maxPoints - 1) num;
/**
* FORCE AT LEAST ONE EXECUTION OF THE LOOP
*/
SELECT 1 into countChanged;
/**
* LOOP OF FINAL CHECK
*/
WHILE countChanged > 0 loop
/**
* UPDATE CLUSTER OF KM_DATA BASED ON LOCATION
*/
UPDATE km_data d
SET cluster_id = (
SELECT
id
FROM
km_clusters c
ORDER BY
POW( d.lat - c.lat, 2 ) + POW( d.lng - c.lng, 2 )
ASC LIMIT 1
);
/**
* UPDATE CLUSTER BASED ON KM_DATA
*/
WITH rows AS (
UPDATE km_clusters AS C
SET
lat = D.lat,
lng = D.lng,
count = D.count
FROM
(
SELECT
cluster_id,
SUM(lat*count)/SUM(count) AS lat,
SUM(lng*count)/SUM(count) AS lng,
SUM(count) AS count
FROM km_data GROUP BY cluster_id
) D
WHERE
C.id = D.cluster_id
AND
( C.lat != D.lat OR C.lng != D.lng OR C.count != D.count )
RETURNING 1
)
/**
* CALC HOW MANY ROWS HAS CHANGED
*/
SELECT COUNT(*) INTO countChanged FROM rows;
END LOOP;
/**
* CONVERT TABLE TO ARRAY
*/
SELECT INTO returnPins ARRAY(
SELECT (
pin.lat ,
pin.lng ,
pin.count
)::Pin
FROM
km_clusters AS pin
WHERE COUNT > 0
);
/**
* DROP TEMPORARY TABLE
*/
DROP TABLE km_data;
DROP TABLE km_clusters;
/**
* RETURN ARRAY OF PINS
*/
RETURN returnPins;
end;
$$ language plpgsql;
@thiagomata
Copy link
Author

Working like a charm 💃

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment