Last active
September 22, 2020 01:29
-
-
Save thiagomata/a9737c3455d6248bef9f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Based on the mysql cluster | |
* @link http://jonisalonen.com/2012/k-means-clustering-in-mysql/ | |
*/ | |
-- SELECT | |
-- * FROM | |
-- unnest(kmeans(array(SELECT | |
-- ( | |
-- lat, | |
-- lng, | |
-- count(*) | |
-- )::Pin | |
-- FROM myschema.mytable group by something, lat, lng ), 50 )) | |
-- DROP FUNCTION KMEANS( inPins pin[], maxPoints integer ); | |
CREATE TYPE /* IF NOT EXISTS */ pin AS ( | |
lat float, | |
lng float, | |
count bigint | |
); | |
CREATE OR REPLACE FUNCTION KMEANS( inPins pin[], maxPoints integer ) | |
RETURNS pin[] AS | |
$$ | |
declare | |
-- TOTAL OF ROWS CHANGED | |
countChanged bigint; | |
-- TOTAL OF WEIGHT DATA COUNT | |
countPoints bigint; | |
-- RETURN PINS | |
returnPins pin[]; | |
-- SUM OF LAT OF ALL PINS | |
totalLat float; | |
-- SUM OF LNG OF ALL PINS | |
totalLng float; | |
-- STEP OF EACH LAT CLUSTER | |
stepLat float; | |
-- STEP OF EACH LNG CLUSTER | |
stepLng float; | |
-- MIN LAT OF DATA | |
minLat float; | |
-- MIN LNG OF DATA | |
minLng float; | |
-- CLUSTER AND DATA COUNT PROPORTION | |
clusterProportion float; | |
BEGIN | |
SELECT array_length( inPins, 1 ) into countPoints; | |
/** | |
* IF IS THE EASY WAY, LET IT GO | |
*/ | |
IF countPoints < maxPoints THEN | |
RETURN inPins; | |
END IF; | |
/** | |
* CREATE TEMPORARY TABLES | |
*/ | |
DROP TABLE IF EXISTS km_data; | |
CREATE TEMPORARY TABLE km_data( | |
id BIGINT PRIMARY KEY, | |
cluster_id BIGINT, | |
lat FLOAT, | |
lng FLOAT, | |
count BIGINT | |
); | |
CREATE INDEX IF NOT EXISTS KM_DATA_LAT_LNG ON km_data (lat,lng); | |
CREATE INDEX IF NOT EXISTS KM_DATA_LAT_LNG_COUNT ON km_data (lat,lng,count); | |
CREATE INDEX IF NOT EXISTS KM_DATA_CLUSTER ON km_data (cluster_id); | |
CREATE INDEX IF NOT EXISTS KM_DATA_ID ON km_data (id); | |
DROP TABLE IF EXISTS km_clusters; | |
CREATE TEMPORARY TABLE km_clusters( | |
id BIGINT PRIMARY KEY, | |
lat FLOAT, | |
lng FLOAT, | |
count BIGINT | |
); | |
CREATE INDEX IF NOT EXISTS KM_CLUSTER_LAT_LNG ON km_clusters (lat,lng); | |
CREATE INDEX IF NOT EXISTS KM_CLUSTER_LAT_LNG_COUNT ON km_clusters (lat,lng,count); | |
CREATE INDEX IF NOT EXISTS KM_CLUSTER_ID ON km_data (id); | |
/** | |
* CREATE THE PIN GROUP ( GROUP SAME POSITION ELEMENTS ) | |
*/ | |
INSERT INTO km_data | |
SELECT | |
row_number() OVER () AS id, | |
null AS cluster_id, | |
lat AS lat, | |
lng AS lng, | |
sum(count) AS count | |
FROM | |
unnest( inPins ) | |
group by | |
lat, lng; | |
/** | |
* GET THE LIMITS OF THE KM_DATA LAT AND LNG | |
*/ | |
SELECT MIN(lat) into minLat FROM km_data; | |
SELECT MIN(lng) into minLng FROM km_data; | |
/** | |
* DEFINE THE STEP TO EACH CLUSTER | |
*/ | |
SELECT (MAX(lat)-MIN(lat))/count(*) into stepLat FROM km_data; | |
SELECT (MAX(lng)-MIN(lng))/count(*) into stepLng FROM km_data; | |
/** | |
* DEFINE CLUSTER PROPORTION | |
*/ | |
SELECT count(*) / maxPoints into clusterProportion FROM km_data; | |
/** | |
* CREATE INITAL DATA TO KM_CLUSTER | |
*/ | |
INSERT INTO km_clusters (id, lat, lng, count) | |
SELECT | |
row_number() OVER () AS id, | |
minLat + stepLat * clusterProportion * num AS lat, | |
minLng + stepLng * clusterProportion * num AS lng, | |
0 AS count | |
FROM | |
generate_series(0, maxPoints - 1) num; | |
/** | |
* FORCE AT LEAST ONE EXECUTION OF THE LOOP | |
*/ | |
SELECT 1 into countChanged; | |
/** | |
* LOOP OF FINAL CHECK | |
*/ | |
WHILE countChanged > 0 loop | |
/** | |
* UPDATE CLUSTER OF KM_DATA BASED ON LOCATION | |
*/ | |
UPDATE km_data d | |
SET cluster_id = ( | |
SELECT | |
id | |
FROM | |
km_clusters c | |
ORDER BY | |
POW( d.lat - c.lat, 2 ) + POW( d.lng - c.lng, 2 ) | |
ASC LIMIT 1 | |
); | |
/** | |
* UPDATE CLUSTER BASED ON KM_DATA | |
*/ | |
WITH rows AS ( | |
UPDATE km_clusters AS C | |
SET | |
lat = D.lat, | |
lng = D.lng, | |
count = D.count | |
FROM | |
( | |
SELECT | |
cluster_id, | |
SUM(lat*count)/SUM(count) AS lat, | |
SUM(lng*count)/SUM(count) AS lng, | |
SUM(count) AS count | |
FROM km_data GROUP BY cluster_id | |
) D | |
WHERE | |
C.id = D.cluster_id | |
AND | |
( C.lat != D.lat OR C.lng != D.lng OR C.count != D.count ) | |
RETURNING 1 | |
) | |
/** | |
* CALC HOW MANY ROWS HAS CHANGED | |
*/ | |
SELECT COUNT(*) INTO countChanged FROM rows; | |
END LOOP; | |
/** | |
* CONVERT TABLE TO ARRAY | |
*/ | |
SELECT INTO returnPins ARRAY( | |
SELECT ( | |
pin.lat , | |
pin.lng , | |
pin.count | |
)::Pin | |
FROM | |
km_clusters AS pin | |
WHERE COUNT > 0 | |
); | |
/** | |
* DROP TEMPORARY TABLE | |
*/ | |
DROP TABLE km_data; | |
DROP TABLE km_clusters; | |
/** | |
* RETURN ARRAY OF PINS | |
*/ | |
RETURN returnPins; | |
end; | |
$$ language plpgsql; |
Try comment that drop function line.
This code is from 2015. Looks like many of these /* IF NOT EXISTS */ are now available. Maybe worth some update.
Working like a charm 💃
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
/**
/
-- SELECT
-- * FROM
-- unnest(kmeans(array(SELECT
-- (
-- lat,
-- lng,
-- count()
-- )::Pin
-- FROM myschema.mytable group by something, lat, lng ), 50 ))
DROP FUNCTION KMEANS( inPins pin[], maxPoints integer )