Created
November 2, 2011 14:55
-
-
Save aquilax/1333830 to your computer and use it in GitHub Desktop.
Naive Bayes classifier in PostgreSQL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- | |
-- PostgreSQL database dump | |
-- | |
-- Dumped from database version 9.1.1 | |
-- Dumped by pg_dump version 9.1.1 | |
-- Started on 2011-11-02 16:53:36 EET | |
SET statement_timeout = 0; | |
SET client_encoding = 'UTF8'; | |
SET standard_conforming_strings = on; | |
SET check_function_bodies = false; | |
SET client_min_messages = warning; | |
-- | |
-- TOC entry 8 (class 2615 OID 28036) | |
-- Name: bayes; Type: SCHEMA; Schema: -; Owner: - | |
-- | |
CREATE SCHEMA bayes; | |
SET search_path = bayes, pg_catalog; | |
-- | |
-- TOC entry 223 (class 1255 OID 28111) | |
-- Dependencies: 607 8 | |
-- Name: classify(integer, integer[]); Type: FUNCTION; Schema: bayes; Owner: - | |
-- | |
CREATE FUNCTION classify(iset_id integer, ifeature_ids integer[]) RETURNS integer | |
LANGUAGE plpgsql COST 1 | |
AS $$ | |
DECLARE | |
features_count integer := 0; | |
r record; | |
p float := 0; | |
mult float := 0; | |
max_prob float := -1; | |
max_prob_cat integer := 0; | |
index integer := 1; | |
fcount integer := 0; | |
BEGIN | |
features_count = array_upper(ifeature_ids, 1); | |
FOR r IN | |
SELECT category_id, count(*) AS cntr | |
FROM bayes.feature_category | |
WHERE set_id = iset_id | |
GROUP BY category_id | |
LOOP | |
p = 1; | |
WHILE index <= features_count | |
LOOP | |
IF (r.cntr = 0) THEN | |
mult = 0.001; | |
ELSE | |
SELECT cnt | |
FROM bayes.feature_category | |
WHERE feature_id = ifeature_ids[index] | |
AND set_id = iset_id | |
AND category_id = r.category_id INTO fcount; | |
IF (fcount IS NULL) THEN | |
fcount = 0; | |
END IF; | |
mult = fcount / r.cntr; | |
END IF; | |
p = p * mult; | |
index = index + 1; | |
END LOOP; | |
--RAISE NOTICE 'cat % like %', r.category_id, p; | |
IF p > max_prob THEN | |
max_prob_cat = r.category_id; | |
max_prob = p; | |
END IF; | |
END LOOP; | |
RETURN max_prob_cat; | |
END; | |
$$; | |
-- | |
-- TOC entry 224 (class 1255 OID 28110) | |
-- Dependencies: 607 8 | |
-- Name: tokenize(character varying[], boolean); Type: FUNCTION; Schema: bayes; Owner: - | |
-- | |
CREATE FUNCTION tokenize(ifeatures character varying[], ido_insert boolean) RETURNS integer[] | |
LANGUAGE plpgsql COST 1 | |
AS $$ | |
DECLARE | |
w varchar; | |
fid integer := 0; | |
index integer := 1; | |
count integer := 0; | |
outa int[]; | |
BEGIN | |
count = array_upper(ifeatures, 1); | |
WHILE index <= count | |
LOOP | |
RAISE NOTICE '%', ifeatures[index]; | |
SELECT id FROM bayes.feature WHERE val = ifeatures[index] INTO fid; | |
IF (fid IS NOT NULL) THEN | |
outa = array_append(outa, fid); | |
ELSE | |
IF ido_insert THEN | |
INSERT INTO bayes.feature (val) VALUES (ifeatures[index]); | |
outa = array_append(outa, lastval()::int); | |
END IF; | |
END IF; | |
index := index + 1; | |
END LOOP; | |
RETURN outa; | |
END; | |
$$; | |
-- | |
-- TOC entry 222 (class 1255 OID 28092) | |
-- Dependencies: 607 8 | |
-- Name: train(integer, integer, integer[]); Type: FUNCTION; Schema: bayes; Owner: - | |
-- | |
CREATE FUNCTION train(iset_id integer, icategory_id integer, ifeature_ids integer[]) RETURNS void | |
LANGUAGE plpgsql COST 1 | |
AS $$ | |
DECLARE | |
w integer; | |
fid integer := 0; | |
index integer := 1; | |
count integer := 0; | |
BEGIN | |
count = array_upper(ifeature_ids, 1); | |
WHILE index <= count | |
LOOP | |
SELECT id | |
FROM bayes.feature_category | |
WHERE set_id = iset_id | |
AND category_id = icategory_id | |
AND feature_id = ifeature_ids[index] INTO fid; | |
IF (fid IS NOT NULL) THEN | |
UPDATE bayes.feature_category | |
SET cnt = cnt + 1 | |
WHERE id = fid; | |
ELSE | |
INSERT INTO bayes.feature_category | |
(set_id, category_id, feature_id) | |
VALUES | |
(iset_id, icategory_id, ifeature_ids[index]); | |
END IF; | |
index := index + 1; | |
END LOOP; | |
END; | |
$$; | |
SET default_tablespace = ''; | |
SET default_with_oids = false; | |
-- | |
-- TOC entry 206 (class 1259 OID 28114) | |
-- Dependencies: 8 | |
-- Name: feature; Type: TABLE; Schema: bayes; Owner: -; Tablespace: | |
-- | |
CREATE TABLE feature ( | |
id integer NOT NULL, | |
val character varying(50) | |
); | |
-- | |
-- TOC entry 208 (class 1259 OID 28124) | |
-- Dependencies: 1975 8 | |
-- Name: feature_category; Type: TABLE; Schema: bayes; Owner: -; Tablespace: | |
-- | |
CREATE TABLE feature_category ( | |
id integer NOT NULL, | |
set_id integer, | |
category_id integer, | |
feature_id integer, | |
cnt integer DEFAULT 1 | |
); | |
-- | |
-- TOC entry 207 (class 1259 OID 28122) | |
-- Dependencies: 208 8 | |
-- Name: feature_category_id_seq; Type: SEQUENCE; Schema: bayes; Owner: - | |
-- | |
CREATE SEQUENCE feature_category_id_seq | |
START WITH 1 | |
INCREMENT BY 1 | |
NO MINVALUE | |
NO MAXVALUE | |
CACHE 1; | |
-- | |
-- TOC entry 1986 (class 0 OID 0) | |
-- Dependencies: 207 | |
-- Name: feature_category_id_seq; Type: SEQUENCE OWNED BY; Schema: bayes; Owner: - | |
-- | |
ALTER SEQUENCE feature_category_id_seq OWNED BY feature_category.id; | |
-- | |
-- TOC entry 205 (class 1259 OID 28112) | |
-- Dependencies: 206 8 | |
-- Name: feature_id_seq; Type: SEQUENCE; Schema: bayes; Owner: - | |
-- | |
CREATE SEQUENCE feature_id_seq | |
START WITH 1 | |
INCREMENT BY 1 | |
NO MINVALUE | |
NO MAXVALUE | |
CACHE 1; | |
-- | |
-- TOC entry 1987 (class 0 OID 0) | |
-- Dependencies: 205 | |
-- Name: feature_id_seq; Type: SEQUENCE OWNED BY; Schema: bayes; Owner: - | |
-- | |
ALTER SEQUENCE feature_id_seq OWNED BY feature.id; | |
-- | |
-- TOC entry 1973 (class 2604 OID 28117) | |
-- Dependencies: 205 206 206 | |
-- Name: id; Type: DEFAULT; Schema: bayes; Owner: - | |
-- | |
ALTER TABLE feature ALTER COLUMN id SET DEFAULT nextval('feature_id_seq'::regclass); | |
-- | |
-- TOC entry 1974 (class 2604 OID 28127) | |
-- Dependencies: 207 208 208 | |
-- Name: id; Type: DEFAULT; Schema: bayes; Owner: - | |
-- | |
ALTER TABLE feature_category ALTER COLUMN id SET DEFAULT nextval('feature_category_id_seq'::regclass); | |
-- | |
-- TOC entry 1981 (class 2606 OID 28130) | |
-- Dependencies: 208 208 | |
-- Name: feature_category_pkey; Type: CONSTRAINT; Schema: bayes; Owner: -; Tablespace: | |
-- | |
ALTER TABLE ONLY feature_category | |
ADD CONSTRAINT feature_category_pkey PRIMARY KEY (id); | |
-- | |
-- TOC entry 1983 (class 2606 OID 28132) | |
-- Dependencies: 208 208 208 208 | |
-- Name: feature_category_set_id_category_id_feature_id_key; Type: CONSTRAINT; Schema: bayes; Owner: -; Tablespace: | |
-- | |
ALTER TABLE ONLY feature_category | |
ADD CONSTRAINT feature_category_set_id_category_id_feature_id_key UNIQUE (set_id, category_id, feature_id); | |
-- | |
-- TOC entry 1977 (class 2606 OID 28119) | |
-- Dependencies: 206 206 | |
-- Name: feature_pkey; Type: CONSTRAINT; Schema: bayes; Owner: -; Tablespace: | |
-- | |
ALTER TABLE ONLY feature | |
ADD CONSTRAINT feature_pkey PRIMARY KEY (id); | |
-- | |
-- TOC entry 1979 (class 2606 OID 28121) | |
-- Dependencies: 206 206 | |
-- Name: feature_val_key; Type: CONSTRAINT; Schema: bayes; Owner: -; Tablespace: | |
-- | |
ALTER TABLE ONLY feature | |
ADD CONSTRAINT feature_val_key UNIQUE (val); | |
-- Completed on 2011-11-02 16:53:37 EET | |
-- | |
-- PostgreSQL database dump complete | |
-- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Could you give a tiny bit of instruction on how to use this?