Last active
January 10, 2018 19:38
-
-
Save soaxelbrooke/11c045855b129dbdabb05d5a4c8cfbde to your computer and use it in GitHub Desktop.
Phrase detection implemented in pure PostgreSQL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH tokens AS ( | |
-- Just edit MY_TABLE, MY_TEXT_COL, and MY_PKEY_COL, and watch it go! | |
SELECT MY_PKEY_COL AS pkey, (unnest(to_tsvector(MY_TEXT_COL))).* FROM MY_TABLE | |
), token_stream AS ( | |
SELECT pkey, unnest(positions) AS token_idx, lexeme | |
FROM tokens ORDER BY pkey, token_idx | |
), token_counts AS ( | |
SELECT lexeme, sum(count) AS count | |
FROM ( | |
SELECT lexeme, array_length(positions, 1) AS count FROM tokens | |
WHERE char_length(lexeme) < 12 | |
) lexeme_counts | |
GROUP BY lexeme | |
), vocab_meta AS ( | |
SELECT count(distinct(lexeme)) FROM token_counts | |
), phrase_scores AS ( | |
SELECT | |
phrase, | |
phrase_count, | |
first_counts.count AS first_count, | |
second_counts.count AS last_count, | |
CAST(phrase_count - 5 AS DOUBLE PRECISION) / first_counts.count / second_counts.count * vocab_meta.count AS score | |
FROM vocab_meta, ( | |
SELECT | |
first.lexeme AS first, | |
second.lexeme AS second, | |
first.lexeme || ' ' || second.lexeme AS phrase, | |
count(DISTINCT (first.pkey)) AS phrase_count | |
FROM token_stream first | |
JOIN token_stream second | |
ON first.pkey = second.pkey | |
AND second.token_idx = first.token_idx + 1 | |
GROUP BY first.lexeme, second.lexeme | |
) phrase_counts | |
JOIN token_counts first_counts ON first = first_counts.lexeme | |
JOIN token_counts second_counts ON second = second_counts.lexeme | |
) | |
SELECT * FROM phrase_scores WHERE score > 5 ORDER BY score DESC; | |
-- BSD 0 Licensed - Copyright Stuart Axelbrooke 2017-2018 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment