Skip to content

Instantly share code, notes, and snippets.

@jrjames83
Created May 4, 2020 16:57
Show Gist options
  • Save jrjames83/8ceab19bdbaff7d9441d3b884a67a641 to your computer and use it in GitHub Desktop.
Save jrjames83/8ceab19bdbaff7d9441d3b884a67a641 to your computer and use it in GitHub Desktop.
-- WITH keyphrases AS (
-- SELECT '\\b' || name || '\\b' as keyword
-- FROM `bigquery-public-data.usa_names.usa_1910_2013`
-- ), docs AS (
-- SELECT title as doc
-- FROM `bigquery-samples.reddit.full`
-- )
-- SELECT DISTINCT d.doc, k.keyword
-- FROM keyphrases k
-- JOIN docs d ON REGEXP_CONTAINS(d.doc, k.keyword)
-- -- WHERE STRPOS(d.doc, k.keyword) > 0
-- AND char_length(k.keyword) > 5
WITH ngrams AS (
-- 2 267 098a rows
SELECT id
, title
, ML.NGRAMS(SPLIT(title, ' ' ), [1,4]) as doc
FROM `bigquery-samples.reddit.full`
)
SELECT distinct id, word, title
FROM ngrams, UNNEST(doc) as word
-- 5 552 452 names
JOIN `bigquery-public-data.usa_names.usa_1910_2013` names ON names.name = word
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment