Skip to content

Instantly share code, notes, and snippets.

@myui
Created December 15, 2016 11:40
Show Gist options
  • Save myui/9449141c8e24d46defb102839be1f1aa to your computer and use it in GitHub Desktop.
Save myui/9449141c8e24d46defb102839be1f1aa to your computer and use it in GitHub Desktop.
create table page (
docid int,
contents string
);
INSERT OVERWRITE TABLE page_exploded
select
d.docid,
normalize_unicode(t.word) as word
from
page d
LATERAL VIEW explode(tokenize_ja(contents)) t as word
where
t.word NOT IN ('stopward1', 'stopward2')
;
WITH term_frequency as (
select
docid,
word,
freq
from (
select
docid,
tf(word) as word2freq
from
page_exploded
group by
docid
) t
LATERAL VIEW explode(word2freq) t2 as word, freq
),
document_frequency as (
select
word,
count(distinct docid) docs
from
page_exploded
group by
word
),
stats as (
select count(distinct docid) as ndocs
from page
)
INSERT OVERWRITE TABLE tfidf
select
tf.docid,
tf.word,
tfidf(tf.freq, df.docs, s.ndocs) as tfidf
from
term_frequency tf
JOIN document_frequency df ON (tf.word = df.word)
CROSS JOIN stats s
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment