Created
July 24, 2024 22:03
-
-
Save zhibor/131459b24a720f622a86b141e468d731 to your computer and use it in GitHub Desktop.
Word frequency analysis in SQL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with cleaned_docs as ( | |
select | |
tweet_id, | |
regexp_replace(regexp_replace(text, '\\n', ' ', 'g'), 'https[^\\s]+', '', 'g') as cleaned_text | |
from tweets | |
where text is not null | |
), | |
tokenized_docs as ( | |
select | |
tweet_id, | |
unnest(string_split(cleaned_text, ' ')) as token | |
from cleaned_docs | |
), | |
stop_words as ( | |
select | |
unnest(en) as word | |
from 'https://raw.githubusercontent.com/stopwords-iso/stopwords-iso/master/python/stopwordsiso/stopwords-iso.json' | |
), | |
filtered_tokens as ( | |
select | |
tweet_id, | |
regexp_replace(lower(token), '[^a-z]', '', 'g') as token | |
from tokenized_docs | |
where | |
length(token) > 1 | |
and token not in (select word from stop_words) | |
and token is not null | |
), | |
word_frequencies as ( | |
select | |
token, | |
count(*) as freq | |
from filtered_tokens | |
where token is not null | |
group by token | |
order by freq desc | |
) | |
select * from word_frequencies; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment