zhibor · July 24, 2024 22:03
diff --git a/word_frequency.sql b/word_frequency.sql
 with cleaned_docs as (
    select 
        tweet_id,
        regexp_replace(regexp_replace(text, '\\n', ' ', 'g'), 'https[^\\s]+', '', 'g') as cleaned_text
    from tweets
    where text is not null
 ),

 tokenized_docs as (
    select 
        tweet_id,
        unnest(string_split(cleaned_text, ' ')) as token
    from cleaned_docs
 ),

 stop_words as (
    select 
        unnest(en) as word 
    from 'https://raw.githubusercontent.com/stopwords-iso/stopwords-iso/master/python/stopwordsiso/stopwords-iso.json'
 ),

 filtered_tokens as (
    select 
        tweet_id, 
        regexp_replace(lower(token), '[^a-z]', '', 'g') as token
    from tokenized_docs
    where 
        length(token) > 1
        and token not in (select word from stop_words)
        and token is not null
 ),

 word_frequencies as (
    select 
        token,
        count(*) as freq
    from filtered_tokens
    where token is not null
    group by token
    order by freq desc
 )

 select * from word_frequencies;
	with cleaned_docs as (
	select
	tweet_id,
	regexp_replace(regexp_replace(text, '\\n', ' ', 'g'), 'https[^\\s]+', '', 'g') as cleaned_text
	from tweets
	where text is not null
	),

	tokenized_docs as (
	select
	tweet_id,
	unnest(string_split(cleaned_text, ' ')) as token
	from cleaned_docs
	),

	stop_words as (
	select
	unnest(en) as word
	from 'https://raw.githubusercontent.com/stopwords-iso/stopwords-iso/master/python/stopwordsiso/stopwords-iso.json'
	),

	filtered_tokens as (
	select
	tweet_id,
	regexp_replace(lower(token), '[^a-z]', '', 'g') as token
	from tokenized_docs
	where
	length(token) > 1
	and token not in (select word from stop_words)
	and token is not null
	),

	word_frequencies as (
	select
	token,
	count(*) as freq
	from filtered_tokens
	where token is not null
	group by token
	order by freq desc
	)

	select * from word_frequencies;