Skip to content

Instantly share code, notes, and snippets.

@githoov
Last active October 17, 2016 17:36
Show Gist options
  • Save githoov/d282e8f96742ddb302d2aaac36e50904 to your computer and use it in GitHub Desktop.
Save githoov/d282e8f96742ddb302d2aaac36e50904 to your computer and use it in GitHub Desktop.
Tweet Similarity
/*
this is a basic first pass at tweet similarity
using a modified cosine-similarity approach. the
modification in question weights words based on their
rarity across the corpus. that means that more common
words that aren't quite stop words get a lower weight
than less common words—the method is called inverse
document frequency.
*/
-- used to split tweet into array
create or replace function split_text(TEXT string)
returns variant
language javascript
as '
var words = TEXT
.replace(/\&\\w+;/g, " ")
.replace(/@[\\S]+/g, "")
.replace(/[\-\"\?\,\.\!\@\$\%\^\&\*]+|_/g, "")
.replace(/http\\S+/g, "")
.toLowerCase()
.trim()
.split(/\\s+/g);
return words;
';
-- used to get occurrences of each word in tweet
create or replace function word_count(WORDS variant)
returns variant
language javascript
as '
var arr = WORDS
var obj = { };
for (var i = 0, j = arr.length; i < j; i++) {
obj[arr[i]] = (obj[arr[i]] || 0) + 1;
}
return obj
';
-- used for for rarity calculation later on
create table total_tweets
as
select count(1) as total_documents
from twitter.data.tweets_1p;
-- build the corpus
create or replace table corpus
as
select value::string as word
, count(*) as occurrences
, count(distinct tweet:id) as rarity
, ln(((select total_documents from total_tweets)/count(distinct tweet:id))) as idf --inverse document frequency
from twitter.data.tweets_1p,
lateral flatten(split_text(tweet:text::string))
where tweet:lang::string = 'en'
and value::string not in (select this from stop_words)
group by 1
having count(*) > 1;
-- create sparse bag-of-words vector for each tweet
create or replace table bags
as
with words_in_tweets as (
select tweet_id
, key
, value
from labeled_tweets,
lateral flatten(word_count(tokens))
where key not in (select this from stop_words)
)
select words_in_tweets.*
, corpus.idf
from words_in_tweets
left join corpus
on words_in_tweets.value = corpus.word;
-- calculate cosine similarity with inverse document frequency weighting
select v1.tweet_id as tweet_id_v1
, v2.tweet_id as tweet_id_v2
, sum(v1.value * v2.value * power(v1.idf, 2))/(sqrt(sum(v1.value * v1.value * power(v1.idf, 2))) * sqrt(sum(v2.value * v2.value * power(v1.idf, 2)))) as cosine
from bags as v1
inner join bags as v2
on v1.key = v2.key
and v1.tweet_id != v2.tweet_id
where v1.tweet_id = '<enter_some_tweet_id>' --rather than finding every combination, you should filter for a single tweet and find similar tweets
group by 1,2;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment