Last active
October 17, 2016 17:36
-
-
Save githoov/d282e8f96742ddb302d2aaac36e50904 to your computer and use it in GitHub Desktop.
Tweet Similarity
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
this is a basic first pass at tweet similarity | |
using a modified cosine-similarity approach. the | |
modification in question weights words based on their | |
rarity across the corpus. that means that more common | |
words that aren't quite stop words get a lower weight | |
than less common words—the method is called inverse | |
document frequency. | |
*/ | |
-- used to split tweet into array | |
create or replace function split_text(TEXT string) | |
returns variant | |
language javascript | |
as ' | |
var words = TEXT | |
.replace(/\&\\w+;/g, " ") | |
.replace(/@[\\S]+/g, "") | |
.replace(/[\-\"\?\,\.\!\@\$\%\^\&\*]+|_/g, "") | |
.replace(/http\\S+/g, "") | |
.toLowerCase() | |
.trim() | |
.split(/\\s+/g); | |
return words; | |
'; | |
-- used to get occurrences of each word in tweet | |
create or replace function word_count(WORDS variant) | |
returns variant | |
language javascript | |
as ' | |
var arr = WORDS | |
var obj = { }; | |
for (var i = 0, j = arr.length; i < j; i++) { | |
obj[arr[i]] = (obj[arr[i]] || 0) + 1; | |
} | |
return obj | |
'; | |
-- used for for rarity calculation later on | |
create table total_tweets | |
as | |
select count(1) as total_documents | |
from twitter.data.tweets_1p; | |
-- build the corpus | |
create or replace table corpus | |
as | |
select value::string as word | |
, count(*) as occurrences | |
, count(distinct tweet:id) as rarity | |
, ln(((select total_documents from total_tweets)/count(distinct tweet:id))) as idf --inverse document frequency | |
from twitter.data.tweets_1p, | |
lateral flatten(split_text(tweet:text::string)) | |
where tweet:lang::string = 'en' | |
and value::string not in (select this from stop_words) | |
group by 1 | |
having count(*) > 1; | |
-- create sparse bag-of-words vector for each tweet | |
create or replace table bags | |
as | |
with words_in_tweets as ( | |
select tweet_id | |
, key | |
, value | |
from labeled_tweets, | |
lateral flatten(word_count(tokens)) | |
where key not in (select this from stop_words) | |
) | |
select words_in_tweets.* | |
, corpus.idf | |
from words_in_tweets | |
left join corpus | |
on words_in_tweets.value = corpus.word; | |
-- calculate cosine similarity with inverse document frequency weighting | |
select v1.tweet_id as tweet_id_v1 | |
, v2.tweet_id as tweet_id_v2 | |
, sum(v1.value * v2.value * power(v1.idf, 2))/(sqrt(sum(v1.value * v1.value * power(v1.idf, 2))) * sqrt(sum(v2.value * v2.value * power(v1.idf, 2)))) as cosine | |
from bags as v1 | |
inner join bags as v2 | |
on v1.key = v2.key | |
and v1.tweet_id != v2.tweet_id | |
where v1.tweet_id = '<enter_some_tweet_id>' --rather than finding every combination, you should filter for a single tweet and find similar tweets | |
group by 1,2; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment