githoov · October 17, 2016 17:36
diff --git a/tweet_simiarity.sql b/tweet_simiarity.sql
 /*
 this is a basic first pass at tweet similarity
 using a modified cosine-similarity approach. the
 modification in question weights words based on their
 rarity across the corpus. that means that more common
 words that aren't quite stop words get a lower weight
 than less common words—the method is called inverse 
 document frequency.
 */

 -- used to split tweet into array
 create or replace function split_text(TEXT string)
  returns variant
  language javascript
    as '
     var words = TEXT
      .replace(/\&amp;\\w+;/g, " ") 
      .replace(/@[\\S]+/g, "")
      .replace(/[\-\"\?\,\.\!\@\$\%\^\&\*]+|_/g, "")
      .replace(/http\\S+/g, "") 
      .toLowerCase()
      .trim()
      .split(/\\s+/g);
     return words;
     ';

 -- used to get occurrences of each word in tweet
 create or replace function word_count(WORDS variant)
  returns variant
  language javascript
    as '
      var arr = WORDS
      var obj = { };
      for (var i = 0, j = arr.length; i < j; i++) {
         obj[arr[i]] = (obj[arr[i]] || 0) + 1;
      }
      return obj
     ';

 -- used for for rarity calculation later on
 create table total_tweets
  as
    select count(1) as total_documents
    from twitter.data.tweets_1p;

 -- build the corpus
 create or replace table corpus
  as
    select value::string as word
      , count(*) as occurrences
      , count(distinct tweet:id) as rarity
      , ln(((select total_documents from total_tweets)/count(distinct tweet:id))) as idf  --inverse document frequency
    from twitter.data.tweets_1p,
    lateral flatten(split_text(tweet:text::string))
    where tweet:lang::string = 'en'
    and value::string not in (select this from stop_words)
    group by 1
    having count(*) > 1;

 -- create sparse bag-of-words vector for each tweet
 create or replace table bags
  as
    with words_in_tweets as (
        select tweet_id
          , key
          , value
        from labeled_tweets,
        lateral flatten(word_count(tokens))
        where key not in (select this from stop_words)
      )
    select words_in_tweets.*
      , corpus.idf
    from words_in_tweets
    left join corpus
    on words_in_tweets.value = corpus.word;

 -- calculate cosine similarity with inverse document frequency weighting 
 select v1.tweet_id as tweet_id_v1
  , v2.tweet_id as tweet_id_v2
  , sum(v1.value * v2.value * power(v1.idf, 2))/(sqrt(sum(v1.value * v1.value * power(v1.idf, 2))) * sqrt(sum(v2.value * v2.value * power(v1.idf, 2)))) as cosine
 from bags as v1
 inner join bags as v2
 on v1.key = v2.key 
 and v1.tweet_id != v2.tweet_id
 where v1.tweet_id = '<enter_some_tweet_id>' --rather than finding every combination, you should filter for a single tweet and find similar tweets
 group by 1,2;
	/*
	this is a basic first pass at tweet similarity
	using a modified cosine-similarity approach. the
	modification in question weights words based on their
	rarity across the corpus. that means that more common
	words that aren't quite stop words get a lower weight
	than less common words—the method is called inverse
	document frequency.
	*/

	-- used to split tweet into array
	create or replace function split_text(TEXT string)
	returns variant
	language javascript
	as '
	var words = TEXT
	.replace(/\&\\w+;/g, " ")
	.replace(/@[\\S]+/g, "")
	.replace(/[\-\"\?\,\.\!\@\$\%\^\&\*]+\|_/g, "")
	.replace(/http\\S+/g, "")
	.toLowerCase()
	.trim()
	.split(/\\s+/g);
	return words;
	';

	-- used to get occurrences of each word in tweet
	create or replace function word_count(WORDS variant)
	returns variant
	language javascript
	as '
	var arr = WORDS
	var obj = { };
	for (var i = 0, j = arr.length; i < j; i++) {
	obj[arr[i]] = (obj[arr[i]] \|\| 0) + 1;
	}
	return obj
	';

	-- used for for rarity calculation later on
	create table total_tweets
	as
	select count(1) as total_documents
	from twitter.data.tweets_1p;

	-- build the corpus
	create or replace table corpus
	as
	select value::string as word
	, count(*) as occurrences
	, count(distinct tweet:id) as rarity
	, ln(((select total_documents from total_tweets)/count(distinct tweet:id))) as idf --inverse document frequency
	from twitter.data.tweets_1p,
	lateral flatten(split_text(tweet:text::string))
	where tweet:lang::string = 'en'
	and value::string not in (select this from stop_words)
	group by 1
	having count(*) > 1;

	-- create sparse bag-of-words vector for each tweet
	create or replace table bags
	as
	with words_in_tweets as (
	select tweet_id
	, key
	, value
	from labeled_tweets,
	lateral flatten(word_count(tokens))
	where key not in (select this from stop_words)
	)
	select words_in_tweets.*
	, corpus.idf
	from words_in_tweets
	left join corpus
	on words_in_tweets.value = corpus.word;

	-- calculate cosine similarity with inverse document frequency weighting
	select v1.tweet_id as tweet_id_v1
	, v2.tweet_id as tweet_id_v2
	, sum(v1.value * v2.value * power(v1.idf, 2))/(sqrt(sum(v1.value * v1.value * power(v1.idf, 2))) * sqrt(sum(v2.value * v2.value * power(v1.idf, 2)))) as cosine
	from bags as v1
	inner join bags as v2
	on v1.key = v2.key
	and v1.tweet_id != v2.tweet_id
	where v1.tweet_id = '<enter_some_tweet_id>' --rather than finding every combination, you should filter for a single tweet and find similar tweets
	group by 1,2;