Last active September 27, 2016 07:46
%%bigquery udf --module extract_tokens
* @param {{tweet_object: string, polarity: float, magnitude: float, syntax:string }} r
* @param function({{type: string, content:string}}) emitFn
function(r, emitFn) {
var tweet = JSON.parse(r.tweet_object);
var tokens = JSON.parse(r.syntax);
if(token.content !== "&amp" && token.content!== "#")
emitFn({type : token.partOfSpeech, content : token.content.toLowerCase()})
%%sql --module words
total = 10;
SELECT content, count(*) as count FROM extract_tokens(SELECT * FROM [in-full-gear:Dataset1.debates_tweets]
WHERE NOT REGEXP_MATCH(tweet_object, "GoogleJsonResponseException"))
WHERE type=$part_of_speech
GROUp BY content
ORDER BY count desc
LIMIT $total
import datalab.bigquery as bq
nouns = bq.Query(words, udf=extract_tokens, part_of_speech="NOUN", total=12);
%%chart bars --data nouns
title: Most common nouns used in tweets about the debate
height: 600
width: 900
