Last active
October 4, 2016 01:20
-
-
Save isaacabraham/636df8e6a691df1c20d1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#I @"..\packages\" | |
#r @"FSharp.Data\lib\net40\FSharp.Data.dll" | |
#load @"FSPlot\FsPlotBootstrap.fsx" | |
#r @"Deedle\lib\net40\Deedle.dll" | |
do fsi.AddPrinter(fun (printer:Deedle.Internal.IFsiFormattable) -> "\n" + (printer.Format())) | |
open Deedle | |
open FSharp.Data | |
open System | |
type Twitter = CsvProvider< @"C:\Users\Isaac\Downloads\fsharp_2013-2014.csv"> | |
let data = Twitter.GetSample() | |
let remove removeIt (text:string) = text.Replace(removeIt, "") | |
let getName = remove ":" | |
let isMention (word:string) = word.StartsWith "@" | |
let (|Retweet|_|) = | |
function | |
| (tweet:Twitter.Row) when tweet.Text.StartsWith "RT" -> | |
Some <| Retweet(tweet, ((tweet.Text.Split ' ').[1] |> getName)) | |
| tweet -> None | |
let getMentions (tweet:Twitter.Row) = | |
tweet.Text.Split ' ' | |
|> Seq.filter isMention | |
|> Seq.map (fun word -> (word.Split ' ').[0] |> getName) | |
|> Seq.toArray | |
module Seq = | |
let summarise counter = | |
Seq.countBy counter | |
>> Seq.sortBy (fun (n, count) -> -count) | |
>> Seq.toArray | |
let toFrame (items:seq<_>) = | |
items | |
|> Frame.ofRecords | |
|> Frame.indexColsWith [ "Key"; "Count" ] | |
|> Frame.indexRowsString "Key" | |
|> Frame.sortRowsWith "Count" (fun _ c -> -c) | |
/// Who is mentioned the most | |
let mentions = | |
data.Rows | |
|> Seq.collect getMentions | |
|> Seq.summarise id | |
/// Who is retweeted the most | |
let retweets = | |
data.Rows | |
|> Seq.choose(function | |
| Retweet details -> Some details | |
| _ -> None) | |
|> Seq.summarise snd | |
/// Who tweets the most | |
let tweeters = data.Rows |> Seq.summarise (fun t -> t.FromUserScreenName) | |
let stopwords = [ ""; "rt"; "-"; "&"; "will"; "just"; "amp"; "a";"about";"above";"after";"again";"against";"all";"am";"an";"and";"any";"are";"aren't";"as";"at";"be";"because";"been";"before";"being";"below";"between";"both";"but";"by";"can't";"cannot";"could";"couldn't";"did";"didn't";"do";"does";"doesn't";"doing";"don't";"down";"during";"each";"few";"for";"from";"further";"had";"hadn't";"has";"hasn't";"have";"haven't";"having";"he";"he'd";"he'll";"he's";"her";"here";"here's";"hers";"herself";"him";"himself";"his";"how";"how's";"i";"i'd";"i'll";"i'm";"i've";"if";"in";"into";"is";"isn't";"it";"it's";"its";"itself";"let's";"me";"more";"most";"mustn't";"my";"myself";"no";"nor";"not";"of";"off";"on";"once";"only";"or";"other";"ought";"our";"ours";"ourselves";"out";"over";"own";"same";"shan't";"she";"she'd";"she'll";"she's";"should";"shouldn't";"so";"some";"such";"than";"that";"that's";"the";"their";"theirs";"them";"themselves";"then";"there";"there's";"these";"they";"they'd";"they'll";"they're";"they've";"this";"those";"through";"to";"too";"under";"until";"up";"very";"was";"wasn't";"we";"we'd";"we'll";"we're";"we've";"were";"weren't";"what";"what's";"when";"when's";"where";"where's";"which";"while";"who";"who's";"whom";"why";"why's";"with";"won't";"would";"wouldn't";"you";"you'd";"you'll";"you're";"you've";"your";"yours";"yourself";"yourselves";] |> Set.ofList | |
let isHashtag (word:string) = word.StartsWith "#" | |
let isLongerThan length (word:string) = word.Length > length | |
let words = | |
data.Rows | |
|> Seq.choose(fun tweet -> | |
match tweet with | |
| Retweet details -> None | |
| _ -> Some tweet) | |
|> Seq.map(fun tweet -> tweet.Text) | |
|> Seq.collect(fun text -> text.Split ' ') | |
|> Seq.map(fun word -> word.ToLower()) | |
let composeRules rules word = rules |> Seq.forall(fun rule -> rule word) | |
/// Most popular words | |
let popularWords = | |
let isPopularWord = | |
composeRules | |
[ not << stopwords.Contains | |
not << isMention | |
not << isHashtag | |
fun word -> word.ToCharArray() |> Array.forall(Char.IsLetter) | |
isLongerThan 3 ] | |
words | |
|> Seq.filter isPopularWord | |
|> Seq.summarise id | |
/// Most popular hashtags | |
let hashtags = | |
let isValidHashTag = composeRules [ isHashtag; isLongerThan 1 ] | |
words | |
|> Seq.map(fun word -> String(word.ToCharArray() |> Array.filter(fun c -> Char.IsLetter c || c = '#'))) | |
|> Seq.filter isValidHashTag | |
|> Seq.summarise id | |
open FsPlot.Highcharts.Charting | |
mentions |> Seq.take 15 |> Chart.Column |> Chart.WithTitle "Mentions" | |
retweets |> Seq.take 15 |> Chart.Pie |> Chart.WithTitle "Retweets" | |
popularWords |> Seq.take 15 |> Chart.Column |> Chart.WithTitle "Words" | |
hashtags |> Seq.skip 1 |> Seq.take 15 |> Chart.Pie |> Chart.WithTitle "Hashtags" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment