-
-
Save davidglassborow/5ee3fb8bd2233b668957 to your computer and use it in GitHub Desktop.
A simple F# script that extracts words from tweets, given a #tag and a date range. Use this script in order to extract words from tweets, given a #hashtag and a date range. Excellent for creating e.g. word clouds! NB You also have to get a copy of the stopwords.txt file (or create your own). Disclaimer: This is not a finished product. There's ro…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#I @"packages\FSharp.Data.Toolbox.Twitter.0.6\lib\net40" | |
#I @"packages\FSharp.Data.2.1.1\lib\net40" | |
#r @".\packages\FSharp.Data.Toolbox.Twitter.0.6\lib\net40\FSharp.Data.Toolbox.Twitter.dll" | |
#r @".\packages\FSharp.Data.2.1.1\lib\net40\FSharp.Data.dll" | |
open FSharp.Data.Toolbox.Twitter | |
// Check out https://apps.twitter.com for more info on getting key & secret! | |
let key = "" //Insert key here! | |
let secret = "" //Insert secret here! | |
let twitter = Twitter.AuthenticateAppOnly(key, secret) | |
open System | |
open System.IO | |
let stopWords = File.ReadAllLines(__SOURCE_DIRECTORY__ + "\\StopWords.txt") | |
let words (text:string) = text.Split([|'#';'.';',';';';':';'!';'?';'`';' ';'\r';'\n';'"';'\'';'“';'”';'(';')';'+';'-'|], StringSplitOptions.RemoveEmptyEntries) | |
let rec getWords id = | |
let ndc = | |
match id with | |
| Some id -> twitter.Search.Tweets("#ndcoslo since:2015-06-15",count=100,maxId=id) | |
| None -> twitter.Search.Tweets("#ndcoslo since:2015-06-15",count=100) | |
let statuses = ndc.Statuses | |
let words = | |
statuses | |
|> Seq.filter (fun tweet -> tweet.RetweetedStatus.IsSome |> not) | |
|> Seq.filter (fun tweet -> tweet.Text.StartsWith("RT") |> not) | |
|> Seq.map (fun tweet -> tweet.Text) | |
|> String.concat " " | |
|> words | |
|> Seq.map (fun s -> s.ToLower()) | |
|> Seq.filter (fun word -> word <> "ndcoslo") | |
|> Seq.filter (fun word -> word.StartsWith("http") |> not) | |
|> Seq.filter (fun word -> word.StartsWith("co/") |> not) | |
|> Seq.filter (fun word -> word.StartsWith("//t") |> not) | |
|> Seq.filter (fun word -> word.StartsWith("@") |> not) | |
|> Seq.filter (fun word -> word <> "rt") | |
|> Seq.filter (fun word -> word <> "ht") | |
|> Seq.filter (fun word -> word <> "co") | |
|> Seq.filter (fun word -> word <> "cc") | |
|> Seq.filter (fun word -> word <> "yo") | |
|> Seq.filter (fun word -> word <> "bit") | |
|> Seq.filter (fun word -> word <> "didn") | |
|> Seq.filter (fun word -> word <> "don") | |
|> Seq.filter (fun word -> word <> "isn") | |
|> Seq.filter (fun word -> word <> "htt") | |
|> Seq.filter (fun word -> word <> "&") | |
|> Seq.filter (fun word -> word.Length > 2) | |
|> Seq.filter (fun word -> word.EndsWith("…") |> not) | |
|> Seq.filter (fun word -> word.EndsWith("%") |> not) | |
|> Seq.filter (fun word -> let x,_ = Int32.TryParse(word) in not x) | |
|> Seq.filter (fun word -> stopWords |> Array.exists ((=) word) |> not) | |
|> Seq.toList | |
if statuses |> Seq.length > 0 then | |
let path = String.Format("C:/temp/ndcoslo{0}.txt", DateTime.Now.Ticks) | |
printf "%s" path | |
File.WriteAllText(path, words |> String.concat " ") | |
let oldestTweet = statuses |> Seq.minBy(fun s -> s.Id) | |
printfn "%A" oldestTweet.CreatedAt | |
printfn "%A" oldestTweet.Text | |
words @ getWords (Some ((oldestTweet).Id - (int64)1)) | |
else [] | |
let allWords = getWords None | |
let ndcoslo = allWords |> String.concat " " | |
File.WriteAllText(@"C:\temp\ndcoslo.txt", ndcoslo) | |
let myCounts = | |
allWords | |
|> Seq.countBy id | |
|> Seq.sortBy (fun (_, count) -> -count) | |
|> Seq.take 50 | |
|> Seq.toArray |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment