Created
June 20, 2018 07:49
-
-
Save rolandleth/2dc971edc72f83a4eea6b2db523c529d to your computer and use it in GitHub Desktop.
Extracting and converting your Twitter archive into simpler objects
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Blog post: https://rolandleth.com/extracting-and-parsing-tweets-from-your-twitter-archive | |
import Foundation | |
struct Tweet: Codable { | |
let text: String | |
let timestamp: String | |
} | |
enum Syntax { | |
case markdown | |
case html | |
case none | |
} | |
let syntax: Syntax = .markdown | |
let dataDetector = try! NSDataDetector(types: NSTextCheckingResult.CheckingType.link.rawValue) | |
let handleRegex = try! NSRegularExpression(pattern: "@[^.,:;?!'\"\\-()\\[\\]{} ]+", options: .caseInsensitive) | |
let file = Bundle.main.path(forResource: "tweets", ofType: "csv")! | |
let csv = try! CSV(name: file) | |
var rawTweets = csv.rows.filter { | |
let isRetweet = $0["retweeted_status_user_id"]?.isEmpty == false | |
|| $0["expanded_urls"]?.contains("https://twitter.com") == true | |
|| $0["expanded_urls"]?.contains("favd.net") == true | |
|| $0["text"]?.contains("via @") == true | |
|| $0["text"]?.contains("RT @") == true | |
|| $0["text"]?.contains("\"@") == true | |
|| $0["text"]?.contains("“@") == true | |
|| $0["text"] == "." | |
let isReply = $0["in_reply_to_status_id"]?.isEmpty == false | |
|| $0["text"]?.hasPrefix("@") == true | |
let isLinkToBlog = $0["expanded_urls"]?.contains("rolandleth.com") == true | |
return !isRetweet && !isReply && !isLinkToBlog | |
} | |
let tweets = try! rawTweets.map { rawTweet -> Tweet in | |
var text = rawTweet["text"]! | |
if syntax == .markdown { | |
text = text.replacingOccurrences(of: "\n", with: " \n") | |
} | |
var nsText: NSString { return text as NSString } | |
var textRange: NSRange { return NSRange(location: 0, length: text.utf16.count) } | |
let expandedURLs = rawTweet["expanded_urls"]!.components(separatedBy: ",") | |
let reversedMatches = dataDetector | |
.matches(in: text, options: [], range: textRange) | |
.reversed() | |
let matchesCount = reversedMatches.count | |
var nonTcoURLs = 0 | |
reversedMatches.enumerated().forEach { i, m in | |
var url = nsText.substring(with: m.range) | |
let correctURL: String | |
if matchesCount > expandedURLs.count, !url.hasPrefix("http") { | |
url = "http://" + url | |
nonTcoURLs += 1 | |
} | |
else { | |
url = expandedURLs[i - nonTcoURLs] | |
} | |
let urlName = url | |
.replacingOccurrences(of: "http://", with: "") | |
.replacingOccurrences(of: "https://", with: "") | |
switch syntax { | |
case .markdown: correctURL = "[\(urlName)](\(url))" | |
case .html: correctURL = "<a href=\"\(url)\">\(urlName)</a>" | |
case .none: correctURL = url | |
} | |
text = nsText.replacingCharacters(in: m.range, with: correctURL) | |
} | |
let reversedHandleMatches = handleRegex | |
.matches(in: text, options: [], range: textRange) | |
.reversed() | |
reversedHandleMatches.forEach { | |
let accountRange = NSRange(location: $0.range.location + 1, length: $0.range.length - 1) | |
let account = nsText.substring(with: accountRange) | |
let correctHandleURL: String | |
let handleURL = "https://twitter.com/\(account)" | |
switch syntax { | |
case .markdown: correctHandleURL = "[@\(account)](\(handleURL))" | |
case .html: correctHandleURL = "<a href=\"\(handleURL)\">@\(account)</a>" | |
case .none: correctHandleURL = handleURL | |
} | |
text = nsText.replacingCharacters(in: $0.range, with: correctHandleURL) | |
} | |
let t = rawTweet["timestamp"]! | |
.replacingOccurrences(of: " ", with: "-") | |
.replacingOccurrences(of: ":", with: "") | |
as NSString | |
let time = t.substring(with: NSRange(location: 0, length: 15)) | |
return Tweet(text: text, timestamp: time) | |
} | |
// Blog post: https://rolandleth.com/extracting-and-parsing-tweets-from-your-twitter-archive |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment