Last active
April 6, 2016 23:54
-
-
Save hassanselim0/e5e42dcffbb207d5babbd5af5d2e94e1 to your computer and use it in GitHub Desktop.
Convert FB Message Archive to XML or JSON
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#r Newtonsoft.Json.dll | |
/* | |
Steps: | |
- Go to https://www.facebook.com/settings | |
- Press on "Download a copy" | |
- You'll get an e-mail, open the link and download the zip file | |
- Extract "/html/messages.htm" | |
- Run scriptcs -install Newtonsoft.Json (will create a file and a folder) | |
- Then run this script | |
Thankfully it's actually XHTML so we can just parse it as XML | |
The XHTML looks like this: | |
<html> | |
<head>IGNORED</head> | |
<body> | |
<div class="nav">IGNORED</div> | |
<div class="contents"> | |
<h1>IGNORED</h1> | |
<div> <!-- groups of threads --> | |
<div class="thread"> | |
THREAD_TITLE | |
<div class="message"> | |
<div class="message_header"> | |
<span class="user">USER_FULL_NAME</span> | |
<span class="meta">DATE: dddd, MMMM d, yyyy at h:mmtt UTCzz</span> | |
</div> | |
</div> | |
<p>CONTENT</p> | |
<div class="message"> | |
</div> | |
<p></p> | |
<!-- more messages --> | |
</div> | |
</div> | |
<div> | |
<div class="thread"> | |
</div> | |
</div> | |
<!-- more groups of threads --> | |
</div> | |
<div class="footer">IGNORED</div> | |
</body> | |
</html> | |
*/ | |
using System.Text.RegularExpressions; | |
using System.Xml.Linq; | |
using Newtonsoft.Json; | |
// Read the file and cleanup all control characters because XDocument doesnt like them | |
var xhtmlStr = File.ReadAllText("messages.htm"); | |
var xhtmlStrClean = Regex.Replace(xhtmlStr, @"[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F\u0080\u009F]", ""); | |
// Parse the XHTML | |
var xhtml = XDocument.Parse(xhtmlStrClean); | |
// Get the groups of threads (skipping the h1) | |
var partEls = xhtml.Root.Elements("body").Elements().ElementAt(1).Elements().Skip(1); | |
// Get all the thread divs and flatten them | |
var threadEls = partEls.SelectMany(p => p.Elements()).Where(el => el.Attribute("class").Value == "thread"); | |
// Extract the data into objects | |
var Threads = threadEls.Select(threadEl => | |
{ | |
var title = threadEl.Nodes().OfType<XText>().First().Value; | |
var messageEls = threadEl.Elements().Where(el => (string)el.Attribute("class") == "message"); | |
var messages = messageEls.Select(messageEl => | |
{ | |
var headerEl = messageEl.Element("div"); | |
var user = headerEl.Elements().First(el => el.Attribute("class").Value == "user").Value; | |
var date = headerEl.Elements().First(el => el.Attribute("class").Value == "meta").Value; | |
var dateUtc = DateTime.ParseExact( | |
date.Replace(" at", ""), "dddd, MMMM d, yyyy h:mmtt UTCzz", null).ToUniversalTime(); | |
var content = (messageEl.NextNode as XElement).Value; | |
return new | |
{ | |
User = user, | |
Date = dateUtc, | |
Content = content, | |
}; | |
}); | |
return new | |
{ | |
Title = title, | |
Messages = messages, | |
}; | |
}); | |
// Serialize to JSON | |
var jsonStr = JsonConvert.SerializeObject(Threads, Formatting.Indented); | |
// Save JSON to file | |
File.WriteAllText("Messages.json", jsonStr); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Steps: | |
- Go to https://www.facebook.com/settings | |
- Press on "Download a copy" | |
- You'll get an e-mail, open the link and download the zip file | |
- Extract "/html/messages.htm" and run this script | |
Thankfully it's actually XHTML so we can just parse it as XML | |
The XHTML looks like this: | |
<html> | |
<head>IGNORED</head> | |
<body> | |
<div class="nav">IGNORED</div> | |
<div class="contents"> | |
<h1>IGNORED</h1> | |
<div> <!-- groups of threads --> | |
<div class="thread"> | |
THREAD_TITLE | |
<div class="message"> | |
<div class="message_header"> | |
<span class="user">USER_FULL_NAME</span> | |
<span class="meta">DATE: dddd, MMMM d, yyyy at h:mmtt UTCzz</span> | |
</div> | |
</div> | |
<p>CONTENT</p> | |
<div class="message"> | |
</div> | |
<p></p> | |
<!-- more messages --> | |
</div> | |
</div> | |
<div> | |
<div class="thread"> | |
</div> | |
</div> | |
<!-- more groups of threads --> | |
</div> | |
<div class="footer">IGNORED</div> | |
</body> | |
</html> | |
*/ | |
using System.Text.RegularExpressions; | |
using System.Xml.Linq; | |
// Read the file and cleanup all control characters because XDocument doesnt like them | |
var xhtmlStr = File.ReadAllText("messages.htm"); | |
var xhtmlStrClean = Regex.Replace(xhtmlStr, @"[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F\u0080\u009F]", ""); | |
// Parse the XHTML | |
var xhtml = XDocument.Parse(xhtmlStrClean); | |
// Get the groups of threads (skipping the h1) | |
var partEls = xhtml.Root.Elements("body").Elements().ElementAt(1).Elements().Skip(1); | |
// Get all the thread divs and flatten them | |
var threadEls = partEls.SelectMany(p => p.Elements()).Where(el => el.Attribute("class").Value == "thread"); | |
// Extract the data into objects | |
var Threads = threadEls.Select(threadEl => | |
{ | |
var title = threadEl.Nodes().OfType<XText>().First().Value; | |
var messageEls = threadEl.Elements().Where(el => (string)el.Attribute("class") == "message"); | |
var messages = messageEls.Select(messageEl => | |
{ | |
var headerEl = messageEl.Element("div"); | |
var user = headerEl.Elements().First(el => el.Attribute("class").Value == "user").Value; | |
var date = headerEl.Elements().First(el => el.Attribute("class").Value == "meta").Value; | |
var dateUtc = DateTime.ParseExact( | |
date.Replace(" at", ""), "dddd, MMMM d, yyyy h:mmtt UTCzz", null).ToUniversalTime(); | |
var content = (messageEl.NextNode as XElement).Value; | |
return new | |
{ | |
User = user, | |
Date = dateUtc, | |
Content = content, | |
}; | |
}); | |
return new | |
{ | |
Title = title, | |
Messages = messages, | |
}; | |
}); | |
// Generate the resulting XML | |
var xml = new XDocument( | |
new XElement("Threads", Threads.Select(t => | |
new XElement("Thread", | |
new XElement("Title", t.Title), | |
new XElement("Messages", t.Messages.Select(m => | |
new XElement("Message", | |
new XElement("User", m.User), | |
new XElement("Date", m.Date), | |
new XElement("Content", m.Content) | |
) | |
)) | |
) | |
)) | |
); | |
// Save the XML File | |
xml.Save("Messages.xml", SaveOptions.None); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment