Skip to content

Instantly share code, notes, and snippets.

@hassanselim0
Last active April 6, 2016 23:54
Show Gist options
  • Save hassanselim0/e5e42dcffbb207d5babbd5af5d2e94e1 to your computer and use it in GitHub Desktop.
Save hassanselim0/e5e42dcffbb207d5babbd5af5d2e94e1 to your computer and use it in GitHub Desktop.
Convert FB Message Archive to XML or JSON
#r Newtonsoft.Json.dll
/*
Steps:
- Go to https://www.facebook.com/settings
- Press on "Download a copy"
- You'll get an e-mail, open the link and download the zip file
- Extract "/html/messages.htm"
- Run scriptcs -install Newtonsoft.Json (will create a file and a folder)
- Then run this script
Thankfully it's actually XHTML so we can just parse it as XML
The XHTML looks like this:
<html>
<head>IGNORED</head>
<body>
<div class="nav">IGNORED</div>
<div class="contents">
<h1>IGNORED</h1>
<div> <!-- groups of threads -->
<div class="thread">
THREAD_TITLE
<div class="message">
<div class="message_header">
<span class="user">USER_FULL_NAME</span>
<span class="meta">DATE: dddd, MMMM d, yyyy at h:mmtt UTCzz</span>
</div>
</div>
<p>CONTENT</p>
<div class="message">
</div>
<p></p>
<!-- more messages -->
</div>
</div>
<div>
<div class="thread">
</div>
</div>
<!-- more groups of threads -->
</div>
<div class="footer">IGNORED</div>
</body>
</html>
*/
using System.Text.RegularExpressions;
using System.Xml.Linq;
using Newtonsoft.Json;
// Read the file and cleanup all control characters because XDocument doesnt like them
var xhtmlStr = File.ReadAllText("messages.htm");
var xhtmlStrClean = Regex.Replace(xhtmlStr, @"[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F\u0080\u009F]", "");
// Parse the XHTML
var xhtml = XDocument.Parse(xhtmlStrClean);
// Get the groups of threads (skipping the h1)
var partEls = xhtml.Root.Elements("body").Elements().ElementAt(1).Elements().Skip(1);
// Get all the thread divs and flatten them
var threadEls = partEls.SelectMany(p => p.Elements()).Where(el => el.Attribute("class").Value == "thread");
// Extract the data into objects
var Threads = threadEls.Select(threadEl =>
{
var title = threadEl.Nodes().OfType<XText>().First().Value;
var messageEls = threadEl.Elements().Where(el => (string)el.Attribute("class") == "message");
var messages = messageEls.Select(messageEl =>
{
var headerEl = messageEl.Element("div");
var user = headerEl.Elements().First(el => el.Attribute("class").Value == "user").Value;
var date = headerEl.Elements().First(el => el.Attribute("class").Value == "meta").Value;
var dateUtc = DateTime.ParseExact(
date.Replace(" at", ""), "dddd, MMMM d, yyyy h:mmtt UTCzz", null).ToUniversalTime();
var content = (messageEl.NextNode as XElement).Value;
return new
{
User = user,
Date = dateUtc,
Content = content,
};
});
return new
{
Title = title,
Messages = messages,
};
});
// Serialize to JSON
var jsonStr = JsonConvert.SerializeObject(Threads, Formatting.Indented);
// Save JSON to file
File.WriteAllText("Messages.json", jsonStr);
/*
Steps:
- Go to https://www.facebook.com/settings
- Press on "Download a copy"
- You'll get an e-mail, open the link and download the zip file
- Extract "/html/messages.htm" and run this script
Thankfully it's actually XHTML so we can just parse it as XML
The XHTML looks like this:
<html>
<head>IGNORED</head>
<body>
<div class="nav">IGNORED</div>
<div class="contents">
<h1>IGNORED</h1>
<div> <!-- groups of threads -->
<div class="thread">
THREAD_TITLE
<div class="message">
<div class="message_header">
<span class="user">USER_FULL_NAME</span>
<span class="meta">DATE: dddd, MMMM d, yyyy at h:mmtt UTCzz</span>
</div>
</div>
<p>CONTENT</p>
<div class="message">
</div>
<p></p>
<!-- more messages -->
</div>
</div>
<div>
<div class="thread">
</div>
</div>
<!-- more groups of threads -->
</div>
<div class="footer">IGNORED</div>
</body>
</html>
*/
using System.Text.RegularExpressions;
using System.Xml.Linq;
// Read the file and cleanup all control characters because XDocument doesnt like them
var xhtmlStr = File.ReadAllText("messages.htm");
var xhtmlStrClean = Regex.Replace(xhtmlStr, @"[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F\u0080\u009F]", "");
// Parse the XHTML
var xhtml = XDocument.Parse(xhtmlStrClean);
// Get the groups of threads (skipping the h1)
var partEls = xhtml.Root.Elements("body").Elements().ElementAt(1).Elements().Skip(1);
// Get all the thread divs and flatten them
var threadEls = partEls.SelectMany(p => p.Elements()).Where(el => el.Attribute("class").Value == "thread");
// Extract the data into objects
var Threads = threadEls.Select(threadEl =>
{
var title = threadEl.Nodes().OfType<XText>().First().Value;
var messageEls = threadEl.Elements().Where(el => (string)el.Attribute("class") == "message");
var messages = messageEls.Select(messageEl =>
{
var headerEl = messageEl.Element("div");
var user = headerEl.Elements().First(el => el.Attribute("class").Value == "user").Value;
var date = headerEl.Elements().First(el => el.Attribute("class").Value == "meta").Value;
var dateUtc = DateTime.ParseExact(
date.Replace(" at", ""), "dddd, MMMM d, yyyy h:mmtt UTCzz", null).ToUniversalTime();
var content = (messageEl.NextNode as XElement).Value;
return new
{
User = user,
Date = dateUtc,
Content = content,
};
});
return new
{
Title = title,
Messages = messages,
};
});
// Generate the resulting XML
var xml = new XDocument(
new XElement("Threads", Threads.Select(t =>
new XElement("Thread",
new XElement("Title", t.Title),
new XElement("Messages", t.Messages.Select(m =>
new XElement("Message",
new XElement("User", m.User),
new XElement("Date", m.Date),
new XElement("Content", m.Content)
)
))
)
))
);
// Save the XML File
xml.Save("Messages.xml", SaveOptions.None);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment