hassanselim0 · April 6, 2016 23:54
diff --git a/FbMsgArchiveToJson.csx b/FbMsgArchiveToJson.csx
 #r Newtonsoft.Json.dll

 /*
    Steps:
    - Go to https://www.facebook.com/settings
    - Press on "Download a copy"
    - You'll get an e-mail, open the link and download the zip file
    - Extract "/html/messages.htm"
    - Run scriptcs -install Newtonsoft.Json (will create a file and a folder)
    - Then run this script

    Thankfully it's actually XHTML so we can just parse it as XML
    The XHTML looks like this:
    <html>
        <head>IGNORED</head>
        <body>
            <div class="nav">IGNORED</div>
            <div class="contents">
                <h1>IGNORED</h1>
                <div> <!-- groups of threads -->
                    <div class="thread">
                        THREAD_TITLE
                        <div class="message">
                            <div class="message_header">
                                <span class="user">USER_FULL_NAME</span>
                                <span class="meta">DATE: dddd, MMMM d, yyyy at h:mmtt UTCzz</span>
                            </div>
                        </div>
                        <p>CONTENT</p>
                        
                        <div class="message">
                        </div>
                        <p></p>

                        <!-- more messages -->
                    </div>
                </div>

                <div>
                    <div class="thread">
                    </div>
                </div>

                <!-- more groups of threads -->
            </div>
            <div class="footer">IGNORED</div>
        </body>
    </html>
 */

 using System.Text.RegularExpressions;
 using System.Xml.Linq;
 using Newtonsoft.Json;

 // Read the file and cleanup all control characters because XDocument doesnt like them
 var xhtmlStr = File.ReadAllText("messages.htm");
 var xhtmlStrClean = Regex.Replace(xhtmlStr, @"[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F\u0080\u009F]", "");

 // Parse the XHTML
 var xhtml = XDocument.Parse(xhtmlStrClean);

 // Get the groups of threads (skipping the h1)
 var partEls = xhtml.Root.Elements("body").Elements().ElementAt(1).Elements().Skip(1);

 // Get all the thread divs and flatten them
 var threadEls = partEls.SelectMany(p => p.Elements()).Where(el => el.Attribute("class").Value == "thread");

 // Extract the data into objects
 var Threads = threadEls.Select(threadEl =>
 {
    var title = threadEl.Nodes().OfType<XText>().First().Value;
    var messageEls = threadEl.Elements().Where(el => (string)el.Attribute("class") == "message");

    var messages = messageEls.Select(messageEl =>
    {
        var headerEl = messageEl.Element("div");
        var user = headerEl.Elements().First(el => el.Attribute("class").Value == "user").Value;
        var date = headerEl.Elements().First(el => el.Attribute("class").Value == "meta").Value;
        var dateUtc = DateTime.ParseExact(
            date.Replace(" at", ""), "dddd, MMMM d, yyyy h:mmtt UTCzz", null).ToUniversalTime();
        var content = (messageEl.NextNode as XElement).Value;

        return new
        {
            User = user,
            Date = dateUtc,
            Content = content,
        };
    });

    return new
    {
        Title = title,
        Messages = messages,
    };
 });

 // Serialize to JSON
 var jsonStr = JsonConvert.SerializeObject(Threads, Formatting.Indented);

 // Save JSON to file
 File.WriteAllText("Messages.json", jsonStr);
diff --git a/FbMsgArchiveToXml.csx b/FbMsgArchiveToXml.csx
 /*
    Steps:
    - Go to https://www.facebook.com/settings
    - Press on "Download a copy"
    - You'll get an e-mail, open the link and download the zip file
    - Extract "/html/messages.htm" and run this script

    Thankfully it's actually XHTML so we can just parse it as XML
    The XHTML looks like this:
    <html>
        <head>IGNORED</head>
        <body>
            <div class="nav">IGNORED</div>
            <div class="contents">
                <h1>IGNORED</h1>
                <div> <!-- groups of threads -->
                    <div class="thread">
                        THREAD_TITLE
                        <div class="message">
                            <div class="message_header">
                                <span class="user">USER_FULL_NAME</span>
                                <span class="meta">DATE: dddd, MMMM d, yyyy at h:mmtt UTCzz</span>
                            </div>
                        </div>
                        <p>CONTENT</p>
                        
                        <div class="message">
                        </div>
                        <p></p>

                        <!-- more messages -->
                    </div>
                </div>

                <div>
                    <div class="thread">
                    </div>
                </div>

                <!-- more groups of threads -->
            </div>
            <div class="footer">IGNORED</div>
        </body>
    </html>
 */

 using System.Text.RegularExpressions;
 using System.Xml.Linq;

 // Read the file and cleanup all control characters because XDocument doesnt like them
 var xhtmlStr = File.ReadAllText("messages.htm");
 var xhtmlStrClean = Regex.Replace(xhtmlStr, @"[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F\u0080\u009F]", "");

 // Parse the XHTML
 var xhtml = XDocument.Parse(xhtmlStrClean);

 // Get the groups of threads (skipping the h1)
 var partEls = xhtml.Root.Elements("body").Elements().ElementAt(1).Elements().Skip(1);

 // Get all the thread divs and flatten them
 var threadEls = partEls.SelectMany(p => p.Elements()).Where(el => el.Attribute("class").Value == "thread");

 // Extract the data into objects
 var Threads = threadEls.Select(threadEl =>
 {
    var title = threadEl.Nodes().OfType<XText>().First().Value;
    var messageEls = threadEl.Elements().Where(el => (string)el.Attribute("class") == "message");

    var messages = messageEls.Select(messageEl =>
    {
        var headerEl = messageEl.Element("div");
        var user = headerEl.Elements().First(el => el.Attribute("class").Value == "user").Value;
        var date = headerEl.Elements().First(el => el.Attribute("class").Value == "meta").Value;
        var dateUtc = DateTime.ParseExact(
            date.Replace(" at", ""), "dddd, MMMM d, yyyy h:mmtt UTCzz", null).ToUniversalTime();
        var content = (messageEl.NextNode as XElement).Value;

        return new
        {
            User = user,
            Date = dateUtc,
            Content = content,
        };
    });

    return new
    {
        Title = title,
        Messages = messages,
    };
 });

 // Generate the resulting XML
 var xml = new XDocument(
    new XElement("Threads", Threads.Select(t =>
        new XElement("Thread",
            new XElement("Title", t.Title),
            new XElement("Messages", t.Messages.Select(m =>
                new XElement("Message", 
                    new XElement("User", m.User),
                    new XElement("Date", m.Date),
                    new XElement("Content", m.Content)
                )
            ))
        )
    ))
 );

 // Save the XML File
 xml.Save("Messages.xml", SaveOptions.None);
	#r Newtonsoft.Json.dll

	/*
	Steps:
	- Go to https://www.facebook.com/settings
	- Press on "Download a copy"
	- You'll get an e-mail, open the link and download the zip file
	- Extract "/html/messages.htm"
	- Run scriptcs -install Newtonsoft.Json (will create a file and a folder)
	- Then run this script

	Thankfully it's actually XHTML so we can just parse it as XML
	The XHTML looks like this:
	<html>
	<head>IGNORED</head>
	<body>
	<div class="nav">IGNORED</div>
	<div class="contents">
	<h1>IGNORED</h1>
	<div> <!-- groups of threads -->
	<div class="thread">
	THREAD_TITLE
	<div class="message">
	<div class="message_header">
	<span class="user">USER_FULL_NAME</span>
	<span class="meta">DATE: dddd, MMMM d, yyyy at h:mmtt UTCzz</span>
	</div>
	</div>
	<p>CONTENT</p>

	<div class="message">
	</div>
	<p></p>

	<!-- more messages -->
	</div>
	</div>

	<div>
	<div class="thread">
	</div>
	</div>

	<!-- more groups of threads -->
	</div>
	<div class="footer">IGNORED</div>
	</body>
	</html>
	*/

	using System.Text.RegularExpressions;
	using System.Xml.Linq;
	using Newtonsoft.Json;

	// Read the file and cleanup all control characters because XDocument doesnt like them
	var xhtmlStr = File.ReadAllText("messages.htm");
	var xhtmlStrClean = Regex.Replace(xhtmlStr, @"[\u0000-\u0009\u000B\u000C\u000E-\u001F\u007F\u0080\u009F]", "");

	// Parse the XHTML
	var xhtml = XDocument.Parse(xhtmlStrClean);

	// Get the groups of threads (skipping the h1)
	var partEls = xhtml.Root.Elements("body").Elements().ElementAt(1).Elements().Skip(1);

	// Get all the thread divs and flatten them
	var threadEls = partEls.SelectMany(p => p.Elements()).Where(el => el.Attribute("class").Value == "thread");

	// Extract the data into objects
	var Threads = threadEls.Select(threadEl =>
	{
	var title = threadEl.Nodes().OfType<XText>().First().Value;
	var messageEls = threadEl.Elements().Where(el => (string)el.Attribute("class") == "message");

	var messages = messageEls.Select(messageEl =>
	{
	var headerEl = messageEl.Element("div");
	var user = headerEl.Elements().First(el => el.Attribute("class").Value == "user").Value;
	var date = headerEl.Elements().First(el => el.Attribute("class").Value == "meta").Value;
	var dateUtc = DateTime.ParseExact(
	date.Replace(" at", ""), "dddd, MMMM d, yyyy h:mmtt UTCzz", null).ToUniversalTime();
	var content = (messageEl.NextNode as XElement).Value;

	return new
	{
	User = user,
	Date = dateUtc,
	Content = content,
	};
	});

	return new
	{
	Title = title,
	Messages = messages,
	};
	});

	// Serialize to JSON
	var jsonStr = JsonConvert.SerializeObject(Threads, Formatting.Indented);

	// Save JSON to file
	File.WriteAllText("Messages.json", jsonStr);