Skip to content

Instantly share code, notes, and snippets.

@tetrillard
Created November 24, 2014 22:57
Show Gist options
  • Save tetrillard/759bf2d165b440e4915c to your computer and use it in GitHub Desktop.
Save tetrillard/759bf2d165b440e4915c to your computer and use it in GitHub Desktop.
SMS Backup & Restore : Extract images and videos from a backup file
#!/usr/bin/env python
# -*- coding: utf8 -*-
# SMSBackupRestore extractor
#
# smsbackuprestore-extractor.py
# 24/11/2014
#
# This script will extract all images and videos retrieved
# from a xml backup of the Android application "SMS Backup & Restore".
# For each contact, it will create a folder inside the output folder
# with all received images and videos.
#
# Make sure the destination folder is empty otherwise it will create duplicates.
#
# Links :
# https://play.google.com/store/apps/details?id=com.riteshsahu.SMSBackupRestore
#
# example: python smsbackuprestore-extractor.py sms-20141122183844.xml medias/
from lxml import etree
import os
import sys
if len(sys.argv) < 2:
print "usage: %s [sms-backup.xml] [output-folder]" % sys.argv[0]
sys.exit(-1)
INPUT_FILE = sys.argv[1]
OUTPUT_FOLDER = sys.argv[2]
if not os.path.isfile(INPUT_FILE):
print "File %s not found" % INPUT_FILE
print "[*] Parsing : %s" % INPUT_FILE
tree = etree.parse(INPUT_FILE)
mms_list = tree.xpath(".//mms")
total = 0
for mms in mms_list:
address = mms.get("address")
contact = mms.get("contact_name")
if contact == "(Unknown)":
folder = address
if address == None:
folder = "_Unknown"
else:
folder = contact
media_list = mms.xpath(".//part[starts-with(@ct, 'image') or starts-with(@ct, 'video')]")
# Create the folders
for media in media_list:
total = total + 1
output = OUTPUT_FOLDER + "/" + folder
if os.path.exists(output) == False:
os.makedirs(OUTPUT_FOLDER + "/" + folder)
print "[+] New folder created : %s" % output.encode("utf-8")
filename = media.get("cl")
rawdata = media.get("data").decode("base64")
outfile = output + "/" + filename
# Duplicates handling
i = 1
while os.path.isfile(outfile):
dname = filename.split('.')
dname.insert(-1, str(i))
outfile = output + "/" + '.'.join(dname)
i = i+1
f = open(outfile, 'w')
f.write(rawdata)
f.close()
print "[*] Job done (%d files created)" % total
print "[*] Output folder : %s" % OUTPUT_FOLDER
@matttarsi
Copy link

Any luck running on a large file? My backup file is 2.1GB. I tried to add:

from lxml.etree import XMLParser, parse

and then change

tree = etree.parse(INPUT_FILE) 

to

p = XMLParser(huge_tree=True)
tree = etree.parse(INPUT_FILE, parser=p) 

After cleaning the file as described above, I get the following output:

$ python3 extract-python3.py input/sms-20190505085947-clean.xml output/PixelXLMMS/
[*] Parsing : input/sms-20190505085947-clean.xml
[*] Job done (0 files created)
[*] Output folder : output/PixelXLMMS/

@hajis2019
Copy link

I'm also struggling with this on python3 / windows 10.

Any luck @nicksears?

@bumpaneer
Copy link

Thanks for the code! I've modified it for my use to update "null" file names to the timestamp instead. I also set the file modification time to the MMS date timestamp. The other change I made was to parse messages individually instead of parsing the whole file at once. This allowed me to get through a very large backup.

#!/usr/bin/env python
# -*- coding: utf8 -*-

# SMSBackupRestore extractor
#
# smsbackuprestore-extractor.py
# 24/11/2014
#
# This script will extract all images and videos retrieved
# from a xml backup of the Android application "SMS Backup & Restore".
# For each contact, it will create a folder inside the output folder
# with all received images and videos.
# 
# Make sure the destination folder is empty otherwise it will create duplicates.
#
# Links :
#   https://play.google.com/store/apps/details?id=com.riteshsahu.SMSBackupRestore
#
#  example: python smsbackuprestore-extractor.py sms-20141122183844.xml medias/
#
# 2019-02-09 @stefan-schiffer 
# Ported to Python 3
# You might have first to fix malformed XML with entityfixer.py
# https://gist.github.com/Calvin-L/5232f876b8acf48a216941b8904632bb

from lxml import etree
from win32_setctime import setctime
import os
import sys
import base64
import datetime

if len(sys.argv) < 2:
	print("usage: %s [sms-backup.xml] [output-folder]" % sys.argv[0])
	sys.exit(-1)

INPUT_FILE = sys.argv[1]	
OUTPUT_FOLDER = sys.argv[2]

if not os.path.isfile(INPUT_FILE):
	print("File %s not found" % INPUT_FILE)

print("[*] Parsing : %s" % INPUT_FILE)
total = 0
for _, mms in etree.iterparse(INPUT_FILE, tag='mms', huge_tree=True): 
	address = mms.get("address")
	contact = mms.get("contact_name")
	if contact == "(Unknown)":
		folder = address
		if address == None:
			folder = "_Unknown"
	else:
		folder = contact
	media_list = mms.xpath(".//part[starts-with(@ct, 'image') or starts-with(@ct, 'video')]")
	# Create the folders
	for media in media_list:
		total = total + 1
		output = OUTPUT_FOLDER + "/" + folder
		if os.path.exists(output) == False:
			os.makedirs(OUTPUT_FOLDER + "/" + folder)
			print("[+] New folder created: %s" % folder)
		timestamp = datetime.datetime.fromtimestamp(float(mms.get("date"))/1000.0)
		filename = media.get("cl")
		rawdata = base64.b64decode(media.get("data"))
		if filename == "null":
			name = media.get("ct")
			ext = name.split('/')[1]
			if ext == "jpeg":
				ext = "jpg"
			elif name == "image/*":
				print("Unknown image type * for MMS content; guessing .jpg " + output + "/" + timestamp.strftime("%Y%m%d_%H%M%S"))
				ext = "jpg"
			elif name == "video/*":
				print("Unknown video type * for MMS content; guessing .3gpp " + output + "/" + timestamp.strftime("%Y%m%d_%H%M%S"))
				ext = "3gpp"
			filename = timestamp.strftime("%Y%m%d_%H%M%S") + '.' + ext
		outfile = output + "/" + filename
		# Duplicates handling
		i = 1
		while os.path.isfile(outfile):
			dname = filename.split('.')
			dname.insert(-1, str(i))
			outfile = output + "/" + '.'.join(dname)
			i = i+1
		f = open(outfile, 'w')
		f.buffer.write(rawdata)
		f.close()
		filetime = (timestamp-datetime.datetime(1970,1,1)).total_seconds()
		setctime(outfile, filetime)
		os.utime(outfile, (filetime, filetime))
	mms.clear()
print("[*] Job done (%d files created)" % total)
print("[*] Output folder : %s" % OUTPUT_FOLDER)

@FuturefryGuy
Copy link

This was a lifesaver! many thanks for porting to python 3 !

@bitsmack
Copy link

bitsmack commented Dec 5, 2020

This is great! I just ran the newest version (posted by @bumpaneer) on a 600MB xml file and it worked wonderfully. I was about to start writing something to do the same thing but I wasn't looking forward to it :) Thank you.

Do you mind if I link to this gist from a Stack Exchange question?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment