-
-
Save tetrillard/759bf2d165b440e4915c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
# -*- coding: utf8 -*- | |
# SMSBackupRestore extractor | |
# | |
# smsbackuprestore-extractor.py | |
# 24/11/2014 | |
# | |
# This script will extract all images and videos retrieved | |
# from a xml backup of the Android application "SMS Backup & Restore". | |
# For each contact, it will create a folder inside the output folder | |
# with all received images and videos. | |
# | |
# Make sure the destination folder is empty otherwise it will create duplicates. | |
# | |
# Links : | |
# https://play.google.com/store/apps/details?id=com.riteshsahu.SMSBackupRestore | |
# | |
# example: python smsbackuprestore-extractor.py sms-20141122183844.xml medias/ | |
from lxml import etree | |
import os | |
import sys | |
if len(sys.argv) < 2: | |
print "usage: %s [sms-backup.xml] [output-folder]" % sys.argv[0] | |
sys.exit(-1) | |
INPUT_FILE = sys.argv[1] | |
OUTPUT_FOLDER = sys.argv[2] | |
if not os.path.isfile(INPUT_FILE): | |
print "File %s not found" % INPUT_FILE | |
print "[*] Parsing : %s" % INPUT_FILE | |
tree = etree.parse(INPUT_FILE) | |
mms_list = tree.xpath(".//mms") | |
total = 0 | |
for mms in mms_list: | |
address = mms.get("address") | |
contact = mms.get("contact_name") | |
if contact == "(Unknown)": | |
folder = address | |
if address == None: | |
folder = "_Unknown" | |
else: | |
folder = contact | |
media_list = mms.xpath(".//part[starts-with(@ct, 'image') or starts-with(@ct, 'video')]") | |
# Create the folders | |
for media in media_list: | |
total = total + 1 | |
output = OUTPUT_FOLDER + "/" + folder | |
if os.path.exists(output) == False: | |
os.makedirs(OUTPUT_FOLDER + "/" + folder) | |
print "[+] New folder created : %s" % output.encode("utf-8") | |
filename = media.get("cl") | |
rawdata = media.get("data").decode("base64") | |
outfile = output + "/" + filename | |
# Duplicates handling | |
i = 1 | |
while os.path.isfile(outfile): | |
dname = filename.split('.') | |
dname.insert(-1, str(i)) | |
outfile = output + "/" + '.'.join(dname) | |
i = i+1 | |
f = open(outfile, 'w') | |
f.write(rawdata) | |
f.close() | |
print "[*] Job done (%d files created)" % total | |
print "[*] Output folder : %s" % OUTPUT_FOLDER |
Any luck running on a large file? My backup file is 2.1GB. I tried to add:
from lxml.etree import XMLParser, parse
and then change
tree = etree.parse(INPUT_FILE)
to
p = XMLParser(huge_tree=True)
tree = etree.parse(INPUT_FILE, parser=p)
After cleaning the file as described above, I get the following output:
$ python3 extract-python3.py input/sms-20190505085947-clean.xml output/PixelXLMMS/
[*] Parsing : input/sms-20190505085947-clean.xml
[*] Job done (0 files created)
[*] Output folder : output/PixelXLMMS/
I'm also struggling with this on python3 / windows 10.
Any luck @nicksears?
Thanks for the code! I've modified it for my use to update "null" file names to the timestamp instead. I also set the file modification time to the MMS date timestamp. The other change I made was to parse messages individually instead of parsing the whole file at once. This allowed me to get through a very large backup.
#!/usr/bin/env python
# -*- coding: utf8 -*-
# SMSBackupRestore extractor
#
# smsbackuprestore-extractor.py
# 24/11/2014
#
# This script will extract all images and videos retrieved
# from a xml backup of the Android application "SMS Backup & Restore".
# For each contact, it will create a folder inside the output folder
# with all received images and videos.
#
# Make sure the destination folder is empty otherwise it will create duplicates.
#
# Links :
# https://play.google.com/store/apps/details?id=com.riteshsahu.SMSBackupRestore
#
# example: python smsbackuprestore-extractor.py sms-20141122183844.xml medias/
#
# 2019-02-09 @stefan-schiffer
# Ported to Python 3
# You might have first to fix malformed XML with entityfixer.py
# https://gist.github.com/Calvin-L/5232f876b8acf48a216941b8904632bb
from lxml import etree
from win32_setctime import setctime
import os
import sys
import base64
import datetime
if len(sys.argv) < 2:
print("usage: %s [sms-backup.xml] [output-folder]" % sys.argv[0])
sys.exit(-1)
INPUT_FILE = sys.argv[1]
OUTPUT_FOLDER = sys.argv[2]
if not os.path.isfile(INPUT_FILE):
print("File %s not found" % INPUT_FILE)
print("[*] Parsing : %s" % INPUT_FILE)
total = 0
for _, mms in etree.iterparse(INPUT_FILE, tag='mms', huge_tree=True):
address = mms.get("address")
contact = mms.get("contact_name")
if contact == "(Unknown)":
folder = address
if address == None:
folder = "_Unknown"
else:
folder = contact
media_list = mms.xpath(".//part[starts-with(@ct, 'image') or starts-with(@ct, 'video')]")
# Create the folders
for media in media_list:
total = total + 1
output = OUTPUT_FOLDER + "/" + folder
if os.path.exists(output) == False:
os.makedirs(OUTPUT_FOLDER + "/" + folder)
print("[+] New folder created: %s" % folder)
timestamp = datetime.datetime.fromtimestamp(float(mms.get("date"))/1000.0)
filename = media.get("cl")
rawdata = base64.b64decode(media.get("data"))
if filename == "null":
name = media.get("ct")
ext = name.split('/')[1]
if ext == "jpeg":
ext = "jpg"
elif name == "image/*":
print("Unknown image type * for MMS content; guessing .jpg " + output + "/" + timestamp.strftime("%Y%m%d_%H%M%S"))
ext = "jpg"
elif name == "video/*":
print("Unknown video type * for MMS content; guessing .3gpp " + output + "/" + timestamp.strftime("%Y%m%d_%H%M%S"))
ext = "3gpp"
filename = timestamp.strftime("%Y%m%d_%H%M%S") + '.' + ext
outfile = output + "/" + filename
# Duplicates handling
i = 1
while os.path.isfile(outfile):
dname = filename.split('.')
dname.insert(-1, str(i))
outfile = output + "/" + '.'.join(dname)
i = i+1
f = open(outfile, 'w')
f.buffer.write(rawdata)
f.close()
filetime = (timestamp-datetime.datetime(1970,1,1)).total_seconds()
setctime(outfile, filetime)
os.utime(outfile, (filetime, filetime))
mms.clear()
print("[*] Job done (%d files created)" % total)
print("[*] Output folder : %s" % OUTPUT_FOLDER)
This was a lifesaver! many thanks for porting to python 3 !
This is great! I just ran the newest version (posted by @bumpaneer) on a 600MB xml file and it worked wonderfully. I was about to start writing something to do the same thing but I wasn't looking forward to it :) Thank you.
Do you mind if I link to this gist from a Stack Exchange question?
Ported to Python 3. It works now!