Skip to content

Instantly share code, notes, and snippets.

@vdcrim
Last active December 12, 2015 09:18
Show Gist options
  • Save vdcrim/4750006 to your computer and use it in GitHub Desktop.
Save vdcrim/4750006 to your computer and use it in GitHub Desktop.
Parse SegmentUID and Duration on Matroska files, for ordered chapters
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Parse SegmentUID and Duration on Matroska files, for ordered chapters
Usage (Python 2 or 3):
$python mkv_suid_duration.py file1 [file2...]
$python mkv_suid_duration.py directory
Copyright (C) 2013 Diego Fernández Gosende (dfgosende [at] gmail [dot] com)
GPL v3 or later <http://www.gnu.org/licenses/gpl-3.0.html>
"""
from __future__ import print_function
import os.path
import binascii
import struct
def parse_files(paths):
"""Parse SegmentUID and Duration on Matroska files"""
mkv_dict = {}
chunk_size = 100000 # 100 kB
for path in (path for path in paths if os.path.isfile(path)):
with open(path, 'rb') as file:
if file.read(4) != b'\x1A\x45\xDF\xA3': # not a Matroska file
continue
basename = os.path.basename(path)
mkv_dict[basename] = {'suid': None, 'duration': None}
suid = tcscale = duration = False
i = 0
while True:
if suid and tcscale and duration:
break
bin = file.read(chunk_size)
if not bin:
break
suid_pos = bin.find(b'\x73\xA4\x90') # \x90 -> 16 bytes
if suid_pos != -1:
suid_pos = 4 + i * chunk_size + suid_pos + 3
file.seek(suid_pos)
suid = binascii.hexlify(file.read(16)).decode()
mkv_dict[basename]['suid'] = suid
tcscale_pos = bin.find(b'\x2A\xD7\xB1')
if tcscale_pos != -1:
tcscale_pos = 4 + i * chunk_size + tcscale_pos + 3
file.seek(tcscale_pos)
tcscale_len = get_data_len(file.read(1))
tcscale = int(binascii.hexlify(file.read(tcscale_len)), 16)
duration_pos = bin.find(b'\x44\x89\x84') # float (4 bytes)
if duration_pos != -1:
duration_pos = 4 + i * chunk_size + duration_pos + 3
file.seek(duration_pos)
duration = struct.unpack('>f', file.read(4))[0]
if not duration:
duration_pos = bin.find(b'\x44\x89\x88') # double (8 bytes)
if duration_pos != -1:
duration_pos = 4 + i * chunk_size + duration_pos + 3
file.seek(duration_pos)
duration = struct.unpack('>d', file.read(8))[0]
if bin.find(b'\x1F\x43\xB6\x75') != -1:
# segment info should be before the clusters
break
i += 1
if tcscale and duration:
mkv_dict[basename]['duration'] = ms2str(
duration * tcscale / 1000000)
return mkv_dict
def get_data_len(byte):
"""Get the length (bytes) of the element data"""
n = ord(byte)
mask = 0b10000000
while not n & mask:
mask >>= 1
return n & ~mask
def ms2str(ms):
"""Convert ms to a 'hh:mm:ss.mmm' string"""
s, ms = divmod(ms, 1000)
m, s = divmod(s, 60)
h, m = divmod(m, 60)
return '{0:02d}:{1:02d}:{2:02d}.{3:03d}'.format(
*[int(i) for i in (h, m, s, ms)])
if __name__ == '__main__':
import os
import sys
import glob
def decode_arg(arg):
if sys.version_info[0] >= 3:
arg = os.fsencode(arg)
return arg.decode(sys.stdin.encoding)
if len(sys.argv) > 1:
arg1 = decode_arg(sys.argv[1])
if os.path.isdir(arg1):
paths = glob.iglob(os.path.join(arg1, '*.mkv'))
else:
paths = (decode_arg(path) for path in sys.argv[1:])
mkv_dict = parse_files(paths)
for file in mkv_dict:
print(mkv_dict[file]['suid'], mkv_dict[file]['duration'], file[:30])
else:
print('A file(s) or directory is needed!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment