Created
May 5, 2024 19:54
-
-
Save sveetch/0973050c97b2df4dc34d416cfe3b96ef to your computer and use it in GitHub Desktop.
Collecting video meta informations with MediaInfo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Proof of concept script to use MediaInfo to get metadatas from a video file. | |
Although this script have been done only for videos, MediaInfo also allow to read infos from audio and images. | |
First, this have been done with Python 3.10 but it should probably work with Python 3.8 | |
MediaInfo library is required to be installed on your system, see: | |
https://github.com/MediaArea/MediaInfo | |
On Ubuntu you would get it with:: | |
sudo apt-get install mediainfo | |
Then the Python wrapper library: | |
https://github.com/sbraz/pymediainfo | |
You would install it with:: | |
pip install pymediainfo | |
Be aware that recent pymediainfo version could be incompatible with very old MediaInfo. | |
You may install pymediainfo directly on your system with package manager like apt but | |
it may then difficult to use in a virtual Python environment as it is recommended. | |
""" | |
from pymediainfo import MediaInfo | |
class VideoMetaParser: | |
# Selected attribute names to get informations from track types | |
GENERAL_FIELDS = [ | |
"format", | |
"duration", | |
"file_last_modification_date", | |
] | |
VIDEO_FIELDS = [ | |
"format", | |
"codec_id", | |
"width", | |
"height", | |
"bit_rate", | |
"frame_rate", | |
"pixel_aspect_ratio", | |
"display_aspect_ratio", | |
] | |
AUDIO_FIELDS = [ | |
"title", | |
"language", | |
"format", | |
"codec_id", | |
"bit_rate", | |
"sampling_rate", | |
] | |
SUBTITLE_FIELDS = [ | |
"title", | |
"language", | |
"format", | |
"codec_id", | |
] | |
def format_general_duration(self, value): | |
""" | |
Ensure duration is always an integer | |
""" | |
if value and (isinstance(value, str) or isinstance(value, float)): | |
value = int(value) | |
return value | |
def format_general_file_last_modification_date(self, value): | |
""" | |
Always return an UTC datetime with timezone. | |
""" | |
if value: | |
# Remove possible UTC prefix | |
value = value[len("UTC "):] if value.startswith("UTC ") else value | |
# Add UTC timezone if there is not any | |
value = value + "+00:00" if "+" not in value else value | |
# Finish the ISO format | |
return value.replace(" ", "T") | |
return value | |
def format_video_frame_rate(self, value): | |
""" | |
Ensure frame rate is always an integer | |
""" | |
if value and isinstance(value, str): | |
value = float(value) | |
if value and isinstance(value, float): | |
value = int(value) | |
return value | |
def format_video_bit_rate(self, value): | |
""" | |
Ensure bit rate is always an integer | |
""" | |
if value and isinstance(value, str): | |
value = int(value) | |
return value | |
def format_video_pixel_aspect_ratio(self, value): | |
""" | |
Ensure pixel ratio is always a float | |
""" | |
if value and isinstance(value, str): | |
value = float(value) | |
return value | |
def format_video_display_aspect_ratio(self, value): | |
""" | |
Ensure display ratio is always a float | |
""" | |
if value and isinstance(value, str): | |
value = float(value) | |
return value | |
def format_audio_bit_rate(self, value): | |
""" | |
Ensure bit rate is always an integer | |
""" | |
if value and isinstance(value, str): | |
value = int(value) | |
return value | |
def format_audio_sampling_rate(self, value): | |
""" | |
Ensure sampling rate is always an integer | |
""" | |
if value and isinstance(value, str): | |
value = int(value) | |
return value | |
def formatted_attr_value(self, kind, data, name): | |
""" | |
formatted_attr_value("video", DATA, "frame_rate") | |
""" | |
value = getattr(data, name) | |
formatter = "format_{}_{}".format(kind, name) | |
if hasattr(self, formatter): | |
return getattr(self, formatter)(value) | |
return value | |
def scan(self, filepath): | |
media_info = MediaInfo.parse(filepath) | |
data = { | |
"general": {}, | |
"video": [], | |
"audio": [], | |
"subtitle": [], | |
} | |
# We only care about a single one general track, there should not be more | |
video_general = media_info.general_tracks[0] | |
for fieldname in self.GENERAL_FIELDS: | |
data["general"][fieldname] = self.formatted_attr_value( | |
"general", | |
video_general, | |
fieldname | |
) | |
for i, track in enumerate(media_info.video_tracks, start=1): | |
data["video"].append({ | |
fieldname: self.formatted_attr_value("video", track, fieldname) | |
for fieldname in self.VIDEO_FIELDS | |
}) | |
for i, track in enumerate(media_info.audio_tracks, start=1): | |
data["audio"].append({ | |
fieldname: getattr(track, fieldname) | |
for fieldname in self.AUDIO_FIELDS | |
}) | |
for i, track in enumerate(media_info.text_tracks, start=1): | |
data["subtitle"].append({ | |
fieldname: getattr(track, fieldname) | |
for fieldname in self.SUBTITLE_FIELDS | |
}) | |
return data | |
if __name__ == "__main__": | |
import json | |
from pathlib import Path | |
SAMPLES = [ | |
Path("some_videos.mkv"), | |
Path("another_videos.mp4"), | |
] | |
parser = VideoMetaParser() | |
for path in SAMPLES: | |
output_filedst = "{}.json".format(path.name) | |
print(path, output_filedst, path.exists()) | |
data = parser.scan(path) | |
(OUTPUT_DIR / output_filedst).write_text(json.dumps(data, indent=4)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment