sveetch · May 5, 2024 19:54
diff --git a/video_infos.py b/video_infos.py
 """
 Proof of concept script to use MediaInfo to get metadatas from a video file.

 Although this script have been done only for videos, MediaInfo also allow to read infos from audio and images.

 First, this have been done with Python 3.10 but it should probably work with Python 3.8

 MediaInfo library is required to be installed on your system, see:

 https://github.com/MediaArea/MediaInfo

 On Ubuntu you would get it with::
    sudo apt-get install mediainfo

 Then the Python wrapper library:

 https://github.com/sbraz/pymediainfo

 You would install it with::
    pip install pymediainfo

 Be aware that recent pymediainfo version could be incompatible with very old MediaInfo.
 You may install pymediainfo directly on your system with package manager like apt but
 it may then difficult to use in a virtual Python environment as it is recommended.

 """
 from pymediainfo import MediaInfo


 class VideoMetaParser:
    # Selected attribute names to get informations from track types
    GENERAL_FIELDS = [
        "format",
        "duration",
        "file_last_modification_date",
    ]
    VIDEO_FIELDS = [
        "format",
        "codec_id",
        "width",
        "height",
        "bit_rate",
        "frame_rate",
        "pixel_aspect_ratio",
        "display_aspect_ratio",
    ]
    AUDIO_FIELDS = [
        "title",
        "language",
        "format",
        "codec_id",
        "bit_rate",
        "sampling_rate",
    ]
    SUBTITLE_FIELDS = [
        "title",
        "language",
        "format",
        "codec_id",
    ]

    def format_general_duration(self, value):
        """
        Ensure duration is always an integer
        """
        if value and (isinstance(value, str) or isinstance(value, float)):
            value = int(value)

        return value

    def format_general_file_last_modification_date(self, value):
        """
        Always return an UTC datetime with timezone.
        """
        if value:
            # Remove possible UTC prefix
            value = value[len("UTC "):] if value.startswith("UTC ") else value
            # Add UTC timezone if there is not any
            value = value + "+00:00" if "+" not in value else value
            # Finish the ISO format
            return value.replace(" ", "T")

        return value

    def format_video_frame_rate(self, value):
        """
        Ensure frame rate is always an integer
        """
        if value and isinstance(value, str):
            value = float(value)

        if value and isinstance(value, float):
            value = int(value)

        return value

    def format_video_bit_rate(self, value):
        """
        Ensure bit rate is always an integer
        """
        if value and isinstance(value, str):
            value = int(value)

        return value

    def format_video_pixel_aspect_ratio(self, value):
        """
        Ensure pixel ratio is always a float
        """
        if value and isinstance(value, str):
            value = float(value)

        return value

    def format_video_display_aspect_ratio(self, value):
        """
        Ensure display ratio is always a float
        """
        if value and isinstance(value, str):
            value = float(value)

        return value

    def format_audio_bit_rate(self, value):
        """
        Ensure bit rate is always an integer
        """
        if value and isinstance(value, str):
            value = int(value)

        return value

    def format_audio_sampling_rate(self, value):
        """
        Ensure sampling rate is always an integer
        """
        if value and isinstance(value, str):
            value = int(value)

        return value

    def formatted_attr_value(self, kind, data, name):
        """
        formatted_attr_value("video", DATA, "frame_rate")
        """
        value = getattr(data, name)
        formatter = "format_{}_{}".format(kind, name)

        if hasattr(self, formatter):
            return getattr(self, formatter)(value)

        return value

    def scan(self, filepath):
        media_info = MediaInfo.parse(filepath)

        data = {
            "general": {},
            "video": [],
            "audio": [],
            "subtitle": [],
        }

        # We only care about a single one general track, there should not be more
        video_general = media_info.general_tracks[0]
        for fieldname in self.GENERAL_FIELDS:
            data["general"][fieldname] = self.formatted_attr_value(
                "general",
                video_general,
                fieldname
            )

        for i, track in enumerate(media_info.video_tracks, start=1):
            data["video"].append({
                fieldname: self.formatted_attr_value("video", track, fieldname)
                for fieldname in self.VIDEO_FIELDS
            })

        for i, track in enumerate(media_info.audio_tracks, start=1):
            data["audio"].append({
                fieldname: getattr(track, fieldname)
                for fieldname in self.AUDIO_FIELDS
            })

        for i, track in enumerate(media_info.text_tracks, start=1):
            data["subtitle"].append({
                fieldname: getattr(track, fieldname)
                for fieldname in self.SUBTITLE_FIELDS
            })

        return data


 if __name__ == "__main__":
    import json
    from pathlib import Path

    SAMPLES = [
        Path("some_videos.mkv"),
        Path("another_videos.mp4"),
    ]

    parser = VideoMetaParser()
    for path in SAMPLES:
        output_filedst = "{}.json".format(path.name)
        print(path, output_filedst, path.exists())
        data = parser.scan(path)
        (OUTPUT_DIR / output_filedst).write_text(json.dumps(data, indent=4))
	"""
	Proof of concept script to use MediaInfo to get metadatas from a video file.

	Although this script have been done only for videos, MediaInfo also allow to read infos from audio and images.

	First, this have been done with Python 3.10 but it should probably work with Python 3.8

	MediaInfo library is required to be installed on your system, see:

	https://github.com/MediaArea/MediaInfo

	On Ubuntu you would get it with::
	sudo apt-get install mediainfo

	Then the Python wrapper library:

	https://github.com/sbraz/pymediainfo

	You would install it with::
	pip install pymediainfo

	Be aware that recent pymediainfo version could be incompatible with very old MediaInfo.
	You may install pymediainfo directly on your system with package manager like apt but
	it may then difficult to use in a virtual Python environment as it is recommended.

	"""
	from pymediainfo import MediaInfo


	class VideoMetaParser:
	# Selected attribute names to get informations from track types
	GENERAL_FIELDS = [
	"format",
	"duration",
	"file_last_modification_date",
	]
	VIDEO_FIELDS = [
	"format",
	"codec_id",
	"width",
	"height",
	"bit_rate",
	"frame_rate",
	"pixel_aspect_ratio",
	"display_aspect_ratio",
	]
	AUDIO_FIELDS = [
	"title",
	"language",
	"format",
	"codec_id",
	"bit_rate",
	"sampling_rate",
	]
	SUBTITLE_FIELDS = [
	"title",
	"language",
	"format",
	"codec_id",
	]

	def format_general_duration(self, value):
	"""
	Ensure duration is always an integer
	"""
	if value and (isinstance(value, str) or isinstance(value, float)):
	value = int(value)

	return value

	def format_general_file_last_modification_date(self, value):
	"""
	Always return an UTC datetime with timezone.
	"""
	if value:
	# Remove possible UTC prefix
	value = value[len("UTC "):] if value.startswith("UTC ") else value
	# Add UTC timezone if there is not any
	value = value + "+00:00" if "+" not in value else value
	# Finish the ISO format
	return value.replace(" ", "T")

	return value

	def format_video_frame_rate(self, value):
	"""
	Ensure frame rate is always an integer
	"""
	if value and isinstance(value, str):
	value = float(value)

	if value and isinstance(value, float):
	value = int(value)

	return value

	def format_video_bit_rate(self, value):
	"""
	Ensure bit rate is always an integer
	"""
	if value and isinstance(value, str):
	value = int(value)

	return value

	def format_video_pixel_aspect_ratio(self, value):
	"""
	Ensure pixel ratio is always a float
	"""
	if value and isinstance(value, str):
	value = float(value)

	return value

	def format_video_display_aspect_ratio(self, value):
	"""
	Ensure display ratio is always a float
	"""
	if value and isinstance(value, str):
	value = float(value)

	return value

	def format_audio_bit_rate(self, value):
	"""
	Ensure bit rate is always an integer
	"""
	if value and isinstance(value, str):
	value = int(value)

	return value

	def format_audio_sampling_rate(self, value):
	"""
	Ensure sampling rate is always an integer
	"""
	if value and isinstance(value, str):
	value = int(value)

	return value

	def formatted_attr_value(self, kind, data, name):
	"""
	formatted_attr_value("video", DATA, "frame_rate")
	"""
	value = getattr(data, name)
	formatter = "format_{}_{}".format(kind, name)

	if hasattr(self, formatter):
	return getattr(self, formatter)(value)

	return value

	def scan(self, filepath):
	media_info = MediaInfo.parse(filepath)

	data = {
	"general": {},
	"video": [],
	"audio": [],
	"subtitle": [],
	}

	# We only care about a single one general track, there should not be more
	video_general = media_info.general_tracks[0]
	for fieldname in self.GENERAL_FIELDS:
	data["general"][fieldname] = self.formatted_attr_value(
	"general",
	video_general,
	fieldname
	)

	for i, track in enumerate(media_info.video_tracks, start=1):
	data["video"].append({
	fieldname: self.formatted_attr_value("video", track, fieldname)
	for fieldname in self.VIDEO_FIELDS
	})

	for i, track in enumerate(media_info.audio_tracks, start=1):
	data["audio"].append({
	fieldname: getattr(track, fieldname)
	for fieldname in self.AUDIO_FIELDS
	})

	for i, track in enumerate(media_info.text_tracks, start=1):
	data["subtitle"].append({
	fieldname: getattr(track, fieldname)
	for fieldname in self.SUBTITLE_FIELDS
	})

	return data


	if __name__ == "__main__":
	import json
	from pathlib import Path

	SAMPLES = [
	Path("some_videos.mkv"),
	Path("another_videos.mp4"),
	]

	parser = VideoMetaParser()
	for path in SAMPLES:
	output_filedst = "{}.json".format(path.name)
	print(path, output_filedst, path.exists())
	data = parser.scan(path)
	(OUTPUT_DIR / output_filedst).write_text(json.dumps(data, indent=4))