download-telegram-links-from-browser.md

import functools
import os.path
from pathlib import Path
from urllib.parse import urlparse

import bookmarks_parser
import ffmpeg
import piexif
from PIL import Image
from pymediainfo import MediaInfo
from ruamel.yaml import YAML
from ruamel.yaml.comments import \
    CommentedMap as OrderedDict
from ruamel.yaml.main import \
    round_trip_dump as yaml_dump
from telethon.sync import TelegramClient
from telethon.tl.types import InputMessagesFilterPhotoVideo

api_id = 12345678
api_hash = '87643576fea98735864576bcde786583'


def check_image_with_pil(path):
    try:
        Image.open(path)
    except IOError:
        return False
    return True


def check_video(path):
    file_info = MediaInfo.parse(path)
    for track in file_info.tracks:
        if track.track_type == "Video":
            return True
    return False


def write_file_info(file_path, comments, date):
    if check_image_with_pil(file_path):
        img = Image.open(file_path)
        exif_dict = {"0th": {},
                     "Exif": {},
                     "GPS": {},
                     "Interop": {},
                     "1st": {},
                     "thumbnail": None}
        if img.info and 'exif' in img.info:
            exif = img.info['exif']
            exif_dict = piexif.load(exif)
        exif_dict['0th'][piexif.ImageIFD.XPComment] = comments.encode('utf-16')
        exif_dict['Exif'][piexif.ExifIFD.DateTimeOriginal] = date.strftime("%Y:%m:%d %H:%M:%S")
        exif_bytes = piexif.dump(exif_dict)
        img.save(file_path, exif=exif_bytes)
    elif check_video(file_path):
        stream = ffmpeg.input(file_path)
        path, filename = os.path.split(file_path)
        file_name_and_ext = os.path.splitext(filename)
        filename = file_name_and_ext[0]
        fileext = file_name_and_ext[1]
        newfilename = f'{filename}_tmp.{fileext}'
        tmp_file_path = os.path.join(path, newfilename)
        # todo see exiftool https://stackoverflow.com/questions/9981599/how-to-set-the-media-created-date-on-an-mpeg-4-file
        # set comments https://stackoverflow.com/questions/16082729/ffmpeg-multiline-text-in-metadata-comment-tag ???
        # creation date:  https://stackoverflow.com/questions/40354172/change-avi-creation-date-with-ffmpeg
        # creation_time + powershell: https://stackoverflow.com/questions/66595557/how-to-modify-media-created-field-in-file-properties-via-powershell
        stream = ffmpeg.output(stream, tmp_file_path, vcodec="copy", acodec="copy", loglevel='quiet',
                               **{'metadata:g:0': "comment=" + comments, 'metadata:g:1': "creation_time=" + str(date)},
                               y=None)
        ffmpeg.run(stream)
        os.remove(file_path)
        os.rename(tmp_file_path, file_path)


def read_messages(user, name, min_id, ids_set, message_id_to_title, limit=None):
    ids_set = ids_set.copy()
    client = TelegramClient('test_session', api_id, api_hash)
    client.start()

    if not os.path.exists(name):
        os.makedirs(name)

    index_file_path = f'./{name}/index.yaml'.replace("\\", "/", )
    if not os.path.exists(index_file_path):
        open(index_file_path, 'a').close()

    current_index = YAML(typ='safe').load(Path(index_file_path))

    offset_id = 0
    offset_file = f'./{name}/offset.txt'.replace("\\", "/", )
    if os.path.exists(offset_file):
        with open(offset_file) as f:
            offset_id = int(f.read())
    offset_id = max(offset_id, min_id - 1)  # exclusive
    print(f'offset is {offset_id}')

    max_id = -1
    with open(index_file_path, 'a', encoding='utf-8') as index_file:
        print('Read messages for channel={}, min_id={}, limit={}'.format(user, offset_id, limit))
        messages = client.get_messages(user, min_id=offset_id, limit=limit, reverse=True,
                                       filter=InputMessagesFilterPhotoVideo)
        for message in messages:
            max_id = max(message.id, max_id)
            with open(offset_file, 'w') as f:
                f.write(str(max_id))
                f.flush()
            bookmark_title = message_id_to_title.get(message.id)
            if message.id not in ids_set:
                continue
            ids_set.remove(message.id)
            message_id_str = f'{message.id}'
            if current_index and current_index.get(message_id_str):
                print('Message {message_id_str} has already loaded.'.format(message_id_str))
                continue
            out_folder = f'./{name}/data'
            out_file = f'{out_folder}/{message_id_str}'
            print("download media " + out_file)
            media_path = message.download_media(file=out_file)
            media_path = media_path.replace("\\", "/", )
            entry = OrderedDict({
                message_id_str: OrderedDict({
                    "media": media_path,
                    "text": message.text
                })
            })
            entry_str = yaml_dump(entry)
            index_file.write("\n\n" + entry_str)
            index_file.flush()
            write_file_info(media_path, bookmark_title + "\n\n" + user + "/" + message_id_str, message.date)
            if len(ids_set) == 0:
                'All ids were considered. Stopping process for channel {}...'.format(user)
                break


if __name__ == '__main__':
    bookmarks = bookmarks_parser.parse("./bookmarks_10.04.2022.html")
    bookmarks_folder_path = ['Панель закладок', '!', 'вн']
    bookmarks = functools.reduce(
        lambda items, title: list(filter(lambda it: it.get('title') == title, items))[0].get('children'),
        bookmarks_folder_path,
        bookmarks
    )


    def index_bookmark(index, bookmark):
        url = bookmark.get('url')
        title = bookmark.get('title')
        if not url.__contains__('t.me'):
            return index
        segments = urlparse(url).path.rsplit('/')
        ch_name = '/'.join(segments[1:-1])
        ch_info = index.get(ch_name)
        if not ch_info:
            index[ch_name] = ch_info = {'ids': [], 'message_id_to_title': {}}

        msg_id = int(segments[-1])
        ch_info.get('ids').append(msg_id)
        ch_info.get('message_id_to_title')[msg_id] = title
        return index


    channel_infos = functools.reduce(
        index_bookmark,
        bookmarks,
        {}
    )
    for channel_name, info in channel_infos.items():
        ids = info.get('ids')
        message_id_to_title = info.get('message_id_to_title')
        read_messages('https://t.me/' + channel_name, channel_name, min(ids), set(ids), message_id_to_title, None)
gorshkov-leonid/download-telegram-links-from-browser.md