Skip to content

Instantly share code, notes, and snippets.

@kai2nenobu
Last active February 14, 2025 09:53
Show Gist options
  • Save kai2nenobu/df593179632c67128cb7ec453e1f3b18 to your computer and use it in GitHub Desktop.
Save kai2nenobu/df593179632c67128cb7ec453e1f3b18 to your computer and use it in GitHub Desktop.
Convert a mht file for Microsoft Teams Wiki into markdown format
import argparse
import logging
import sys
from bs4 import Tag, NavigableString, BeautifulSoup
logger = logging.getLogger(__name__)
class Context:
def __init__(self):
self.tags = []
self.indent_level = 0
self.table_shape = (0, 0)
self.table_index = (0, 0)
@property
def tag_depth(self):
return len(self.tags)
def down_tag(self, tag: str):
self.tags.append(tag)
def up_tag(self):
return self.tags.pop()
def inc_indent(self):
self.indent_level += 1
return self.indent_level
def dec_indent(self):
self.indent_level -= 1
return self.indent_level
def bullet_prefix(self):
"""
Return a bullet sign for listing.
"""
indent = ' ' * (self.indent_level - 1)
for tag in reversed(self.tags):
if tag == 'ul':
return indent + '-'
elif tag == 'ol':
return indent + '1.'
raise ValueError('Not inside "ul" or "ol".')
def inside_table(self):
return 'table' in self.tags
def is_second_row(self):
row, _ = self.table_index
return row == 2
def next_row_index(self):
row, col = self.table_index
self.table_index = (row + 1, col)
def next_column_index(self):
row, col = self.table_index
self.table_index = (row, col + 1)
def clear_table_index(self):
self.table_index = (0, 0)
class MarkdownConverter:
def __init__(self):
self._context: Context = None
def convert(self, content: Tag):
self._context = Context()
return self._convert(content)
def _convert(self, content):
if isinstance(content, Tag):
logger.info('%s => %s', type(content), content.name)
try:
self._context.down_tag(content.name)
return self._convert_tag(content)
finally:
self._context.up_tag()
elif isinstance(content, NavigableString):
logger.info('%s => %s', type(content), content)
return str(content).strip()
def _convert_tag(self, tag: Tag):
if tag.name == 'html':
return self._convert_html(tag)
elif tag.name == 'h1':
return f'# {self._convert_contents(tag)}\n\n'
elif tag.name == 'h3':
return self._convert_h3(tag)
elif tag.name == 'div':
return self._convert_div(tag)
elif tag.name == 'p':
return f'{self._convert_contents(tag)}\n'
elif tag.name == 'blockquote':
return f'>{self._convert_contents(tag)}\n'
elif tag.name == 'pre':
# pre配下のタグは読み取らずテキストとして解釈する
return f'```\n{tag.text.strip()}\n```\n'
elif tag.name == 'code':
return f'`{self._convert_contents(tag)}`'
elif tag.name == 'a':
return self._convert_a(tag)
elif tag.name == 'span':
return f'{self._convert_contents(tag)}'
elif tag.name == 'u':
return f'{self._convert_contents(tag)}'
elif tag.name == 'b':
return f'**{self._convert_contents(tag)}**'
elif tag.name == 'em':
return f'*{self._convert_contents(tag)}*'
elif tag.name == 'strong':
return f'__{self._convert_contents(tag)}__'
elif tag.name == 'br':
return self._convert_br()
elif tag.name == 'ul':
return self._convert_list(tag)
elif tag.name == 'ol':
return self._convert_list(tag)
elif tag.name == 'li':
return self._convert_li(tag)
elif tag.name == 'table':
return self._convert_table(tag)
elif tag.name == 'colgroup':
return '' # ignore
elif tag.name == 'tbody':
return f'{self._convert_contents(tag)}\n'
elif tag.name == 'tr':
self._context.next_row_index()
return self._convert_tr(tag)
elif tag.name == 'td':
self._context.next_column_index()
return f'{self._convert_contents(tag).strip()} | '
elif tag.name == 'img':
return self._convert_img(tag)
else:
return f'<<<CANNOT CONVERT {tag.name}>>>'
def _convert_contents(self, tag: Tag):
return ''.join([self._convert(c) for c in tag.children])
def _convert_a(self, a: Tag):
href = a.attrs['href']
text = self._convert_contents(a)
if href == text:
# 表示テキストとURLが一致する場合はURLのみ出力
return href
else:
return f'[{text}]({href})'
def _convert_list(self, tag: Tag):
try:
self._context.inc_indent()
return f'\n{self._convert_contents(tag)}'
finally:
self._context.dec_indent()
def _convert_li(self, li: Tag):
inner_text = self._convert_contents(li)
prefix = self._context.bullet_prefix()
return f'{prefix} {inner_text}\n'
def _convert_html(self, html: Tag):
# Embedded metadata into HTML comment
metadata = f'''\
<!--
page: {html.attrs["data-page"]}
canvas: {html.attrs["data-canvas"]}
site: {html.attrs["data-site"]}
list: {html.attrs["data-list"]}
tabId: {html.attrs["data-tabid"]}
slug: {html.attrs["data-slug"]}
threadId: {html.attrs["data-threadid"]}
-->
'''
return metadata + self._convert_contents(html)
def _convert_h3(self, tag: Tag):
if 'wiki-mht-note' in tag.attrs.get('class', []):
# wiki-mht-noteクラスはTeams特有の非表示クラスなので
# 特別扱いしてMarkdownに変換しない
return ''
return f'### {self._convert_contents(tag)}\n\n'
def _convert_br(self):
if self._context.inside_table():
return '<br />'
else:
return ' \n'
def _convert_div(self, tag):
if self._context.inside_table():
return f'{self._convert_contents(tag)}<br />'
else:
return f'{self._convert_contents(tag)}\n'
def _convert_table(self, table: Tag):
try:
rows = len(table.find_all('tr'))
cols = len(table.find_all('col'))
self._context.table_shape = (rows, cols)
return f'\n{self._convert_contents(table)}\n'
finally:
self._context.clear_table_index()
def _convert_tr(self, tr: Tag):
if self._context.is_second_row():
# 2行目はヘッダーとボディのセパレーターを入れる
separator = '|' + (' --- |' * self._context.table_shape[1]) + '\n'
else:
separator = ''
return separator + '| ' + self._convert_contents(tr) + '\n'
def _convert_img(self, img: Tag):
if 'data-preview-src' in img.attrs:
src = img.attrs['data-preview-src']
# "data-preview-src" attribute is a string like below.
# "/sites/msteams_1b3d5f/Teams Wiki Data/General/img-123-fe3028d38dc34d1e94b6cd350d0f9941.png"
# Last entry "img-123-fe3028d38dc34d1e94b6cd350d0f9941.png" is a image file stored in
# "Teams Wiki Data" folder, so link it as embedded image.
url = src.split('/')[-1]
else:
# Preserve original "src" attrribute
url = img.attrs['src']
return f'![]({url})'
def cli_main():
parser = argparse.ArgumentParser()
parser.add_argument("--verbose", action="store_true", help="output logging message")
parser.add_argument("file", type=str, help="mhtml file to convert into markdown")
args = parser.parse_args(sys.argv[1:])
logging.basicConfig(level=logging.INFO if args.verbose else logging.WARNING, stream=sys.stderr)
with open(args.file, mode='r', encoding='utf-8') as f:
html_text = f.read()
soup = BeautifulSoup(html_text, "html.parser") # HTMLを解析する
# print(soup.prettify())
html: Tag = soup.html
converter = MarkdownConverter()
markdown = converter.convert(html)
print(markdown)
if __name__ == '__main__':
cli_main()
[project]
name = "convert-teams-wiki"
version = "0.1.1"
description = "Convert a mht file for Microsoft Teams Wiki into markdown format"
requires-python = ">=3.6"
classifiers = [
"Programming Language :: Python :: 3",
]
dependencies = [
"beautifulsoup4>=4.5",
]
[project.scripts]
convert-teams-wiki = "convert_teams_wiki:cli_main"
[build-system]
requires = ["setuptools>=40.8.0", "wheel"]
build-backend = "setuptools.build_meta:__legacy__"
@minusInfinite
Copy link

minusInfinite commented Feb 8, 2022

Thanks for the script, this will assist in moving my MS Teams wiki into another project.

While not valid for Markdown it is possible get images by adding the following _convert function

def _convert_tag(self, tag: Tag):
...
elif tag.name == 'img':
            return {self._convert_img(tag)
...
...
def _convert_img(self, img: Tag):
        src = img.attrs['data-preview-src']
        path = src.split("/")[5] #last entry in the split list
        return f'![](/{path})\n'
...

@nbonnec
Copy link

nbonnec commented Feb 18, 2022

Sometimes there is no attributes, so:

    def _convert_img(self, img: Tag):
        attr = 'data-preview-src'
        if attr in img.attrs:
            src = img.attrs[attr]
            path = src.split("/")[5]  # last entry in the split list
            return f'![](/{path})\n'
        else:
            return f''

@kai2nenobu
Copy link
Author

Add a file for package. You can install convert-teams-wiki command by pip or pipx.

pip install git+https://gist.github.com/df593179632c67128cb7ec453e1f3b18.git
# or
pipx install git+https://gist.github.com/df593179632c67128cb7ec453e1f3b18.git

@kai2nenobu
Copy link
Author

Change in 0.1.1

  • add --verbose option to output logging message

@matnav
Copy link

matnav commented Mar 11, 2024

Purposed Updates

Changes:

  • Added <h2> Tag Handling: Introduced conversion support for <h2> tags to ensure that they are properly converted to Markdown headers, enhancing the semantic structuring of the converted content.
  • Introduced <s> Tag Ignoring: Implemented logic to ignore <s> tags during conversion. This ensures that strikethrough text, which may not be relevant or correctly supported in the target Markdown context, is excluded, maintaining content clarity.
import argparse
import logging
import sys

from bs4 import Tag, NavigableString, BeautifulSoup

logger = logging.getLogger(__name__)


class Context:
    def __init__(self):
        self.tags = []
        self.indent_level = 0
        self.table_shape = (0, 0)
        self.table_index = (0, 0)

    @property
    def tag_depth(self):
        return len(self.tags)

    def down_tag(self, tag: str):
        self.tags.append(tag)

    def up_tag(self):
        return self.tags.pop()

    def inc_indent(self):
        self.indent_level += 1
        return self.indent_level

    def dec_indent(self):
        self.indent_level -= 1
        return self.indent_level

    def bullet_prefix(self):
        """
        Return a bullet sign for listing.
        """
        indent = '    ' * (self.indent_level - 1)
        for tag in reversed(self.tags):
            if tag == 'ul':
                return indent + '-'
            elif tag == 'ol':
                return indent + '1.'
        raise ValueError('Not inside "ul" or "ol".')

    def inside_table(self):
        return 'table' in self.tags

    def is_second_row(self):
        row, _ = self.table_index
        return row == 2

    def next_row_index(self):
        row, col = self.table_index
        self.table_index = (row + 1, col)

    def next_column_index(self):
        row, col = self.table_index
        self.table_index = (row, col + 1)

    def clear_table_index(self):
        self.table_index = (0, 0)


class MarkdownConverter:
    def __init__(self):
        self._context: Context = None

    def convert(self, content: Tag):
        self._context = Context()
        return self._convert(content)

    def _convert(self, content):
        if isinstance(content, Tag):
            logger.info('%s => %s', type(content), content.name)
            try:
                self._context.down_tag(content.name)
                return self._convert_tag(content)
            finally:
                self._context.up_tag()
        elif isinstance(content, NavigableString):
            logger.info('%s => %s', type(content), content)
            return str(content).strip()

    def _convert_tag(self, tag: Tag):
        if tag.name == 'html':
            return self._convert_html(tag)
        elif tag.name == 's':
            return f''
        elif tag.name == 'h1':
            return f'# {self._convert_contents(tag)}\n\n'
        elif tag.name == 'h2':
            return self._convert_h2(tag)
        elif tag.name == 'h3':
            return self._convert_h3(tag)
        elif tag.name == 'div':
            return self._convert_div(tag)
        elif tag.name == 'p':
            return f'{self._convert_contents(tag)}\n'
        elif tag.name == 'blockquote':
            return f'>{self._convert_contents(tag)}\n'
        elif tag.name == 'pre':
            # pre配下のタグは読み取らずテキストとして解釈する
            return f'```\n{tag.text.strip()}\n```\n'
        elif tag.name == 'code':
            return f'`{self._convert_contents(tag)}`'
        elif tag.name == 'a':
            return self._convert_a(tag)
        elif tag.name == 'span':
            return f'{self._convert_contents(tag)}'
        elif tag.name == 'u':
            return f'{self._convert_contents(tag)}'
        elif tag.name == 'b':
            return f'**{self._convert_contents(tag)}**'
        elif tag.name == 'em':
            return f'*{self._convert_contents(tag)}*'
        elif tag.name == 'strong':
            return f'__{self._convert_contents(tag)}__'
        elif tag.name == 'br':
            return self._convert_br()
        elif tag.name == 'ul':
            return self._convert_list(tag)
        elif tag.name == 'ol':
            return self._convert_list(tag)
        elif tag.name == 'li':
            return self._convert_li(tag)
        elif tag.name == 'table':
            return self._convert_table(tag)
        elif tag.name == 'colgroup':
            return ''  # ignore
        elif tag.name == 'tbody':
            return f'{self._convert_contents(tag)}\n'
        elif tag.name == 'tr':
            self._context.next_row_index()
            return self._convert_tr(tag)
        elif tag.name == 'td':
            self._context.next_column_index()
            return f'{self._convert_contents(tag).strip()} | '
        elif tag.name == 'img':
            return self._convert_img(tag)
        else:
            return f'<<<CANNOT CONVERT {tag.name}>>>'

    def _convert_contents(self, tag: Tag):
        return ''.join([self._convert(c) for c in tag.children])

    def _convert_a(self, a: Tag):
        href = a.attrs['href']
        text = self._convert_contents(a)
        if href == text:
            # 表示テキストとURLが一致する場合はURLのみ出力
            return href
        else:
            return f'[{text}]({href})'

    def _convert_list(self, tag: Tag):
        try:
            self._context.inc_indent()
            return f'\n{self._convert_contents(tag)}'
        finally:
            self._context.dec_indent()

    def _convert_li(self, li: Tag):
        inner_text = self._convert_contents(li)
        prefix = self._context.bullet_prefix()
        return f'{prefix} {inner_text}\n'

    def _convert_html(self, html: Tag):
        # Embedded metadata into HTML comment
        metadata = f'''\
<!--
page: {html.attrs["data-page"]}
canvas: {html.attrs["data-canvas"]}
site: {html.attrs["data-site"]}
list: {html.attrs["data-list"]}
tabId: {html.attrs["data-tabid"]}
slug: {html.attrs["data-slug"]}
threadId: {html.attrs["data-threadid"]}
-->

'''
        return metadata + self._convert_contents(html)

    def _convert_h3(self, tag: Tag):
        if 'wiki-mht-note' in tag.attrs.get('class', []):
            # wiki-mht-noteクラスはTeams特有の非表示クラスなので
            # 特別扱いしてMarkdownに変換しない
            return ''
        return f'### {self._convert_contents(tag)}\n\n'

    def _convert_h2(self, tag: Tag):
        if 'wiki-mht-note' in tag.attrs.get('class', []):
            # wiki-mht-noteクラスはTeams特有の非表示クラスなので
            # 特別扱いしてMarkdownに変換しない
            return ''
        return f'## {self._convert_contents(tag)}\n\n'

    def _convert_br(self):
        if self._context.inside_table():
            return '<br />'
        else:
            return ' \n'

    def _convert_div(self, tag):
        if self._context.inside_table():
            return f'{self._convert_contents(tag)}<br />'
        else:
            return f'{self._convert_contents(tag)}\n'

    def _convert_table(self, table: Tag):
        try:
            rows = len(table.find_all('tr'))
            cols = len(table.find_all('col'))
            self._context.table_shape = (rows, cols)
            return f'\n{self._convert_contents(table)}\n'
        finally:
            self._context.clear_table_index()

    def _convert_tr(self, tr: Tag):
        if self._context.is_second_row():
            # 2行目はヘッダーとボディのセパレーターを入れる
            separator = '|' + (' --- |' * self._context.table_shape[1]) + '\n'
        else:
            separator = ''
        return separator + '| ' + self._convert_contents(tr) + '\n'

    def _convert_img(self, img: Tag):
        if 'data-preview-src' in img.attrs:
            src = img.attrs['data-preview-src']
            # "data-preview-src" attribute is a string like below.
            # "/sites/msteams_1b3d5f/Teams Wiki Data/General/img-123-fe3028d38dc34d1e94b6cd350d0f9941.png"
            # Last entry "img-123-fe3028d38dc34d1e94b6cd350d0f9941.png" is a image file stored in
            # "Teams Wiki Data" folder, so link it as embedded image.
            url = src.split('/')[-1]
        else:
            # Preserve original "src" attrribute
            url = img.attrs['src']
        return f'![]({url})'

def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--verbose", action="store_true", help="output logging message")
    parser.add_argument("file", type=str, help="mhtml file to convert into markdown")
    args = parser.parse_args(sys.argv[1:])

    logging.basicConfig(level=logging.INFO if args.verbose else logging.WARNING, stream=sys.stderr)
    with open(args.file, mode='r', encoding='utf-8') as f:
        html_text = f.read()
    soup = BeautifulSoup(html_text, "html.parser")  # HTMLを解析する

    # print(soup.prettify())
    html: Tag = soup.html
    converter = MarkdownConverter()
    markdown = converter.convert(html)
    print(markdown)


if __name__ == '__main__':
    cli_main()

@schyen
Copy link

schyen commented Feb 14, 2025

Thank you for this! This was super helpful and worked beautifully for me.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment