Last active
August 9, 2024 17:41
-
-
Save pukkandan/ee737fec64822f2552caf3ca4cbf5db7 to your computer and use it in GitHub Desktop.
Prettify and nest comments from yt-dlp's info.json file and write it to a new html/json file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
SPDX-License-Identifier: MIT https://opensource.org/licenses/MIT | |
Copyright © 2021 [email protected] | |
* Input file is an info.json (with comments) that yt-dlp (https://github.com/yt-dlp/yt-dlp) wrote | |
* Change FIELDS according to your needs | |
The output file will be in the format: | |
[{ | |
'text': 'comment 1', | |
... | |
'replies': [{ | |
'text': 'reply 1', | |
... | |
'replies': [...], | |
}, ...], | |
}, ...] | |
""" | |
import os.path | |
import json | |
import argparse | |
from datetime import datetime | |
def get_fields(dct): | |
for name, fn in FIELDS.items(): | |
val = fn(dct, name) | |
if val is not None: | |
yield name, val | |
def filter_func(comments): | |
return [dict(get_fields(c)) for c in comments] | |
FIELDS = { | |
'text': dict.get, | |
'author': dict.get, | |
'timestamp': lambda dct, name: dct.get(name) and datetime.strftime( | |
datetime.utcfromtimestamp(dct.get(name)), '%Y/%m/%d'), | |
# Add more fields here | |
'replies': lambda dct, name: filter_func(dct.get(name, [])) or None | |
} | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
'--input-file', '-i', | |
dest='inputfile', metavar='FILE', required=True, | |
help='File to read video metadata from (info.json)') | |
parser.add_argument( | |
'--output-file', '-o', | |
dest='outputfile', metavar='FILE', required=True, | |
help='File to write comments to (json / html)') | |
args = parser.parse_args() | |
ext = os.path.splitext(args.outputfile)[1][1:] | |
if ext == 'html': # Error early | |
try: | |
from json2html import json2html | |
except ImportError: | |
raise SystemExit('ERROR: json2html is required for html output. Install it with pip install json2html') | |
elif ext != 'json': | |
raise SystemExit(f'ERROR: Only json and html formats are supported, not {ext}') | |
print('Reading file') | |
with open(args.inputfile, encoding='utf-8') as f: | |
info_dict = json.load(f) | |
comment_data = {c['id']: c for c in sorted( | |
info_dict['comments'], key=lambda c: c.get('timestamp') or 0)} | |
count = len(info_dict['comments']) | |
del info_dict | |
nested_comments = [] | |
for i, (cid, c) in enumerate(comment_data.items(), 1): | |
print(f'Processing comment {i}/{count}', end='\r') | |
parent = nested_comments if c['parent'] == 'root' else comment_data[c['parent']].setdefault('replies', []) | |
parent.append(c) | |
del parent | |
print('') | |
nested_comments = filter_func(nested_comments) | |
if ext == 'json': | |
print('Converting to json') | |
out = json.dumps(nested_comments, indent=4, ensure_ascii=False) | |
elif ext == 'html': | |
print('Converting to html') | |
out = json2html.convert(nested_comments) | |
del nested_comments | |
print('Writing file') | |
with open(args.outputfile, 'w', encoding='utf-8') as f: | |
f.write(out) | |
print('Done') |
@m3jorri, Try replacing:
html_content += f'<p><strong>{author}:</strong> <pre>{text}</pre></p>' # Wrap text in <pre> to preserve spaces and newlines
with:
html_content += f'<p><strong>{author}:</strong> <div class="comment-text">{text}</div></p>' # Wrap text in div with a class for styling
and add this just above the closing style
tag:
.comment-text {
white-space: pre-wrap; /* Preserve whitespace and line breaks */
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you! This indeed fixes it, but perhaps creates another issue. If the user's original comment is a long single line without breaks/newlines, then it does not wrap around within the box in html output. It overflows, and creates a long single line while displaying in browser. Any solution for that?