-
-
Save pukkandan/ee737fec64822f2552caf3ca4cbf5db7 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
""" | |
SPDX-License-Identifier: MIT https://opensource.org/licenses/MIT | |
Copyright © 2021 [email protected] | |
* Input file is an info.json (with comments) that yt-dlp (https://github.com/yt-dlp/yt-dlp) wrote | |
* Change FIELDS according to your needs | |
The output file will be in the format: | |
[{ | |
'text': 'comment 1', | |
... | |
'replies': [{ | |
'text': 'reply 1', | |
... | |
'replies': [...], | |
}, ...], | |
}, ...] | |
""" | |
import os.path | |
import json | |
import argparse | |
from datetime import datetime | |
def get_fields(dct): | |
for name, fn in FIELDS.items(): | |
val = fn(dct, name) | |
if val is not None: | |
yield name, val | |
def filter_func(comments): | |
return [dict(get_fields(c)) for c in comments] | |
FIELDS = { | |
'text': dict.get, | |
'author': dict.get, | |
'timestamp': lambda dct, name: dct.get(name) and datetime.strftime( | |
datetime.utcfromtimestamp(dct.get(name)), '%Y/%m/%d'), | |
# Add more fields here | |
'replies': lambda dct, name: filter_func(dct.get(name, [])) or None | |
} | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
'--input-file', '-i', | |
dest='inputfile', metavar='FILE', required=True, | |
help='File to read video metadata from (info.json)') | |
parser.add_argument( | |
'--output-file', '-o', | |
dest='outputfile', metavar='FILE', required=True, | |
help='File to write comments to (json / html)') | |
args = parser.parse_args() | |
ext = os.path.splitext(args.outputfile)[1][1:] | |
if ext == 'html': # Error early | |
try: | |
from json2html import json2html | |
except ImportError: | |
raise SystemExit('ERROR: json2html is required for html output. Install it with pip install json2html') | |
elif ext != 'json': | |
raise SystemExit(f'ERROR: Only json and html formats are supported, not {ext}') | |
print('Reading file') | |
with open(args.inputfile, encoding='utf-8') as f: | |
info_dict = json.load(f) | |
comment_data = {c['id']: c for c in sorted( | |
info_dict['comments'], key=lambda c: c.get('timestamp') or 0)} | |
count = len(info_dict['comments']) | |
del info_dict | |
nested_comments = [] | |
for i, (cid, c) in enumerate(comment_data.items(), 1): | |
print(f'Processing comment {i}/{count}', end='\r') | |
parent = nested_comments if c['parent'] == 'root' else comment_data[c['parent']].setdefault('replies', []) | |
parent.append(c) | |
del parent | |
print('') | |
nested_comments = filter_func(nested_comments) | |
if ext == 'json': | |
print('Converting to json') | |
out = json.dumps(nested_comments, indent=4, ensure_ascii=False) | |
elif ext == 'html': | |
print('Converting to html') | |
out = json2html.convert(nested_comments) | |
del nested_comments | |
print('Writing file') | |
with open(args.outputfile, 'w', encoding='utf-8') as f: | |
f.write(out) | |
print('Done') |
Thanks for catching that, @m3jorri. Please try replacing:
text = html.escape(comment["text"])
with:
text = html.escape(comment["text"]).replace('\n', '<br>') # Convert newlines to <br>
and:
html_content += f'<p><strong>{author}:</strong> {text}</p>'
with:
html_content += f'<p><strong>{author}:</strong> <pre>{text}</pre></p>' # Wrap text in <pre> to preserve spaces and newlines
Thank you! This indeed fixes it, but perhaps creates another issue. If the user's original comment is a long single line without breaks/newlines, then it does not wrap around within the box in html output. It overflows, and creates a long single line while displaying in browser. Any solution for that?
@m3jorri, Try replacing:
html_content += f'<p><strong>{author}:</strong> <pre>{text}</pre></p>' # Wrap text in <pre> to preserve spaces and newlines
with:
html_content += f'<p><strong>{author}:</strong> <div class="comment-text">{text}</div></p>' # Wrap text in div with a class for styling
and add this just above the closing style
tag:
.comment-text {
white-space: pre-wrap; /* Preserve whitespace and line breaks */
}
@tinyapps when the html output is viewed in the browser, empty lines and indentations are lost. It shows up as a single very long line. It seems to be a display issue only. When the file is opened in a text editor, the line breaks and indentations are there. Any way to fix this?