-
-
Save pukkandan/ee737fec64822f2552caf3ca4cbf5db7 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
""" | |
SPDX-License-Identifier: MIT https://opensource.org/licenses/MIT | |
Copyright © 2021 [email protected] | |
* Input file is an info.json (with comments) that yt-dlp (https://github.com/yt-dlp/yt-dlp) wrote | |
* Change FIELDS according to your needs | |
The output file will be in the format: | |
[{ | |
'text': 'comment 1', | |
... | |
'replies': [{ | |
'text': 'reply 1', | |
... | |
'replies': [...], | |
}, ...], | |
}, ...] | |
""" | |
import os.path | |
import json | |
import argparse | |
from datetime import datetime | |
def get_fields(dct): | |
for name, fn in FIELDS.items(): | |
val = fn(dct, name) | |
if val is not None: | |
yield name, val | |
def filter_func(comments): | |
return [dict(get_fields(c)) for c in comments] | |
FIELDS = { | |
'text': dict.get, | |
'author': dict.get, | |
'timestamp': lambda dct, name: dct.get(name) and datetime.strftime( | |
datetime.utcfromtimestamp(dct.get(name)), '%Y/%m/%d'), | |
# Add more fields here | |
'replies': lambda dct, name: filter_func(dct.get(name, [])) or None | |
} | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
'--input-file', '-i', | |
dest='inputfile', metavar='FILE', required=True, | |
help='File to read video metadata from (info.json)') | |
parser.add_argument( | |
'--output-file', '-o', | |
dest='outputfile', metavar='FILE', required=True, | |
help='File to write comments to (json / html)') | |
args = parser.parse_args() | |
ext = os.path.splitext(args.outputfile)[1][1:] | |
if ext == 'html': # Error early | |
try: | |
from json2html import json2html | |
except ImportError: | |
raise SystemExit('ERROR: json2html is required for html output. Install it with pip install json2html') | |
elif ext != 'json': | |
raise SystemExit(f'ERROR: Only json and html formats are supported, not {ext}') | |
print('Reading file') | |
with open(args.inputfile, encoding='utf-8') as f: | |
info_dict = json.load(f) | |
comment_data = {c['id']: c for c in sorted( | |
info_dict['comments'], key=lambda c: c.get('timestamp') or 0)} | |
count = len(info_dict['comments']) | |
del info_dict | |
nested_comments = [] | |
for i, (cid, c) in enumerate(comment_data.items(), 1): | |
print(f'Processing comment {i}/{count}', end='\r') | |
parent = nested_comments if c['parent'] == 'root' else comment_data[c['parent']].setdefault('replies', []) | |
parent.append(c) | |
del parent | |
print('') | |
nested_comments = filter_func(nested_comments) | |
if ext == 'json': | |
print('Converting to json') | |
out = json.dumps(nested_comments, indent=4, ensure_ascii=False) | |
elif ext == 'html': | |
print('Converting to html') | |
out = json2html.convert(nested_comments) | |
del nested_comments | |
print('Writing file') | |
with open(args.outputfile, 'w', encoding='utf-8') as f: | |
f.write(out) | |
print('Done') |
I'm just confirming that @tinyapps suggested changes really do improve the HTML output. One very nice thing about this change is that your script would no longer require the json2html
module.
I am no longer maintaining this script. Feel free to fork and improve
PS: From a quick look, @tinyapps' script is not escaping the fields. This could cause the html to break, or worse, can cause arbitrary JS injection.
Thank you for your kind response, @pukkandan. By adding one more import statement (import html
) to the top of the script, we can escape the fields like so:
def wrap_html(data, top_level=True):
html_content = '<ul>'
for comment in data:
author = html.escape(comment.get("author", "Anonymous"))
text = html.escape(comment["text"])
timestamp = html.escape(comment.get("timestamp", ""))
html_content += f'<li><div class="comment-box">'
html_content += f'<p><strong>{author}:</strong> {text}</p>'
if timestamp:
html_content += f'<p><small>{timestamp}</small></p>'
if 'replies' in comment and comment['replies']:
html_content += wrap_html(comment['replies'], top_level=False)
html_content += '</div></li>'
html_content += '</ul>'
if top_level:
style = '''
<style>
.comment-box {
border: 1px solid #ccc;
padding: 10px;
}
.comments ul {
list-style-type: none;
padding-left: 20px;
}
</style>
'''
meta = '<meta charset="UTF-8">'
return f'{meta}{style}<div class="comments">{html_content}</div>'
return html_content
out = wrap_html(nested_comments)
@tinyapps when the html output is viewed in the browser, empty lines and indentations are lost. It shows up as a single very long line. It seems to be a display issue only. When the file is opened in a text editor, the line breaks and indentations are there. Any way to fix this?
Thanks for catching that, @m3jorri. Please try replacing:
text = html.escape(comment["text"])
with:
text = html.escape(comment["text"]).replace('\n', '<br>') # Convert newlines to <br>
and:
html_content += f'<p><strong>{author}:</strong> {text}</p>'
with:
html_content += f'<p><strong>{author}:</strong> <pre>{text}</pre></p>' # Wrap text in <pre> to preserve spaces and newlines
Thank you! This indeed fixes it, but perhaps creates another issue. If the user's original comment is a long single line without breaks/newlines, then it does not wrap around within the box in html output. It overflows, and creates a long single line while displaying in browser. Any solution for that?
@m3jorri, Try replacing:
html_content += f'<p><strong>{author}:</strong> <pre>{text}</pre></p>' # Wrap text in <pre> to preserve spaces and newlines
with:
html_content += f'<p><strong>{author}:</strong> <div class="comment-text">{text}</div></p>' # Wrap text in div with a class for styling
and add this just above the closing style
tag:
.comment-text {
white-space: pre-wrap; /* Preserve whitespace and line breaks */
}
EDIT: See newer version below based on @pukkandan's feedback.
Thanks very much for sharing this; makes reading downloaded YT comments much more pleasant.
The HTML output can be made a bit easier to read by replacing:
with: