Last active
May 20, 2024 22:01
-
-
Save carlosrobles/839e57f5c312f1f582e2374be8c97459 to your computer and use it in GitHub Desktop.
Create a table of content for Substack using Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
#you can install lxml with `sudo pip3 install lxml` | |
from lxml.html import fromstring | |
import sys | |
from urllib.parse import urlparse, urlunparse, urlencode, parse_qs | |
import time | |
if len(sys.argv) < 2: | |
print("No arguments were given. Use URL [output_file_name]") | |
quit() | |
#To avoid cache we add a parameter withe the timestamp | |
def add_timestamp_to_url(url): | |
parsed_url = urlparse(url) | |
query_params = parse_qs(parsed_url.query) | {"timestamp": int(time.time())} | |
return urlunparse(parsed_url._replace(query=urlencode(query_params, doseq=True))) | |
url = sys.argv[1] | |
filename = sys.argv[2] if len(sys.argv) > 2 else None | |
#fetch the HTML | |
tree = fromstring(requests.get(str(add_timestamp_to_url(url))).content) | |
path = "//*[@class='header-with-anchor-widget']" | |
current_level = 100 | |
ul_open = 0 | |
output = "<ul>" | |
#get all headers with the right class define above | |
for header in tree.xpath(path): | |
header_level = int(header.tag[1]) | |
print("H"+str(header_level) +" - " + str(header.text)) | |
#we will nest subheaders inside of parents | |
if header_level > current_level: | |
print("nesting ") | |
output = output[:-5] + "<ul>" | |
ul_open = ul_open + 1 | |
#close current tree and go back to a higher rank header | |
elif header_level < current_level and ul_open > 0: | |
while (ul_open > 0): | |
print("unnesting " + str(ul_open)) | |
ul_open = ul_open -1 | |
output = output + "</ul></li>" | |
current_level = header_level | |
#create link | |
link = header[0].get('id') | |
output = output + "<li><a href='"+url+"#"+str(link)+"'>" | |
output = output + str(header.text) + "</a></li>" | |
#if this is the last one, close current tree before closing the main UL | |
while (ul_open > 0): | |
print("unnesting " + str(ul_open)) | |
ul_open = ul_open -1 | |
output = output + "</ul></li>" | |
output = output + "</ul>" | |
if filename: | |
with open(filename, 'w') as file: | |
file.write (output) | |
print ("\nSaved to ./" + filename) | |
else: | |
print(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment