Created
December 28, 2020 19:11
-
-
Save Jonty/2376f46818462345fdc81e029b62ce57 to your computer and use it in GitHub Desktop.
Extract all code from a set of Doxygen generated documentation, for use when recovering code that has otherwise been lost
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This extracts all the code from a set of Doxygen generated documentation | |
# where the code is embedded and highlighted. You really only need to use this | |
# when attempting to recover lost code and you still have the docs. | |
# Writes all code out into the original directory structure relative to where | |
# the script is executed. | |
# Run: `python extract_code_from_doxygen.py URL_TO_DOXYGEN_FILES_PAGE` | |
# e.g. `python extract_code_from_doxygen.py http://swf2svg.sourceforge.net/azar/doc/files.html` | |
import sys | |
import re | |
import os | |
from urllib.parse import urlparse | |
import requests | |
import lxml.html | |
listing = sys.argv[1] # The files.html doxygen url | |
base_url = "/".join(listing.split("/")[:-1]) | |
response = requests.get(listing) | |
root = lxml.html.fromstring(response.content) | |
file_nodes = root.xpath("//table/tr/td[1]/a[2]") | |
for node in file_nodes: | |
code_url = base_url + "/" + node.attrib["href"] | |
response = requests.get(code_url) | |
code_root = lxml.html.fromstring(response.content) | |
h1 = code_root.xpath("//h1")[0].text | |
base_path, filename = os.path.split(h1) | |
# Extremely hacky way to make a windows/linux path relative | |
base_path = re.sub("^([a-zA-Z]:)*/", "", base_path) | |
try: | |
os.makedirs(base_path) | |
except FileExistsError as e: | |
pass | |
pre = code_root.xpath("//pre")[0].text_content() | |
code = re.sub("^[0-9]+ ", "", pre, flags=re.MULTILINE) | |
print("Writing %s/%s" % (base_path, filename)) | |
with open("%s/%s" % (base_path, filename), "w") as f: | |
f.write(code) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for sharing this.
Your script didn't work for me. Here is a modified version that recursively downloads all files, even deeply nested ones.
The entrypoint is a bit different though: