Created
August 21, 2024 15:35
-
-
Save jmassardo/6425318146d16fa1f4491444d679ff3c to your computer and use it in GitHub Desktop.
Python script to parse html files in a folder and convert them to markdown for use with Jekyll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import logging | |
from bs4 import BeautifulSoup | |
from markdownify import markdownify as md | |
def convert_html_to_markdown(folder): | |
# Set up logging | |
logging.basicConfig(filename='conversion.log', level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s') | |
# Check if the folder exists | |
if not os.path.isdir(folder): | |
logging.error(f"Folder {folder} does not exist.") | |
return | |
# Loop through all files in the folder | |
for filename in os.listdir(folder): | |
if filename.endswith(".html"): | |
try: | |
# Read the HTML file | |
filepath = os.path.join(folder, filename) | |
with open(filepath, 'r', encoding='utf-8') as file: | |
soup = BeautifulSoup(file, 'html.parser') | |
# Find the div with class "container-wide" | |
container = soup.find('main', class_='main page-content') | |
if container: | |
# Convert the contents to markdown | |
markdown_content = md(str(container)) | |
# Save the markdown to a new file | |
new_filename = filename.replace('.html', '.md') | |
new_filepath = os.path.join(folder, new_filename) | |
with open(new_filepath, 'w', encoding='utf-8') as md_file: | |
md_file.write(markdown_content) | |
# Log the successful conversion | |
logging.info(f"Converted {filename} to {new_filename}") | |
else: | |
logging.warning(f"No 'container-wide' div found in {filename}") | |
except Exception as e: | |
# Log any errors encountered | |
logging.error(f"Error converting {filename}: {e}") | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python convert.py <folder>") | |
else: | |
folder = sys.argv[1] | |
convert_html_to_markdown(folder) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment