Skip to content

Instantly share code, notes, and snippets.

@sh78
Last active October 3, 2021 09:44
Show Gist options
  • Save sh78/40ac5fa27a16d5550d3222137b570391 to your computer and use it in GitHub Desktop.
Save sh78/40ac5fa27a16d5550d3222137b570391 to your computer and use it in GitHub Desktop.
# extract html comments from files in current working directory recursively
from bs4 import BeautifulSoup, Comment
import os
# run the following commands to install dependencies (OS X):
# easy_install beautifulsoup4
# easy_install lxml
##########
# output all comments to a file organized by source file name
##########
path_holder = ''
f = open('out.txt', 'w')
for root, directories, filenames in os.walk('.'):
for filename in filenames:
if filename.endswith('.html'):
path = os.path.join(root,filename)
source = open(path)
html = source.read()
soup = BeautifulSoup(html, 'lxml')
if path != path_holder:
print >> f, "\n\n\n" + path + "\n------------------------------\n"
path_holder = path
for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
print >> f, ("<!--\n" + comment.strip() + "\n-->\n").encode('ascii', 'ignore')
f.close()
##########
# create separate .html.txt file for each set of comments
##########
for root, directories, filenames in os.walk('.'):
for filename in filenames:
if filename.endswith('.html'):
path = os.path.join(root,filename)
source = open(path)
html = source.read()
soup = BeautifulSoup(html, 'lxml')
out = path + '-comments' + '.txt'
f = open(out, 'w')
for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
print >> f, ("<!--\n" + comment.strip() + "\n-->\n").encode('ascii', 'ignore')
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment