sh78 · October 3, 2021 09:44
diff --git a/extract html comments.py b/extract html comments.py
 # extract html comments from files in current working directory recursively

 from bs4 import BeautifulSoup, Comment
 import os

 # run the following commands to install dependencies (OS X):

 # easy_install beautifulsoup4
 # easy_install lxml


 ##########
 # output all comments to a file organized by source file name
 ##########

 path_holder = ''
 f = open('out.txt', 'w')
 for root, directories, filenames in os.walk('.'):
    for filename in filenames:
        if filename.endswith('.html'):
            path = os.path.join(root,filename)
            source = open(path)
            html = source.read()
            soup = BeautifulSoup(html, 'lxml')
            if path != path_holder:
                print >> f, "\n\n\n" + path + "\n------------------------------\n"
            path_holder = path
            for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
                print >> f, ("<!--\n" + comment.strip() + "\n-->\n").encode('ascii', 'ignore')
 f.close()

 ##########
 # create separate .html.txt file for each set of comments
 ##########
 for root, directories, filenames in os.walk('.'):
    for filename in filenames:
        if filename.endswith('.html'):
            path = os.path.join(root,filename)
            source = open(path)
            html = source.read()
            soup = BeautifulSoup(html, 'lxml')
            out = path + '-comments' + '.txt'
            f = open(out, 'w')
            for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
                print >> f, ("<!--\n" + comment.strip() + "\n-->\n").encode('ascii', 'ignore')
            f.close()
	# extract html comments from files in current working directory recursively

	from bs4 import BeautifulSoup, Comment
	import os

	# run the following commands to install dependencies (OS X):

	# easy_install beautifulsoup4
	# easy_install lxml


	##########
	# output all comments to a file organized by source file name
	##########

	path_holder = ''
	f = open('out.txt', 'w')
	for root, directories, filenames in os.walk('.'):
	for filename in filenames:
	if filename.endswith('.html'):
	path = os.path.join(root,filename)
	source = open(path)
	html = source.read()
	soup = BeautifulSoup(html, 'lxml')
	if path != path_holder:
	print >> f, "\n\n\n" + path + "\n------------------------------\n"
	path_holder = path
	for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
	print >> f, ("<!--\n" + comment.strip() + "\n-->\n").encode('ascii', 'ignore')
	f.close()

	##########
	# create separate .html.txt file for each set of comments
	##########
	for root, directories, filenames in os.walk('.'):
	for filename in filenames:
	if filename.endswith('.html'):
	path = os.path.join(root,filename)
	source = open(path)
	html = source.read()
	soup = BeautifulSoup(html, 'lxml')
	out = path + '-comments' + '.txt'
	f = open(out, 'w')
	for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
	print >> f, ("<!--\n" + comment.strip() + "\n-->\n").encode('ascii', 'ignore')
	f.close()