Skip to content

Instantly share code, notes, and snippets.

@pcote
Last active August 29, 2015 14:11
Show Gist options
  • Save pcote/71c0f6faf1ffc450a9f0 to your computer and use it in GitHub Desktop.
Save pcote/71c0f6faf1ffc450a9f0 to your computer and use it in GitHub Desktop.
Snippet for soup_line
def soup_line(dir_name, *exclusions):
"""
Pair up soups with the files they're based on.
:param dir_name: Directory with the html files needed.
:param exclusions: Don't include these files.
:return: A tuple of soup file name pairs.
"""
import os
from collections import namedtuple
SoupFilePair = namedtuple("SoupFileName", ["soup", "file_name"])
def is_valid_file(fpath):
if os.path.isfile(fpath):
if fpath.endswith(".html"):
if fpath not in exclusions:
return True
else:
return False
for fname in os.listdir(dir_name):
file_path = "{0}/{1}".format(dir_name, fname)
if is_valid_file(file_path):
with open(file_path, "rt", encoding="utf-8") as file_ob:
soup = BeautifulSoup(file_ob.read().strip())
yield SoupFilePair(soup, fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment