Created
January 20, 2018 20:06
-
-
Save mikicz/0a4c39cf07fbda4ebd2bd20d68487d92 to your computer and use it in GitHub Desktop.
A way to get all used HTML elements in naucse.python.cz courses
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from bs4 import BeautifulSoup | |
used_elements = set() | |
def process(fl): | |
soup = BeautifulSoup(fl.read_text(), "html5lib") | |
container = soup.select("body > .page > .container") | |
container = container[0] | |
for child in container.findChildren(): | |
used_elements.add(child.name) | |
def walk(path=Path("naucse/_build/lessons")): | |
for i in path.iterdir(): | |
if i.is_dir(): | |
walk(i) | |
elif i.name.endswith(".html"): | |
process(i) | |
def search(): | |
global used_elements | |
used_elements = set() | |
walk() | |
if __name__ == "__main__": | |
search() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment