jmrobles · August 13, 2020 15:51
diff --git a/html2ScrapWeb.py b/html2ScrapWeb.py
 def html2ScrapedWeb(url: str, html: str) -> ScrapedWebVitamined:
    """
    Parse HTML using BS4 HTML5Lib parser and get <body> content without
    <nav>, <script>, <footer>
    It's focus on content
    """
    dom = BeautifulSoup(html, 'html5lib')
    # 1. Get title
    title = dom.title.string if dom.title else None
    # 2. Get description
    desc = getDescription(dom)
    # 3. Remove undesired elements
    dropTags = ["script", "head", "nav", "footer"]
    for tag in dropTags:
        for elem in dom.find_all(tag):
            elem.decompose()
    # 4. Headings
    headings = [x.getText() for idx in range(6) for x in dom.find_all(f'h{idx+1}') if x]
    # 5. Content
    s = set()
    for x in dom.find_all(["p", "td", "li", "a"]):
        s.add(x.getText().strip())
    contents = list(s)
    return ScrapedWebVitamined(url, title, desc, headings, contents, dom)
	def html2ScrapedWeb(url: str, html: str) -> ScrapedWebVitamined:
	"""
	Parse HTML using BS4 HTML5Lib parser and get <body> content without
	<nav>, <script>, <footer>
	It's focus on content
	"""
	dom = BeautifulSoup(html, 'html5lib')
	# 1. Get title
	title = dom.title.string if dom.title else None
	# 2. Get description
	desc = getDescription(dom)
	# 3. Remove undesired elements
	dropTags = ["script", "head", "nav", "footer"]
	for tag in dropTags:
	for elem in dom.find_all(tag):
	elem.decompose()
	# 4. Headings
	headings = [x.getText() for idx in range(6) for x in dom.find_all(f'h{idx+1}') if x]
	# 5. Content
	s = set()
	for x in dom.find_all(["p", "td", "li", "a"]):
	s.add(x.getText().strip())
	contents = list(s)
	return ScrapedWebVitamined(url, title, desc, headings, contents, dom)