Skip to content

Instantly share code, notes, and snippets.

@PandaWhoCodes
Created January 27, 2020 18:27
Show Gist options
  • Save PandaWhoCodes/60eddf515167237b845d1af1fbae27b6 to your computer and use it in GitHub Desktop.
Save PandaWhoCodes/60eddf515167237b845d1af1fbae27b6 to your computer and use it in GitHub Desktop.
get text from a web page
from bs4 import BeautifulSoup
from bs4.element import Comment
import requests
def tag_visible(element):
if element.parent.name in ['script', 'style', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def get_text(url):
all_text = []
page = requests.get(url)
soup = BeautifulSoup(page.content, "lxml")
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
all_text.append(u" ".join(t.strip() for t in visible_texts))
return " ".join(all_text)
print(get_text("http://doraithodla.com"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment