Created
March 26, 2019 23:05
-
-
Save paddy74/72eb882b20df6ca91abf9c21f2efc6e1 to your computer and use it in GitHub Desktop.
Grab the visible text from a webpage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Grab the visible text from a webpage | |
| Based on the stackoverflow post https://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text | |
| """ | |
| from bs4 import BeautifulSoup | |
| from bs4.element import Comment | |
| import requests | |
| def tag_visible(element): | |
| if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: | |
| return False | |
| if isinstance(element, Comment): | |
| return False | |
| return True | |
| def text_from_html(body): | |
| soup = BeautifulSoup(body, 'html.parser') | |
| texts = soup.findAll(text=True) | |
| visible_texts = filter(tag_visible, texts) | |
| return u" ".join(t.strip() for t in visible_texts) | |
| def visible_webpage_text(uri): | |
| r = requests.get(uri) | |
| r = r.text | |
| return text_from_html(r) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment