Last active
February 17, 2019 08:49
-
-
Save abdullah-alnahas/073ad7f410b28ea9c9d4c0a83259fbf2 to your computer and use it in GitHub Desktop.
Build a wordcloud for your website - Python code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import time | |
import string | |
import regex | |
from nltk import sent_tokenize | |
from requests import get | |
from requests.exceptions import RequestException | |
from contextlib import closing | |
from bs4 import BeautifulSoup | |
from wordcloud import WordCloud, STOPWORDS | |
def simple_get(url): | |
""" | |
Borrowed from https://realpython.com/python-web-scraping-practical-introduction/ | |
Attempts to get the content at `url` by making an HTTP GET request. | |
If the content-type of response is some kind of HTML/XML, return the | |
text content, otherwise return None. | |
""" | |
try: | |
with closing(get(url, stream=True)) as resp: | |
if is_good_response(resp): | |
return resp.content | |
else: | |
return None | |
except RequestException as e: | |
print('Error during requests to {0} : {1}'.format(url, str(e))) | |
return None | |
def is_good_response(resp): | |
""" | |
Borrowed from https://realpython.com/python-web-scraping-practical-introduction/ | |
Returns True if the response seems to be HTML, False otherwise. | |
""" | |
content_type = resp.headers['Content-Type'].lower() | |
return (resp.status_code == 200 | |
and content_type is not None | |
and content_type.find('html') > -1) | |
def get_urls(htmlsoup): | |
urls = [] | |
links = htmlsoup.find_all('a') | |
for link in links: | |
url = link.get('href', '') | |
if 'scrapinghub.com' in url: | |
urls.append(url) | |
return urls | |
def clean_line(line, | |
eol='\n', | |
minlen=1, | |
url_re = re.compile(r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""", re.IGNORECASE|re.UNICODE) | |
): | |
""" The url regex is borrowed from https://github.com/rcompton/ryancompton.net/blob/master/assets/praw_drugs/urlmarker.py#L23 """ | |
if not line.strip(): | |
return line | |
#preprocess | |
##remove all URLs | |
line = url_re.sub('', line) | |
##clean from html tags | |
line = BeautifulSoup(line, "lxml").get_text() | |
##remove everything other than Latin chars, numbers and punctuations | |
line = regex.sub('[^\p{Latin} \p{Number}]', '', line) | |
##split into sentences | |
sentences = sent_tokenize(line) | |
##replace the strange double quotation marks with the normal ones, i.e. “ or ” --> " | |
sentences = [re.sub('(“|”)', '"', sentence) for sentence in sentences] | |
##replace the strange single quotation marks with the normal ones, i.e. ‘ or ’ --> ' | |
sentences = [re.sub('(‘|’)', "'", sentence) for sentence in sentences] | |
##replace multiple spaces by one space | |
sentences = [re.sub('(\s)+', ' ', sentence) for sentence in sentences] | |
##lower | |
sentences = [sentence.strip().lower() + eol for sentence in sentences if len(sentence.split(" ")) > minlen] | |
return sentences | |
def extract_text(htmlsoup): | |
lines = [] | |
for line in htmlsoup.text.split('\n'): | |
lines.append(clean_line(line)) | |
return lines | |
def draw_word_cloud(text): | |
""" | |
""" | |
wordcloud = WordCloud(max_words=1000, stopwords=set(STOPWORDS), random_state=1).generate(text) | |
wordcloud.to_file("scrapinghub_wordcloud.png") | |
def flatten(l, a=[]): | |
""" Borrowed from https://stackoverflow.com/a/40252152/2558856 with a tiny modification """ | |
for i in l: | |
if isinstance(i, list): | |
flatten(i, a) | |
else: | |
a.append(i) | |
return a | |
if __name__ == '__main__': | |
elapsed_time = time.time() | |
urls = ['https://scrapinghub.com'] | |
scrapped_urls = set() | |
scrapped_count = 0 | |
text_lines = [] | |
try: | |
while urls: | |
url = urls.pop(0) | |
if url in scrapped_urls: | |
continue | |
scrapped_urls.add(url) | |
html_content = simple_get(url) | |
if not type(html_content) == bytes: | |
continue | |
htmlsoup = BeautifulSoup(html_content, 'lxml') | |
urls.extend(get_urls(htmlsoup)) | |
textual_content = extract_text(htmlsoup) | |
text_lines.extend(textual_content) | |
scrapped_count += 1 | |
print(f"Done scraping {url} -- count: {scrapped_count}") | |
except KeyboardInterrupt: | |
print("\nInterrupted!\nProducing WordCloud..") | |
pass | |
text_lines = flatten(text_lines) | |
text = '\n'.join(text_lines) | |
if not text: | |
text = 'empty!' | |
draw_word_cloud(text) | |
with open('scrapinghub.txt', 'w') as infile: | |
infile.write(text) | |
elapsed_time = (time.time() - elapsed_time)/60 | |
redcolor = '\033[01;31;47m' | |
blackcolor = '\033[01;30;47m' | |
nativecolor = '\033[m' | |
print("Scrapped {} pages from {}scraping{}hub{} within {:5.2f} minutes.". | |
format(scrapped_count, blackcolor, redcolor, nativecolor, elapsed_time) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment