abdullah-alnahas · February 17, 2019 08:49
diff --git a/scrapinghub_wordcloud.png b/scrapinghub_wordcloud.png
diff --git a/website_wordcloud.py b/website_wordcloud.py
 import re
 import time
 import string
 import regex
 from nltk import sent_tokenize
 from requests import get
 from requests.exceptions import RequestException
 from contextlib import closing
 from bs4 import BeautifulSoup
 from wordcloud import WordCloud, STOPWORDS

 def simple_get(url):
    """
    Borrowed from https://realpython.com/python-web-scraping-practical-introduction/
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        print('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


 def is_good_response(resp):
    """
    Borrowed from https://realpython.com/python-web-scraping-practical-introduction/
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

 def get_urls(htmlsoup):
    urls = []
    links = htmlsoup.find_all('a')
    for link in links:
        url = link.get('href', '')
        if 'scrapinghub.com' in url:
            urls.append(url)
    return urls

 def clean_line(line,
                eol='\n',
                minlen=1,
                url_re = re.compile(r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""", re.IGNORECASE|re.UNICODE)
                ):
    """ The url regex is borrowed from https://github.com/rcompton/ryancompton.net/blob/master/assets/praw_drugs/urlmarker.py#L23 """
    if not line.strip():
        return line
    #preprocess
    ##remove all URLs
    line = url_re.sub('', line)
    ##clean from html tags
    line = BeautifulSoup(line, "lxml").get_text()
    ##remove everything other than Latin chars, numbers and punctuations
    line = regex.sub('[^\p{Latin} \p{Number}]', '', line)
    ##split into sentences
    sentences = sent_tokenize(line)
    ##replace the strange double quotation marks with the normal ones, i.e. “ or ” --> "
    sentences = [re.sub('(“|”)', '"', sentence) for sentence in sentences]
    ##replace the strange single quotation marks with the normal ones, i.e. ‘ or ’ --> '
    sentences = [re.sub('(‘|’)', "'", sentence) for sentence in sentences]
    ##replace multiple spaces by one space
    sentences = [re.sub('(\s)+', ' ', sentence) for sentence in sentences]
    ##lower
    sentences = [sentence.strip().lower() + eol for sentence in sentences if len(sentence.split(" ")) > minlen]
    return sentences

 def extract_text(htmlsoup):
    lines = []
    for line in htmlsoup.text.split('\n'):
        lines.append(clean_line(line))
    return lines

 def draw_word_cloud(text):
    """
    """
    wordcloud = WordCloud(max_words=1000, stopwords=set(STOPWORDS), random_state=1).generate(text)
    wordcloud.to_file("scrapinghub_wordcloud.png")

 def flatten(l, a=[]):
    """ Borrowed from https://stackoverflow.com/a/40252152/2558856 with a tiny modification """
    for i in l:
        if isinstance(i, list):
            flatten(i, a)
        else:
            a.append(i)
    return a


 if __name__ == '__main__':
    elapsed_time = time.time()
    urls = ['https://scrapinghub.com']
    scrapped_urls = set()
    scrapped_count = 0
    text_lines = []
    try:
        while urls:
            url = urls.pop(0)
            if url in scrapped_urls:
                continue
            scrapped_urls.add(url)
            html_content = simple_get(url)
            if not type(html_content) == bytes:
                continue
            htmlsoup = BeautifulSoup(html_content, 'lxml')
            urls.extend(get_urls(htmlsoup))
            textual_content = extract_text(htmlsoup)
            text_lines.extend(textual_content)
            scrapped_count += 1
            print(f"Done scraping {url} -- count: {scrapped_count}")
    except KeyboardInterrupt:
        print("\nInterrupted!\nProducing WordCloud..")
        pass

    text_lines = flatten(text_lines)
    text = '\n'.join(text_lines)
    if not text:
        text = 'empty!'
    draw_word_cloud(text)

    with open('scrapinghub.txt', 'w') as infile:
        infile.write(text)

    elapsed_time = (time.time() - elapsed_time)/60

    redcolor = '\033[01;31;47m'
    blackcolor = '\033[01;30;47m'
    nativecolor = '\033[m'

    print("Scrapped {} pages from {}scraping{}hub{} within {:5.2f} minutes.".
        format(scrapped_count, blackcolor, redcolor, nativecolor, elapsed_time)
        )
	import re
	import time
	import string
	import regex
	from nltk import sent_tokenize
	from requests import get
	from requests.exceptions import RequestException
	from contextlib import closing
	from bs4 import BeautifulSoup
	from wordcloud import WordCloud, STOPWORDS

	def simple_get(url):
	"""
	Borrowed from https://realpython.com/python-web-scraping-practical-introduction/
	Attempts to get the content at `url` by making an HTTP GET request.
	If the content-type of response is some kind of HTML/XML, return the
	text content, otherwise return None.
	"""
	try:
	with closing(get(url, stream=True)) as resp:
	if is_good_response(resp):
	return resp.content
	else:
	return None

	except RequestException as e:
	print('Error during requests to {0} : {1}'.format(url, str(e)))
	return None


	def is_good_response(resp):
	"""
	Borrowed from https://realpython.com/python-web-scraping-practical-introduction/
	Returns True if the response seems to be HTML, False otherwise.
	"""
	content_type = resp.headers['Content-Type'].lower()
	return (resp.status_code == 200
	and content_type is not None
	and content_type.find('html') > -1)

	def get_urls(htmlsoup):
	urls = []
	links = htmlsoup.find_all('a')
	for link in links:
	url = link.get('href', '')
	if 'scrapinghub.com' in url:
	urls.append(url)
	return urls

	def clean_line(line,
	eol='\n',
	minlen=1,
	url_re = re.compile(r"""(?i)\b((?:https?:(?:/{1,3}\|[a-z0-9%])\|[a-z0-9.\-]+[.](?:com\|net\|org\|edu\|gov\|mil\|aero\|asia\|biz\|cat\|coop\|info\|int\|jobs\|mobi\|museum\|name\|post\|pro\|tel\|travel\|xxx\|ac\|ad\|ae\|af\|ag\|ai\|al\|am\|an\|ao\|aq\|ar\|as\|at\|au\|aw\|ax\|az\|ba\|bb\|bd\|be\|bf\|bg\|bh\|bi\|bj\|bm\|bn\|bo\|br\|bs\|bt\|bv\|bw\|by\|bz\|ca\|cc\|cd\|cf\|cg\|ch\|ci\|ck\|cl\|cm\|cn\|co\|cr\|cs\|cu\|cv\|cx\|cy\|cz\|dd\|de\|dj\|dk\|dm\|do\|dz\|ec\|ee\|eg\|eh\|er\|es\|et\|eu\|fi\|fj\|fk\|fm\|fo\|fr\|ga\|gb\|gd\|ge\|gf\|gg\|gh\|gi\|gl\|gm\|gn\|gp\|gq\|gr\|gs\|gt\|gu\|gw\|gy\|hk\|hm\|hn\|hr\|ht\|hu\|id\|ie\|il\|im\|in\|io\|iq\|ir\|is\|it\|je\|jm\|jo\|jp\|ke\|kg\|kh\|ki\|km\|kn\|kp\|kr\|kw\|ky\|kz\|la\|lb\|lc\|li\|lk\|lr\|ls\|lt\|lu\|lv\|ly\|ma\|mc\|md\|me\|mg\|mh\|mk\|ml\|mm\|mn\|mo\|mp\|mq\|mr\|ms\|mt\|mu\|mv\|mw\|mx\|my\|mz\|na\|nc\|ne\|nf\|ng\|ni\|nl\|no\|np\|nr\|nu\|nz\|om\|pa\|pe\|pf\|pg\|ph\|pk\|pl\|pm\|pn\|pr\|ps\|pt\|pw\|py\|qa\|re\|ro\|rs\|ru\|rw\|sa\|sb\|sc\|sd\|se\|sg\|sh\|si\|sj\|Ja\|sk\|sl\|sm\|sn\|so\|sr\|ss\|st\|su\|sv\|sx\|sy\|sz\|tc\|td\|tf\|tg\|th\|tj\|tk\|tl\|tm\|tn\|to\|tp\|tr\|tt\|tv\|tw\|tz\|ua\|ug\|uk\|us\|uy\|uz\|va\|vc\|ve\|vg\|vi\|vn\|vu\|wf\|ws\|ye\|yt\|yu\|za\|zm\|zw)/)(?:[^\s()<>{}\[\]]+\|\([^\s()]?\([^\s()]+\)[^\s()]?\)\|\([^\s]+?\))+(?:\([^\s()]?\([^\s()]+\)[^\s()]?\)\|\([^\s]+?\)\|[^\s`!()\[\]{};:'".,<>?«»“”‘’])\|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com\|net\|org\|edu\|gov\|mil\|aero\|asia\|biz\|cat\|coop\|info\|int\|jobs\|mobi\|museum\|name\|post\|pro\|tel\|travel\|xxx\|ac\|ad\|ae\|af\|ag\|ai\|al\|am\|an\|ao\|aq\|ar\|as\|at\|au\|aw\|ax\|az\|ba\|bb\|bd\|be\|bf\|bg\|bh\|bi\|bj\|bm\|bn\|bo\|br\|bs\|bt\|bv\|bw\|by\|bz\|ca\|cc\|cd\|cf\|cg\|ch\|ci\|ck\|cl\|cm\|cn\|co\|cr\|cs\|cu\|cv\|cx\|cy\|cz\|dd\|de\|dj\|dk\|dm\|do\|dz\|ec\|ee\|eg\|eh\|er\|es\|et\|eu\|fi\|fj\|fk\|fm\|fo\|fr\|ga\|gb\|gd\|ge\|gf\|gg\|gh\|gi\|gl\|gm\|gn\|gp\|gq\|gr\|gs\|gt\|gu\|gw\|gy\|hk\|hm\|hn\|hr\|ht\|hu\|id\|ie\|il\|im\|in\|io\|iq\|ir\|is\|it\|je\|jm\|jo\|jp\|ke\|kg\|kh\|ki\|km\|kn\|kp\|kr\|kw\|ky\|kz\|la\|lb\|lc\|li\|lk\|lr\|ls\|lt\|lu\|lv\|ly\|ma\|mc\|md\|me\|mg\|mh\|mk\|ml\|mm\|mn\|mo\|mp\|mq\|mr\|ms\|mt\|mu\|mv\|mw\|mx\|my\|mz\|na\|nc\|ne\|nf\|ng\|ni\|nl\|no\|np\|nr\|nu\|nz\|om\|pa\|pe\|pf\|pg\|ph\|pk\|pl\|pm\|pn\|pr\|ps\|pt\|pw\|py\|qa\|re\|ro\|rs\|ru\|rw\|sa\|sb\|sc\|sd\|se\|sg\|sh\|si\|sj\|Ja\|sk\|sl\|sm\|sn\|so\|sr\|ss\|st\|su\|sv\|sx\|sy\|sz\|tc\|td\|tf\|tg\|th\|tj\|tk\|tl\|tm\|tn\|to\|tp\|tr\|tt\|tv\|tw\|tz\|ua\|ug\|uk\|us\|uy\|uz\|va\|vc\|ve\|vg\|vi\|vn\|vu\|wf\|ws\|ye\|yt\|yu\|za\|zm\|zw)\b/?(?!@)))""", re.IGNORECASE\|re.UNICODE)
	):
	""" The url regex is borrowed from https://github.com/rcompton/ryancompton.net/blob/master/assets/praw_drugs/urlmarker.py#L23 """
	if not line.strip():
	return line
	#preprocess
	##remove all URLs
	line = url_re.sub('', line)
	##clean from html tags
	line = BeautifulSoup(line, "lxml").get_text()
	##remove everything other than Latin chars, numbers and punctuations
	line = regex.sub('[^\p{Latin} \p{Number}]', '', line)
	##split into sentences
	sentences = sent_tokenize(line)
	##replace the strange double quotation marks with the normal ones, i.e. “ or ” --> "
	sentences = [re.sub('(“\|”)', '"', sentence) for sentence in sentences]
	##replace the strange single quotation marks with the normal ones, i.e. ‘ or ’ --> '
	sentences = [re.sub('(‘\|’)', "'", sentence) for sentence in sentences]
	##replace multiple spaces by one space
	sentences = [re.sub('(\s)+', ' ', sentence) for sentence in sentences]
	##lower
	sentences = [sentence.strip().lower() + eol for sentence in sentences if len(sentence.split(" ")) > minlen]
	return sentences

	def extract_text(htmlsoup):
	lines = []
	for line in htmlsoup.text.split('\n'):
	lines.append(clean_line(line))
	return lines

	def draw_word_cloud(text):
	"""
	"""
	wordcloud = WordCloud(max_words=1000, stopwords=set(STOPWORDS), random_state=1).generate(text)
	wordcloud.to_file("scrapinghub_wordcloud.png")

	def flatten(l, a=[]):
	""" Borrowed from https://stackoverflow.com/a/40252152/2558856 with a tiny modification """
	for i in l:
	if isinstance(i, list):
	flatten(i, a)
	else:
	a.append(i)
	return a


	if __name__ == '__main__':
	elapsed_time = time.time()
	urls = ['https://scrapinghub.com']
	scrapped_urls = set()
	scrapped_count = 0
	text_lines = []
	try:
	while urls:
	url = urls.pop(0)
	if url in scrapped_urls:
	continue
	scrapped_urls.add(url)
	html_content = simple_get(url)
	if not type(html_content) == bytes:
	continue
	htmlsoup = BeautifulSoup(html_content, 'lxml')
	urls.extend(get_urls(htmlsoup))
	textual_content = extract_text(htmlsoup)
	text_lines.extend(textual_content)
	scrapped_count += 1
	print(f"Done scraping {url} -- count: {scrapped_count}")
	except KeyboardInterrupt:
	print("\nInterrupted!\nProducing WordCloud..")
	pass

	text_lines = flatten(text_lines)
	text = '\n'.join(text_lines)
	if not text:
	text = 'empty!'
	draw_word_cloud(text)

	with open('scrapinghub.txt', 'w') as infile:
	infile.write(text)

	elapsed_time = (time.time() - elapsed_time)/60

	redcolor = '\033[01;31;47m'
	blackcolor = '\033[01;30;47m'
	nativecolor = '\033[m'

	print("Scrapped {} pages from {}scraping{}hub{} within {:5.2f} minutes.".
	format(scrapped_count, blackcolor, redcolor, nativecolor, elapsed_time)
	)