lloesche · August 22, 2019 13:33
diff --git a/process.py b/process.py
 #!/usr/bin/env python3
 from pprint import pprint

 document_uri = 'https://gist.githubusercontent.com/demersdesigns/4442cd84c1cc6c5ccda9b19eac1ba52b/raw/cf06109a805b661dd12133f9aa4473435e478569/craft-popular-urls'

 def process(document, num_threads=10):
    """Processes a document either found on the local disk or a remote http/https URI

    The function reads the document (either from a local file or remote URL) and finds all the http and https URLs in it.

    It then fetches those URLs and determine their content type and, if the downloaded document is an HTML document,
    retrieves the document's title.

    It processes a configurable number (num_threads) of URLs concurrently.
    It returns a dict in the form of {url: {'content-type': content_type, 'title': title}} with title being omitted if the url is not returning an HTML document
    If there's duplicate URLs only one fetch is being performed.

    Parameters
   
    ----------
    document 
        A document containing http(s) URLs found either on the local disk or a remote http(s) URI
        If the string starts with http:// or https:// assume a remote URI, otherwise a local file.
    num_threads : int, optional
        The number of concurrent URL processing threads (default is 10)

    Returns
    -------
    Dict[str, Dict[str, str]]
        A dictionary in the form of {url: {'content-type': content_type, 'title': title}
        or {url: {}} or {url: {'content-type': content_type}}
    """

    pass


 pprint(process(document_uri))
	#!/usr/bin/env python3
	from pprint import pprint

	document_uri = 'https://gist.githubusercontent.com/demersdesigns/4442cd84c1cc6c5ccda9b19eac1ba52b/raw/cf06109a805b661dd12133f9aa4473435e478569/craft-popular-urls'

	def process(document, num_threads=10):
	"""Processes a document either found on the local disk or a remote http/https URI

	The function reads the document (either from a local file or remote URL) and finds all the http and https URLs in it.

	It then fetches those URLs and determine their content type and, if the downloaded document is an HTML document,
	retrieves the document's title.

	It processes a configurable number (num_threads) of URLs concurrently.
	It returns a dict in the form of {url: {'content-type': content_type, 'title': title}} with title being omitted if the url is not returning an HTML document
	If there's duplicate URLs only one fetch is being performed.

	Parameters

	----------
	document
	A document containing http(s) URLs found either on the local disk or a remote http(s) URI
	If the string starts with http:// or https:// assume a remote URI, otherwise a local file.
	num_threads : int, optional
	The number of concurrent URL processing threads (default is 10)

	Returns
	-------
	Dict[str, Dict[str, str]]
	A dictionary in the form of {url: {'content-type': content_type, 'title': title}
	or {url: {}} or {url: {'content-type': content_type}}
	"""

	pass


	pprint(process(document_uri))