bbengfort · June 19, 2014 11:16
diff --git a/cats.py b/cats.py
 #!/usr/bin/env python
 # cats
 # Count Anchor Tags
 #
 # Author:   Benjamin Bengfort <[email protected]>
 # Created:  Thu Jun 19 06:34:50 2014 -0400
 #
 # Copyright (C) 2014 Bengfort.com
 # For license information, see LICENSE.txt
 #
 # ID: cats [] [email protected] $

 """
 Count Anchor Tags is a small program that counts the number of a tags in
 an HTML document that is fetched from the web. Also provides the option to
 write out that fetched document or the list of anchor tags on command.
 """

 ##########################################################################
 ## Imports
 ##########################################################################

 import os
 import sys
 import json
 import argparse
 import requests

 from datetime import datetime
 from bs4 import BeautifulSoup, SoupStrainer

 ##########################################################################
 ## CATs Class
 ##########################################################################

 class CountAnchorTags(object):

    def __init__(self, url):
        self.url      = url
        self.response = None
        self.soup     = None
        self.elapsed  = None

        # Fetch the URL and parse it
        self.fetch()

    def fetch(self):
        """
        Fetches the URL and parses it with BeautifulSoup
        """
        start         = datetime.now()

        # Fetch
        self.response = requests.get(self.url)
        self.response.raise_for_status()

        # Parse
        strainer      = SoupStrainer('a')
        self.soup     = BeautifulSoup(self.response.content, 'lxml', parse_only=strainer)

        finit         = datetime.now()
        self.elapsed  = finit - start

    def links(self):
        """
        Returns an array with all the anchor tags.
        """
        if self.soup is not None:
            for tag in self.soup.find_all('a'):
                yield tag['href']

    def dump(self, stream, **kwargs):
        """
        Dumps out a JSON array of the links
        """
        data = {
            "url": self.response.url,
            "status_code": self.response.status_code,
            "elapsed": self.elapsed.total_seconds(),
            "redirects": len(self.response.history),
            "count": len(self),
            "links": list(self.links()),
        }
        json.dump(data, stream, **kwargs)

    def __len__(self):
        return len(self.soup.find_all('a'))

 ##########################################################################
 ## Main
 ##########################################################################

 def main(*argv):
    kwargs = {
        "description": "Counts the anchor tags on a web page",
        "epilog": "For help, please see Ben",
        "version": "1.0.0"
    }

    parser = argparse.ArgumentParser(**kwargs)
    parser.add_argument('url', nargs=1, type=str, help='The URL to download and count tags on')
    parser.add_argument('-o', '--outpath', metavar='PATH', type=argparse.FileType('w'), default=sys.stdout, help='Dump the results to a JSON file.')

    args   = parser.parse_args()
    cats   = CountAnchorTags(args.url[0])
    cats.dump(args.outpath, indent=4)

 if __name__ == '__main__':
    main(*sys.argv)
	#!/usr/bin/env python
	# cats
	# Count Anchor Tags
	#
	# Author: Benjamin Bengfort <[email protected]>
	# Created: Thu Jun 19 06:34:50 2014 -0400
	#
	# Copyright (C) 2014 Bengfort.com
	# For license information, see LICENSE.txt
	#
	# ID: cats [] [email protected] $

	"""
	Count Anchor Tags is a small program that counts the number of a tags in
	an HTML document that is fetched from the web. Also provides the option to
	write out that fetched document or the list of anchor tags on command.
	"""

	##########################################################################
	## Imports
	##########################################################################

	import os
	import sys
	import json
	import argparse
	import requests

	from datetime import datetime
	from bs4 import BeautifulSoup, SoupStrainer

	##########################################################################
	## CATs Class
	##########################################################################

	class CountAnchorTags(object):

	def __init__(self, url):
	self.url = url
	self.response = None
	self.soup = None
	self.elapsed = None

	# Fetch the URL and parse it
	self.fetch()

	def fetch(self):
	"""
	Fetches the URL and parses it with BeautifulSoup
	"""
	start = datetime.now()

	# Fetch
	self.response = requests.get(self.url)
	self.response.raise_for_status()

	# Parse
	strainer = SoupStrainer('a')
	self.soup = BeautifulSoup(self.response.content, 'lxml', parse_only=strainer)

	finit = datetime.now()
	self.elapsed = finit - start

	def links(self):
	"""
	Returns an array with all the anchor tags.
	"""
	if self.soup is not None:
	for tag in self.soup.find_all('a'):
	yield tag['href']

	def dump(self, stream, **kwargs):
	"""
	Dumps out a JSON array of the links
	"""
	data = {
	"url": self.response.url,
	"status_code": self.response.status_code,
	"elapsed": self.elapsed.total_seconds(),
	"redirects": len(self.response.history),
	"count": len(self),
	"links": list(self.links()),
	}
	json.dump(data, stream, **kwargs)

	def __len__(self):
	return len(self.soup.find_all('a'))

	##########################################################################
	## Main
	##########################################################################

	def main(*argv):
	kwargs = {
	"description": "Counts the anchor tags on a web page",
	"epilog": "For help, please see Ben",
	"version": "1.0.0"
	}

	parser = argparse.ArgumentParser(**kwargs)
	parser.add_argument('url', nargs=1, type=str, help='The URL to download and count tags on')
	parser.add_argument('-o', '--outpath', metavar='PATH', type=argparse.FileType('w'), default=sys.stdout, help='Dump the results to a JSON file.')

	args = parser.parse_args()
	cats = CountAnchorTags(args.url[0])
	cats.dump(args.outpath, indent=4)

	if __name__ == '__main__':
	main(*sys.argv)