Skip to content

Instantly share code, notes, and snippets.

@bbengfort
Created June 19, 2014 11:16
Show Gist options
  • Save bbengfort/eb91ed2dcda9d8bb238b to your computer and use it in GitHub Desktop.
Save bbengfort/eb91ed2dcda9d8bb238b to your computer and use it in GitHub Desktop.
Count Anchor Tags is a small program that counts the number of a tags in an HTML document that is fetched from the web. Also provides the option to write out that fetched document or the list of anchor tags on command.
#!/usr/bin/env python
# cats
# Count Anchor Tags
#
# Author: Benjamin Bengfort <[email protected]>
# Created: Thu Jun 19 06:34:50 2014 -0400
#
# Copyright (C) 2014 Bengfort.com
# For license information, see LICENSE.txt
#
# ID: cats [] [email protected] $
"""
Count Anchor Tags is a small program that counts the number of a tags in
an HTML document that is fetched from the web. Also provides the option to
write out that fetched document or the list of anchor tags on command.
"""
##########################################################################
## Imports
##########################################################################
import os
import sys
import json
import argparse
import requests
from datetime import datetime
from bs4 import BeautifulSoup, SoupStrainer
##########################################################################
## CATs Class
##########################################################################
class CountAnchorTags(object):
def __init__(self, url):
self.url = url
self.response = None
self.soup = None
self.elapsed = None
# Fetch the URL and parse it
self.fetch()
def fetch(self):
"""
Fetches the URL and parses it with BeautifulSoup
"""
start = datetime.now()
# Fetch
self.response = requests.get(self.url)
self.response.raise_for_status()
# Parse
strainer = SoupStrainer('a')
self.soup = BeautifulSoup(self.response.content, 'lxml', parse_only=strainer)
finit = datetime.now()
self.elapsed = finit - start
def links(self):
"""
Returns an array with all the anchor tags.
"""
if self.soup is not None:
for tag in self.soup.find_all('a'):
yield tag['href']
def dump(self, stream, **kwargs):
"""
Dumps out a JSON array of the links
"""
data = {
"url": self.response.url,
"status_code": self.response.status_code,
"elapsed": self.elapsed.total_seconds(),
"redirects": len(self.response.history),
"count": len(self),
"links": list(self.links()),
}
json.dump(data, stream, **kwargs)
def __len__(self):
return len(self.soup.find_all('a'))
##########################################################################
## Main
##########################################################################
def main(*argv):
kwargs = {
"description": "Counts the anchor tags on a web page",
"epilog": "For help, please see Ben",
"version": "1.0.0"
}
parser = argparse.ArgumentParser(**kwargs)
parser.add_argument('url', nargs=1, type=str, help='The URL to download and count tags on')
parser.add_argument('-o', '--outpath', metavar='PATH', type=argparse.FileType('w'), default=sys.stdout, help='Dump the results to a JSON file.')
args = parser.parse_args()
cats = CountAnchorTags(args.url[0])
cats.dump(args.outpath, indent=4)
if __name__ == '__main__':
main(*sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment