-
-
Save ixtel/05d917e13e16214113c9 to your computer and use it in GitHub Desktop.
Count Anchor Tags is a small program that counts the number of a tags in an HTML document that is fetched from the web. Also provides the option to write out that fetched document or the list of anchor tags on command.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# cats | |
# Count Anchor Tags | |
# | |
# Author: Benjamin Bengfort <[email protected]> | |
# Created: Thu Jun 19 06:34:50 2014 -0400 | |
# | |
# Copyright (C) 2014 Bengfort.com | |
# For license information, see LICENSE.txt | |
# | |
# ID: cats [] [email protected] $ | |
""" | |
Count Anchor Tags is a small program that counts the number of a tags in | |
an HTML document that is fetched from the web. Also provides the option to | |
write out that fetched document or the list of anchor tags on command. | |
""" | |
########################################################################## | |
## Imports | |
########################################################################## | |
import os | |
import sys | |
import json | |
import argparse | |
import requests | |
from datetime import datetime | |
from bs4 import BeautifulSoup, SoupStrainer | |
########################################################################## | |
## CATs Class | |
########################################################################## | |
class CountAnchorTags(object): | |
def __init__(self, url): | |
self.url = url | |
self.response = None | |
self.soup = None | |
self.elapsed = None | |
# Fetch the URL and parse it | |
self.fetch() | |
def fetch(self): | |
""" | |
Fetches the URL and parses it with BeautifulSoup | |
""" | |
start = datetime.now() | |
# Fetch | |
self.response = requests.get(self.url) | |
self.response.raise_for_status() | |
# Parse | |
strainer = SoupStrainer('a') | |
self.soup = BeautifulSoup(self.response.content, 'lxml', parse_only=strainer) | |
finit = datetime.now() | |
self.elapsed = finit - start | |
def links(self): | |
""" | |
Returns an array with all the anchor tags. | |
""" | |
if self.soup is not None: | |
for tag in self.soup.find_all('a'): | |
yield tag['href'] | |
def dump(self, stream, **kwargs): | |
""" | |
Dumps out a JSON array of the links | |
""" | |
data = { | |
"url": self.response.url, | |
"status_code": self.response.status_code, | |
"elapsed": self.elapsed.total_seconds(), | |
"redirects": len(self.response.history), | |
"count": len(self), | |
"links": list(self.links()), | |
} | |
json.dump(data, stream, **kwargs) | |
def __len__(self): | |
return len(self.soup.find_all('a')) | |
########################################################################## | |
## Main | |
########################################################################## | |
def main(*argv): | |
kwargs = { | |
"description": "Counts the anchor tags on a web page", | |
"epilog": "For help, please see Ben", | |
"version": "1.0.0" | |
} | |
parser = argparse.ArgumentParser(**kwargs) | |
parser.add_argument('url', nargs=1, type=str, help='The URL to download and count tags on') | |
parser.add_argument('-o', '--outpath', metavar='PATH', type=argparse.FileType('w'), default=sys.stdout, help='Dump the results to a JSON file.') | |
args = parser.parse_args() | |
cats = CountAnchorTags(args.url[0]) | |
cats.dump(args.outpath, indent=4) | |
if __name__ == '__main__': | |
main(*sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment