Last active
July 3, 2020 17:17
-
-
Save tuck1s/f77b5d7ebcfaf7acec09fc41ec730251 to your computer and use it in GitHub Desktop.
Simple tool to count distinct domains in a [list of] file or stdin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse, csv, sys | |
import dns.resolver | |
def domainpart(n): | |
# A valid email address contains exactly one @, otherwise return None = invalid | |
parts = n.split('@') | |
if len(parts) == 2: | |
return parts[1] | |
return None | |
class DomainList: | |
def __init__(self): | |
self.domains = {} | |
def add(self, name): | |
dn = domainpart(name) | |
if dn in self.domains: | |
self.domains[dn] += 1 # Already seen = increment count | |
else: | |
self.domains[dn] = 1 | |
def dump(self, mx): | |
for k, v in self.domains.items(): | |
if mx: | |
result = dns.resolver.query(k, 'MX') | |
provider = ',' + ' '.join(exdata.exchange.to_text() for exdata in result) | |
else: | |
provider = '' | |
print('{},{}{}'.format(k, v, provider)) | |
# ----------------------------------------------------------------------------------------- | |
# Main code | |
# ----------------------------------------------------------------------------------------- | |
# Read and validate command-line arguments. You can put multiple input files in the args | |
parser = argparse.ArgumentParser( | |
description='List the distinct domains (and their frequency) from file(s) of email addresses') | |
parser.add_argument('-mx', action='store_true', help='show MX record lookup') | |
parser.add_argument('files', metavar='file', type=argparse.FileType('r'), default=[sys.stdin], nargs='*', | |
help='input filename. If omitted, will read from stdin') | |
args = parser.parse_args() | |
dl = DomainList() | |
for fh in args.files: | |
f = csv.reader(fh) | |
for line in f: # walk through each line in the file | |
for addr in line: # may be more than one address per line (comma-separated) | |
if domainpart(addr): | |
dl.add(addr) | |
else: | |
print('File {}:\t{} Not an email address - skipping'.format(fh.name, addr)) | |
dl.dump(args.mx) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment