Created
September 11, 2016 22:53
-
-
Save konklone/62a2a2a1f67812c8439867345c8863a3 to your computer and use it in GitHub Desktop.
old script for cleaning up subdomain exports, used to be https://github.com/GSA/data/blob/subdomains/subdomains/clean.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from sys import argv | |
import csv | |
import re | |
script, arg = argv | |
# Open the input raw subdomain list, the filtered output handle, and | |
# a file for likely-missing rows we'd like to take a second look at. | |
inputfile = open(arg, 'rb') | |
cleaned = open('subdomains-filtered.csv', 'w') | |
missing = open('investigate.csv', 'w') | |
rejected = open('rejected.csv', 'w') | |
# Load the raw csv file | |
inputreader = csv.reader(inputfile, lineterminator='\n') | |
cleaned_writer = csv.writer(cleaned, lineterminator='\n') | |
missing_writer = csv.writer(missing, lineterminator='\n') | |
rejected_writer = csv.writer(rejected, lineterminator='\n') | |
mail_domains = 0 | |
ip_domains = 0 | |
for row in inputreader: | |
domain = row[1].lower() | |
agencyType = row[2].lower() | |
agency = row[3].lower() | |
subdomain = row[4].lower() | |
status = row[5].lower() | |
ip = row[6].lower() | |
#Remove bad data | |
if (not domain) or (not subdomain): | |
print ("data missing") | |
rejected_writer.writerow(row[1:-1]) | |
continue | |
# Removing the header row | |
if domain.startswith("second level"): | |
cleaned_writer.writerow(row[1:-1]) | |
missing_writer.writerow(row[1:-1]) | |
rejected_writer.writerow(row[1:-1]) | |
continue | |
# Remove rows containing "ic" subdomains | |
if (".ic." in subdomain) or (subdomain.startswith("ic.")): | |
# print ("Contains IC: %s" % subdomain) | |
rejected_writer.writerow(row[1:-1]) | |
continue | |
#Filter out non-federal subdomains | |
if not agencyType.startswith("federal agency"): | |
# print ("Non-federal agency Type: %s" % agencyType) | |
rejected_writer.writerow(row[1:-1]) | |
continue | |
if agency.startswith("non-federal"): | |
# print ("Non-federal agency: %s" %agency) | |
rejected_writer.writerow(row[1:-1]) | |
continue | |
# Remove obvious IP addresses | |
if (re.search("\\d{1,3}[\\.\\-]\\d{1,3}[\\.\\-]\\d{1,3}[\\.\\-]\\d{1,3}", subdomain)): | |
# print("Probable IP Address: %s" %subdomain) | |
rejected_writer.writerow(row[1:-1]) | |
ip_domains += 1 | |
continue | |
# Remove rows that are mailservers | |
if subdomain.startswith("mail.") or subdomain.startswith("pop.") or subdomain.startswith("mx.") or (".mail." in subdomain) or ("smtp" in subdomain) or (".pop." in subdomain) or (".mx." in subdomain): | |
mail_domains += 1 | |
rejected_writer.writerow(row[1:-1]) | |
# print("Mail server: %s" % subdomain) | |
continue | |
# Output "DOES_NOT_EXIST" and "NO_WEBSERVER_FOUND" to a "missing" file | |
if (status.startswith("does_not_exist")) or (status.startswith("no_webserver_record_found")): | |
missing_writer.writerow(row[1:-1]) | |
continue | |
# Output the filtered records to a "cleaned" file | |
output_row = row[1:-1] | |
cleaned_writer.writerow(output_row) | |
# Close all the file handles. | |
inputfile.close() | |
cleaned.close() | |
missing.close() | |
rejected.close() | |
print("Mail domains: %i" % mail_domains) | |
print("IP domains: %i" % ip_domains) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment