-
-
Save rfong/7220527 to your computer and use it in GitHub Desktop.
Fork of boyers's domain name n-gram generator that lets user specify initial prefix, or fallback on other domain extensions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python -O | |
from optparse import OptionParser | |
import random | |
import string | |
import subprocess | |
import threading | |
import time | |
import re | |
# get a list of words with only ASCII characters, and surround them with ^ and $ to demarcate the word boundaries | |
words = [w.strip().lower() for w in open("/usr/share/dict/words").readlines()] | |
words = ["^" + w + "$" for w in words if w != "" and all([c in string.ascii_lowercase for c in w])] | |
# construct a discrete-time markov chain of n-grams | |
n = 5 # this is the "n" in n-grams, try adjusting this for different results | |
transitions = {} # keys are n-grams, values are dicts mapping subsequent n-grams to probabilities | |
frequencies = {} # keys are n-grams, values are normalized frequencies [0, 1] of occurrence in the wordlist | |
for word in words: | |
for i in range(len(word) + 1 - n): | |
gram = word[i : i + n] | |
if gram in frequencies: | |
frequencies[gram] += 1 | |
else: | |
frequencies[gram] = 1 | |
for i in range(len(word) - n): | |
gram = word[i : i + n] | |
next = word[i + 1 : i + n + 1] | |
if gram not in transitions: | |
transitions[gram] = {} | |
if next in transitions[gram]: | |
transitions[gram][next] += 1 | |
else: | |
transitions[gram][next] = 1 | |
for gram in frequencies: | |
frequencies[gram] /= float(len(frequencies)) | |
for gram in transitions: | |
total = 0 | |
for next in transitions[gram]: | |
total += transitions[gram][next] | |
for next in transitions[gram]: | |
transitions[gram][next] /= float(total) | |
# sample a probability mass function | |
# pmf: dict mapping elements to probabilities | |
def sample(pmf): | |
sample = random.random() | |
cdf = 0.0 | |
for e in pmf: | |
cdf += pmf[e] | |
if cdf >= sample: | |
return e | |
return random.choice(pmf.keys()) | |
# compute a conditional probability mass function | |
# pmf: dict mapping elements to probabilities | |
# condition: boolean-valued function to condition on | |
def conditional(pmf, condition): | |
total_mass = 0.0 | |
cond = {} | |
for e in pmf: | |
if condition(e): | |
cond[e] = pmf[e] | |
total_mass += pmf[e] | |
for e in cond: | |
cond[e] /= total_mass | |
return cond | |
# compute the prefix frequencies | |
# prefixes are n-grams that appear at the beginning of words | |
prefix_frequencies = conditional(frequencies, lambda x: x[0] == "^") | |
# generate a new letter according to the markov chain (make sure len(word) >= n) | |
def evolve(word): | |
# grab the last n characters and make sure the n-gram is in our model | |
gram = word[-n:] | |
if gram not in transitions: | |
# uh oh, just return a random letter to keep things moving | |
return random.choice(string.ascii_lowercase + "$") | |
# sample the n-grams that we can transition to | |
return sample(transitions[gram])[-1:] | |
# generate a word according to the markov chain | |
def gen_word(word=None): | |
# start with a prefix | |
if word==None: | |
word = sample(prefix_frequencies) | |
# wait until the markov chain adds a terminator to the word | |
while word[-1] != "$": | |
# generate a new letter and append it to the word | |
word += evolve(word) | |
# optional: sometimes domains are multiple word-like lexemes concatenated together | |
if word[-1] == "$" and random.random() > 0.7 and len(word) < 8: | |
word += sample(prefix_frequencies) | |
# remove the boundary markers and return the word | |
return word.replace("^", "").replace("$", "") | |
# check whether a domain is taken (e.g., "example.com") | |
# returns True if the domain is taken, False if it is available, or None if the request timed out | |
def check(domain): | |
formatter = lambda s:re.sub(r'\s+',' ',s.lower()) | |
# use the "whois" command to determine availability, and timeout after a few seconds in case it hangs | |
process = subprocess.Popen(["whois", domain], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
end_time = time.clock() + 4.0 | |
while time.clock() < end_time: | |
if process.poll() is not None: | |
stdout = process.stdout.read() | |
return all([formatter(s) not in formatter(stdout) for s in [ | |
"No match", | |
"Not found", | |
"no entries found", | |
"not registered", | |
"status: available", | |
"status: free", | |
"is available for", #io,ac | |
"query_status: 200 Available", #co.nz | |
"No Data Found", #ae,com.au | |
"no data was found", #il | |
"was not found", #im | |
"nothing found", #at | |
"no domain entries were found to match", #coop | |
"This query returned 0 objects", #int | |
"incorrect domain name", #al | |
"no records matching", #ax | |
"does not exist in database!", #bg | |
"invalid query or domain name not known", #cf,ga | |
"we do not have an entry in our database", #ch | |
"no existe", #cl | |
"no matching record", #cn | |
"domain has not been registered", #hk | |
]]) | |
time.sleep(0.1) | |
try: | |
process.kill() | |
except: | |
pass | |
return None | |
def main(): | |
# allow user to specify initial prefix | |
parser = OptionParser() | |
parser.add_option("-w", dest="word", | |
help="specify initial prefix") | |
parser.add_option("-x", dest="extensions", | |
help="comma-delimited list of allowed domain extensions, no spaces or leading dots") | |
(options,args) = parser.parse_args() | |
# mutate/slice user specified word to get valid prefix | |
word = options.word | |
if options.word: | |
word = "^" + word | |
if len(word)>n: | |
word = word[:n+1] | |
elif len(word)<n: | |
print "Prefix too short; ignoring" | |
word = None | |
elif word not in prefix_frequencies: | |
print "Prefix invalid; ignoring" | |
word = None | |
# format domain extensions | |
extensions = options.extensions or "" | |
extensions = filter(lambda x:x, extensions.split(',')) | |
# also filter ones not supported by whois | |
extensions = filter(lambda x:x not in ['hn','bo'], extensions) | |
extensions = map(lambda x:'.'+x, extensions) | |
# remember previously generated names | |
visited = [] | |
# generate domain names forever | |
while True: | |
# generate a few domains and pick the smallest | |
domain = sorted([gen_word(word) for i in range(3)], key=lambda x: len(x))[0] | |
# avoid redundancy if prefix is specified | |
if word: | |
if domain in visited: | |
continue | |
visited.append(domain) | |
# report whether the domain is available | |
if check(domain + ".com") == False: # could be True, False, or None | |
print domain + ".com <-- Available!" | |
else: | |
print domain + ".com" | |
# fall back to other extensions | |
for ext in extensions: | |
if check(domain + ext) == False: | |
print domain + ext + " <-- Available!" | |
if __name__=="__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment