rfong · December 3, 2019 23:45
diff --git a/domain_finder.py b/domain_finder.py
 #!/usr/bin/python -O
 from optparse import OptionParser
 import random
 import string
 import subprocess
 import threading
 import time
 import re

 # get a list of words with only ASCII characters, and surround them with ^ and $ to demarcate the word boundaries
 words = [w.strip().lower() for w in open("/usr/share/dict/words").readlines()]
 words = ["^" + w + "$" for w in words if w != "" and all([c in string.ascii_lowercase for c in w])]

 # construct a discrete-time markov chain of n-grams
 n = 5 # this is the "n" in n-grams, try adjusting this for different results
 transitions = {} # keys are n-grams, values are dicts mapping subsequent n-grams to probabilities
 frequencies = {} # keys are n-grams, values are normalized frequencies [0, 1] of occurrence in the wordlist
 for word in words:
  for i in range(len(word) + 1 - n):
    gram = word[i : i + n]
    if gram in frequencies:
      frequencies[gram] += 1
    else:
      frequencies[gram] = 1
  for i in range(len(word) - n):
    gram = word[i : i + n]
    next = word[i + 1 : i + n + 1]
    if gram not in transitions:
      transitions[gram] = {}
    if next in transitions[gram]:
      transitions[gram][next] += 1
    else:
      transitions[gram][next] = 1
 for gram in frequencies:
  frequencies[gram] /= float(len(frequencies))
 for gram in transitions:
  total = 0
  for next in transitions[gram]:
    total += transitions[gram][next]
  for next in transitions[gram]:
    transitions[gram][next] /= float(total)

 # sample a probability mass function
 #   pmf: dict mapping elements to probabilities
 def sample(pmf):
  sample = random.random()
  cdf = 0.0
  for e in pmf:
    cdf += pmf[e]
    if cdf >= sample:
      return e
  return random.choice(pmf.keys())

 # compute a conditional probability mass function
 #   pmf:       dict mapping elements to probabilities
 #   condition: boolean-valued function to condition on
 def conditional(pmf, condition):
  total_mass = 0.0
  cond = {}
  for e in pmf:
    if condition(e):
      cond[e] = pmf[e]
      total_mass += pmf[e]
  for e in cond:
    cond[e] /= total_mass
  return cond

 # compute the prefix frequencies
 # prefixes are n-grams that appear at the beginning of words
 prefix_frequencies = conditional(frequencies, lambda x: x[0] == "^")

 # generate a new letter according to the markov chain (make sure len(word) >= n)
 def evolve(word):
  # grab the last n characters and make sure the n-gram is in our model
  gram = word[-n:]
  if gram not in transitions:
    # uh oh, just return a random letter to keep things moving
    return random.choice(string.ascii_lowercase + "$")

  # sample the n-grams that we can transition to
  return sample(transitions[gram])[-1:]

 # generate a word according to the markov chain
 def gen_word(word=None):
  # start with a prefix
  if word==None:
    word = sample(prefix_frequencies)

  # wait until the markov chain adds a terminator to the word
  while word[-1] != "$":
    # generate a new letter and append it to the word
    word += evolve(word)

    # optional: sometimes domains are multiple word-like lexemes concatenated together
    if word[-1] == "$" and random.random() > 0.7 and len(word) < 8:
      word += sample(prefix_frequencies)

  # remove the boundary markers and return the word
  return word.replace("^", "").replace("$", "")

 # check whether a domain is taken (e.g., "example.com")
 # returns True if the domain is taken, False if it is available, or None if the request timed out
 def check(domain):
  formatter = lambda s:re.sub(r'\s+',' ',s.lower())
  # use the "whois" command to determine availability, and timeout after a few seconds in case it hangs
  process = subprocess.Popen(["whois", domain], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  end_time = time.clock() + 4.0
  while time.clock() < end_time:
    if process.poll() is not None:
      stdout = process.stdout.read()
      return all([formatter(s) not in formatter(stdout) for s in [
                  "No match",
                  "Not found",
                  "no entries found",
                  "not registered",
                  "status: available",
                  "status: free",
                  "is available for", #io,ac
                  "query_status: 200 Available", #co.nz
                  "No Data Found",    #ae,com.au
                  "no data was found", #il
                  "was not found",    #im
                  "nothing found",    #at
                  "no domain entries were found to match", #coop
                  "This query returned 0 objects", #int
                  "incorrect domain name", #al
                  "no records matching", #ax
                  "does not exist in database!", #bg
                  "invalid query or domain name not known", #cf,ga
                  "we do not have an entry in our database", #ch
                  "no existe",        #cl
                  "no matching record", #cn
                  "domain has not been registered", #hk
                  
                 ]])
    time.sleep(0.1)
  try:
    process.kill()
  except:
    pass
  return None

 def main():
  # allow user to specify initial prefix
  parser = OptionParser()
  parser.add_option("-w", dest="word",
                    help="specify initial prefix")
  parser.add_option("-x", dest="extensions",
                    help="comma-delimited list of allowed domain extensions, no spaces or leading dots")
  (options,args) = parser.parse_args()
  
  # mutate/slice user specified word to get valid prefix
  word = options.word
  if options.word:
    word = "^" + word
    if len(word)>n:
      word = word[:n+1]
    elif len(word)<n:
      print "Prefix too short; ignoring"
      word = None
    elif word not in prefix_frequencies:
      print "Prefix invalid; ignoring"
      word = None
  
  # format domain extensions
  extensions = options.extensions or ""
  extensions = filter(lambda x:x, extensions.split(','))
  # also filter ones not supported by whois
  extensions = filter(lambda x:x not in ['hn','bo'], extensions)
  extensions = map(lambda x:'.'+x, extensions)
  
  # remember previously generated names
  visited = []
  
  # generate domain names forever
  while True:
    # generate a few domains and pick the smallest
    domain = sorted([gen_word(word) for i in range(3)], key=lambda x: len(x))[0]
    # avoid redundancy if prefix is specified
    if word:
      if domain in visited:
        continue
      visited.append(domain)
  
    # report whether the domain is available
    if check(domain + ".com") == False: # could be True, False, or None
      print domain + ".com <-- Available!"
    else:
      print domain + ".com"
      # fall back to other extensions
      for ext in extensions:
        if check(domain + ext) == False:
          print domain + ext + " <-- Available!"

 if __name__=="__main__":
  main()
	#!/usr/bin/python -O
	from optparse import OptionParser
	import random
	import string
	import subprocess
	import threading
	import time
	import re

	# get a list of words with only ASCII characters, and surround them with ^ and $ to demarcate the word boundaries
	words = [w.strip().lower() for w in open("/usr/share/dict/words").readlines()]
	words = ["^" + w + "$" for w in words if w != "" and all([c in string.ascii_lowercase for c in w])]

	# construct a discrete-time markov chain of n-grams
	n = 5 # this is the "n" in n-grams, try adjusting this for different results
	transitions = {} # keys are n-grams, values are dicts mapping subsequent n-grams to probabilities
	frequencies = {} # keys are n-grams, values are normalized frequencies [0, 1] of occurrence in the wordlist
	for word in words:
	for i in range(len(word) + 1 - n):
	gram = word[i : i + n]
	if gram in frequencies:
	frequencies[gram] += 1
	else:
	frequencies[gram] = 1
	for i in range(len(word) - n):
	gram = word[i : i + n]
	next = word[i + 1 : i + n + 1]
	if gram not in transitions:
	transitions[gram] = {}
	if next in transitions[gram]:
	transitions[gram][next] += 1
	else:
	transitions[gram][next] = 1
	for gram in frequencies:
	frequencies[gram] /= float(len(frequencies))
	for gram in transitions:
	total = 0
	for next in transitions[gram]:
	total += transitions[gram][next]
	for next in transitions[gram]:
	transitions[gram][next] /= float(total)

	# sample a probability mass function
	# pmf: dict mapping elements to probabilities
	def sample(pmf):
	sample = random.random()
	cdf = 0.0
	for e in pmf:
	cdf += pmf[e]
	if cdf >= sample:
	return e
	return random.choice(pmf.keys())

	# compute a conditional probability mass function
	# pmf: dict mapping elements to probabilities
	# condition: boolean-valued function to condition on
	def conditional(pmf, condition):
	total_mass = 0.0
	cond = {}
	for e in pmf:
	if condition(e):
	cond[e] = pmf[e]
	total_mass += pmf[e]
	for e in cond:
	cond[e] /= total_mass
	return cond

	# compute the prefix frequencies
	# prefixes are n-grams that appear at the beginning of words
	prefix_frequencies = conditional(frequencies, lambda x: x[0] == "^")

	# generate a new letter according to the markov chain (make sure len(word) >= n)
	def evolve(word):
	# grab the last n characters and make sure the n-gram is in our model
	gram = word[-n:]
	if gram not in transitions:
	# uh oh, just return a random letter to keep things moving
	return random.choice(string.ascii_lowercase + "$")

	# sample the n-grams that we can transition to
	return sample(transitions[gram])[-1:]

	# generate a word according to the markov chain
	def gen_word(word=None):
	# start with a prefix
	if word==None:
	word = sample(prefix_frequencies)

	# wait until the markov chain adds a terminator to the word
	while word[-1] != "$":
	# generate a new letter and append it to the word
	word += evolve(word)

	# optional: sometimes domains are multiple word-like lexemes concatenated together
	if word[-1] == "$" and random.random() > 0.7 and len(word) < 8:
	word += sample(prefix_frequencies)

	# remove the boundary markers and return the word
	return word.replace("^", "").replace("$", "")

	# check whether a domain is taken (e.g., "example.com")
	# returns True if the domain is taken, False if it is available, or None if the request timed out
	def check(domain):
	formatter = lambda s:re.sub(r'\s+',' ',s.lower())
	# use the "whois" command to determine availability, and timeout after a few seconds in case it hangs
	process = subprocess.Popen(["whois", domain], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	end_time = time.clock() + 4.0
	while time.clock() < end_time:
	if process.poll() is not None:
	stdout = process.stdout.read()
	return all([formatter(s) not in formatter(stdout) for s in [
	"No match",
	"Not found",
	"no entries found",
	"not registered",
	"status: available",
	"status: free",
	"is available for", #io,ac
	"query_status: 200 Available", #co.nz
	"No Data Found", #ae,com.au
	"no data was found", #il
	"was not found", #im
	"nothing found", #at
	"no domain entries were found to match", #coop
	"This query returned 0 objects", #int
	"incorrect domain name", #al
	"no records matching", #ax
	"does not exist in database!", #bg
	"invalid query or domain name not known", #cf,ga
	"we do not have an entry in our database", #ch
	"no existe", #cl
	"no matching record", #cn
	"domain has not been registered", #hk

	]])
	time.sleep(0.1)
	try:
	process.kill()
	except:
	pass
	return None

	def main():
	# allow user to specify initial prefix
	parser = OptionParser()
	parser.add_option("-w", dest="word",
	help="specify initial prefix")
	parser.add_option("-x", dest="extensions",
	help="comma-delimited list of allowed domain extensions, no spaces or leading dots")
	(options,args) = parser.parse_args()

	# mutate/slice user specified word to get valid prefix
	word = options.word
	if options.word:
	word = "^" + word
	if len(word)>n:
	word = word[:n+1]
	elif len(word)<n:
	print "Prefix too short; ignoring"
	word = None
	elif word not in prefix_frequencies:
	print "Prefix invalid; ignoring"
	word = None

	# format domain extensions
	extensions = options.extensions or ""
	extensions = filter(lambda x:x, extensions.split(','))
	# also filter ones not supported by whois
	extensions = filter(lambda x:x not in ['hn','bo'], extensions)
	extensions = map(lambda x:'.'+x, extensions)

	# remember previously generated names
	visited = []

	# generate domain names forever
	while True:
	# generate a few domains and pick the smallest
	domain = sorted([gen_word(word) for i in range(3)], key=lambda x: len(x))[0]
	# avoid redundancy if prefix is specified
	if word:
	if domain in visited:
	continue
	visited.append(domain)

	# report whether the domain is available
	if check(domain + ".com") == False: # could be True, False, or None
	print domain + ".com <-- Available!"
	else:
	print domain + ".com"
	# fall back to other extensions
	for ext in extensions:
	if check(domain + ext) == False:
	print domain + ext + " <-- Available!"

	if __name__=="__main__":
	main()