Created
November 14, 2018 18:08
-
-
Save backupbrain/17a97ff87981afa655f6f9fafee935c2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import requests | |
import nltk | |
from lxml import html | |
from nltk.corpus import stopwords | |
stop_words = set(stopwords.words("english")) | |
# A Company Name | |
company_name = "The Boring Company" | |
search_terms = ["flamethrower"] # The Boring Company sells flamethrowers | |
# Split company name into words | |
company_words = nltk.word_tokenize(company_name.lower()) | |
# Blend company words into various potential domain names | |
blended_words = [] | |
num_words = len(company_words) | |
for end in range(0, num_words + 1): | |
for start in range(0, end): | |
domain_parts = [] | |
for word in range(start, end): | |
domain_parts.append(company_words[word]) | |
blended_word = "".join(domain_parts) | |
blended_words.append(blended_word) | |
# Remove stop words | |
blended_words = [ | |
blended_word | |
for blended_word in blended_words | |
if blended_word not in stop_words | |
] | |
# Order by longest to shortest | |
blended_words = sorted(blended_words, key=len, reverse=True) | |
# Domain parts | |
domain_suffixes = [ | |
'com', 'co', 'io', 'ca', 'net', 'org' | |
] | |
popular_prefixes = [ | |
'', 'get' | |
] | |
# Build domain parts | |
possible_domains = [] | |
for domain_suffix in domain_suffixes: | |
for blended_word in blended_words: | |
for prefix in popular_prefixes: | |
domain = "{}{}.{}".format( | |
prefix, | |
blended_word, | |
domain_suffix | |
) | |
possible_domains.append(domain) | |
# Test domain connectivity | |
headers = requests.utils.default_headers() | |
headers.update({ | |
"User-Agent": | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) " | |
"AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/69.0.3497.100 Safari/537.36" | |
}) | |
domains = {} | |
for possible_domain in possible_domains: | |
is_live = False | |
url = "http://{}".format(possible_domain) | |
response = None | |
try: | |
print("Testing {}".format(url)) | |
response = requests.get( | |
url, | |
headers=headers, | |
timeout=3.0 | |
) | |
if response.status_code < 400: | |
is_live = True | |
except: | |
pass | |
domains[possible_domain] = { | |
"url": url, | |
"is_live": is_live, | |
"response": response | |
} | |
# Test if <title> tag matches company name | |
for domain, data in domains.items(): | |
do_titles_match = False | |
dom = None | |
if data["is_live"] is True: | |
response = data["response"] | |
dom = html.fromstring(response.text) | |
titles = dom.xpath("/html/head/title") | |
if len(titles) > 0: | |
title = titles[0].text | |
if company_name in title: | |
do_titles_match = True | |
domains[domain]["dom"] = dom | |
domains[domain]["do_titles_match"] = do_titles_match | |
# Remove unwanted elements from HTML DOM | |
def strip_elements(dom_tree, xpaths): | |
for xpath in xpaths: | |
elements = dom_tree.xpath(xpath) | |
for element in elements: | |
element.getparent().remove(element) | |
return dom_tree | |
# Seach for context-sensitive words in website text | |
for domain, data in domains.items(): | |
does_content_match = False | |
if data["is_live"] is True: | |
dom = data["dom"] | |
dom_tree_stripped = strip_elements( | |
dom, | |
["//head", "//script", "//style", "//link"] | |
) | |
text_content = dom_tree_stripped.text_content().lower() | |
text_content = re.sub("[\r\n]+", "\n", text_content) | |
text_content = re.sub("[\t ]+", " ", text_content) | |
text_content = re.sub("(\n )+", "\n", text_content) | |
for search_term in search_terms: | |
if search_term in text_content: | |
does_content_match = True | |
domains[domain]["does_content_match"] = does_content_match | |
# Settle on company URL | |
url = "" | |
for domain, data in domains.items(): | |
if data["is_live"] is True and \ | |
data["do_titles_match"] is True and\ | |
data["does_content_match"] is True: | |
url = domain | |
break | |
print(url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment