|
import requests |
|
import re |
|
|
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urlparse |
|
|
|
|
|
from ml_three_point_o import all_companies, tech_stack |
|
|
|
def flat_list_print( json ): |
|
if isinstance(json, list): |
|
for elem in json: |
|
print(elem) |
|
elif isinstance(json, dict): |
|
for key in json: |
|
flat_list_print(json[key]) |
|
|
|
def hierachical_list_print( json, parent ): |
|
if isinstance( json, list ): |
|
for elem in json: |
|
print(elem) |
|
elif isinstance( json, dict ): |
|
for key in json: |
|
print(key.upper()) if not parent else print( ' - '.join(parent).upper() + ' - ' + key.upper()) |
|
print("------------------") |
|
hierachical_list_print( json[key], parent + [key] ) |
|
|
|
|
|
def hierachical_list_print_markdown( json, parent ): |
|
if isinstance( json, list ): |
|
for elem in json: |
|
print("* " + elem) |
|
print() |
|
elif isinstance( json, dict ): |
|
for key in json: |
|
if not parent: |
|
print("### ", key.upper()) |
|
else: |
|
print("#### " + ' - '.join(parent).upper() + ' - ' + key.upper()) |
|
print("------------------") |
|
hierachical_list_print_markdown( json[key], parent + [key] ) |
|
|
|
def levenshteinDistance(s1, s2): |
|
""" |
|
Check the 'edit' distance between two words, s1 and s2. |
|
Code from : http://stackoverflow.com/a/32558749 |
|
""" |
|
if len(s1) > len(s2): |
|
s1, s2 = s2, s1 |
|
|
|
distances = range(len(s1) + 1) |
|
for i2, c2 in enumerate(s2): |
|
distances_ = [i2+1] |
|
for i1, c1 in enumerate(s1): |
|
if c1 == c2: |
|
distances_.append(distances[i1]) |
|
else: |
|
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) |
|
distances = distances_ |
|
return distances[-1] |
|
|
|
def get_search_result_URLs( company_name ): |
|
google_base = "https://www.google.com.au/search?q=" |
|
page = requests.get(google_base + company_name) |
|
soup = BeautifulSoup(page.content, "html.parser") |
|
links = soup.find_all("a", href=re.compile("(?<=/url\?q=)(htt.*://.*)")) |
|
cleaned_links = [] |
|
for link in links: |
|
cleaned_links.extend(re.split(":(?=http)",link["href"].replace("/url?q=",""))) |
|
cleaned_links = [l for l in cleaned_links if "webcache" not in l] |
|
base_links = [urlparse(url).netloc for url in cleaned_links] |
|
|
|
return base_links |
|
|
|
subcategory_words = { |
|
'visual' : ['visual', 'image'], |
|
'audio' : ['audio', 'sound'], |
|
'sensor' : ['sensor', 'iot'], |
|
'internal data' : ['analytics', 'internal', 'data'], |
|
'market' : ['market'], |
|
'customer support' : ['customer', 'support'], |
|
'sales' : ['sales'], |
|
'marketing' : ['marketing'], |
|
'security' : ['security', 'protection', 'guard'], |
|
'recruiting' : ['recruiting', 'target', 'hr', 'talent', 'human', 'resources'], |
|
'ground navigation' : ['ground', 'navigation', 'truck', 'transport' ,'logistics'], |
|
'aerial' : ['aerial', 'freight', 'plane'], |
|
'industrial' : ['industrial'], |
|
'personal' : ['personal', 'agents', 'assistant'], |
|
'professional' : ['professional', 'agent', 'assitant'], |
|
'agriculture' : ['agriculture', 'farming', 'crop'], |
|
'education' : ['education', 'school', 'learning'], |
|
'investment' : ['investment', 'growth', 'returns'], |
|
'legal' : ['legal', 'law'], |
|
'logistics' : ['logistics'], |
|
'materials' : ['materials'], |
|
'retail finance' : ['retail', 'finance', 'customer'], |
|
'patient' : ['patient', 'healthcare', 'health'], |
|
'image' : ['image', 'healthcare', 'diagnostics'], |
|
'biological' : ['biological', 'healthcare'] |
|
} |
|
|
|
machine_intelligence_phrases = [ |
|
'machine intelligence', |
|
'machine learning', |
|
'artificial intelligence', |
|
'AI', 'A.I', 'A I', |
|
'data science', |
|
'intelligent', |
|
'algorithms', 'algorithm' |
|
] |
|
|
|
if __name__ == '__main__': |
|
#hierachical_list_print( tech_stack, [] ) |
|
hierachical_list_print_markdown(all_companies, []) |