thundergolfer · April 3, 2025 03:43
diff --git a/bloomberg_beta_state_of_machine_intelligence_three.md b/bloomberg_beta_state_of_machine_intelligence_three.md
diff --git a/bloomberg_beta_state_of_machine_intelligence_three_techstack.md b/bloomberg_beta_state_of_machine_intelligence_three_techstack.md
diff --git a/mi_three_point_o.py b/mi_three_point_o.py
 import json

 all_companies = { 'enterprise intelligence': {
                                                'visual' : [
                                                    'Orbital Insight',
                                                    'Planet Labs',
                                                    'clarifai',
                                                    'deep vision',
                                                    'cortica',
                                                    'algocian',
                                                    'space_know',
                                                    'captricity',
                                                    'netra',
                                                    'deepomatic' ],
                                                'audio' : [
                                                    'Gridspace',
                                                    'TalkIQ',
                                                    'nexidia',
                                                    'Twilio',
                                                    'Capio',
                                                    'Expect labs',
                                                    'Clover',
                                                    'Mobvoi',
                                                    'Popup archive'
                                                ],
                                                'sensor' : [
                                                    'Predix',
                                                    'C3IoT',
                                                    'Maana',
                                                    'Sentenai',
                                                    'Planet OS',
                                                    'Uptake',
                                                    'Imubit',
                                                    'Preferred Networks',
                                                    'thingworx',
                                                    'Konux',
                                                    'Alluvium'
                                                ],
                                                'internal data' : [
                                                    'Primer',
                                                    'IBM Watson',
                                                    'Cycorp',
                                                    'Palantir',
                                                    'Armio',
                                                    'Alation',
                                                    'Sapho',
                                                    'Outlier',
                                                    'Digital Reasoning'
                                                ],
                                                'Market' : [
                                                    'mattermark',
                                                    'Quid',
                                                    'Datafox',
                                                    'Premise',
                                                    'Bottlenose',
                                                    'CB Insights',
                                                    'Enigma',
                                                    'Tracxn',
                                                    'Predata'
                                                ]
 }, # end enterprise intelligence
 'enterprise functions' : {


 'customer support' : [
                                                    'Digital Genius',
                                                    'Kasisto',
                                                    'Eloquent',
 													'Wise.io',
 													'ActionIQ',
 													'Zendisk',
 													'Preact',
 													'Clarabridge'
 ],
                                                'sales' : [
                                                    'Collective i',
                                                    '6Sense',
                                                    'fuse machines',
                                                    'Aviso',
                                                    'Salesforce',
                                                    'InsideSales.com',
                                                    'Clari',
                                                    'Zensight'
                                                ],
 												'marketing' : [
                                                    'Mintigo',
                                                    'Lattice',
                                                    'Radius',
                                                    'Liftigniter',
                                                    'AIR PR',
                                                    'Motiva',
                                                    'BrightFunnel',
                                                    'msg.ai',
                                                    'Retention Science',
                                                    'Persado',
                                                    'Cognicor'
 ],
                                                'security' : [
                                                    'Cylance',
                                                    'Darktrace',
                                                    'Zimperium',
                                                    'Deep Instinct',
                                                    'Sentinel',
                                                    'Demisto',
                                                    'Graphistry',
                                                    'Drawbridge',
                                                    'SignalSense',
                                                    'AppZen'
                                                ],
 												'recruiting' : [
                                                    'Textio',
                                                    'Entelo',
                                                    'Wade & Wendy',
                                                    'hiQ',
                                                    'Unitive',
                                                    'SpringRole',
                                                    'Gigster',
                                                    'HireVue'
                                                ]


 }, # end enterprise functions
 'autonomous systems' : {


                        'ground navigation' : [
                                                    'drive.ai',
                                                    'AdasWorks',
                                                    'zoox',
                                                    'Mobileye',
                                                    'Uber',
                                                    'Google',
                                                    'Tesla',
                                                    'nuTonomy',
 													                          'Auro Robotics'
                                                ],
 												'aerial' : [
                                                    'skydio',
                                                    'Shield AI',
                                                    'Airware',
                                                    'DJI',
                                                    'Lily',
                                                    'DroneDeploy',
                                                    'pilot.ai',
                                                    'Skycatch'
 ],
                        'industrial' : [
                                                    'Jaybridge',
                                                    'Osaro',
                                                    'Clearpath',
                                                    'Fetch Robotics',
                                                    'Kindred',
                                                    'Harvest Automation',
                                                    'Rethink Robotics'
                                                ]

 },# end autonomous systems }

 'agents' : {


                        'personal' : [
                                                    'amazon alexa',
                                                    'Cortana',
                                                    'Allo',
                                                    'Facebook',
                                                    'Siri',
                                                    'Replika'
 ],
                        'professional' : [
                                                    'butter.ai',
                                                    'pogo',
                                                    'Skipflag',
                                                    'clara',
                                                    'x.ai',
                                                    'slack',
                                                    'talla',
                                                    'Zoom.ai',
 													                          'sudo'
                                                ]


 },

 'industries' : {


                          'agriculture' : [
                                                    'Blue River',
                                                    'mavrx',
                                                    'tule',
                                                    'Trace Genomics',
                                                    'Pivot Bio',
                                                    'TerrAvion',
                                                    'Agri-Data',
                                                    'Descartes Labs',
                                                    'udio',
                                                    'Abundant Robotics'
 ],
                                                'education' : [
                                                    'Knewton',
                                                    'Volley',
                                                    'gradescope',
                                                    'CTI',
                                                    'Coursera',
                                                    'Udacity',
                                                    'AltSchool'
                                                ],
 												'investment' : [
                                                    'Bloomberg',
                                                    'Sentient',
                                                    'iSentium',
                                                    'Kensho',
                                                    'AlphaSense',
                                                    'Dataminr',
                                                    'Cerebellum Capital',
                                                    'Quandl'
 ],
                        'legal' : [
                                                    'BlueJ',
                                                    'Beagle',
                                                    'Everlaw',
                                                    'Ravel',
                                                    'Seal',
                                                    'Ross',
                                                    'Legal Robot'
                                                ],
 												'logistics' : [
                                                    'Nauto',
                                                    'Acerta',
                                                    'Preteckt',
                                                    'Routific',
                                                    'ClearMetal',
                                                    'Marble',
                                                    'Pitstop '
 ],
                        'materials' : [
                                                    'Zymergen',
                                                    'Citrine',
                                                    'Eigen Innovations',
                                                    'Sight Machine',
                                                    'Ginkgo Bioworks',
                                                    'Nanotronics',
                                                    'Calculario'
                                                ],
 												'retail finance' : [
                                                    'Tala',
                                                    'Zest Finance',
                                                    'Lendo',
                                                    'Earnest',
                                                    'Affirm',
                                                    'Mirador',
                                                    'Wealthfront',
 													                          'Betterment'
                                                ]
 },
 'healthcare' : {


                        'patient' : [
                                                    'Pulse',
                                                    'CareSkore',
                                                    'Zephyr Health',
                                                    'IBM Watson Health',
                                                    'Oncora',
                                                    'Sentrian',
                                                    'Atomwise',
                                                    'Numerate'
 ],
                        'image' : [
                                                    'Butterfly',
                                                    '3Scan',
                                                    'Arterys',
                                                    'Enlitic',
                                                    'Bay Labs',
                                                    'imagia',
                                                    'Google DeepMind'
                                                ],
 												'biological' : [
                                                    'iCarbonX',
                                                    'color',
                                                    'Grail',
                                                    'Deep Genomics',
                                                    'Recursion',
                                                    'Luminist',
                                                    'Numerate',
                                                    'Atomwise',
                                                    'Verily',
                                                    'Whole Biome'
 													]
 }

 }

 tech_stack = {
    'agent enablers' : [
 													'Octane.AI',
                                                    'Howdy',
                                                    'Maluuba',
 													'KITT.AI',
 													'OpenAI Gym',
 													'Kasisto',
 													'Automnt',
 													'Semantic'],

    'data science'   : [							'Domino',
                                                    'SparkBeyond',
                                                    'RapidMiner',
 													'Kaggle',
 													'DataRobot',
 													'Yhat',
 													'Ayasdi',
 													'Dataiku',
 													'Seldon',
 													'Yseop',
 													'BigML'
 													],

    'machine learning' : [							'CognitiveScale',
                                                    'GoogleML',
                                                    'Context Relevant',
 													'Cycorp',
 													'HyperScience',
 													'Nara Logics',
 													'minds.ai',
 													'H2O.ai',
 													'Scaled Inference',
 													'SparkCognition',
 													'Looop',
 													'Geometric Intelligence'
 													'DeepSense.io',
 													'Reactive',
 													'Skymind',
 													'Bonsai'
 													],

    'natural language' : [							'Agolo',
                                                    'Aylien',
                                                    'Lexalytics',
 													'Narrative Science',
 													'Loop AI Labs',
 													'spaCy',
 													'Luminoso',
 													'Cortical.io',
 													'MonkeyLearn'],


    'development'      : [ 							'Sigopt',
                                                    'HyperOpt',
                                                    'Fuzzy.io',
 													'Kite',
 													'rainforest',
 													'lobe',
 													'Anodot',
 													'Signifai',
 													'Layer6 AI',
 													'bonsai'],


    'data capture' : [								'CrowdFlower',
                                                    'Diffbot',
                                                    'Crowd AI',
 													'import.io',
 													'Paxata',
 													'DataSift',
 													'Amazon Mechanical Turk',
 													'enigma',
 													'WorkFusion',
 													'Datalogue',
 													'Trifacta',
 													'parsehub'],



    'open source libraries' : [						'Keras',
                                                    'Chainer',
                                                    'CNTK',
 													'TensorFlow',
 													'Caffe',
 													'H2O',
 													'DeepLearning4J',
 													'theano',
 													'torch',
 													'Dsstne',
 													'Scikit-learn',
 													'AzureML',
 													'neon',
 													'MXNet',
 													'DMTK',
 													'Spark',
 													'PaddlePaddle',
 													'Weka'
 	],
    'hardware' : [									'Knupath',
                                                    'Tenstorrent',
                                                    'Cirrascale',
 													'nvidia',
 													'nervana',
 													'Movidius',
 													'tensilica',
 													'Google TPU',
 													'10^26 Labs',
 													'Qualcomm',
 													'Cerebras',
 													'Isosemi'],


    'research' : [									'OpenAI',
                                                    'nnaisense',
                                                    'Element AI',
 													'vicarious',
 													'Knoggin',
 													'Numenta',
 													'Kimera Systems',
 													'Cogitai']
 }

 alt_old_names = { 'planet labs' : 'cosmogia' }

 # To test JSON formatting. Will throw exception if there's error
 print( json.dumps(all_companies) )
 print( json.dumps(tech_stack))
diff --git a/twitter_follow.py b/twitter_follow.py
 import tweepy
 import csv

 from urllib.parse import urlparse

 from private import (consumer_key, consumer_secret, access_token, access_token_secret)
 from ml_three_point_o import all_companies, tech_stack
 from util import (levenshteinDistance,
                  get_search_result_URLs,
                  subcategory_words,
                  machine_intelligence_phrases
                  )


 auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
 auth.set_access_token(access_token, access_token_secret)

 api = tweepy.API(auth)

 FIRST_NAMES = set(load_csv_names("First_Names.csv"))
 LAST_NAMES  = set(load_csv_names("Last_Names.csv"))

 MIN_SCORE = 1


 def follow_on_twitter( json, parent, include=None, exclude=None):
    if isinstance( json, dict):
        for key in json:
            if ((include and key in include) and
                ((not exclude) or (exclude and key not in exclude)):
                follow_on_twitter( json[key], key ) # pass in key as 'parent'.
    elif isinstance( json, list ): # reached list of companies or technologies
        for company_name in json:
            # get a list of users using company name as keyword
            results = api.search_users(q=company_name)
            best_match = []
            max_score = MIN_SCORE
            # check each result's score
            for user in results:
                score = check_score( parent.lower(), company_name, user )
                if score >= max_score:
                    max_score = score
                    best_match.append(user)
            if best_match:
                for user in best_match:
                    # api.create_friendship( user.id )
                    print("would have followed :", user.name )
            else:
                print("Found no match for : ", company_name)


 def check_score( category, company_name, user ):
    score = 0
    if check_name( company_name, user ):
        score += 1
    elif is_persons_name( user ):
        score -= 5 # A person's name is very likely not a match.
    score += 1 if check_followers( user ) else 0
    score += 1 if check_verified( user )  else 0
    score += 1 if check_url( company_name, user ) else 0
    subcat_words = subcategory_words[category] if category else []
    score += 1 if check_desc( user, subcat_words + machine_intelligence_phrases )  else 0
    return score

 def is_persons_name( user ):
    tokens = user.name.lower().split(' ')
    if len(tokens) == 2: # could be standard 'firstname lastname' form
        return True if tokens[0] in FIRST_NAMES and tokens[1] in LAST_NAMES else False
    elif len(tokens) == 3:
        if (tokens[0] in FIRST_NAMES or tokens[1] in FIRST_NAMES) and (tokens[2] in LAST_NAMES):
            return True
        elif (tokens[0] in FIRST_NAMES) and (tokens[1] in FIRST_NAMES or tokens[2] in LAST_NAMES):
            return True
    return False

 def check_name( company_name, user ):
    if levenshteinDistance( company_name.lower(), user.name.lower() ) < 2:
        return True
    if user.name.startswith( company_name ):
        return True
    return False

 def check_followers( user ):
    THRESHOLD = 50 # somewhat arbitrary
    return user.followers_count > THRESHOLD

 def check_verified( user ):
    return user.verified or (user.verified == "True")

 def check_url( company_name, user ):
    google_results = get_search_result_URLs( company_name )
    return urlparse(user.url).netloc in google_results

 def check_desc( user, target_phrases ):
    desc = user.description
    return any(phrase in user.description.lower() for phrase in target_phrases)

 def load_csv_names( pth ):
    with open(pth, 'rt') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            for field in row:
                yield field.lower()


 if __name__ == "__main__":
    #follow_on_twitter( all_companies, None )
diff --git a/util.py b/util.py
 import requests
 import re

 from bs4 import BeautifulSoup
 from urllib.parse import urlparse


 from ml_three_point_o import all_companies, tech_stack

 def flat_list_print( json ):
    if isinstance(json, list):
        for elem in json:
            print(elem)
    elif isinstance(json, dict):
        for key in json:
            flat_list_print(json[key])

 def hierachical_list_print( json, parent ):
    if isinstance( json, list ):
        for elem in json:
            print(elem)
    elif isinstance( json, dict ):
        for key in json:
            print(key.upper()) if not parent else print( ' - '.join(parent).upper() + ' - ' + key.upper())
            print("------------------")
            hierachical_list_print( json[key], parent + [key] )


 def hierachical_list_print_markdown( json, parent ):
    if isinstance( json, list ):
        for elem in json:
            print("* " + elem)
        print()
    elif isinstance( json, dict ):
        for key in json:
            if not parent:
                print("### ", key.upper())
            else:
                print("#### " + ' - '.join(parent).upper() + ' - ' + key.upper())
            print("------------------")
            hierachical_list_print_markdown( json[key], parent + [key] )

 def levenshteinDistance(s1, s2):
    """
    Check the 'edit' distance between two words, s1 and s2.
    Code from : http://stackoverflow.com/a/32558749
    """
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

 def get_search_result_URLs( company_name ):
    google_base = "https://www.google.com.au/search?q="
    page = requests.get(google_base + company_name)
    soup = BeautifulSoup(page.content, "html.parser")
    links = soup.find_all("a", href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
    cleaned_links = []
    for link in links:
        cleaned_links.extend(re.split(":(?=http)",link["href"].replace("/url?q=","")))
    cleaned_links = [l for l in cleaned_links if "webcache" not in l]
    base_links = [urlparse(url).netloc for url in cleaned_links]

    return base_links

 subcategory_words = {
    'visual' : ['visual', 'image'],
    'audio'  : ['audio', 'sound'],
    'sensor' : ['sensor', 'iot'],
    'internal data' : ['analytics', 'internal', 'data'],
    'market' : ['market'],
    'customer support' : ['customer', 'support'],
    'sales' : ['sales'],
    'marketing' : ['marketing'],
    'security' : ['security', 'protection', 'guard'],
    'recruiting' : ['recruiting', 'target', 'hr', 'talent', 'human', 'resources'],
    'ground navigation' : ['ground', 'navigation', 'truck', 'transport' ,'logistics'],
    'aerial' : ['aerial', 'freight', 'plane'],
    'industrial' : ['industrial'],
    'personal' : ['personal', 'agents', 'assistant'],
    'professional' : ['professional', 'agent', 'assitant'],
    'agriculture' : ['agriculture', 'farming', 'crop'],
    'education' : ['education', 'school', 'learning'],
    'investment' : ['investment', 'growth', 'returns'],
    'legal' : ['legal', 'law'],
    'logistics' : ['logistics'],
    'materials' : ['materials'],
    'retail finance' : ['retail', 'finance', 'customer'],
    'patient' : ['patient', 'healthcare', 'health'],
    'image' : ['image', 'healthcare', 'diagnostics'],
    'biological' : ['biological', 'healthcare']
 }

 machine_intelligence_phrases = [
 'machine intelligence',
 'machine learning',
 'artificial intelligence',
 'AI', 'A.I', 'A I',
 'data science',
 'intelligent',
 'algorithms', 'algorithm'
 ]

 if __name__ == '__main__':
    #hierachical_list_print( tech_stack, [] )
    hierachical_list_print_markdown(all_companies, [])
	import json

	all_companies = { 'enterprise intelligence': {
	'visual' : [
	'Orbital Insight',
	'Planet Labs',
	'clarifai',
	'deep vision',
	'cortica',
	'algocian',
	'space_know',
	'captricity',
	'netra',
	'deepomatic' ],
	'audio' : [
	'Gridspace',
	'TalkIQ',
	'nexidia',
	'Twilio',
	'Capio',
	'Expect labs',
	'Clover',
	'Mobvoi',
	'Popup archive'
	],
	'sensor' : [
	'Predix',
	'C3IoT',
	'Maana',
	'Sentenai',
	'Planet OS',
	'Uptake',
	'Imubit',
	'Preferred Networks',
	'thingworx',
	'Konux',
	'Alluvium'
	],
	'internal data' : [
	'Primer',
	'IBM Watson',
	'Cycorp',
	'Palantir',
	'Armio',
	'Alation',
	'Sapho',
	'Outlier',
	'Digital Reasoning'
	],
	'Market' : [
	'mattermark',
	'Quid',
	'Datafox',
	'Premise',
	'Bottlenose',
	'CB Insights',
	'Enigma',
	'Tracxn',
	'Predata'
	]
	}, # end enterprise intelligence
	'enterprise functions' : {


	'customer support' : [
	'Digital Genius',
	'Kasisto',
	'Eloquent',
	'Wise.io',
	'ActionIQ',
	'Zendisk',
	'Preact',
	'Clarabridge'
	],
	'sales' : [
	'Collective i',
	'6Sense',
	'fuse machines',
	'Aviso',
	'Salesforce',
	'InsideSales.com',
	'Clari',
	'Zensight'
	],
	'marketing' : [
	'Mintigo',
	'Lattice',
	'Radius',
	'Liftigniter',
	'AIR PR',
	'Motiva',
	'BrightFunnel',
	'msg.ai',
	'Retention Science',
	'Persado',
	'Cognicor'
	],
	'security' : [
	'Cylance',
	'Darktrace',
	'Zimperium',
	'Deep Instinct',
	'Sentinel',
	'Demisto',
	'Graphistry',
	'Drawbridge',
	'SignalSense',
	'AppZen'
	],
	'recruiting' : [
	'Textio',
	'Entelo',
	'Wade & Wendy',
	'hiQ',
	'Unitive',
	'SpringRole',
	'Gigster',
	'HireVue'
	]


	}, # end enterprise functions
	'autonomous systems' : {


	'ground navigation' : [
	'drive.ai',
	'AdasWorks',
	'zoox',
	'Mobileye',
	'Uber',
	'Google',
	'Tesla',
	'nuTonomy',
	'Auro Robotics'
	],
	'aerial' : [
	'skydio',
	'Shield AI',
	'Airware',
	'DJI',
	'Lily',
	'DroneDeploy',
	'pilot.ai',
	'Skycatch'
	],
	'industrial' : [
	'Jaybridge',
	'Osaro',
	'Clearpath',
	'Fetch Robotics',
	'Kindred',
	'Harvest Automation',
	'Rethink Robotics'
	]

	},# end autonomous systems }

	'agents' : {


	'personal' : [
	'amazon alexa',
	'Cortana',
	'Allo',
	'Facebook',
	'Siri',
	'Replika'
	],
	'professional' : [
	'butter.ai',
	'pogo',
	'Skipflag',
	'clara',
	'x.ai',
	'slack',
	'talla',
	'Zoom.ai',
	'sudo'
	]


	},

	'industries' : {


	'agriculture' : [
	'Blue River',
	'mavrx',
	'tule',
	'Trace Genomics',
	'Pivot Bio',
	'TerrAvion',
	'Agri-Data',
	'Descartes Labs',
	'udio',
	'Abundant Robotics'
	],
	'education' : [
	'Knewton',
	'Volley',
	'gradescope',
	'CTI',
	'Coursera',
	'Udacity',
	'AltSchool'
	],
	'investment' : [
	'Bloomberg',
	'Sentient',
	'iSentium',
	'Kensho',
	'AlphaSense',
	'Dataminr',
	'Cerebellum Capital',
	'Quandl'
	],
	'legal' : [
	'BlueJ',
	'Beagle',
	'Everlaw',
	'Ravel',
	'Seal',
	'Ross',
	'Legal Robot'
	],
	'logistics' : [
	'Nauto',
	'Acerta',
	'Preteckt',
	'Routific',
	'ClearMetal',
	'Marble',
	'Pitstop '
	],
	'materials' : [
	'Zymergen',
	'Citrine',
	'Eigen Innovations',
	'Sight Machine',
	'Ginkgo Bioworks',
	'Nanotronics',
	'Calculario'
	],
	'retail finance' : [
	'Tala',
	'Zest Finance',
	'Lendo',
	'Earnest',
	'Affirm',
	'Mirador',
	'Wealthfront',
	'Betterment'
	]
	},
	'healthcare' : {


	'patient' : [
	'Pulse',
	'CareSkore',
	'Zephyr Health',
	'IBM Watson Health',
	'Oncora',
	'Sentrian',
	'Atomwise',
	'Numerate'
	],
	'image' : [
	'Butterfly',
	'3Scan',
	'Arterys',
	'Enlitic',
	'Bay Labs',
	'imagia',
	'Google DeepMind'
	],
	'biological' : [
	'iCarbonX',
	'color',
	'Grail',
	'Deep Genomics',
	'Recursion',
	'Luminist',
	'Numerate',
	'Atomwise',
	'Verily',
	'Whole Biome'
	]
	}

	}

	tech_stack = {
	'agent enablers' : [
	'Octane.AI',
	'Howdy',
	'Maluuba',
	'KITT.AI',
	'OpenAI Gym',
	'Kasisto',
	'Automnt',
	'Semantic'],

	'data science' : [ 'Domino',
	'SparkBeyond',
	'RapidMiner',
	'Kaggle',
	'DataRobot',
	'Yhat',
	'Ayasdi',
	'Dataiku',
	'Seldon',
	'Yseop',
	'BigML'
	],

	'machine learning' : [ 'CognitiveScale',
	'GoogleML',
	'Context Relevant',
	'Cycorp',
	'HyperScience',
	'Nara Logics',
	'minds.ai',
	'H2O.ai',
	'Scaled Inference',
	'SparkCognition',
	'Looop',
	'Geometric Intelligence'
	'DeepSense.io',
	'Reactive',
	'Skymind',
	'Bonsai'
	],

	'natural language' : [ 'Agolo',
	'Aylien',
	'Lexalytics',
	'Narrative Science',
	'Loop AI Labs',
	'spaCy',
	'Luminoso',
	'Cortical.io',
	'MonkeyLearn'],


	'development' : [ 'Sigopt',
	'HyperOpt',
	'Fuzzy.io',
	'Kite',
	'rainforest',
	'lobe',
	'Anodot',
	'Signifai',
	'Layer6 AI',
	'bonsai'],


	'data capture' : [ 'CrowdFlower',
	'Diffbot',
	'Crowd AI',
	'import.io',
	'Paxata',
	'DataSift',
	'Amazon Mechanical Turk',
	'enigma',
	'WorkFusion',
	'Datalogue',
	'Trifacta',
	'parsehub'],



	'open source libraries' : [ 'Keras',
	'Chainer',
	'CNTK',
	'TensorFlow',
	'Caffe',
	'H2O',
	'DeepLearning4J',
	'theano',
	'torch',
	'Dsstne',
	'Scikit-learn',
	'AzureML',
	'neon',
	'MXNet',
	'DMTK',
	'Spark',
	'PaddlePaddle',
	'Weka'
	],
	'hardware' : [ 'Knupath',
	'Tenstorrent',
	'Cirrascale',
	'nvidia',
	'nervana',
	'Movidius',
	'tensilica',
	'Google TPU',
	'10^26 Labs',
	'Qualcomm',
	'Cerebras',
	'Isosemi'],


	'research' : [ 'OpenAI',
	'nnaisense',
	'Element AI',
	'vicarious',
	'Knoggin',
	'Numenta',
	'Kimera Systems',
	'Cogitai']
	}

	alt_old_names = { 'planet labs' : 'cosmogia' }

	# To test JSON formatting. Will throw exception if there's error
	print( json.dumps(all_companies) )
	print( json.dumps(tech_stack))
	import tweepy
	import csv

	from urllib.parse import urlparse

	from private import (consumer_key, consumer_secret, access_token, access_token_secret)
	from ml_three_point_o import all_companies, tech_stack
	from util import (levenshteinDistance,
	get_search_result_URLs,
	subcategory_words,
	machine_intelligence_phrases
	)


	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.set_access_token(access_token, access_token_secret)

	api = tweepy.API(auth)

	FIRST_NAMES = set(load_csv_names("First_Names.csv"))
	LAST_NAMES = set(load_csv_names("Last_Names.csv"))

	MIN_SCORE = 1


	def follow_on_twitter( json, parent, include=None, exclude=None):
	if isinstance( json, dict):
	for key in json:
	if ((include and key in include) and
	((not exclude) or (exclude and key not in exclude)):
	follow_on_twitter( json[key], key ) # pass in key as 'parent'.
	elif isinstance( json, list ): # reached list of companies or technologies
	for company_name in json:
	# get a list of users using company name as keyword
	results = api.search_users(q=company_name)
	best_match = []
	max_score = MIN_SCORE
	# check each result's score
	for user in results:
	score = check_score( parent.lower(), company_name, user )
	if score >= max_score:
	max_score = score
	best_match.append(user)
	if best_match:
	for user in best_match:
	# api.create_friendship( user.id )
	print("would have followed :", user.name )
	else:
	print("Found no match for : ", company_name)


	def check_score( category, company_name, user ):
	score = 0
	if check_name( company_name, user ):
	score += 1
	elif is_persons_name( user ):
	score -= 5 # A person's name is very likely not a match.
	score += 1 if check_followers( user ) else 0
	score += 1 if check_verified( user ) else 0
	score += 1 if check_url( company_name, user ) else 0
	subcat_words = subcategory_words[category] if category else []
	score += 1 if check_desc( user, subcat_words + machine_intelligence_phrases ) else 0
	return score

	def is_persons_name( user ):
	tokens = user.name.lower().split(' ')
	if len(tokens) == 2: # could be standard 'firstname lastname' form
	return True if tokens[0] in FIRST_NAMES and tokens[1] in LAST_NAMES else False
	elif len(tokens) == 3:
	if (tokens[0] in FIRST_NAMES or tokens[1] in FIRST_NAMES) and (tokens[2] in LAST_NAMES):
	return True
	elif (tokens[0] in FIRST_NAMES) and (tokens[1] in FIRST_NAMES or tokens[2] in LAST_NAMES):
	return True
	return False

	def check_name( company_name, user ):
	if levenshteinDistance( company_name.lower(), user.name.lower() ) < 2:
	return True
	if user.name.startswith( company_name ):
	return True
	return False

	def check_followers( user ):
	THRESHOLD = 50 # somewhat arbitrary
	return user.followers_count > THRESHOLD

	def check_verified( user ):
	return user.verified or (user.verified == "True")

	def check_url( company_name, user ):
	google_results = get_search_result_URLs( company_name )
	return urlparse(user.url).netloc in google_results

	def check_desc( user, target_phrases ):
	desc = user.description
	return any(phrase in user.description.lower() for phrase in target_phrases)

	def load_csv_names( pth ):
	with open(pth, 'rt') as f:
	reader = csv.reader(f, delimiter=',')
	for row in reader:
	for field in row:
	yield field.lower()


	if __name__ == "__main__":
	#follow_on_twitter( all_companies, None )
	import requests
	import re

	from bs4 import BeautifulSoup
	from urllib.parse import urlparse


	from ml_three_point_o import all_companies, tech_stack

	def flat_list_print( json ):
	if isinstance(json, list):
	for elem in json:
	print(elem)
	elif isinstance(json, dict):
	for key in json:
	flat_list_print(json[key])

	def hierachical_list_print( json, parent ):
	if isinstance( json, list ):
	for elem in json:
	print(elem)
	elif isinstance( json, dict ):
	for key in json:
	print(key.upper()) if not parent else print( ' - '.join(parent).upper() + ' - ' + key.upper())
	print("------------------")
	hierachical_list_print( json[key], parent + [key] )


	def hierachical_list_print_markdown( json, parent ):
	if isinstance( json, list ):
	for elem in json:
	print("* " + elem)
	print()
	elif isinstance( json, dict ):
	for key in json:
	if not parent:
	print("### ", key.upper())
	else:
	print("#### " + ' - '.join(parent).upper() + ' - ' + key.upper())
	print("------------------")
	hierachical_list_print_markdown( json[key], parent + [key] )

	def levenshteinDistance(s1, s2):
	"""
	Check the 'edit' distance between two words, s1 and s2.
	Code from : http://stackoverflow.com/a/32558749
	"""
	if len(s1) > len(s2):
	s1, s2 = s2, s1

	distances = range(len(s1) + 1)
	for i2, c2 in enumerate(s2):
	distances_ = [i2+1]
	for i1, c1 in enumerate(s1):
	if c1 == c2:
	distances_.append(distances[i1])
	else:
	distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
	distances = distances_
	return distances[-1]

	def get_search_result_URLs( company_name ):
	google_base = "https://www.google.com.au/search?q="
	page = requests.get(google_base + company_name)
	soup = BeautifulSoup(page.content, "html.parser")
	links = soup.find_all("a", href=re.compile("(?<=/url\?q=)(htt.://.)"))
	cleaned_links = []
	for link in links:
	cleaned_links.extend(re.split(":(?=http)",link["href"].replace("/url?q=","")))
	cleaned_links = [l for l in cleaned_links if "webcache" not in l]
	base_links = [urlparse(url).netloc for url in cleaned_links]

	return base_links

	subcategory_words = {
	'visual' : ['visual', 'image'],
	'audio' : ['audio', 'sound'],
	'sensor' : ['sensor', 'iot'],
	'internal data' : ['analytics', 'internal', 'data'],
	'market' : ['market'],
	'customer support' : ['customer', 'support'],
	'sales' : ['sales'],
	'marketing' : ['marketing'],
	'security' : ['security', 'protection', 'guard'],
	'recruiting' : ['recruiting', 'target', 'hr', 'talent', 'human', 'resources'],
	'ground navigation' : ['ground', 'navigation', 'truck', 'transport' ,'logistics'],
	'aerial' : ['aerial', 'freight', 'plane'],
	'industrial' : ['industrial'],
	'personal' : ['personal', 'agents', 'assistant'],
	'professional' : ['professional', 'agent', 'assitant'],
	'agriculture' : ['agriculture', 'farming', 'crop'],
	'education' : ['education', 'school', 'learning'],
	'investment' : ['investment', 'growth', 'returns'],
	'legal' : ['legal', 'law'],
	'logistics' : ['logistics'],
	'materials' : ['materials'],
	'retail finance' : ['retail', 'finance', 'customer'],
	'patient' : ['patient', 'healthcare', 'health'],
	'image' : ['image', 'healthcare', 'diagnostics'],
	'biological' : ['biological', 'healthcare']
	}

	machine_intelligence_phrases = [
	'machine intelligence',
	'machine learning',
	'artificial intelligence',
	'AI', 'A.I', 'A I',
	'data science',
	'intelligent',
	'algorithms', 'algorithm'
	]

	if __name__ == '__main__':
	#hierachical_list_print( tech_stack, [] )
	hierachical_list_print_markdown(all_companies, [])