Created
July 27, 2016 16:46
-
-
Save soeffing/730a0c850b148409800a6ad04f0dec2f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from sklearn.cluster import KMeans | |
import gensim | |
import sys | |
from pprint import pprint | |
import numpy as np | |
import collections | |
from sklearn.cluster import DBSCAN | |
from sklearn import metrics | |
from sklearn.datasets.samples_generator import make_blobs | |
from sklearn.preprocessing import StandardScaler | |
# Better to preload those word2vec models cuz they take ages to load | |
freebase = gensim.models.word2vec.Word2Vec.load_word2vec_format('/home/ubuntu/esb_volume/models/downloads/freebase-vectors-skipgram1000-en.bin', binary=True) | |
google = gensim.models.word2vec.Word2Vec.load_word2vec_format('/home/ubuntu/esb_volume/models/downloads/GoogleNews-vectors-negative300.bin', binary=True) | |
all_keywords = ['black_plastic_gloves', 'nitro_gloves', 'Colored_Disposable_Gloves', 'glove_supplier', 'gloves_distributor', 'vinyl_nitrile', 'disposable_glove_manufacturers', 'glove_distributors', 'www', 'disposable_exam_gloves', 'black_latex_texture', 'superior_quality_gloves', 'glove_importers', 'glove_plus', 'medical_glove_suppliers', 'glove_manufacturers_in_china', 'glove_manufacturer', 'nitrile_gloves_canada', 'nitrile_glove_manufacturers', 'latex_gloves_distributor', 'glove_nitrile', 'nitrile_glove', 'gloves_importer', 'nitrile_glove_manufacturer', 'gloves_distributors', 'vinyl_glove_suppliers', 'Industrial_Disposable_Gloves', 'personal_care_needs', 'twitter_contest_rules', 'general_gloves', 'latex_glove', 'select_gloves', 'microfiber_rags', 'nitrile_latex', 'poly_glove', 'string_gloves', 'vinyl_gloves_safety', 'premium_gloves', 'general_purpose_application', 'latex_gloves', 'glove_warmers', 'polyethylene_gloves', 'workers_gloves', 'glove_vinyl', 'purple_latex_gloves', 'powdered_latex_gloves', 'protective_coverings', 'textured_nitrile_gloves', 'latex_vs_vinyl', 'Powdered_Disposable_Gloves', 'work_glove', 'medical_grade_gloves', 'jersey_work_gloves', 'Food_Service_Poly_Gloves', 'polyethylene_glove', 'poly_gloves', 'glove_hand_warmers', 'disposable_latex_gloves', 'personal_protective_coverings', 'string_knit_work_gloves', 'heat_works', 'grip_work_gloves', 'vinyl_glove', 'cotton_work_gloves', 'industrial_work_glove', 'line_gloves', 'premium_hd', 'micro_fiber_towels', 'latex_glove_size_chart', 'hair_nets', 'sign_distributor', 'poly_sleeves', 'microfiber_towels', 'hair_net_for_cooking', 'hairnets', 'duty_gloves', 'booties_for_shoes_covers', 'white_hair_net', 'shoe_covers_booties', 'shoe_coverings', 'shoe_booties_covers', 'white_hair_nets', 'hair_nets_for_cooking', 'distributor_today', 'glove_safety_tips', 'restaurant_gloves', 'bouffant_caps', 'hand_glove_size', 'ecommerce_trend', 'north_american_distribution', 'beard_covers', 'booties_shoe_covers', 'sign_distributors', 'medical_exam_glove', 'lacquer_thinners', 'latex_material', 'toughest_gloves', 'digital_marketing_sales_pitch', 'earloop', 'Clean_Hand_Poly_Glove_Medium', 'vinyl_nitrile_gloves', 'nbr_nitrile', 'waiter_gloves', 'gloves_for_lacquer_thinner', 'synthetic_vinyl_gloves', 'disposable_chemical_resistant_gloves', 'ingredients_in_paint_thinner', 'paint_thinners', 'don_glove', 'stretch_vinyl', 'nitrile_allergy', 'hand_exam', 'polyvinyl_chloride_nitrile_butadiene_rubber', 'ems_disposable_gloves', 'ear_loop_face_mask', 'exam_gloves_for_dentists', 'X3_Nitrile_Gloves', 'right_glove', 'gloves_nylon', 'ems_workers', 'glove_wholesaler', 'nitrate_gloves', 'nitrile_blue', 'glove_manufacturers', 'gp_glove', 'nitrile_lab_gloves', 'Small_Powdered_Vinyl_Gloves', 'gloves_material', 'natural_rubber_production_process', 'aql_testing', 'gloves_hand', 'black_police_gloves', 'earloop_masks', 'examination_latex_gloves', 'selling_philosophy', 'gauntlet_cuffs', 'nbr_chemical', 'acetone_resistant_gloves', 'white_microfiber_towels', 'pvc_nitrile', 'baby_wipes_refills', 'why_are_gloves_powdered', 'best_gloves', 'Vinyl_Gloves_vs_Latex_Gloves', 'gloves_police', 'nitrile_material', 'rubber_production_process', 'sales_tactics_examples', 'vinyl_stretch', 'nitrile_glove_allergy', 'glove_suppliers', 'pet_gloves', 'white_inspection_gloves', 'latex_sap', 'Hand_Specific_Latex_Gloves', 'black_tattoo_gloves', 'split_cowhide', 'white_waiter_gloves', 'gauntlet_cuff', 'gp_exam', 'resistance_gloves', 'refills_wipes', 'booties_images', 'all_industrial_safety', 'X3_Gloves', 'b2b_consumers', 'latex_gloves_factory', 'food_processing_industry_trends', 'auto_gloves', 'dish_washing_gloves', 'cotton_glove', 'lacquer_paint', 'nylon_inspection_gloves', 'sales_tactics', 'gloves_blue', 'glove_testing', 'string_knit', 'latex_free', 'why_safety', 'Vinyl_Exam_Glove_Small', 'lacquer_thinner_paint_thinner', 'automotive_gloves', 'donning_gloves', 'latex_manufacturing', 'ear_loop_masks', 'Brown_Jersey_Knit_Glove', 'nitrile_gloves_allergy', 'fish_cleaning_gloves', 'Best_Quality_Disposable_Gloves', 'diaper_changing_gloves', 'paint_thinner_chemical', 'marketing_sales_pitch', 'gp_gloves', 'fireworks_equipment', 'sale_tactics', 'rubber_glove_manufacturer', 'earloop_face_mask', 'gloves_best', 'specialty_gloves', 'synthetic_glove', 'Antimicrobial_Vinyl_Gloves', 'glove_materials', 'dishwashing_gloves', 'vinyl_exam_glove', 'adult_washcloths', 'nbr_polymer', 'household_glove', 'microfiber_suppliers', 'xtreme_nitrile', 'social_media_marketing_benefits', 'exam_gloves', 'black_nitrile_glove', 'nylon_gloves', 'unlined_leather', 'paint_thinner_on_skin', 'inspection_glove', 'industrial_glove', 'micro_gloves', 'ems_gloves', 'best_automotive_in_Tex.', 'exam_glove', 'lacquer_thinner', 'barrier_protection', 'glove_material', 'b2b_consumer', 'gloves_with_string', 'used_gloves', 'vinyl_industrial', 'knit_glove', 'gloves_hair', 'make_latex', 'string_knit_gloves', 'dishwashing_glove', 'Latex_Exam_Glove_Small', 'gloves_for_acetone', 'natural_rubber_manufacturing_process', 'baby_wipe_tub', 'chemicals_in_paint_thinner', 'dentist_industry', 'antimicrobial_gloves', 'donned_gloves', 'how_is_vinyl_made', 'vinyl_examination_gloves', 'industry_trend', 'nitrile_chemistry', 'latex_exam', 'Powder_Free_Vinyl_Gloves', 'dipping_rubber', 'paint_thinner_side_effects', 'latex_string', 'Glove_Nitrile_Short_Sleeve', 'instant_cold', 'rubber_glove_manufacturers', 'best_mechanics_gloves', 'glove_industrial', 'nitrile_exam', 'nitrile_pvc', 'latex_manufacturers', 'Industrial_Vinyl_Gloves', 'beauty_gloves', 'aql_test', 'synthetic_vinyl', 'hand_warmer_reviews', 'black_chemical_gloves', 'black_nitrile', 'ems_glove', 'fireworks_manufacturing', 'puncture_resistant_disposable_gloves', 'baby_wipe_refills', 'surgical_gloves_history', 'flock_lined_gloves', 'ear_loop', 'knit_gloves', 'latex_exam_glove', 'hair_gloves', 'disposable_surgical_gloves', 'changing_table_paper_rolls', 'latex_hd', 'best_in_Tex._automotive', 'long_dishwashing_gloves', 'are_nitrile_gloves_chemical_resistant', 'cotton_string_knit_gloves', 'allergy_free_gloves', 'lacquer_thinner_ingredients', 'gripping_gloves', 'best_auto_IND_IN', 'Powder_Free_Nitrile_Gloves', 'vinyl_allergy', 'industrial_safety_gloves', 'glove_string', 'LX3_Gloves', 'trends_in_food', 'painting_gloves', 'paint_lacquer', 'latex_examination_gloves', 'latex_dipped_gloves', 'common_chemicals', 'synthetic_gloves', 'Xtreme_Gloves', 'changing_table_paper', 'acetone_gloves', 'rubber_dipping_process', 'food_safety_scores', 'national_food_safety_month', 'baby_care_market', 'Nitrile_Glove_Chemical_Resistant', 'naphtha_paint_thinner', 'latex_production', 'rubber_making_process', 'latex_rubber', 'chemical_resistant_disposable_gloves', 'safety_gloves_cost_savings', 'lacquer_paint_thinner', 'glove_manufacturing_process', 'food_safety_month', 'glove_manufacturing', 'synthetic_exam_gloves', 'texture_grip', 'acetone_nitrile_gloves', 'vinyl_exam_gloves', 'nitrile_exam_glove', 'gloves_manufacturing', 'extreme_green_products', 'smart_sales', 'child_care_products', 'cowhide_split', 'in_disposable', 'flock_lined', 'latex_barrier_protection', 'Glove_Green_Medium', 'paint_gloves', 'latex_hand', 'cotton_gloves_disposable', 'auto_mechanics_gloves', 'industrial_latex', 'medical_glove_manufacturers', 'Sized_Gloves', 'copolymer_gloves', 'xtreme_green_products', 'baby_wipes_refill', 'rubber_glove', 'earloop_face_masks', 'string_knit_glove', 'Large_Powdered_Latex_Glove', 'plastic_glove', 'nitrile_polymer', 'glove_chemical_resistance', 'recent_trends_in_food_processing', 'industry_glove', 'exam_latex', 'elastic_gloves', 'sells_gloves', 'nitrile', 'knit_with', 'heavy_duty_glove', 'glove_weights', 'janitorial_images', 'glove_sales', 'center_gloves', 'gwon', 'osha_products_we', 'glove_weight', 'weight_glove', 'open_hand_images', 'Extra_Long_Disposable_Gloves', 'disposable_medical_grade_latex_gloves', 'dental_exam_gloves', 'Safety_Glove_Prep_Guard_Small', 'diamond_grip_nitrile_gloves', 'heavy_gloves', 'jan_san_industry', 'winter_work_safety_tips', 'jan_san', 'disposable_nitrile', 'glove_works', 'corona_virus_in_horses', 'gloves_supply', '7_mil_nitrile_gloves', 'disposable_gloves', 'stretching_vinyl', 'dental_gloves', 'Glove_Latex_Orange_Large', 'long_exam_gloves', 'powdered_exam_gloves', 'works_gloves', 'winter_safety_tips_at_work', 'FDA_Approved_Medical_Gloves', 'food_glove', 'gram_weights', 'sell_gloves', 'medical_grade_latex', 'Rubber_Glove_Orange_Large', 'gloves_uses', 'extra_long_gloves', 'duty_glove', 'Glove_Latex_Orange_Medium', 'hd_heavy_duty', 'Food_Service_Disposable_Gloves', 'Glove_Orange_Rubber_Medium', 'plumbing_gloves', 'hd_blue', 'gloves_tattoo', 'chain_gloves', 'Latex_Glove_Orange_Small', 'latex_watermark', 'working_in_cold_conditions', 'wrist_covers', 'vinyl_latex', 'thick_gloves', 'Vinyl_Food_Service_Gloves', 'nitrile_gloves', 'working_in_cold_temperatures', 'gloves_orange', 'kitchen_gloves', 'gloves_works', 'medical_gloves_for_sale', 'gloves_sells', 'is_vinyl_latex', 'Nitrile_Glove_Puncture_Resistant', 'working_in_the_cold', 'tattoo_artist_gloves', 'orange_sales_team', 'wrist_images', 'orange_glove', 'gloves_dental', 'sales_now', 'football_players_gloves', 'janitorial_gloves', 'glove_industry', 'orange_hd', 'orange_gloves', 'hand_holding_gloves', 'auto_holding'] | |
seen = set() | |
result = [] | |
for item in all_keywords: | |
if item not in seen: | |
seen.add(item) | |
result.append(item) | |
fails = [] | |
hits = [] | |
vectors = [] | |
for keyword in result: | |
try: | |
google.vocab[keyword] | |
#freebase.vocab['/en/' + keyword] | |
vectors.append(google[keyword]) | |
#vectors.append(freebase['/en/' + keyword]) | |
hits.append(keyword) | |
except: | |
try: | |
unigrams = keyword.split('_') | |
uni_vectors = [] | |
for uni in unigrams: | |
#freebase.vocab['/en/' + uni] | |
google.vocab[uni] | |
#uni_vectors.append(freebase['/en/' + uni]) | |
uni_vectors.append(google[uni]) | |
vectors.append(sum(uni_vectors) / len(unigrams)) | |
hits.append(keyword) | |
except: | |
fails.append(keyword) | |
# Standaridze | |
new_vectors = StandardScaler().fit_transform(vectors) | |
# Try out various eps to see distribution of clusters | |
# Adjust eps bounds and step size | |
for eps in np.arange(13,14.5,0.1): | |
print 'Eps: ' | |
print eps | |
new_db = DBSCAN(eps=eps, min_samples=10).fit(new_vectors) | |
labels = new_db.labels_ | |
counter=collections.Counter(labels) | |
print counter | |
print len(counter) | |
# Once you found a decent eps, use it here | |
new_db = DBSCAN(eps=13.9, min_samples=10).fit(new_vectors) | |
labels = new_db.labels_ | |
counter=collections.Counter(labels) | |
clusters = {} | |
for key, value in counter.iteritems(): | |
clusters[key] = [] | |
for idx, val in enumerate(labels): | |
clusters[val].append(hits[idx]) | |
pprint(clusters) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment