Last active
August 17, 2021 20:46
-
-
Save joaomdmoura/0e916fb3b0aeb1fa7201f26ea7a3a91d to your computer and use it in GitHub Desktop.
Ruby script to enrich company data based on a email column in a CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Before running this make sure you have Ruby installed | |
# preference to version > 2.4 | |
# | |
# Also install the clearbit library, after the ruby | |
# installation, to do it you just run: | |
# $ gem install clearbit | |
# | |
# Then, you run a ruby console on the same directory of this file | |
# and the following commands: | |
# | |
# $ irb | |
# > require_relative "name_of_this_file_without_extension" | |
# > data = EnrichCSVClearbit.new("your_csv_on_the_same_directory.csv", "you_secret_key_on_clearbit") | |
# > data.enrich!("name_of_your_column_with_the_email_address") | |
# | |
# The secret clearbit key should be obtained by creating your | |
# account at http://clearbit.com | |
require 'clearbit' | |
require 'csv' | |
class EnrichCSVClearbit | |
attr_accessor :csv, :data, :non_company_domains | |
def initialize(csv, clearbit_key) | |
@csv = csv | |
@data = CSV.read(csv) | |
@non_company_domains = [ | |
'gmail.com', 'yahoo.com', 'hotmail.com', 'aol.com', 'icloud.com', | |
'me.com', 'outlook.com', 'live.com', 'mac.com', 'comcast.net', | |
'ymail.com', 'msn.com', 'cox.net', 'att.net', 'verizon.net', | |
'mail.com', 'yahoo.co.uk', 'example.com', 'packlane.com' | |
] | |
Clearbit.key = clearbit_key | |
end | |
def enrich!(email_field) | |
enrich_company(email_field) | |
:ok | |
end | |
private | |
def enrich_company(email_field) | |
header = @data.delete_at(0) | |
email_field_index = header.index(email_field) | |
header << ["name", "bio", "founded", "alexaUsRank", "alexaGlobalRank", | |
"employees", "employeesRange", "marketCap", "raised", "annualRevenue", | |
"estimatedAnnualRevenue", "fiscalYearEnd", "streetNumber", "streetName", | |
"subPremise", "city", "postalCode", "state", "stateCode", "country", | |
"countryCode", "lat", "lng", "sector", "industryGroup", "industry", | |
"subIndustry", "sicCode", "naicsCode", "domain", "domain_aliases", "tags", | |
"timezone", "fb_likes", "tw_followers", "tw_following"] | |
data_to_append = [] | |
data_to_append << header.flatten | |
data.each_with_index do |row, index| | |
domain = row[email_field_index].split("@")[1] | |
row_to_append = [] | |
row_to_append << row | |
begin | |
unless @non_company_domains.include?(domain.downcase) | |
result = Clearbit::Enrichment.find(domain: domain, stream: true) | |
if parsed_result = parse_company_result(result) | |
row_to_append = include_parsed_result(row_to_append, parsed_result) | |
end | |
end | |
rescue | |
puts "Error on row #{index+1} " | |
puts "Row populated with pre-existing data and moving on" | |
puts "==============================" | |
end | |
data_to_append << row_to_append.flatten | |
$stdout.flush | |
text = "row #{index+1} from #{data.size} " + "\r" | |
print text | |
end | |
overwrite_csv(data_to_append) | |
end | |
def overwrite_csv(rows) | |
File.open(@csv, "w") {|f| f.write(rows.inject([]) { |csv, row| csv << CSV.generate_line(row) }.join(""))} | |
end | |
def parse_company_result(result) | |
if result.company | |
{ | |
name: result.company["name"], | |
bio: result.company["description"], | |
founded: result.company["foundedYear"], | |
metrics: result.company["metrics"], | |
locale: result.company["geo"], | |
category: result.company["category"], | |
legal_name: result.company["legalName"], | |
domain: result.company["domain"], | |
domain_aliases: result.company["domainAliases"].join(", "), | |
tags: result.company["tags"].join(", "), | |
timezone: result.company["timeZone"], | |
fb_likes: result.company["facebook"]["likes"], | |
tw_followers: result.company["twitter"]["followers"], | |
tw_following: result.company["twitter"]["following"] | |
} | |
end | |
end | |
def include_parsed_result(data, result) | |
data << result[:name] | |
data << result[:bio] | |
data << result[:founded] | |
data << result[:metrics]["alexaUsRank"] | |
data << result[:metrics]["alexaGlobalRank"] | |
data << result[:metrics]["employees"] | |
data << result[:metrics]["employeesRange"] | |
data << result[:metrics]["marketCap"] | |
data << result[:metrics]["raised"] | |
data << result[:metrics]["annualRevenue"] | |
data << result[:metrics]["estimatedAnnualRevenue"] | |
data << result[:metrics]["fiscalYearEnd"] | |
data << result[:locale]["streetNumber"] | |
data << result[:locale]["streetName"] | |
data << result[:locale]["subPremise"] | |
data << result[:locale]["city"] | |
data << result[:locale]["postalCode"] | |
data << result[:locale]["state"] | |
data << result[:locale]["stateCode"] | |
data << result[:locale]["country"] | |
data << result[:locale]["countryCode"] | |
data << result[:locale]["lat"] | |
data << result[:locale]["lng"] | |
data << result[:category]["sector"] | |
data << result[:category]["industryGroup"] | |
data << result[:category]["industry"] | |
data << result[:category]["subIndustry"] | |
data << result[:category]["sicCode"] | |
data << result[:category]["naicsCode"] | |
data << result[:domain] | |
data << result[:domain_aliases] | |
data << result[:tags] | |
data << result[:timezone] | |
data << result[:fb_likes] | |
data << result[:tw_followers] | |
data << result[:tw_following] | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment