Skip to content

Instantly share code, notes, and snippets.

@joaomdmoura
Last active August 17, 2021 20:46
Show Gist options
  • Save joaomdmoura/0e916fb3b0aeb1fa7201f26ea7a3a91d to your computer and use it in GitHub Desktop.
Save joaomdmoura/0e916fb3b0aeb1fa7201f26ea7a3a91d to your computer and use it in GitHub Desktop.
Ruby script to enrich company data based on a email column in a CSV
# Before running this make sure you have Ruby installed
# preference to version > 2.4
#
# Also install the clearbit library, after the ruby
# installation, to do it you just run:
# $ gem install clearbit
#
# Then, you run a ruby console on the same directory of this file
# and the following commands:
#
# $ irb
# > require_relative "name_of_this_file_without_extension"
# > data = EnrichCSVClearbit.new("your_csv_on_the_same_directory.csv", "you_secret_key_on_clearbit")
# > data.enrich!("name_of_your_column_with_the_email_address")
#
# The secret clearbit key should be obtained by creating your
# account at http://clearbit.com
require 'clearbit'
require 'csv'
class EnrichCSVClearbit
attr_accessor :csv, :data, :non_company_domains
def initialize(csv, clearbit_key)
@csv = csv
@data = CSV.read(csv)
@non_company_domains = [
'gmail.com', 'yahoo.com', 'hotmail.com', 'aol.com', 'icloud.com',
'me.com', 'outlook.com', 'live.com', 'mac.com', 'comcast.net',
'ymail.com', 'msn.com', 'cox.net', 'att.net', 'verizon.net',
'mail.com', 'yahoo.co.uk', 'example.com', 'packlane.com'
]
Clearbit.key = clearbit_key
end
def enrich!(email_field)
enrich_company(email_field)
:ok
end
private
def enrich_company(email_field)
header = @data.delete_at(0)
email_field_index = header.index(email_field)
header << ["name", "bio", "founded", "alexaUsRank", "alexaGlobalRank",
"employees", "employeesRange", "marketCap", "raised", "annualRevenue",
"estimatedAnnualRevenue", "fiscalYearEnd", "streetNumber", "streetName",
"subPremise", "city", "postalCode", "state", "stateCode", "country",
"countryCode", "lat", "lng", "sector", "industryGroup", "industry",
"subIndustry", "sicCode", "naicsCode", "domain", "domain_aliases", "tags",
"timezone", "fb_likes", "tw_followers", "tw_following"]
data_to_append = []
data_to_append << header.flatten
data.each_with_index do |row, index|
domain = row[email_field_index].split("@")[1]
row_to_append = []
row_to_append << row
begin
unless @non_company_domains.include?(domain.downcase)
result = Clearbit::Enrichment.find(domain: domain, stream: true)
if parsed_result = parse_company_result(result)
row_to_append = include_parsed_result(row_to_append, parsed_result)
end
end
rescue
puts "Error on row #{index+1} "
puts "Row populated with pre-existing data and moving on"
puts "=============================="
end
data_to_append << row_to_append.flatten
$stdout.flush
text = "row #{index+1} from #{data.size} " + "\r"
print text
end
overwrite_csv(data_to_append)
end
def overwrite_csv(rows)
File.open(@csv, "w") {|f| f.write(rows.inject([]) { |csv, row| csv << CSV.generate_line(row) }.join(""))}
end
def parse_company_result(result)
if result.company
{
name: result.company["name"],
bio: result.company["description"],
founded: result.company["foundedYear"],
metrics: result.company["metrics"],
locale: result.company["geo"],
category: result.company["category"],
legal_name: result.company["legalName"],
domain: result.company["domain"],
domain_aliases: result.company["domainAliases"].join(", "),
tags: result.company["tags"].join(", "),
timezone: result.company["timeZone"],
fb_likes: result.company["facebook"]["likes"],
tw_followers: result.company["twitter"]["followers"],
tw_following: result.company["twitter"]["following"]
}
end
end
def include_parsed_result(data, result)
data << result[:name]
data << result[:bio]
data << result[:founded]
data << result[:metrics]["alexaUsRank"]
data << result[:metrics]["alexaGlobalRank"]
data << result[:metrics]["employees"]
data << result[:metrics]["employeesRange"]
data << result[:metrics]["marketCap"]
data << result[:metrics]["raised"]
data << result[:metrics]["annualRevenue"]
data << result[:metrics]["estimatedAnnualRevenue"]
data << result[:metrics]["fiscalYearEnd"]
data << result[:locale]["streetNumber"]
data << result[:locale]["streetName"]
data << result[:locale]["subPremise"]
data << result[:locale]["city"]
data << result[:locale]["postalCode"]
data << result[:locale]["state"]
data << result[:locale]["stateCode"]
data << result[:locale]["country"]
data << result[:locale]["countryCode"]
data << result[:locale]["lat"]
data << result[:locale]["lng"]
data << result[:category]["sector"]
data << result[:category]["industryGroup"]
data << result[:category]["industry"]
data << result[:category]["subIndustry"]
data << result[:category]["sicCode"]
data << result[:category]["naicsCode"]
data << result[:domain]
data << result[:domain_aliases]
data << result[:tags]
data << result[:timezone]
data << result[:fb_likes]
data << result[:tw_followers]
data << result[:tw_following]
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment