Created
March 8, 2012 12:53
-
-
Save mmriis/2000871 to your computer and use it in GitHub Desktop.
A class that screenscrapes cvr.dk to return company info from either vat_id or name
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CvrInformation | |
require 'open-uri' | |
require 'nokogiri' | |
class CompanyNotFoundInCvr < Exception ; end | |
class << self | |
def get_company_from_vat_number(cvr) | |
uri = "http://cvr.dk/Site/Forms/PublicService/DisplayCompany.aspx?cvrnr=#{cvr}" | |
extract_company_from_cvr(uri) | |
end | |
def get_company_from_name(name) | |
uri = URI.escape("http://cvr.dk/Site/Forms/CompanySearch/CompanySearch.aspx?navn=#{name}") | |
extract_company_from_cvr(uri) | |
end | |
private | |
def extract_company_from_cvr(uri) | |
doc = Nokogiri::HTML(open(uri)) | |
(doc/"div.titletext").inner_html.strip.empty? and raise CompanyNotFoundInCvr | |
company = Company.new | |
company.name = extract_name(doc) | |
company.address = extract_address(doc) | |
company.zip = extract_zip(doc) | |
company.city = extract_city(doc) | |
company.vat_number = extract_vat_number(doc) | |
company.telephone = extract_telephone(doc) | |
company.email = extract_email(doc) | |
return company | |
end | |
def extract_name(doc) | |
(doc/"div.titletext").inner_html.strip | |
end | |
def extract_address(doc) | |
(doc/"tr:contains('Adresse')/td.fieldvalue").inner_html =~ /^(.+)<br\W+\d{4}\W+(.+)$/ | |
$~[1].strip.gsub("<br />", ", ") | |
end | |
def extract_zip(doc) | |
(doc/"tr:contains('Adresse')/td.fieldvalue").inner_html =~ /(\d{4})\W+(.+)$/ | |
$~[1].strip | |
end | |
def extract_city(doc) | |
(doc/"tr:contains('Adresse')/td.fieldvalue").inner_html =~ /(\d{4})\W+(.+)$/ | |
$~[2].strip | |
end | |
def extract_vat_number(doc) | |
(doc/"tr:contains('Cvr-nr')/td.fieldvalue").inner_html.strip | |
end | |
def extract_email(doc) | |
(doc/"tr:contains('Email')/td.fieldvalue").inner_html.strip | |
end | |
def extract_telephone(doc) | |
(doc/"tr:contains('Telefon')/td.fieldvalue").inner_html.strip | |
end | |
end | |
class Company | |
attr_accessor :name, :address, :zip, :city, :country, :vat_number, :ean, :telephone, :email | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment