Created
December 30, 2016 15:10
-
-
Save demogar/3413b1bdf2148649052ecc1db827ebd6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'levenshtein' | |
require 'csv' | |
# Config | |
counter = 0 | |
maxError = 3 | |
inFile = 'entrada.csv' | |
outFile = 'salida.csv' | |
total = 0 | |
# Extending String class for blank? method | |
class String | |
def blank? | |
self.strip.empty? | |
end | |
end | |
# Formatting strings | |
def format_string str | |
str.upcase.strip | |
end | |
# Does name has a "DE" inside? | |
def has_de? str | |
str.include? " DE " | |
end | |
# Calculate Distance between two strings (te, ifarhu) | |
def calculate_distance te, ifarhu | |
distance = 99 | |
ifarhuParts = ifarhu.split(" ") | |
if ifarhuParts.length == 4 and !has_de? ifarhu | |
# El nombre del IFARHU tiene 4 partes y ningun "DE" | |
clearedName = "#{ifarhuParts[0]} #{ifarhuParts[2]}" | |
distance = Levenshtein.distance te, clearedName | |
elsif ifarhuParts.length == 3 and !has_de? ifarhu | |
# El nombre del IFARHU tiene 3 partes y ningun "DE" | |
clearedName = "#{ifarhuParts[0]} #{ifarhuParts[1]}" | |
distance = Levenshtein.distance te, clearedName | |
elsif te.blank? or ifarhu.blank? | |
# El nombre del TE o del IFARHU viene vacio | |
distance = 99 | |
elsif has_de? ifarhu | |
# El nombre tiene "DE" en algun lado | |
teParts = te.split(" ") | |
distance = Levenshtein.distance teParts[0], ifarhuParts[0] | |
else | |
# El resto | |
distance = Levenshtein.distance te, ifarhu | |
end | |
distance | |
end | |
# In | |
lines = CSV.read(inFile) | |
lines.each do |line| | |
id = format_string line[0] | |
tribunal = format_string line[1] | |
ifarhu = format_string line[2] | |
line[0] = id | |
line[1] = tribunal | |
line[2] = ifarhu | |
distance = calculate_distance tribunal, ifarhu | |
line << distance | |
end | |
# Out | |
CSV.open(outFile, 'w') do |csv| | |
lines.each do |line| | |
total = total + 1 | |
counter = counter + 1 if line[3] <= maxError | |
csv << line | |
end | |
end | |
p "Total: #{total}" | |
p "Corregidos: #{counter}" | |
p "Con error: #{total - counter}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment