Skip to content

Instantly share code, notes, and snippets.

@jazzido
Created August 10, 2014 00:42
Show Gist options
  • Select an option

  • Save jazzido/f6673936ffa36760ff2d to your computer and use it in GitHub Desktop.

Select an option

Save jazzido/f6673936ffa36760ff2d to your computer and use it in GitHub Desktop.
# coding: utf-8
module IndecScraper
class TableExtractor
attr_reader :extractor
DEFAULT_INDEX_PAGE = 3
PAGE_NUMBER_RE = /.*?(\d+)$/
def initialize(pdf_file)
@pdf_file = pdf_file
@page_cache = {}
@logger = Logger.new(STDERR)
@logger.progname = self.class.name
self.log("Opening file: %s" % pdf_file)
@extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file, :all)
end
def log(msg, level=Logger::INFO)
@logger.add(level, msg)
end
def close!
@extractor.close!
end
def page(page_number=nil)
@page_cache[page_number] ||= self.extractor.extract_page(page_number || DEFAULT_INDEX_PAGE)
end
##
# extrae la tabla deseada. retorna una lista de listas (lista de filas)
# (debe ser implementado por clases descendientes)
def extract
raise "abstract - not implemented"
end
##
# retorna todas las +Tabula::Line+ de la pagina +page_number+
def lines_from_page(page_number)
page = self.page(page_number)
Tabula::TextChunk.group_by_lines(Tabula::TextElement.merge_words(page.texts.sort))
end
##
# retorna la primera linea (+Tabula::Line+) de la pagina +page_number+
# cuyo texto matchee con +re+
def matching_line_from_page(page_number, re)
lines = self.lines_from_page(page_number)
lines.find { |line|
!(line.text_elements.map(&:text).join =~ re).nil?
}
end
##
# busca en un 'indice' del PDF y retorna el numero de página
def matching_page_number_from_index(page_number, re)
line = self.matching_line_from_page(page_number, re)
m = self.class.match_text_in_line(line, PAGE_NUMBER_RE)
raise "Not found" if m.empty?
m.first.to_i
end
##
# 'static' class methods
class << self
##
# matchea la regexp +re+ en +line+
def match_text_in_line(line, re)
m = line.text_elements.map { |te| te.text }.join.strip.match(re)
m.captures || []
end
##
# convierte una instancia de Tabula::Table a un array de arrays (lista de filas)
def table_to_array(table)
table.rows.map do |l|
l.map { |te| te.text.strip }
end
end
end
end
class PobrezaHogaresPersonasExtractor < TableExtractor
MAIN_INDEX_RE = /Pobreza e ingresos/i
SECTION_INDEX_RE = /5\.2/i # Estamos buscando el "cuadro 5.2"
def extract(page=nil)
# si no nos dieron un numero de pagina, calcularla scrapeando
# los indices (tables of contents)
if page.nil?
section_page_number = self.matching_page_number_from_index(3, MAIN_INDEX_RE)
self.log("%s - section Page: %d" % [File.basename(@pdf_file), section_page_number])
section_page_lines = self.lines_from_page(section_page_number)
prev_line = self.matching_line_from_page(section_page_number, SECTION_INDEX_RE)
m = self.class.match_text_in_line(section_page_lines[section_page_lines.index(prev_line) + 1],
PAGE_NUMBER_RE)
raise "Cuadro Pobreza e Indigencia not found" if m.empty?
cuadro_page_number = m.first.to_i
else
cuadro_page_number = page
end
self.log("%s - CUADRO PAGE: %d" % [File.basename(@pdf_file), cuadro_page_number])
bounds = self.table_bounds(cuadro_page_number)
t = self.class
.table_to_array(self.page(cuadro_page_number)
.get_area(bounds)
.get_table)
rv = t.map.with_index { |row, i|
[i, IndecScraper.canonicalize_string(row.first)] \
+ [row.first] \
+ row[1..-1].map { |cell| cell.gsub(",", ".").to_f }
}
self.log("%s - extracted table in page %d" % [File.basename(@pdf_file), cuadro_page_number])
rv
end
def table_bounds(cuadro_page_number)
cuadro_page = self.page(cuadro_page_number)
top_line, bottom_line = cuadro_page
.horizontal_ruling_lines
.sort_by(&:width)
.reverse[0..2].sort_by(&:top)[1..3]
# quiero sacar la primera línea de texto, por eso esto
area = page.get_area([top_line.top,
top_line.left,
bottom_line.top,
bottom_line.right])
lines = Tabula::TextChunk.group_by_lines(Tabula::TextElement.merge_words(area.texts.sort))
[lines.first.top, top_line.left, bottom_line.top, bottom_line.right]
end
end
##
# Extractor para las tablas de IPC "por principales aperturas y variaciones porcentuales para distintos períodos"
# base 2008-04
class IPC2008Extractor < TableExtractor
MAIN_INDEX_RE = /Precios al consumidor/i
SECTION_INDEX_RE = /por principales aperturas y variaciones porcentuales/
TABLE_TOP_BOUND_RE = /NIVEL GENERAL/i
TABLE_BOTTOM_BOUND_RE = /informa/i
def extract(page=nil)
# si no nos dieron un numero de pagina, calcularla scrapeando
# los indices (tables of contents)
if page.nil?
section_page_number = self.matching_page_number_from_index(3, MAIN_INDEX_RE)
self.log("%s - section Page: %d" % [File.basename(@pdf_file), section_page_number])
section_page_lines = self.lines_from_page(section_page_number)
prev_line = self.matching_line_from_page(section_page_number, SECTION_INDEX_RE)
m = self.class.match_text_in_line(section_page_lines[section_page_lines.index(prev_line) + 1],
PAGE_NUMBER_RE)
raise "Cuadro IPC not found" if m.empty?
cuadro_page_number = m.first.to_i
else
cuadro_page_number = page
end
bounds = self.table_bounds(cuadro_page_number)
t = self.class.table_to_array(self.page(cuadro_page_number).get_area(bounds).get_table)
# 'achatar' las filas word-wrappeadas
t = t.reduce([]) do |memo, row|
# salteo "Regiones", es un título
if row[0].strip == "Regiones"
next memo
end
# si todas las columnas de la 2da en adelante estan vacías en la fila anterior,
# combinar con las columnas de la actual
if memo[-1] && memo[-1][1..-1].all? { |cell| cell == '' }
memo[-1] = memo[-1].zip(row).map { |x, y| x + ' ' + y }
else
memo << row
end
memo
end
rv = t.map.with_index { |row, i|
[i, IndecScraper.canonicalize_string(row.first)] + row
}
self.log("%s - extracted table in page %d" % [File.basename(@pdf_file), cuadro_page_number])
rv
end
##
# calcula los bounds de la tabla IPC en la
# pagina +cuadro_page_number+
def table_bounds(cuadro_page_number)
top_line = self.matching_line_from_page(cuadro_page_number, TABLE_TOP_BOUND_RE)
bottom_line = self.matching_line_from_page(cuadro_page_number, TABLE_BOTTOM_BOUND_RE)
[top_line.top, top_line.left, bottom_line.top, bottom_line.right]
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment