Created
August 10, 2014 00:42
-
-
Save jazzido/f6673936ffa36760ff2d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 | |
| module IndecScraper | |
| class TableExtractor | |
| attr_reader :extractor | |
| DEFAULT_INDEX_PAGE = 3 | |
| PAGE_NUMBER_RE = /.*?(\d+)$/ | |
| def initialize(pdf_file) | |
| @pdf_file = pdf_file | |
| @page_cache = {} | |
| @logger = Logger.new(STDERR) | |
| @logger.progname = self.class.name | |
| self.log("Opening file: %s" % pdf_file) | |
| @extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file, :all) | |
| end | |
| def log(msg, level=Logger::INFO) | |
| @logger.add(level, msg) | |
| end | |
| def close! | |
| @extractor.close! | |
| end | |
| def page(page_number=nil) | |
| @page_cache[page_number] ||= self.extractor.extract_page(page_number || DEFAULT_INDEX_PAGE) | |
| end | |
| ## | |
| # extrae la tabla deseada. retorna una lista de listas (lista de filas) | |
| # (debe ser implementado por clases descendientes) | |
| def extract | |
| raise "abstract - not implemented" | |
| end | |
| ## | |
| # retorna todas las +Tabula::Line+ de la pagina +page_number+ | |
| def lines_from_page(page_number) | |
| page = self.page(page_number) | |
| Tabula::TextChunk.group_by_lines(Tabula::TextElement.merge_words(page.texts.sort)) | |
| end | |
| ## | |
| # retorna la primera linea (+Tabula::Line+) de la pagina +page_number+ | |
| # cuyo texto matchee con +re+ | |
| def matching_line_from_page(page_number, re) | |
| lines = self.lines_from_page(page_number) | |
| lines.find { |line| | |
| !(line.text_elements.map(&:text).join =~ re).nil? | |
| } | |
| end | |
| ## | |
| # busca en un 'indice' del PDF y retorna el numero de página | |
| def matching_page_number_from_index(page_number, re) | |
| line = self.matching_line_from_page(page_number, re) | |
| m = self.class.match_text_in_line(line, PAGE_NUMBER_RE) | |
| raise "Not found" if m.empty? | |
| m.first.to_i | |
| end | |
| ## | |
| # 'static' class methods | |
| class << self | |
| ## | |
| # matchea la regexp +re+ en +line+ | |
| def match_text_in_line(line, re) | |
| m = line.text_elements.map { |te| te.text }.join.strip.match(re) | |
| m.captures || [] | |
| end | |
| ## | |
| # convierte una instancia de Tabula::Table a un array de arrays (lista de filas) | |
| def table_to_array(table) | |
| table.rows.map do |l| | |
| l.map { |te| te.text.strip } | |
| end | |
| end | |
| end | |
| end | |
| class PobrezaHogaresPersonasExtractor < TableExtractor | |
| MAIN_INDEX_RE = /Pobreza e ingresos/i | |
| SECTION_INDEX_RE = /5\.2/i # Estamos buscando el "cuadro 5.2" | |
| def extract(page=nil) | |
| # si no nos dieron un numero de pagina, calcularla scrapeando | |
| # los indices (tables of contents) | |
| if page.nil? | |
| section_page_number = self.matching_page_number_from_index(3, MAIN_INDEX_RE) | |
| self.log("%s - section Page: %d" % [File.basename(@pdf_file), section_page_number]) | |
| section_page_lines = self.lines_from_page(section_page_number) | |
| prev_line = self.matching_line_from_page(section_page_number, SECTION_INDEX_RE) | |
| m = self.class.match_text_in_line(section_page_lines[section_page_lines.index(prev_line) + 1], | |
| PAGE_NUMBER_RE) | |
| raise "Cuadro Pobreza e Indigencia not found" if m.empty? | |
| cuadro_page_number = m.first.to_i | |
| else | |
| cuadro_page_number = page | |
| end | |
| self.log("%s - CUADRO PAGE: %d" % [File.basename(@pdf_file), cuadro_page_number]) | |
| bounds = self.table_bounds(cuadro_page_number) | |
| t = self.class | |
| .table_to_array(self.page(cuadro_page_number) | |
| .get_area(bounds) | |
| .get_table) | |
| rv = t.map.with_index { |row, i| | |
| [i, IndecScraper.canonicalize_string(row.first)] \ | |
| + [row.first] \ | |
| + row[1..-1].map { |cell| cell.gsub(",", ".").to_f } | |
| } | |
| self.log("%s - extracted table in page %d" % [File.basename(@pdf_file), cuadro_page_number]) | |
| rv | |
| end | |
| def table_bounds(cuadro_page_number) | |
| cuadro_page = self.page(cuadro_page_number) | |
| top_line, bottom_line = cuadro_page | |
| .horizontal_ruling_lines | |
| .sort_by(&:width) | |
| .reverse[0..2].sort_by(&:top)[1..3] | |
| # quiero sacar la primera línea de texto, por eso esto | |
| area = page.get_area([top_line.top, | |
| top_line.left, | |
| bottom_line.top, | |
| bottom_line.right]) | |
| lines = Tabula::TextChunk.group_by_lines(Tabula::TextElement.merge_words(area.texts.sort)) | |
| [lines.first.top, top_line.left, bottom_line.top, bottom_line.right] | |
| end | |
| end | |
| ## | |
| # Extractor para las tablas de IPC "por principales aperturas y variaciones porcentuales para distintos períodos" | |
| # base 2008-04 | |
| class IPC2008Extractor < TableExtractor | |
| MAIN_INDEX_RE = /Precios al consumidor/i | |
| SECTION_INDEX_RE = /por principales aperturas y variaciones porcentuales/ | |
| TABLE_TOP_BOUND_RE = /NIVEL GENERAL/i | |
| TABLE_BOTTOM_BOUND_RE = /informa/i | |
| def extract(page=nil) | |
| # si no nos dieron un numero de pagina, calcularla scrapeando | |
| # los indices (tables of contents) | |
| if page.nil? | |
| section_page_number = self.matching_page_number_from_index(3, MAIN_INDEX_RE) | |
| self.log("%s - section Page: %d" % [File.basename(@pdf_file), section_page_number]) | |
| section_page_lines = self.lines_from_page(section_page_number) | |
| prev_line = self.matching_line_from_page(section_page_number, SECTION_INDEX_RE) | |
| m = self.class.match_text_in_line(section_page_lines[section_page_lines.index(prev_line) + 1], | |
| PAGE_NUMBER_RE) | |
| raise "Cuadro IPC not found" if m.empty? | |
| cuadro_page_number = m.first.to_i | |
| else | |
| cuadro_page_number = page | |
| end | |
| bounds = self.table_bounds(cuadro_page_number) | |
| t = self.class.table_to_array(self.page(cuadro_page_number).get_area(bounds).get_table) | |
| # 'achatar' las filas word-wrappeadas | |
| t = t.reduce([]) do |memo, row| | |
| # salteo "Regiones", es un título | |
| if row[0].strip == "Regiones" | |
| next memo | |
| end | |
| # si todas las columnas de la 2da en adelante estan vacías en la fila anterior, | |
| # combinar con las columnas de la actual | |
| if memo[-1] && memo[-1][1..-1].all? { |cell| cell == '' } | |
| memo[-1] = memo[-1].zip(row).map { |x, y| x + ' ' + y } | |
| else | |
| memo << row | |
| end | |
| memo | |
| end | |
| rv = t.map.with_index { |row, i| | |
| [i, IndecScraper.canonicalize_string(row.first)] + row | |
| } | |
| self.log("%s - extracted table in page %d" % [File.basename(@pdf_file), cuadro_page_number]) | |
| rv | |
| end | |
| ## | |
| # calcula los bounds de la tabla IPC en la | |
| # pagina +cuadro_page_number+ | |
| def table_bounds(cuadro_page_number) | |
| top_line = self.matching_line_from_page(cuadro_page_number, TABLE_TOP_BOUND_RE) | |
| bottom_line = self.matching_line_from_page(cuadro_page_number, TABLE_BOTTOM_BOUND_RE) | |
| [top_line.top, top_line.left, bottom_line.top, bottom_line.right] | |
| end | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment