Last active
July 18, 2024 22:12
-
-
Save cpjeanpaul/1387c1a28d78f15ef070e3e0f90590a8 to your computer and use it in GitHub Desktop.
Procesar padrón reducido de SUNAT para importarlo a una tabla de postgres usando COPY
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// iconv -f iso-8859-1 -t utf-8 padron_reducido_ruc.txt > utf8_padron_reducido_ruc.txt | |
echo "Limpiar caracteres especiales\n"; | |
$data = file_get_contents('utf8_padron_reducido_ruc.txt'); | |
$data = str_replace(array('\'', '"', '-'), '', $data); | |
file_put_contents('utf8_padron_reducido_ruc.txt', $data); | |
echo "Limpiar caracteres especiales: LISTO\n"; | |
$replace = array('Š' => 'S', 'š' => 's', 'Ž' => 'Z', 'ž' => 'z', 'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'A', 'Ç' => 'C', 'È' => 'E', 'É' => 'E', | |
'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I', 'Î' => 'I', 'Ï' => 'I', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O', 'Õ' => 'O', 'Ö' => 'O', 'Ø' => 'O', 'Ù' => 'U', | |
'Ú' => 'U', 'Û' => 'U', 'Ü' => 'U', 'Ý' => 'Y', 'Þ' => 'B', 'ß' => 'Ss', 'à' => 'a', 'á' => 'a', 'â' => 'a', 'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'æ' => 'a', 'ç' => 'c', | |
'è' => 'e', 'é' => 'e', 'ê' => 'e', 'ë' => 'e', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', 'ð' => 'o', 'ñ' => 'n', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o', 'õ' => 'o', | |
'ö' => 'o', 'ø' => 'o', 'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ý' => 'y', 'þ' => 'b', 'ÿ' => 'y'); | |
echo "Generando output.csv\n"; | |
$i = 0; | |
if (($handle1 = fopen("utf8_padron_reducido_ruc.txt", "r")) !== FALSE) { | |
if (($handle2 = fopen("output.csv", "w")) !== FALSE) { | |
while (($data = fgetcsv($handle1, 000, "|")) !== FALSE) { | |
$i++; | |
if ($i == 1) { | |
continue; | |
} | |
// mostrar mensaje cada 100K | |
if (($i % 100000) == 0) { | |
echo $i . "\n"; | |
} | |
// Solo mantener RUCs activos | |
if (trim($data['2']) != 'ACTIVO') { | |
continue; | |
} | |
$data = array_pad($data, 15, ''); | |
$data[0] = strtr($data[0], $replace); | |
$data[1] = strtr($data[1], $replace); | |
$data[2] = strtr($data[2], $replace); | |
$data[3] = strtr($data[3], $replace); | |
$data[4] = strtr($data[4], $replace); | |
$data[5] = strtr($data[5], $replace); | |
$data[6] = strtr($data[6], $replace); | |
$data[7] = strtr($data[7], $replace); | |
$data[8] = strtr($data[8], $replace); | |
$data[9] = strtr($data[9], $replace); | |
$data[10] = strtr($data[10], $replace); | |
$data[11] = strtr($data[11], $replace); | |
$data[12] = strtr($data[12], $replace); | |
$data[13] = strtr($data[13], $replace); | |
$data[14] = strtr($data[14], $replace); | |
$data = array_slice($data, 0, 15); | |
fputcsv($handle2, $data, '|'); | |
} | |
fclose($handle2); | |
} | |
fclose($handle1); | |
} | |
echo "Generando output.csv: LISTO\n"; | |
$data = file_get_contents('output.csv'); | |
echo "Limpiando output.csv\n"; | |
$data = str_replace(array('\'', '"'), '', $data); | |
file_put_contents('output.csv', $data); | |
echo "Limpiando output.csv: LISTO\n"; | |
/** | |
* DROP TABLE padron; | |
* TRUNCATE TABLE padron; | |
* | |
* CREATE TABLE public.padron | |
* ( | |
* ruc character varying(15) NOT NULL, | |
* razon character varying(255), | |
* estado character varying(25), | |
* domicilio character varying(25), | |
* ubigeo character varying(6), | |
* via character varying(150), | |
* nombre_via character varying(150), | |
* codigo_zona character varying(150), | |
* tipo_zona character varying(150), | |
* numero character varying(100), | |
* interior character varying(50), | |
* lote character varying(50), | |
* departamento character varying(50), | |
* manzana character varying(50), | |
* kilometro character varying(50) | |
* ) | |
* WITH ( | |
* OIDS=FALSE | |
* ); | |
* ALTER TABLE public.padron | |
* OWNER TO postgres; | |
* | |
* COPY padron FROM '../output.csv' DELIMITER '|' CSV | |
* | |
* CREATE INDEX CONCURRENTLY padron_ruc ON padron(ruc); | |
* CREATE INDEX CONCURRENTLY padron_razon ON padron(razon); | |
* | |
* REINDEX INDEX padron_ruc; | |
* REINDEX INDEX padron_razon; | |
* | |
* select * from padron where ruc like '%%'; | |
* select * from padron where (razon like '%%') or (ruc like '%%') ; | |
* -- via nombre_via 'km.' kilometro 'Nro' numero 'Mza.' manzana 'Lote.' lote 'Dpto' departamento codigo_zona tipo_zona | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment