Created
June 7, 2013 13:16
-
-
Save geoom/5729168 to your computer and use it in GitHub Desktop.
CRUCES DE FICHEROS (JOIN MATCH + UNMATCH): Se generan tres ficheros de salida: “000_A_and_B.txt”: Registros que han cruzado “000_A_and_notB.txt”: Registros del fichero primario que no se encuentran en el secundario “000_notA_and_B.txt”: Registros del fichero secundario que no se encuentran en el primario
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Apr 21 70 74 514 | |
Apr 31 52 63 420 | |
Aug 15 34 47 316 | |
Feb 15 32 24 226 | |
Feb 26 58 80 652 | |
Jan 13 25 15 115 | |
Jan 21 36 64 620 | |
Jul 24 34 67 436 | |
Jun 31 42 75 492 | |
Mar 15 24 34 228 | |
Mar 24 75 70 495 | |
May 16 34 29 208 | |
Nov 20 87 82 577 | |
Oct 29 54 68 525 | |
Sep 13 55 37 277 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Apr Abril | |
Aug Agosto | |
Dec Diciembre | |
Feb Febrero | |
Jul Julio | |
Jun Junio | |
Mar Marzo | |
May Mayo | |
Nov Noviembre | |
Oct Octubre |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
BEGIN { | |
## Ficheros de entrada | |
# - Ordenados por las claves que van a ser utilizadas | |
# - El fichero secundario no puede tener duplicados | |
primary_file = "fichero_A.sort.txt" | |
secondary_file = "fichero_B.sort.txt" | |
# Ficheros de salida | |
match_file = "000_A_and_B.txt" | |
primary_unmatch_file = "000_A_and_notB.txt" | |
secondary_unmatch_file = "000_notA_and_B.txt" | |
ARGC = 1+1 # Uno más que el número de ficheros de entrada que indiquemos. | |
ARGV[1] = primary_file | |
} | |
{ | |
# Clave primaria del fichero primario | |
primary_pkey = $1 | |
# Si el campo clave del fichero primario es más grande que el del secundario, avanzamos el registro del secundario | |
if (reg == "" || primary_pkey > secondary_pkey) { | |
status = getline reg < secondary_file | |
if (status == 1) { | |
split(reg, r, " ") # Guardamos los campos en el array r[1..n] | |
# Clave primaria del fichero secundario | |
secondary_pkey = r[1] | |
} | |
} | |
# Si no se ha llegado al final del fichero del secundario | |
if (status == 1) { | |
# Cruzan | |
if (primary_pkey == secondary_pkey) { | |
print $0, r[2] > match_file | |
vmatch[primary_pkey] = 1 # Control para evitar detectar como no cruzado en el futuro | |
} | |
# Registro del primario no existe en el secundario | |
if (primary_pkey < secondary_pkey && !vmatch[primary_pkey]) { | |
print $0 > primary_unmatch_file | |
} | |
# Registro del secundario no existe en el primario | |
if (primary_pkey > secondary_pkey && !vmatch[secondary_pkey]) { | |
print reg > secondary_unmatch_file | |
} | |
} else { | |
# Se ha acabado el fichero secundario, por tanto todo lo pendiente del primario no existe en el secundario | |
print $0 > primary_unmatch_file | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment