Created
April 18, 2016 17:19
-
-
Save coccoinomane/83e16fb27073edddb0d551e6368b2e9f to your computer and use it in GitHub Desktop.
PHP script to find duplicates & unique entries among two separate files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * PHP script to find duplicates & unique entries among two | |
| * separate files. | |
| * | |
| * The script outputs one file with the matching lines and | |
| * one with the unique ones. | |
| * | |
| * Example: | |
| * | |
| * File 1: | |
| * banana | |
| * apple | |
| * pear | |
| * pineapple | |
| * apple | |
| * | |
| * File 2: | |
| * pineapple | |
| * strawberry | |
| * apple | |
| * mango | |
| * kiwi | |
| * | |
| * Output 1 (duplicates): | |
| * apple | |
| * pineapple | |
| * | |
| * Output 2 (unique entries): | |
| * banana | |
| * apple | |
| * pear | |
| * pineapple | |
| * strawberry | |
| * mango | |
| * kiwi | |
| * | |
| * TODO: Generalize the script to accept N input files. N could also | |
| * be 1, in which case the script just removes duplicates from the | |
| * input file. | |
| * | |
| * TODO: Verify that newline characters don't mess the match, ex. | |
| * if the last line does not end with a newline | |
| */ | |
| // ====================================================================================== | |
| // = User input = | |
| // ====================================================================================== | |
| /* Input files */ | |
| $input_file_1 = $argv[1]; | |
| $input_file_2 = $argv[2]; | |
| /* Output files */ | |
| $output_in_common = isset($argv[3]) ? $argv[3] : "DUPLICATES.csv"; | |
| $output_unique = isset($argv[4]) ? $argv[4] : "UNIQUES.csv"; | |
| /* Should the match take into account case differences? */ | |
| $case_insensitive = true; | |
| // ====================================================================================== | |
| // = Actual script = | |
| // ====================================================================================== | |
| /* Extract data from files */ | |
| $array_1 = file ($input_file_1); | |
| $array_2 = file ($input_file_2); | |
| if ($case_insensitive) { | |
| $array_1 = array_map ('strtolower', $array_1); | |
| $array_2 = array_map ('strtolower', $array_2); | |
| } | |
| /* Extract unique entries in each file */ | |
| $unique_1 = array_unique ($array_1); | |
| $unique_2 = array_unique ($array_2); | |
| $n_unique_1 = count (array_unique ($array_1)); | |
| $n_unique_2 = count (array_unique ($array_2)); | |
| /* Find common lines between the two files */ | |
| $merge = array_merge ($array_1, $array_2); | |
| $in_common = array(); | |
| $unique = array(); | |
| foreach (array_count_values($merge) as $value => $count) | |
| if ($count > 1) | |
| $in_common[] = $value; | |
| else if ($count == 1) | |
| $unique[] = $value; | |
| $n_in_common = count ($in_common); | |
| /* Print some information to screen */ | |
| echo "$input_file_1 has ".count($array_1)." elements ($n_unique_1 unique)" . PHP_EOL; | |
| echo "$input_file_2 has ".count($array_2)." elements ($n_unique_2 unique)" . PHP_EOL; | |
| echo "They have $n_in_common lines in common" . PHP_EOL; | |
| /* Create and fill output files */ | |
| file_put_contents ($output_in_common, join ($in_common)); | |
| file_put_contents ($output_unique, join ($unique)); | |
| ?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment