-
-
Save lavoiesl/9a08e399fc9832d12794 to your computer and use it in GitHub Desktop.
// gcc -O2 -Wall -pedantic process-mysqldump.c -o process-mysqldump | |
// Usage: cat dump.sql | process-mysqldump | |
// Or : process-mysqldump dump.sql | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdbool.h> | |
#include <string.h> | |
#define BUFFER 100000 | |
bool is_escaped(char* string, int offset) { | |
if (offset == 0) { | |
return false; | |
} else if (string[offset - 1] == '\\') { | |
return !is_escaped(string, offset - 1); | |
} else { | |
return false; | |
} | |
} | |
bool is_commented(char* string) { | |
char buffer[4]; | |
sprintf(buffer, "%.3s", string); | |
return strcmp(buffer, "-- ") == 0; | |
} | |
int main(int argc, char *argv[]) | |
{ | |
FILE* file = argc > 1 ? fopen(argv[1], "r") : stdin; | |
char buffer[BUFFER]; | |
char* line; | |
int pos; | |
int parenthesis = 0; | |
bool quote = false; | |
bool escape = false; | |
bool comment = false; | |
while (fgets(buffer, BUFFER, file) != NULL) { | |
line = buffer; | |
// skip commented | |
if (comment || is_commented(line)) { | |
comment = line[strlen(line) - 1] != '\n'; | |
fputs(line, stdout); | |
} else { | |
pos = 0; | |
nullchar: | |
while (line[pos] != '\0') { | |
// if we are still in escape state, we need to check first char. | |
if (!escape) { | |
// find any character in ()' | |
pos = strcspn(line, "()'\\"); | |
} | |
if (pos > 0) { | |
// print before match | |
printf("%.*s", pos, line); | |
} | |
switch (line[pos]) { | |
case '(': | |
if (!quote) { | |
if (parenthesis == 0) { | |
putchar('\n'); | |
} | |
parenthesis++; | |
} | |
if (escape) { | |
escape = false; | |
} | |
break; | |
case ')': | |
if (!quote) { | |
if (parenthesis > 0) { | |
parenthesis--; | |
} else { | |
// whoops | |
puts("\n"); | |
fputs(line, stdout); | |
fputs("Found closing parenthesis without opening one.\n", stderr); | |
exit(1); | |
} | |
} | |
if (escape) { | |
escape = false; | |
} | |
break; | |
case '\\': | |
escape = !escape; | |
break; | |
case '\'': | |
if (escape) { | |
escape = false; | |
} else { | |
quote = !quote; | |
} | |
break; | |
case '\0': | |
goto nullchar; | |
default: | |
if (escape) { | |
escape = false; | |
} | |
break; | |
} | |
// print char then skip it (to make sure we don’t double match) | |
putchar(line[pos]); | |
line = line + pos + 1; | |
pos = 0; | |
} | |
} | |
} | |
return 0; | |
} |
<?php | |
// Usage: cat dump.sql | php process-mysqldump.php | |
$input = fopen('php://stdin', 'r'); | |
while(!feof($input)) { | |
$line = fgets($input); | |
if (substr($line, 0, 6) == 'INSERT') { | |
process_line($line); | |
} else { | |
echo $line; | |
} | |
} | |
function process_line($line) { | |
$length = strlen($line); | |
$pos = strpos($line, ' VALUES ') + 8; | |
echo substr($line, 0, $pos); | |
$parenthesis = false; | |
$quote = false; | |
$escape = false; | |
for ($i = $pos; $i < $length; $i++) { | |
switch($line[$i]) { | |
case '(': | |
if (!$quote) { | |
if ($parenthesis) { | |
throw new Exception('double open parenthesis'); | |
} else { | |
echo PHP_EOL; | |
$parenthesis = true; | |
} | |
} | |
$escape = false; | |
break; | |
case ')': | |
if (!$quote) { | |
if ($parenthesis) { | |
$parenthesis = false; | |
} else { | |
throw new Exception('closing parenthesis without open'); | |
} | |
} | |
$escape = false; | |
break; | |
case '\\': | |
$escape = !$escape; | |
break; | |
case "'": | |
if ($escape) { | |
$escape = false; | |
} else { | |
$quote = !$quote; | |
} | |
break; | |
default: | |
$escape = false; | |
break; | |
} | |
echo $line[$i]; | |
} | |
} | |
fclose($input); |
good job, thanks! couple of notes:
change the open mode from "r" to "rb", otherwise you might suffer silent data corruption when exporting on a windows system (values with \n might suddenly become \r\n when exported on windows - and linux is unaffected by this change)
also, the PHP version would probably be faster if you dropped $input and used STDIN instead (magic constant, i think fopen(php://stdin) use some ugly stream_copy_to_stream stuff behind the scene, because it actually creates a new file descriptor when inspecting /proc/ , at least on linux...)
also, in PHP, pre-increment (++$i) is faster than post-increment ($i++) - this is also true for C, but gcc auto-optimizes that on -O1 and above.
also, replace this
while(!feof($input)) {
$line = fgets($input);
with this
while(false!==($line = fgets($input)) ) {
and it will be functionally equivalent, but faster (saving a syscall at least - keep in mind that STDIN is in blocking mode by default, and fgets() returns bool(false) on EOF, which is the only thing you were using feof() for)
- also, the PHP code could probably get a lot faster by doing ob_start() ob_end_flush() between process_line, because stupid php use write() instead of fwrite() internally, which is completely unbuffered, php basically does the equivalent of printf(...); fflush(stdout) on every echo.. which, in this code, is every byte (
echo $line[$i];
prints 1 byte at a time).. doing
if (substr($line, 0, 6) == 'INSERT') {
ob_start();
process_line($line);
ob_end_flush();
} else {
would probably make the script much faster. (and modifying process_line to return a string instead of printing, and do echo process_line($line);
would probably be even faster than ob_start/end_flush)
(also something could be said about adding 100000 as the 2nd argument to fgets, it would probably get faster, but i haven't done benchmarks, and im not 100% positive - also it would introduce the 100000 quote C-only bug in PHP)
here is an optimized php version: https://gist.github.com/divinity76/494fef3d2ea1b800bb98e13db9c3ec3a#file-process-mysqldump-php
here are some benchmarks, comparing the C version with the original PHP version with the optimized PHP version:
./bench.php 'cat dump.sql | ./process-mysqldump' >/dev/null
0.268805
0.275636
0.277831
0.274316
0.308055
./bench.php 'cat dump.sql | php original.php' >/dev/null
11.146122
11.119936
10.938375
11.308797
10.875135
./bench.php 'cat dump.sql | php optimized.php' >/dev/null
2.930665
2.974056
2.947326
2.916559
2.994324
- the optimized PHP version is now about 3.7 times faster than the original, while still being about 10.7 times slower than the C version..
¯\_(ツ)_/¯
du -h dump.sql
1.1M dump.sql
bench.php: https://gist.github.com/divinity76/b0471cfa6596c92dd6aba2078ea0fde8
(also, i can optimize the PHP version even further, but i won't bother, i wouldn't be able to gain much speed at this point anyway - specifically, i can optimize away a few function call overheads)
did some further optimizations to the PHP version, now it's about 8.4 times faster than the original PHP version,
and about 4.8 times slower than the C version.
1.323755
1.353491
1.351569
1.332137
1.386095
- could speed it up a lot by doing
$ret .= $line[$i];
$to = strcspn($line, '()\\\'', $i+1);
if ($to !== 0) {
$ret .= substr($line, $i+1, $to);
$i += ($to);
}
What license is this code?
Hello.
Thanks for your great work, but there is a bug. If someone writes in a varchar "-- " at the 100000th character, your code will think it's a comment. In the C version.
You should replace the "if(comment ..." with "if (!quote && (comment || is_commented(line))) {"
It took me a long time to figure that bug out... Thanks a lot
Thanks a lot, very useful!