Created
December 6, 2021 17:47
-
-
Save agarie/31c39bd3436d7c5916454e3547ee796f to your computer and use it in GitHub Desktop.
A script to convert the posts from a WordPress SQL backup into a CSV for easier handling with other tools. Important comment: "Typical reminder to not write a parser half drunk at 3AM".
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Parse a SQL file exported from a WordPress site containing the posts backup, | |
# generally named `wphf_posts.sql`, and create a file `wphf_posts.sql.csv` | |
# containing the data in CSV format. | |
# | |
# I could've used a proper sql parser but was in the mood to write some shitty code lol | |
require 'csv' | |
WP_POSTS_SQL_FILE = ARGV[0] | |
WP_POSTS_CSV_FILE = WP_POSTS_SQL_FILE + ".csv" | |
# Get a line of the form "INSERT INTO (...) VALUES (...)" | |
# and parse its values into an array. | |
def parse(raw_values) | |
characters = raw_values.chars | |
# Iterate on characters, accumulating values into an array. | |
numeric = false | |
string = false | |
outside_value = true | |
current_value = [] | |
# Typical reminder to not write parsers half drunk at 3AM | |
([nil] + characters + [nil]).each_cons(2).reduce([]) do |acc, cs| | |
# Start parsing a new value | |
if outside_value | |
# String | |
if cs[1] == "'" | |
outside_value = false | |
string = true | |
end | |
# Number | |
if cs[1] =~ /\d/ | |
outside_value = false | |
numeric = true | |
end | |
else | |
# Parse a number | |
if numeric | |
current_value << cs[0] | |
# Finished parsing a number | |
if cs[1] == "," || cs[1].nil? | |
acc << current_value.join | |
current_value = [] | |
numeric = false | |
outside_value = true | |
end | |
end | |
if string | |
# check if it's end of string | |
if cs[1] == "'" | |
# It's an escaped single quote, alright | |
if cs[0] == "\\" | |
current_value << cs[1] | |
else | |
# An unescaped single quote signals end of string | |
acc << current_value.join | |
current_value = [] | |
string = false | |
outside_value = true | |
end | |
else | |
# Next character isn't a single quote, so just accept it | |
current_value << cs[1] | |
end | |
end | |
end | |
acc | |
end | |
end | |
if !File.exists? WP_POSTS_SQL_FILE | |
STDERR.puts "File #{WP_POSTS_SQL_FILE} should be in the same directory as this script." | |
STDERR.puts "USAGE:" | |
STDERR.puts " ruby #{$0}" | |
exit 1 | |
end | |
# results = [] | |
CSV.open(WP_POSTS_CSV_FILE, "w") do |output_file| | |
create_header = true | |
File.open(WP_POSTS_SQL_FILE, "r") do |f| | |
f.each_line do |line| | |
if line =~ /^INSERT INTO `wphf_posts`/ | |
# atrocious | |
if create_header | |
raw_values = /INSERT INTO `wphf_posts` \((.*)\) VALUES/.match(line)[1].gsub("`", "'") | |
output_file << parse(raw_values) | |
create_header = false | |
end | |
raw_values = /VALUES \((.*)\)/.match(line)[1] | |
output_file << parse(raw_values) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment