Created
February 26, 2014 19:45
-
-
Save meeuw/9236957 to your computer and use it in GitHub Desktop.
script to fix a messed up mysql database with invalid collation / charset and invalid use of utf8encode / utf8decode in latin1 tables.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import MySQLdb | |
import unicodedata | |
mysql_cred = {'host':'localhost', 'user':'root', 'passwd':'new-password', 'db':'test'} | |
conn_latin1 = MySQLdb.connect(**mysql_cred) | |
c_latin1 = conn_latin1.cursor(MySQLdb.cursors.DictCursor) | |
c_latin1.execute("SELECT id FROM texts") | |
ids = [] | |
while 1: | |
row = c_latin1.fetchone() | |
if not row: break | |
ids.append(row['id']) | |
mysql_cred['charset'] = 'utf8' | |
conn_utf8 = MySQLdb.connect(**mysql_cred) | |
c_utf8 = conn_utf8.cursor(MySQLdb.cursors.DictCursor) | |
c_utf8.execute("SELECT id FROM texts") | |
f = open('output.sql', 'wb') | |
for id in ids: | |
c_utf8.execute("SELECT * FROM texts WHERE id = %s", id) | |
row_utf8 = c_utf8.fetchone() | |
c_latin1.execute("SELECT * FROM texts WHERE id = %s", id) | |
row_latin1 = c_latin1.fetchone() | |
if row_latin1['thevalue'] != '': | |
thevalues = [] | |
try: | |
thevalues.append(row_latin1['thevalue'].decode('utf8', 'strict')) | |
except UnicodeDecodeError: pass | |
thevalues.append(row_utf8['thevalue']) | |
best = '' | |
bestscore = -99999 | |
for thevalue in thevalues: | |
score = -len(thevalue) | |
for c in thevalue: | |
if c == u'?': score -= 1 | |
if bestscore < score: | |
best = thevalue | |
bestscore = score | |
c_utf8.execute("UPDATE texts SET thevalue = %s WHERE id = %s", (best, id)) | |
f.write(c_utf8._last_executed+';\n') | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment