Created
October 27, 2011 03:26
-
-
Save Apreche/1318716 to your computer and use it in GitHub Desktop.
Reformats comments from Vanilla 1 to Vanilla 2 style
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Rerformat comments for transition from Vanilla1 to Vanilla 2 | |
import re | |
import MySQLdb | |
from BeautifulSoup import BeautifulSoup | |
connection = MySQLdb.connect(user='root',db='newforum', use_unicode=True) | |
select_cursor = connection.cursor() | |
update_cursor = connection.cursor() | |
bad_comments = set() | |
# Reformat embedded videos | |
video_formats = { | |
'youtube': "http://www.youtube.com/watch?v=%s ", | |
'vimeo': "http://vimeo.com/%s ", | |
'google': "http://video.google.com/videoplay?docid=%s ", | |
} | |
select_query = """ | |
SELECT CommentID, Body | |
FROM GDN_Comment | |
WHERE Body LIKE '%video%' | |
""" | |
select_cursor.execute(select_query) | |
comments = select_cursor.fetchall() | |
video_types = {} | |
for comment in comments: | |
id = comment[0] | |
body = comment[1] | |
try: | |
soup = BeautifulSoup(''.join(body)) | |
except Exception, e: | |
bad_comments.add(id) | |
continue | |
video_tags = soup.findAll('video') | |
if video_tags: | |
for tag in video_tags: | |
if tag.has_key('type'): | |
video_type = tag['type'].lower() | |
video_id = tag.decodeContents() | |
if video_types.has_key(video_type): | |
video_types[video_type] += 1 | |
else: | |
video_types[video_type] = 1 | |
if video_formats.has_key(video_type): | |
replacement = video_formats[video_type] % video_id | |
tag.replaceWith(replacement) | |
update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id)) | |
print "VIDEO TYPE COUNT: %s" % video_types | |
# Remove width attribute from all img tags | |
select_query = """ | |
SELECT CommentID, Body | |
FROM GDN_Comment | |
WHERE Body LIKE '%img%' | |
""" | |
select_cursor.execute(select_query) | |
comments = select_cursor.fetchall() | |
for comment in comments: | |
id = comment[0] | |
body = comment[1] | |
try: | |
soup = BeautifulSoup(''.join(body)) | |
except Exception, e: | |
bad_comments.add(id) | |
continue | |
for tag in soup.findAll('img'): | |
del(tag['width']) | |
update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id)) | |
# Rewrite all quote citations to the new format | |
select_query = """ | |
SELECT CommentID, Body | |
FROM GDN_Comment | |
WHERE Body LIKE '%blockquote%' | |
""" | |
select_cursor.execute(select_query) | |
comments = select_cursor.fetchall() | |
for comment in comments: | |
id = comment[0] | |
body = comment[1] | |
try: | |
soup = BeautifulSoup(''.join(body)) | |
except Exception, e: | |
bad_comments.add(id) | |
continue | |
pattern = re.compile(r'^Posted By: (.*)') | |
for tag in soup.findAll('blockquote'): | |
if tag.cite: | |
citation = tag.cite.decodeContents() | |
match = pattern.match(citation) | |
if match: | |
citation = match.groups()[0] | |
tag.cite.extract() | |
tag['rel'] = citation | |
update_cursor.execute("UPDATE GDN_Comment SET Body = %s WHERE CommentID = %s", (soup, id)) | |
update_cursor.close() | |
select_cursor.close() | |
connection.close() | |
print "BAD COMMENTS: %s" % bad_comments |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment