MySQL script to import stackexchange data dumps (as retrieved from here).
The accompanying Ruby script will un-escape various XML-escaped characters that the MySQL import process fails to do.
# Copyright (c) 2013 Georgios Gousios | |
# MIT-licensed | |
create database stackoverflow DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci; | |
use stackoverflow; | |
create table badges ( | |
Id INT NOT NULL PRIMARY KEY, | |
UserId INT, | |
Name VARCHAR(50), | |
CreationDate DATETIME | |
); | |
CREATE TABLE comments ( | |
Id INT NOT NULL PRIMARY KEY, | |
PostId INT NOT NULL, | |
Score INT NOT NULL DEFAULT 0, | |
Text TEXT, | |
CreationDate DATETIME, | |
UserId INT NOT NULL | |
); | |
CREATE TABLE post_history ( | |
Id INT NOT NULL PRIMARY KEY, | |
PostHistoryTypeId SMALLINT NOT NULL, | |
PostId INT NOT NULL, | |
RevisionGUID VARCHAR(36), | |
CreationDate DATETIME, | |
UserId INT NOT NULL, | |
Text TEXT | |
); | |
CREATE TABLE posts ( | |
Id INT NOT NULL PRIMARY KEY, | |
PostTypeId SMALLINT, | |
AcceptedAnswerId INT, | |
ParentId INT, | |
Score INT NULL, | |
ViewCount INT NULL, | |
Body text NULL, | |
OwnerUserId INT NOT NULL, | |
LastEditorUserId INT, | |
LastEditDate DATETIME, | |
LastActivityDate DATETIME, | |
Title varchar(256) NOT NULL, | |
Tags VARCHAR(256), | |
AnswerCount INT NOT NULL DEFAULT 0, | |
CommentCount INT NOT NULL DEFAULT 0, | |
FavoriteCount INT NOT NULL DEFAULT 0, | |
CreationDate DATETIME | |
); | |
CREATE TABLE users ( | |
Id INT NOT NULL PRIMARY KEY, | |
Reputation INT NOT NULL, | |
CreationDate DATETIME, | |
DisplayName VARCHAR(50) NULL, | |
LastAccessDate DATETIME, | |
Views INT DEFAULT 0, | |
WebsiteUrl VARCHAR(256) NULL, | |
Location VARCHAR(256) NULL, | |
AboutMe TEXT NULL, | |
Age INT, | |
UpVotes INT, | |
DownVotes INT, | |
EmailHash VARCHAR(32) | |
); | |
CREATE TABLE votes ( | |
Id INT NOT NULL PRIMARY KEY, | |
PostId INT NOT NULL, | |
VoteTypeId SMALLINT, | |
CreationDate DATETIME | |
); | |
load xml infile 'Badges.xml' | |
into table badges | |
rows identified by '<row>'; | |
load xml infile 'Comments.xml' | |
into table comments | |
rows identified by '<row>'; | |
load xml infile 'PostHistory.xml' | |
into table post_history | |
rows identified by '<row>'; | |
load xml infile 'Posts.xml' | |
into table posts | |
rows identified by '<row>'; | |
load xml infile 'Users.xml' | |
into table users | |
rows identified by '<row>'; | |
load xml infile 'Votes.xml' | |
into table votes | |
rows identified by '<row>'; | |
create index badges_idx_1 on badges(UserId); | |
create index comments_idx_1 on comments(PostId); | |
create index comments_idx_2 on comments(UserId); | |
create index post_history_idx_1 on post_history(PostId); | |
create index post_history_idx_2 on post_history(UserId); | |
create index posts_idx_1 on posts(AcceptedAnswerId); | |
create index posts_idx_2 on posts(ParentId); | |
create index posts_idx_3 on posts(OwnerUserId); | |
create index posts_idx_4 on posts(LastEditorUserId); | |
create index votes_idx_1 on votes(PostId); |
#!/usr/bin/env ruby | |
# Copyright (c) 2013 Georgios Gousios | |
# MIT-licensed | |
require 'sequel' | |
require 'mysql2' | |
require 'cgi' | |
Sequel::Database.extension :pagination | |
DB = Sequel.connect("mysql2://#{ARGV[1]}:#{ARGV[2]}@#{ARGV[0]}/#{ARGV[3]}") | |
table = ARGV[4].to_sym | |
field = ARGV[5].to_sym | |
lines = 0 | |
lines_with_escapes = 0 | |
DB.from(table.to_sym).select(:Id, field).each_page(1000) do |page| | |
page.each do |row| | |
lines += 1 | |
text = row[field] | |
next if text.nil? | |
text_unescaped = CGI.unescapeHTML(text) | |
if text_unescaped != text | |
lines_with_escapes += 1 | |
DB[table].filter(:Id => row[:Id]).update(field => text_unescaped) | |
end | |
print "\r #{lines} lines, #{lines_with_escapes} with escapes" | |
end | |
end |
MySQL script to import stackexchange data dumps (as retrieved from here).
The accompanying Ruby script will un-escape various XML-escaped characters that the MySQL import process fails to do.