estasney · January 13, 2020 17:24
diff --git a/readme.md b/readme.md
diff --git a/post_tags.py b/post_tags.py
 from sqlalchemy import create_engine
 from collections import namedtuple
 import re
 from tqdm.auto import tqdm

 Tag = namedtuple('Tag', 'id name count regex')

 USERNAME = ''
 PASSWORD = ''
 URL = 'localhost'
 PORT = '3306'

 FIND_TAGS = re.compile(r"<([^>]+)")

 def make_tag(id, name, count):
    r = "<{}>".format(name)
    return Tag(id, name, count, r)


 def get_tags(engine):
    get_tags_stmt = """
            SELECT tags.Id, tags.TagName, tags.Count FROM tags
            ORDER BY tags.Count DESC
            """

    result = engine.execute(get_tags_stmt)
    fetch_result = result.fetchall()
    rtags = [make_tag(*r) for r in fetch_result]
    return rtags

 def make_tag_encoder(engine):
    tag2id = {t.name: t.id for t in get_tags(engine=engine)}
    return tag2id

 def encode_tags(tags, encoder, tag_pattern=FIND_TAGS):
    if not tags:
        return None
    tags = tag_pattern.findall(tags)
    if not tags:
        return None
    encoded_tags = [encoder.get(t, None) for t in tags]
    encoded_tags = list(filter(lambda x: x is not None, encoded_tags))
    if not encoded_tags:
        return None
    return encoded_tags



 def get_posts(last_post_id, limit, engine):
    limit_statement = "LIMIT {};".format(limit)
    if not last_post_id:
        where_statement = "WHERE Tags IS NOT NULL"
    else:
        where_statement = "WHERE Tags IS NOT NULL AND NOT Id <= {}".format(last_post_id)

    stmt = """SELECT
    Id, Tags from posts
    {where}
    ORDER BY Id
    {limit}
    """.format(where=where_statement, limit=limit_statement)

    result = engine.execute(stmt)
    fetch_result = result.fetchall()
    return fetch_result

 def export_tag_posts(f, engine, step_size=10_000):
    n_posts = engine.execute(
            """
            SELECT Count(Id) FROM posts
            WHERE posts.Tags IS NOT NULL
            """).first()[0]
    print("Number of Posts : {}".format(n_posts))

    tag2id = make_tag_encoder(engine)

    n_steps = (n_posts // step_size)
    pb = tqdm(total=n_steps, desc='Getting Post Tags')
    LAST_POST_ID = None

    with open(f, 'w+', encoding='utf-8') as fp:
            for step in range(n_steps):
                step += 1
                step_posts = get_posts(last_post_id=LAST_POST_ID, limit=step_size, engine=engine)
                LAST_POST_ID = step_posts[-1][0]
                for post_id, tags in step_posts:
                    tag_ids = encode_tags(tags, encoder=tag2id)
                    if not tag_ids:
                        continue
                    for tag_id in tag_ids:
                        line_out = "\t".join([str(post_id), str(tag_id)])
                        line_out += "\n"
                        fp.write(line_out)

                pb.update(1)

    pb.close()

 if __name__ == '__main__':
    engine = create_engine(
            'mysql://{username}:{password}@{url}:{port}/stackoverflow?charset=utf8mb4'.format(username=USERNAME,
                                                                                              password=PASSWORD,
                                                                                              url=URL, port=PORT
                                                                                              )
            )
    export_tag_posts("post_tags.txt", engine=engine)

diff --git a/stackoverflow.sql b/stackoverflow.sql
 # Copyright (c) 2013 Georgios Gousios
 # MIT-licensed
 # [megansquire] Edited to handle new column names (Date instead of CreationDate)
 # [estasney] Add suggestions from RoelVdP. Improve performace by disabling checks

 SET @@SESSION.SQL_MODE='';
 create database stackoverflow DEFAULT CHARACTER SET utf8mb4 DEFAULT COLLATE utf8mb4_bin;

 use stackoverflow;

 SET unique_checks=0;
 SET foreign_key_checks=0;

 create table badges (
  Id INT NOT NULL PRIMARY KEY,
  UserId INT,
  Name VARCHAR(50),
  Date DATETIME
 );

 CREATE TABLE comments (
    Id INT NOT NULL PRIMARY KEY,
    PostId INT NOT NULL,
    Score INT NOT NULL DEFAULT 0,
    Text TEXT,
    CreationDate DATETIME,
    UserId INT NOT NULL
 );

 CREATE TABLE post_history (
    Id INT NOT NULL PRIMARY KEY,
    PostHistoryTypeId SMALLINT NOT NULL,
    PostId INT NOT NULL,
    RevisionGUID VARCHAR(36),
    CreationDate DATETIME,
    UserId INT NOT NULL,
    Text TEXT
 );
 CREATE TABLE post_links (
  Id INT NOT NULL PRIMARY KEY,
  CreationDate DATETIME DEFAULT NULL,
  PostId INT NOT NULL,
  RelatedPostId INT NOT NULL,
  LinkTypeId INT DEFAULT NULL
 );


 CREATE TABLE posts (
    Id INT NOT NULL PRIMARY KEY,
    PostTypeId SMALLINT,
    AcceptedAnswerId INT,
    ParentId INT,
    Score INT NULL,
    ViewCount INT NULL,
    Body text NULL,
    OwnerUserId INT NOT NULL,
    OwnerDisplayName varchar(256),
    LastEditorUserId INT,
    LastEditDate DATETIME,
    LastActivityDate DATETIME,
    Title varchar(256) NOT NULL,
    Tags VARCHAR(256),
    AnswerCount INT NOT NULL DEFAULT 0,
    CommentCount INT NOT NULL DEFAULT 0,
    FavoriteCount INT NOT NULL DEFAULT 0,
    CreationDate DATETIME,
    DeletionDate DATETIME NULL
 );

 CREATE TABLE tags (
  Id INT NOT NULL PRIMARY KEY,
  TagName VARCHAR(50) CHARACTER SET latin1 DEFAULT NULL,
  Count INT DEFAULT NULL,
  ExcerptPostId INT DEFAULT NULL,
  WikiPostId INT DEFAULT NULL
 );


 CREATE TABLE users (
    Id INT NOT NULL PRIMARY KEY,
    Reputation INT NOT NULL,
    CreationDate DATETIME,
    DisplayName VARCHAR(50) NULL,
    LastAccessDate  DATETIME,
    Views INT DEFAULT 0,
    WebsiteUrl VARCHAR(256) NULL,
    Location VARCHAR(256) NULL,
    AboutMe TEXT NULL,
    Age INT,
    UpVotes INT,
    DownVotes INT,
    EmailHash VARCHAR(32)
 );

 CREATE TABLE votes (
    Id INT NOT NULL PRIMARY KEY,
    PostId INT NOT NULL,
    VoteTypeId SMALLINT,
    CreationDate DATETIME
 );

 load xml infile 'Badges.xml'
 into table badges
 rows identified by '<row>';

 load xml infile 'Comments.xml'
 into table comments
 rows identified by '<row>';

 load xml infile 'PostHistory.xml'
 into table post_history
 rows identified by '<row>';

 LOAD XML LOCAL INFILE 'PostLinks.xml'
 INTO TABLE post_links
 ROWS IDENTIFIED BY '<row>';

 load xml infile 'Posts.xml'
 into table posts
 rows identified by '<row>';

 LOAD XML LOCAL INFILE 'Tags.xml'
 INTO TABLE tags
 ROWS IDENTIFIED BY '<row>';

 load xml infile 'Users.xml'
 into table users
 rows identified by '<row>';

 load xml infile 'Votes.xml'
 into table votes
 rows identified by '<row>';

 create index badges_idx_1 on badges(UserId);

 create index comments_idx_1 on comments(PostId);
 create index comments_idx_2 on comments(UserId);

 create index post_history_idx_1 on post_history(PostId);
 create index post_history_idx_2 on post_history(UserId);

 create index posts_idx_1 on posts(AcceptedAnswerId);
 create index posts_idx_2 on posts(ParentId);
 create index posts_idx_3 on posts(OwnerUserId);
 create index posts_idx_4 on posts(LastEditorUserId);

 create index votes_idx_1 on votes(PostId);

 SET unique_checks=1;
 SET foreign_key_checks=1;
diff --git a/stackoverflow1.sql b/stackoverflow1.sql
 SET @@SESSION.SQL_MODE='';

 use stackoverflow;

 SET unique_checks=0;
 SET foreign_key_checks=0;

 create table post_tags (
  PostId INT NOT NULL,
  TagId INT NOT NULL,
  PRIMARY KEY (PostId, TagId)
 );

 # Ensure post_tags.txt is an absolute path
 load data local infile 'post_tags.txt' 
 into table post_tags
 fields terminated by '\t'
 lines terminated by '\n';

 SET unique_checks=1;
 SET foreign_key_checks=1;
diff --git a/unespace.rb b/unespace.rb
 #!/usr/bin/env ruby
 # Copyright (c) 2013 Georgios Gousios
 # MIT-licensed

 require 'sequel'
 require 'mysql2'
 require 'cgi'

 Sequel::Database.extension :pagination
 DB = Sequel.connect("mysql2://#{ARGV[1]}:#{ARGV[2]}@#{ARGV[0]}/#{ARGV[3]}")

 table = ARGV[4].to_sym
 field = ARGV[5].to_sym

 lines = 0
 lines_with_escapes = 0

 DB.from(table.to_sym).select(:Id, field).each_page(1000) do |page|
  page.each do |row|
    lines += 1
    text = row[field]

    next if text.nil?

    text_unescaped = CGI.unescapeHTML(text)
    if text_unescaped != text
      lines_with_escapes += 1
      DB[table].filter(:Id => row[:Id]).update(field => text_unescaped)
    end

    print "\r #{lines} lines, #{lines_with_escapes} with escapes"
  end
 end
	from sqlalchemy import create_engine
	from collections import namedtuple
	import re
	from tqdm.auto import tqdm

	Tag = namedtuple('Tag', 'id name count regex')

	USERNAME = ''
	PASSWORD = ''
	URL = 'localhost'
	PORT = '3306'

	FIND_TAGS = re.compile(r"<([^>]+)")

	def make_tag(id, name, count):
	r = "<{}>".format(name)
	return Tag(id, name, count, r)


	def get_tags(engine):
	get_tags_stmt = """
	SELECT tags.Id, tags.TagName, tags.Count FROM tags
	ORDER BY tags.Count DESC
	"""

	result = engine.execute(get_tags_stmt)
	fetch_result = result.fetchall()
	rtags = [make_tag(*r) for r in fetch_result]
	return rtags

	def make_tag_encoder(engine):
	tag2id = {t.name: t.id for t in get_tags(engine=engine)}
	return tag2id

	def encode_tags(tags, encoder, tag_pattern=FIND_TAGS):
	if not tags:
	return None
	tags = tag_pattern.findall(tags)
	if not tags:
	return None
	encoded_tags = [encoder.get(t, None) for t in tags]
	encoded_tags = list(filter(lambda x: x is not None, encoded_tags))
	if not encoded_tags:
	return None
	return encoded_tags



	def get_posts(last_post_id, limit, engine):
	limit_statement = "LIMIT {};".format(limit)
	if not last_post_id:
	where_statement = "WHERE Tags IS NOT NULL"
	else:
	where_statement = "WHERE Tags IS NOT NULL AND NOT Id <= {}".format(last_post_id)

	stmt = """SELECT
	Id, Tags from posts
	{where}
	ORDER BY Id
	{limit}
	""".format(where=where_statement, limit=limit_statement)

	result = engine.execute(stmt)
	fetch_result = result.fetchall()
	return fetch_result

	def export_tag_posts(f, engine, step_size=10_000):
	n_posts = engine.execute(
	"""
	SELECT Count(Id) FROM posts
	WHERE posts.Tags IS NOT NULL
	""").first()[0]
	print("Number of Posts : {}".format(n_posts))

	tag2id = make_tag_encoder(engine)

	n_steps = (n_posts // step_size)
	pb = tqdm(total=n_steps, desc='Getting Post Tags')
	LAST_POST_ID = None

	with open(f, 'w+', encoding='utf-8') as fp:
	for step in range(n_steps):
	step += 1
	step_posts = get_posts(last_post_id=LAST_POST_ID, limit=step_size, engine=engine)
	LAST_POST_ID = step_posts[-1][0]
	for post_id, tags in step_posts:
	tag_ids = encode_tags(tags, encoder=tag2id)
	if not tag_ids:
	continue
	for tag_id in tag_ids:
	line_out = "\t".join([str(post_id), str(tag_id)])
	line_out += "\n"
	fp.write(line_out)

	pb.update(1)

	pb.close()

	if __name__ == '__main__':
	engine = create_engine(
	'mysql://{username}:{password}@{url}:{port}/stackoverflow?charset=utf8mb4'.format(username=USERNAME,
	password=PASSWORD,
	url=URL, port=PORT
	)
	)
	export_tag_posts("post_tags.txt", engine=engine)
	# Copyright (c) 2013 Georgios Gousios
	# MIT-licensed
	# [megansquire] Edited to handle new column names (Date instead of CreationDate)
	# [estasney] Add suggestions from RoelVdP. Improve performace by disabling checks

	SET @@SESSION.SQL_MODE='';
	create database stackoverflow DEFAULT CHARACTER SET utf8mb4 DEFAULT COLLATE utf8mb4_bin;

	use stackoverflow;

	SET unique_checks=0;
	SET foreign_key_checks=0;

	create table badges (
	Id INT NOT NULL PRIMARY KEY,
	UserId INT,
	Name VARCHAR(50),
	Date DATETIME
	);

	CREATE TABLE comments (
	Id INT NOT NULL PRIMARY KEY,
	PostId INT NOT NULL,
	Score INT NOT NULL DEFAULT 0,
	Text TEXT,
	CreationDate DATETIME,
	UserId INT NOT NULL
	);

	CREATE TABLE post_history (
	Id INT NOT NULL PRIMARY KEY,
	PostHistoryTypeId SMALLINT NOT NULL,
	PostId INT NOT NULL,
	RevisionGUID VARCHAR(36),
	CreationDate DATETIME,
	UserId INT NOT NULL,
	Text TEXT
	);
	CREATE TABLE post_links (
	Id INT NOT NULL PRIMARY KEY,
	CreationDate DATETIME DEFAULT NULL,
	PostId INT NOT NULL,
	RelatedPostId INT NOT NULL,
	LinkTypeId INT DEFAULT NULL
	);


	CREATE TABLE posts (
	Id INT NOT NULL PRIMARY KEY,
	PostTypeId SMALLINT,
	AcceptedAnswerId INT,
	ParentId INT,
	Score INT NULL,
	ViewCount INT NULL,
	Body text NULL,
	OwnerUserId INT NOT NULL,
	OwnerDisplayName varchar(256),
	LastEditorUserId INT,
	LastEditDate DATETIME,
	LastActivityDate DATETIME,
	Title varchar(256) NOT NULL,
	Tags VARCHAR(256),
	AnswerCount INT NOT NULL DEFAULT 0,
	CommentCount INT NOT NULL DEFAULT 0,
	FavoriteCount INT NOT NULL DEFAULT 0,
	CreationDate DATETIME,
	DeletionDate DATETIME NULL
	);

	CREATE TABLE tags (
	Id INT NOT NULL PRIMARY KEY,
	TagName VARCHAR(50) CHARACTER SET latin1 DEFAULT NULL,
	Count INT DEFAULT NULL,
	ExcerptPostId INT DEFAULT NULL,
	WikiPostId INT DEFAULT NULL
	);


	CREATE TABLE users (
	Id INT NOT NULL PRIMARY KEY,
	Reputation INT NOT NULL,
	CreationDate DATETIME,
	DisplayName VARCHAR(50) NULL,
	LastAccessDate DATETIME,
	Views INT DEFAULT 0,
	WebsiteUrl VARCHAR(256) NULL,
	Location VARCHAR(256) NULL,
	AboutMe TEXT NULL,
	Age INT,
	UpVotes INT,
	DownVotes INT,
	EmailHash VARCHAR(32)
	);

	CREATE TABLE votes (
	Id INT NOT NULL PRIMARY KEY,
	PostId INT NOT NULL,
	VoteTypeId SMALLINT,
	CreationDate DATETIME
	);

	load xml infile 'Badges.xml'
	into table badges
	rows identified by '<row>';

	load xml infile 'Comments.xml'
	into table comments
	rows identified by '<row>';

	load xml infile 'PostHistory.xml'
	into table post_history
	rows identified by '<row>';

	LOAD XML LOCAL INFILE 'PostLinks.xml'
	INTO TABLE post_links
	ROWS IDENTIFIED BY '<row>';

	load xml infile 'Posts.xml'
	into table posts
	rows identified by '<row>';

	LOAD XML LOCAL INFILE 'Tags.xml'
	INTO TABLE tags
	ROWS IDENTIFIED BY '<row>';

	load xml infile 'Users.xml'
	into table users
	rows identified by '<row>';

	load xml infile 'Votes.xml'
	into table votes
	rows identified by '<row>';

	create index badges_idx_1 on badges(UserId);

	create index comments_idx_1 on comments(PostId);
	create index comments_idx_2 on comments(UserId);

	create index post_history_idx_1 on post_history(PostId);
	create index post_history_idx_2 on post_history(UserId);

	create index posts_idx_1 on posts(AcceptedAnswerId);
	create index posts_idx_2 on posts(ParentId);
	create index posts_idx_3 on posts(OwnerUserId);
	create index posts_idx_4 on posts(LastEditorUserId);

	create index votes_idx_1 on votes(PostId);

	SET unique_checks=1;
	SET foreign_key_checks=1;
	#!/usr/bin/env ruby
	# Copyright (c) 2013 Georgios Gousios
	# MIT-licensed

	require 'sequel'
	require 'mysql2'
	require 'cgi'

	Sequel::Database.extension :pagination
	DB = Sequel.connect("mysql2://#{ARGV[1]}:#{ARGV[2]}@#{ARGV[0]}/#{ARGV[3]}")

	table = ARGV[4].to_sym
	field = ARGV[5].to_sym

	lines = 0
	lines_with_escapes = 0

	DB.from(table.to_sym).select(:Id, field).each_page(1000) do \|page\|
	page.each do \|row\|
	lines += 1
	text = row[field]

	next if text.nil?

	text_unescaped = CGI.unescapeHTML(text)
	if text_unescaped != text
	lines_with_escapes += 1
	DB[table].filter(:Id => row[:Id]).update(field => text_unescaped)
	end

	print "\r #{lines} lines, #{lines_with_escapes} with escapes"
	end
	end