Created
January 7, 2015 15:49
-
-
Save glouppe/edaee2c0f82745e301de to your computer and use it in GitHub Desktop.
Disambiguation prototype
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import argparse | |
import cPickle | |
import scipy.cluster.hierarchy as hac | |
from itertools import groupby | |
from itertools import product | |
from scipy.sparse import lil_matrix | |
from scipy.sparse import issparse | |
from scipy.spatial.distance import squareform | |
from sklearn.cross_validation import train_test_split | |
from sklearn.metrics.cluster import v_measure_score | |
from sklearn.utils import check_random_state | |
# To be ported into beard | |
from beard.metrics import paired_f_score, paired_precision_score, paired_recall_score | |
from beard_utils import translate_to_ascii | |
from beard_utils import initials | |
from beard_utils import are_different | |
from beard_utils import n_cluster_diff | |
def load_data(file_records, file_signatures, file_similarity): | |
records = cPickle.load(open(file_records, "r")) | |
signatures = cPickle.load(open(file_signatures, "r")) | |
similarity = cPickle.load(open(file_similarity, "r")).astype(np.int8) | |
signatures = {s["signature_id"]: s for s in signatures} | |
records = {r["publication_id"]: r for r in records} | |
for v in records.values(): | |
v["references"] = set(v["references"]) | |
v["signatures"] = set() | |
for i, s in signatures.items(): | |
records[s["publication_id"]]["signatures"].add(i) | |
return records, signatures, similarity | |
def _block_last_name(name): | |
return translate_to_ascii(name.split(",", 1)[0]) | |
def _block_last_name_first_initial(name): | |
last_name = translate_to_ascii(name.split(",", 1)[0]) | |
try: | |
first_initial = translate_to_ascii(name.split(",", 1)[1]).strip()[0] | |
except: | |
return last_name | |
return "%s %s" % (last_name, first_initial) | |
def blockize(signatures, func=_block_last_name, min_size=50): | |
groups = {} | |
for n, s in signatures.items(): | |
prefix = func(s["author_name"]) | |
if prefix not in groups: | |
groups[prefix] = [] | |
groups[prefix].append(n) | |
blocks = [] | |
smalls = [] | |
for k, v in sorted(groups.items()): # todo: evaluate the effect of min_size | |
if len(v) >= min_size: | |
blocks.append(v) | |
else: | |
smalls.append((k, v)) | |
smalls = sorted(smalls) | |
for start in range(0, len(smalls), min_size): | |
b = [] | |
for _, v in smalls[start:start + min_size]: | |
b.extend(v) | |
blocks.append(b) | |
return blocks | |
def similarity_to_clusters(S): | |
clusters = np.arange(S.shape[0]) | |
if issparse(S): | |
for i, j in sorted(zip(*S.nonzero())): | |
if S[i, j] == 1: | |
clusters[j] = clusters[i] | |
else: | |
for i in range(S.shape[0]): | |
for j in range(S.shape[1]): | |
if S[i, j] == 1: | |
clusters[j] = clusters[i] | |
return clusters | |
def bootstrap(S, block, records, signatures, bootstrap_algorithm, random_state=None): | |
if bootstrap_algorithm == "nothing": | |
np.fill_diagonal(S, 1) | |
return S | |
random_state = check_random_state(random_state) | |
size = len(block) | |
for i in range(size): # This can easily be parallelized / cythonized | |
s_i = signatures[block[i]] | |
name_i = s_i["author_name"] | |
aff_i = s_i["author_affiliation"] | |
initials_i = initials(name_i) | |
for j in range(i + 1, size): | |
if S[i, j] != 0: | |
continue | |
s_j = signatures[block[j]] | |
name_j = s_j["author_name"] | |
aff_j = s_j["author_affiliation"] | |
initials_j = initials(name_j) | |
if bootstrap_algorithm == "best": | |
# Positive rules | |
if ((name_i == name_j and aff_i == aff_j) or # Exact name & affiliation matching | |
(len(initials_i | initials_j) == max(len(initials_i), len(initials_j)) and aff_i == aff_j) or | |
(s_i["publication_id"] in records[s_j["publication_id"]]["references"]) or # Self-citation | |
(s_j["publication_id"] in records[s_i["publication_id"]]["references"])): # Self-citation | |
S[i, j] = 1 | |
S[j, i] = 1 | |
# Negative rules | |
elif ((s_i["publication_id"] == s_j["publication_id"]) or # Authors are co-authors, hence distinct persons | |
(are_different(name_i, name_j))): # Obvious different names | |
S[i, j] = -1 | |
S[j, i] = -1 | |
elif bootstrap_algorithm == "rabbit": | |
if (name_i == name_j): | |
S[i, j] = 1 | |
S[j, i] = 1 | |
elif bootstrap_algorithm == "random": | |
S[i, j] = 2 * random_state.rand() - 1 | |
np.fill_diagonal(S, 1) | |
return S | |
def clusterize(block, records, signatures, similarity, | |
bootstrap_algorithm="best", score=paired_f_score, | |
random_state=0, verbose=0): | |
# Train / test split | |
size = len(block) | |
train, test = train_test_split(np.arange(size), | |
train_size=0.1, | |
random_state=random_state) # About 10% signatures are claimed in prod | |
train, valid = train_test_split(train, | |
train_size=0.5, | |
random_state=random_state) | |
# Select similarities | |
S = (similarity[block, :])[:, block].todense() | |
# For gold data, assume that S[i,j]==-1 for all unknown values | |
S[S == 0] = -1 | |
truth = similarity_to_clusters(S) | |
if verbose > 0: | |
print "Size =", size | |
print "Positive pairs =", (S == 1).sum() | |
print "Negative pairs =", (S == -1).sum() | |
print "Clusters =", len(np.unique(truth)) | |
for c, elements in groupby(sorted(truth)): | |
print "Cluster %d, counting %d signatures" % (c, | |
len(list(elements))) | |
# Bootstrap a similarity matrix | |
if bootstrap_algorithm != "perfect": | |
S_guessed = np.zeros(S.shape, dtype=np.int8) | |
for i, j in product(train, train): # Copy all known values from train | |
S_guessed[i, j] = S[i, j] | |
S_guessed = bootstrap(S_guessed, | |
block, | |
records, | |
signatures, | |
bootstrap_algorithm, | |
random_state=random_state) | |
else: | |
S_guessed = S | |
if verbose > 0: | |
print "True pairs = %d %d" % ((S == 1).sum(), (S == -1).sum()) | |
print "Guessed pairs = %d %d" % ((S_guessed == 1).sum(), | |
(S_guessed == -1).sum()) | |
# Transform signatures into vectors | |
# TODO | |
# (For now we directly use the guessed similarity matrix, without | |
# trying to infer the missing/unknown values.) | |
# Hierarchical clustering | |
distances = squareform((1.0 - S_guessed) / 2.0) | |
linkage_matrix = hac.linkage(distances, method="complete") | |
f_best = -np.inf | |
t_best = 0 | |
best_clusters = None | |
for t1, t2 in zip(np.concatenate(([0], np.array(linkage_matrix[:-1, 2])), | |
axis=0), | |
np.array(linkage_matrix[:, 2])): | |
t = (t1 + t2) / 2.0 | |
clusters = hac.fcluster(linkage_matrix, t, criterion="distance") | |
f = score(truth[valid], clusters[valid]) | |
if f >= f_best: # if >, more clusters, if >= less clusters | |
f_best = f | |
t_best = t | |
best_clusters = clusters.copy() | |
return truth, best_clusters, t_best, train, valid, test | |
def overall_clusters(blocks, all_clusters): | |
offset = 0 | |
clusters = np.zeros(sum(len(b) for b in blocks)) | |
for i, cluster in enumerate(all_clusters): | |
for j, b in enumerate(blocks[i]): | |
clusters[b] = cluster[j] + offset | |
offset += np.max(cluster) | |
clusters = clusters.astype(np.int) | |
return clusters | |
if __name__ == "__main__": | |
# Parse command line arugments | |
parser = argparse.ArgumentParser() | |
parser.add_argument("file_records", type=str) | |
parser.add_argument("file_signatures", type=str) | |
parser.add_argument("file_similarity", type=str) | |
parser.add_argument("--blockize", default="last_name", type=str, choices=["last_name", "last_name_first_initial"]) | |
parser.add_argument("--block_min_size", default=50, type=int) | |
parser.add_argument("--bootstrap", default="best", type=str, choices=["best", "random", "rabbit", "nothing", "perfect"]) | |
parser.add_argument("--score", default="paired_f_score", type=str, choices=["paired_f_score", "v_measure_score"]) | |
parser.add_argument("--n_jobs", default=1, type=int) | |
parser.add_argument("--random_state", default=42, type=int) | |
parser.add_argument("--verbose", default=0, type=int) | |
args = parser.parse_args() | |
if args.score == "paired_f_score": | |
score = paired_f_score | |
elif args.score == "v_measure_score": | |
score = v_measure_score | |
scorers = [paired_f_score, | |
paired_precision_score, | |
paired_recall_score, | |
v_measure_score, | |
n_cluster_diff] | |
# Load data | |
records, signatures, similarity = load_data(args.file_records, | |
args.file_signatures, | |
args.file_similarity) | |
# Group signatures in blocks | |
if args.blockize == "last_name": | |
blocks = blockize(signatures, | |
func=_block_last_name, | |
min_size=args.block_min_size) | |
elif args.blockize == "last_name_first_initial": | |
blocks = blockize(signatures, | |
func=_block_last_name_first_initial, | |
min_size=args.block_min_size) | |
# Cluster signatures by block | |
all_scores_train = {} | |
all_scores_valid = {} | |
all_scores_test = {} | |
for s in scorers: | |
all_scores_train[s] = [] | |
all_scores_valid[s] = [] | |
all_scores_test[s] = [] | |
all_clusters = [] | |
for block in blocks: | |
(truth, | |
best_clusters, | |
best_threshold, | |
train, | |
valid, | |
test) = clusterize(block, records, signatures, similarity, | |
bootstrap_algorithm=args.bootstrap, | |
score=score, | |
random_state=args.random_state, | |
verbose=args.verbose) | |
if args.verbose > 0: | |
print "Best cut-off =", best_threshold | |
for s in scorers: | |
train_score = s(truth[train], best_clusters[train]) | |
valid_score = s(truth[valid], best_clusters[valid]) | |
test_score = s(truth[test], best_clusters[test]) | |
all_scores_train[s].append(train_score) | |
all_scores_valid[s].append(valid_score) | |
all_scores_test[s].append(test_score) | |
if args.verbose > 0: | |
print "Training score (%s) = %f" % (s.__name__, train_score) | |
print "Valid score (%s) = %f" % (s.__name__, valid_score) | |
print "Test score (%s) = %f" % (s.__name__, test_score) | |
if args.verbose > 0: | |
all_clusters.append(best_clusters) | |
# Average scores | |
print "Params =", args | |
weights = [len(block) for block in blocks] | |
for s in scorers: | |
print "%s, average_train_score, %f" % (s.__name__, np.average(all_scores_train[s], weights=weights)) | |
print "%s, average_valid_score, %f" % (s.__name__, np.average(all_scores_valid[s], weights=weights)) | |
print "%s, average_test_score, %f" % (s.__name__, np.average(all_scores_test[s], weights=weights)) | |
truth = similarity_to_clusters(similarity) | |
clusters = overall_clusters(blocks, all_clusters) | |
for s in scorers: | |
print "%s, overall_score, %f" % (s.__name__, s(truth, clusters)) | |
import IPython | |
IPython.embed() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import argparse | |
import cPickle | |
from itertools import groupby | |
from joblib import Parallel, delayed | |
from scipy.sparse import lil_matrix, csr_matrix | |
from invenio.dbquery import run_sql | |
from invenio.bibauthorid_dbinterface import get_title_of_paper | |
from invenio.bibrank_citation_searcher import get_refers_to | |
def get_affiliation(table, bibref_value, bibrec): | |
"""Returns institution name and field number of a signature.""" | |
table_name = str(table)[0:2] + 'x' | |
q = run_sql("""SELECT f2.value, r.field_number | |
FROM bibrec AS b | |
INNER JOIN bibrec_bib%s AS r ON (r.id_bibrec = b.id) | |
INNER JOIN bib%s AS f ON (r.id_bibxxx = f.id) | |
INNER JOIN bibrec_bib%s AS r2 ON (r2.id_bibrec = b.id AND | |
r.field_number = r2.field_number) | |
INNER JOIN bib%s AS f2 ON (r2.id_bibxxx = f2.id) | |
WHERE b.id = %d AND | |
f.id = %d AND | |
f2.tag = '%s__u' | |
""" % (table_name, table_name, table_name, table_name, | |
bibrec, bibref_value, table)) | |
if len(q) > 0: | |
return q[0] | |
else: | |
q = run_sql("""SELECT field_number | |
FROM bib%s, bibrec_bib%s | |
WHERE bib%s.id = bibrec_bib%s.id_bibxxx AND | |
bib%s.id = %s AND bibrec_bib%s.id_bibrec = %s | |
""" % (table_name, table_name, table_name, table_name, | |
table_name, bibref_value, table_name, bibrec)) | |
if len(q) > 0: | |
return None, q[0][0] | |
return None, None | |
def _getter_sig(i, signature): | |
affiliation, position = get_affiliation(signature[1], | |
signature[2], | |
signature[3]) | |
return {'signature_id': i, | |
'author_name': signature[4], | |
'publication_id': signature[3], | |
'author_affiliation': affiliation, | |
'signature_position': position} | |
def extract_signature_data(signatures, n_jobs=1): | |
return Parallel(n_jobs=n_jobs, verbose=3)(delayed(_getter_sig)(i, signature) | |
for i, signature | |
in enumerate(signatures)) | |
def populate_signature_similarity(positive_signatures): | |
n_signatures = len(positive_signatures) | |
similarity = lil_matrix((n_signatures, n_signatures), dtype=np.int8) | |
counter = 0 | |
for n, (personid, signatures) in enumerate(groupby(positive_signatures, | |
lambda x: x[0])): | |
if n % 1000 == 0: | |
print n | |
signatures = list(signatures) | |
vector = np.zeros((len(signatures, )), dtype=np.int8) | |
for i, s in enumerate(signatures): | |
if s[-1] == 2: | |
vector[i] = 1 | |
vector = csr_matrix(vector) | |
block = vector.T * vector | |
block = block.astype(np.int8) | |
similarity[counter:counter + len(signatures), | |
counter:counter + len(signatures)] = block | |
counter += len(signatures) | |
similarity.setdiag(1) | |
return similarity | |
def populate_signature_similarity_from_disclaims(matrix, | |
rejected_signatures, | |
signature_id_mapping, | |
personid_signature_mapping): | |
for rejected_sig in rejected_signatures: | |
if rejected_sig[1:4] not in signature_id_mapping: | |
continue | |
if rejected_sig[0] not in personid_signature_mapping: | |
continue | |
id = signature_id_mapping[rejected_sig[1:4]] | |
claimed_sigs_of_person = personid_signature_mapping[rejected_sig[0]] | |
for sig in claimed_sigs_of_person: | |
matrix[id, signature_id_mapping[sig[1:4]]] = -1 | |
matrix[signature_id_mapping[sig[1:4]], id] = -1 | |
return matrix | |
if __name__ == '__main__': | |
# Parse command line arugments | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--prefix', default=None, type=str) | |
parser.add_argument('--min_personid', default=0, type=int) | |
parser.add_argument('--max_personid', default=None, type=int) | |
parser.add_argument('--min_bibrec', default=0, type=int) | |
parser.add_argument('--max_bibrec', default=None, type=int) | |
parser.add_argument('--limit', default=None, type=int) | |
parser.add_argument('--step', default=100000, type=int) | |
parser.add_argument('--n_jobs', default=1, type=int) | |
parser.add_argument('--gold_only', default=False, type=bool) | |
args = parser.parse_args() | |
# Retrieve signatures | |
query = """SELECT personid, bibref_table, bibref_value, | |
bibrec, name, flag | |
FROM aidPERSONIDPAPERS """ | |
if args.prefix or args.min_personid or args.max_personid or args.min_bibrec or args.max_bibrec or args.gold_only: | |
query += "WHERE " | |
clauses = [] | |
if args.prefix: | |
clauses.append("name LIKE '%s%%'" % args.prefix) | |
if args.min_personid: | |
clauses.append("personid >= %d" % args.min_personid) | |
if args.max_personid: | |
clauses.append("personid < %d" % args.max_personid) | |
if args.min_bibrec: | |
clauses.append("bibrec >= %d" % args.min_bibrec) | |
if args.max_bibrec: | |
clauses.append("bibrec < %d" % args.max_bibrec) | |
if args.gold_only: | |
clauses.append("(flag = 2 OR flag = -2)") | |
query += " AND ".join(clauses) + " " | |
if args.limit: | |
query += "LIMIT %d " % args.limit | |
print query | |
all_signatures = set(run_sql(query)) | |
positive_signatures = list(filter(lambda x: x[-1] >= 0, all_signatures)) | |
positive_signatures = sorted(positive_signatures, | |
key=lambda x: (x[0], x[3])) | |
print "Found %d signatures" % len(all_signatures) | |
print "- %d positives" % len(positive_signatures) | |
print "- %d claimed" % len(filter(lambda x: x[-1] == 2, all_signatures)) | |
print "- %d rejected" % len(filter(lambda x: x[-1] == -2, all_signatures)) | |
# Extract data from signatures | |
step = args.step | |
filename_sig = "dump-beard-signatures-%010d.pickle" | |
print 'Step 1: Extracting data from signatures...' | |
for start in range(0, len(positive_signatures), step): | |
sig_data = extract_signature_data( | |
positive_signatures[start:min(start+step, | |
len(positive_signatures))], | |
n_jobs=args.n_jobs) | |
cPickle.dump(sig_data, | |
open(filename_sig % start, "w"), | |
protocol=cPickle.HIGHEST_PROTOCOL) | |
print 'Done!' | |
# Build similarity matrix | |
print 'Step 2: Populating similarity matrix...' | |
matrix = populate_signature_similarity(positive_signatures) | |
signature_id_mapping = {} | |
for i, sig in enumerate(positive_signatures): | |
signature_id_mapping[sig[1:4]] = i | |
personid_signature_mapping = {} | |
for pid, sigs in groupby(filter(lambda x: x[-1] == 2, | |
positive_signatures), lambda x: x[0]): | |
personid_signature_mapping[pid] = list(sigs) | |
matrix = populate_signature_similarity_from_disclaims(matrix, | |
all_signatures - set(positive_signatures), | |
signature_id_mapping, | |
personid_signature_mapping) | |
print 'Shape of matrix is %s.' % str(matrix.shape) | |
print 'The number of non zero values is %s' % matrix.nnz | |
cPickle.dump(matrix, | |
open("dump-beard-similarity.pickle", "w"), | |
protocol=cPickle.HIGHEST_PROTOCOL) | |
print 'Done!' | |
# Extract data from records | |
filename_rec = "dump-beard-records.pickle" | |
print 'Step 3: Extracting record metadata.' | |
records = {} | |
for i, sig in enumerate(positive_signatures): | |
if i % 1000 == 0: | |
print i | |
if sig[3] not in records: | |
records[sig[3]] = {'publication_id': sig[3], | |
'title': get_title_of_paper(sig[3]), | |
'references': get_refers_to(sig[3])} | |
cPickle.dump(records.values(), | |
open(filename_rec, "w"), | |
protocol=cPickle.HIGHEST_PROTOCOL) | |
print "Found %d records" % len(records) | |
print 'Done!' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import chardet | |
import re | |
import numpy as np | |
from functools import wraps | |
from unidecode import unidecode | |
def memoize(func): | |
cache = {} | |
@wraps(func) | |
def wrap(*args): | |
if args not in cache: | |
cache[args] = func(*args) | |
return cache[args] | |
return wrap | |
def guess_minimum_encoding(text, charsets=('ascii', 'latin1', 'utf8')): | |
text_in_unicode = text.decode('utf8', 'replace') | |
for charset in charsets: | |
try: | |
return (text_in_unicode.encode(charset), charset) | |
except (UnicodeEncodeError, UnicodeDecodeError): | |
pass | |
return (text_in_unicode.encode('utf8'), 'utf8') | |
def decode_to_unicode(text, default_encoding='utf-8'): | |
if not text: | |
return "" | |
try: | |
return text.decode(default_encoding) | |
except (UnicodeError, LookupError): | |
pass | |
detected_encoding = None | |
res = chardet.detect(text) | |
if res['confidence'] >= 0.8: | |
detected_encoding = res['encoding'] | |
if not detected_encoding: | |
dummy, detected_encoding = guess_minimum_encoding(text) | |
return text.decode(detected_encoding, 'ignore') | |
@memoize | |
def translate_to_ascii(value): | |
unicode_text = decode_to_unicode(value) | |
if u"[?]" in unicode_text: | |
decoded_text = [] | |
for unicode_char in unicode_text: | |
decoded_char = unidecode(unicode_char) | |
# Skip unrecognized characters | |
if decoded_char != "[?]": | |
decoded_text.append(decoded_char) | |
ascii_text = ''.join(decoded_text).encode('ascii') | |
else: | |
ascii_text = unidecode(unicode_text).replace(u"[?]", u"").encode('ascii') | |
return ascii_text | |
RE_NORMALIZE_LAST_NAME = re.compile("\s+|\-") | |
RE_NORMALIZE = re.compile("(,\s(i|ii|iii|iv|v|vi|jr))|[\.'\-,]|\s+") | |
@memoize | |
def normalize(name): | |
name = translate_to_ascii(name).lower() | |
try: | |
names = name.split(",", 1) | |
name = "%s %s" % (RE_NORMALIZE_LAST_NAME.sub("", names[0]), names[1]) | |
except: | |
pass | |
name = RE_NORMALIZE.sub(" ", name) | |
name = name.strip() | |
return name | |
@memoize | |
def initials(name): | |
return set([w[0] for w in name.split()]) | |
@memoize | |
def prefixes(name, min_length): | |
return set(w[:i] for w in name.split() for i in range(len(w)+1) if len(w) >= min_length) | |
def cmp(a, b): | |
a = normalize(a) | |
b = normalize(b) | |
# If initials are not a subset of each other | |
p_a = initials(a) | |
p_b = initials(b) | |
diff = len(p_a | p_b) != max(len(p_a), len(p_b)) | |
if diff: # break early | |
return diff | |
# If prefixes of size >= 3 are not a subset of each other | |
p_a = prefixes(a, 3) | |
p_b = prefixes(b, 3) | |
diff = len(p_a | p_b) != max(len(p_a), len(p_b)) | |
return diff | |
def are_different(a, b): | |
return (cmp(a, b) and | |
cmp(re.sub("\-", "", a), re.sub("\-", "", b)) and | |
cmp(re.sub("\-", " ", a), re.sub("\-", " ", b))) | |
def n_cluster_diff(labels_true, labels_pred): | |
return abs(len(np.unique(labels_true)) - len(np.unique(labels_pred))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment