Skip to content

Instantly share code, notes, and snippets.

@glouppe
Created January 7, 2015 15:49
Show Gist options
  • Save glouppe/edaee2c0f82745e301de to your computer and use it in GitHub Desktop.
Save glouppe/edaee2c0f82745e301de to your computer and use it in GitHub Desktop.
Disambiguation prototype
import numpy as np
import argparse
import cPickle
import scipy.cluster.hierarchy as hac
from itertools import groupby
from itertools import product
from scipy.sparse import lil_matrix
from scipy.sparse import issparse
from scipy.spatial.distance import squareform
from sklearn.cross_validation import train_test_split
from sklearn.metrics.cluster import v_measure_score
from sklearn.utils import check_random_state
# To be ported into beard
from beard.metrics import paired_f_score, paired_precision_score, paired_recall_score
from beard_utils import translate_to_ascii
from beard_utils import initials
from beard_utils import are_different
from beard_utils import n_cluster_diff
def load_data(file_records, file_signatures, file_similarity):
records = cPickle.load(open(file_records, "r"))
signatures = cPickle.load(open(file_signatures, "r"))
similarity = cPickle.load(open(file_similarity, "r")).astype(np.int8)
signatures = {s["signature_id"]: s for s in signatures}
records = {r["publication_id"]: r for r in records}
for v in records.values():
v["references"] = set(v["references"])
v["signatures"] = set()
for i, s in signatures.items():
records[s["publication_id"]]["signatures"].add(i)
return records, signatures, similarity
def _block_last_name(name):
return translate_to_ascii(name.split(",", 1)[0])
def _block_last_name_first_initial(name):
last_name = translate_to_ascii(name.split(",", 1)[0])
try:
first_initial = translate_to_ascii(name.split(",", 1)[1]).strip()[0]
except:
return last_name
return "%s %s" % (last_name, first_initial)
def blockize(signatures, func=_block_last_name, min_size=50):
groups = {}
for n, s in signatures.items():
prefix = func(s["author_name"])
if prefix not in groups:
groups[prefix] = []
groups[prefix].append(n)
blocks = []
smalls = []
for k, v in sorted(groups.items()): # todo: evaluate the effect of min_size
if len(v) >= min_size:
blocks.append(v)
else:
smalls.append((k, v))
smalls = sorted(smalls)
for start in range(0, len(smalls), min_size):
b = []
for _, v in smalls[start:start + min_size]:
b.extend(v)
blocks.append(b)
return blocks
def similarity_to_clusters(S):
clusters = np.arange(S.shape[0])
if issparse(S):
for i, j in sorted(zip(*S.nonzero())):
if S[i, j] == 1:
clusters[j] = clusters[i]
else:
for i in range(S.shape[0]):
for j in range(S.shape[1]):
if S[i, j] == 1:
clusters[j] = clusters[i]
return clusters
def bootstrap(S, block, records, signatures, bootstrap_algorithm, random_state=None):
if bootstrap_algorithm == "nothing":
np.fill_diagonal(S, 1)
return S
random_state = check_random_state(random_state)
size = len(block)
for i in range(size): # This can easily be parallelized / cythonized
s_i = signatures[block[i]]
name_i = s_i["author_name"]
aff_i = s_i["author_affiliation"]
initials_i = initials(name_i)
for j in range(i + 1, size):
if S[i, j] != 0:
continue
s_j = signatures[block[j]]
name_j = s_j["author_name"]
aff_j = s_j["author_affiliation"]
initials_j = initials(name_j)
if bootstrap_algorithm == "best":
# Positive rules
if ((name_i == name_j and aff_i == aff_j) or # Exact name & affiliation matching
(len(initials_i | initials_j) == max(len(initials_i), len(initials_j)) and aff_i == aff_j) or
(s_i["publication_id"] in records[s_j["publication_id"]]["references"]) or # Self-citation
(s_j["publication_id"] in records[s_i["publication_id"]]["references"])): # Self-citation
S[i, j] = 1
S[j, i] = 1
# Negative rules
elif ((s_i["publication_id"] == s_j["publication_id"]) or # Authors are co-authors, hence distinct persons
(are_different(name_i, name_j))): # Obvious different names
S[i, j] = -1
S[j, i] = -1
elif bootstrap_algorithm == "rabbit":
if (name_i == name_j):
S[i, j] = 1
S[j, i] = 1
elif bootstrap_algorithm == "random":
S[i, j] = 2 * random_state.rand() - 1
np.fill_diagonal(S, 1)
return S
def clusterize(block, records, signatures, similarity,
bootstrap_algorithm="best", score=paired_f_score,
random_state=0, verbose=0):
# Train / test split
size = len(block)
train, test = train_test_split(np.arange(size),
train_size=0.1,
random_state=random_state) # About 10% signatures are claimed in prod
train, valid = train_test_split(train,
train_size=0.5,
random_state=random_state)
# Select similarities
S = (similarity[block, :])[:, block].todense()
# For gold data, assume that S[i,j]==-1 for all unknown values
S[S == 0] = -1
truth = similarity_to_clusters(S)
if verbose > 0:
print "Size =", size
print "Positive pairs =", (S == 1).sum()
print "Negative pairs =", (S == -1).sum()
print "Clusters =", len(np.unique(truth))
for c, elements in groupby(sorted(truth)):
print "Cluster %d, counting %d signatures" % (c,
len(list(elements)))
# Bootstrap a similarity matrix
if bootstrap_algorithm != "perfect":
S_guessed = np.zeros(S.shape, dtype=np.int8)
for i, j in product(train, train): # Copy all known values from train
S_guessed[i, j] = S[i, j]
S_guessed = bootstrap(S_guessed,
block,
records,
signatures,
bootstrap_algorithm,
random_state=random_state)
else:
S_guessed = S
if verbose > 0:
print "True pairs = %d %d" % ((S == 1).sum(), (S == -1).sum())
print "Guessed pairs = %d %d" % ((S_guessed == 1).sum(),
(S_guessed == -1).sum())
# Transform signatures into vectors
# TODO
# (For now we directly use the guessed similarity matrix, without
# trying to infer the missing/unknown values.)
# Hierarchical clustering
distances = squareform((1.0 - S_guessed) / 2.0)
linkage_matrix = hac.linkage(distances, method="complete")
f_best = -np.inf
t_best = 0
best_clusters = None
for t1, t2 in zip(np.concatenate(([0], np.array(linkage_matrix[:-1, 2])),
axis=0),
np.array(linkage_matrix[:, 2])):
t = (t1 + t2) / 2.0
clusters = hac.fcluster(linkage_matrix, t, criterion="distance")
f = score(truth[valid], clusters[valid])
if f >= f_best: # if >, more clusters, if >= less clusters
f_best = f
t_best = t
best_clusters = clusters.copy()
return truth, best_clusters, t_best, train, valid, test
def overall_clusters(blocks, all_clusters):
offset = 0
clusters = np.zeros(sum(len(b) for b in blocks))
for i, cluster in enumerate(all_clusters):
for j, b in enumerate(blocks[i]):
clusters[b] = cluster[j] + offset
offset += np.max(cluster)
clusters = clusters.astype(np.int)
return clusters
if __name__ == "__main__":
# Parse command line arugments
parser = argparse.ArgumentParser()
parser.add_argument("file_records", type=str)
parser.add_argument("file_signatures", type=str)
parser.add_argument("file_similarity", type=str)
parser.add_argument("--blockize", default="last_name", type=str, choices=["last_name", "last_name_first_initial"])
parser.add_argument("--block_min_size", default=50, type=int)
parser.add_argument("--bootstrap", default="best", type=str, choices=["best", "random", "rabbit", "nothing", "perfect"])
parser.add_argument("--score", default="paired_f_score", type=str, choices=["paired_f_score", "v_measure_score"])
parser.add_argument("--n_jobs", default=1, type=int)
parser.add_argument("--random_state", default=42, type=int)
parser.add_argument("--verbose", default=0, type=int)
args = parser.parse_args()
if args.score == "paired_f_score":
score = paired_f_score
elif args.score == "v_measure_score":
score = v_measure_score
scorers = [paired_f_score,
paired_precision_score,
paired_recall_score,
v_measure_score,
n_cluster_diff]
# Load data
records, signatures, similarity = load_data(args.file_records,
args.file_signatures,
args.file_similarity)
# Group signatures in blocks
if args.blockize == "last_name":
blocks = blockize(signatures,
func=_block_last_name,
min_size=args.block_min_size)
elif args.blockize == "last_name_first_initial":
blocks = blockize(signatures,
func=_block_last_name_first_initial,
min_size=args.block_min_size)
# Cluster signatures by block
all_scores_train = {}
all_scores_valid = {}
all_scores_test = {}
for s in scorers:
all_scores_train[s] = []
all_scores_valid[s] = []
all_scores_test[s] = []
all_clusters = []
for block in blocks:
(truth,
best_clusters,
best_threshold,
train,
valid,
test) = clusterize(block, records, signatures, similarity,
bootstrap_algorithm=args.bootstrap,
score=score,
random_state=args.random_state,
verbose=args.verbose)
if args.verbose > 0:
print "Best cut-off =", best_threshold
for s in scorers:
train_score = s(truth[train], best_clusters[train])
valid_score = s(truth[valid], best_clusters[valid])
test_score = s(truth[test], best_clusters[test])
all_scores_train[s].append(train_score)
all_scores_valid[s].append(valid_score)
all_scores_test[s].append(test_score)
if args.verbose > 0:
print "Training score (%s) = %f" % (s.__name__, train_score)
print "Valid score (%s) = %f" % (s.__name__, valid_score)
print "Test score (%s) = %f" % (s.__name__, test_score)
if args.verbose > 0:
print
all_clusters.append(best_clusters)
# Average scores
print "Params =", args
print
weights = [len(block) for block in blocks]
for s in scorers:
print "%s, average_train_score, %f" % (s.__name__, np.average(all_scores_train[s], weights=weights))
print "%s, average_valid_score, %f" % (s.__name__, np.average(all_scores_valid[s], weights=weights))
print "%s, average_test_score, %f" % (s.__name__, np.average(all_scores_test[s], weights=weights))
print
truth = similarity_to_clusters(similarity)
clusters = overall_clusters(blocks, all_clusters)
for s in scorers:
print "%s, overall_score, %f" % (s.__name__, s(truth, clusters))
print
import IPython
IPython.embed()
import numpy as np
import argparse
import cPickle
from itertools import groupby
from joblib import Parallel, delayed
from scipy.sparse import lil_matrix, csr_matrix
from invenio.dbquery import run_sql
from invenio.bibauthorid_dbinterface import get_title_of_paper
from invenio.bibrank_citation_searcher import get_refers_to
def get_affiliation(table, bibref_value, bibrec):
"""Returns institution name and field number of a signature."""
table_name = str(table)[0:2] + 'x'
q = run_sql("""SELECT f2.value, r.field_number
FROM bibrec AS b
INNER JOIN bibrec_bib%s AS r ON (r.id_bibrec = b.id)
INNER JOIN bib%s AS f ON (r.id_bibxxx = f.id)
INNER JOIN bibrec_bib%s AS r2 ON (r2.id_bibrec = b.id AND
r.field_number = r2.field_number)
INNER JOIN bib%s AS f2 ON (r2.id_bibxxx = f2.id)
WHERE b.id = %d AND
f.id = %d AND
f2.tag = '%s__u'
""" % (table_name, table_name, table_name, table_name,
bibrec, bibref_value, table))
if len(q) > 0:
return q[0]
else:
q = run_sql("""SELECT field_number
FROM bib%s, bibrec_bib%s
WHERE bib%s.id = bibrec_bib%s.id_bibxxx AND
bib%s.id = %s AND bibrec_bib%s.id_bibrec = %s
""" % (table_name, table_name, table_name, table_name,
table_name, bibref_value, table_name, bibrec))
if len(q) > 0:
return None, q[0][0]
return None, None
def _getter_sig(i, signature):
affiliation, position = get_affiliation(signature[1],
signature[2],
signature[3])
return {'signature_id': i,
'author_name': signature[4],
'publication_id': signature[3],
'author_affiliation': affiliation,
'signature_position': position}
def extract_signature_data(signatures, n_jobs=1):
return Parallel(n_jobs=n_jobs, verbose=3)(delayed(_getter_sig)(i, signature)
for i, signature
in enumerate(signatures))
def populate_signature_similarity(positive_signatures):
n_signatures = len(positive_signatures)
similarity = lil_matrix((n_signatures, n_signatures), dtype=np.int8)
counter = 0
for n, (personid, signatures) in enumerate(groupby(positive_signatures,
lambda x: x[0])):
if n % 1000 == 0:
print n
signatures = list(signatures)
vector = np.zeros((len(signatures, )), dtype=np.int8)
for i, s in enumerate(signatures):
if s[-1] == 2:
vector[i] = 1
vector = csr_matrix(vector)
block = vector.T * vector
block = block.astype(np.int8)
similarity[counter:counter + len(signatures),
counter:counter + len(signatures)] = block
counter += len(signatures)
similarity.setdiag(1)
return similarity
def populate_signature_similarity_from_disclaims(matrix,
rejected_signatures,
signature_id_mapping,
personid_signature_mapping):
for rejected_sig in rejected_signatures:
if rejected_sig[1:4] not in signature_id_mapping:
continue
if rejected_sig[0] not in personid_signature_mapping:
continue
id = signature_id_mapping[rejected_sig[1:4]]
claimed_sigs_of_person = personid_signature_mapping[rejected_sig[0]]
for sig in claimed_sigs_of_person:
matrix[id, signature_id_mapping[sig[1:4]]] = -1
matrix[signature_id_mapping[sig[1:4]], id] = -1
return matrix
if __name__ == '__main__':
# Parse command line arugments
parser = argparse.ArgumentParser()
parser.add_argument('--prefix', default=None, type=str)
parser.add_argument('--min_personid', default=0, type=int)
parser.add_argument('--max_personid', default=None, type=int)
parser.add_argument('--min_bibrec', default=0, type=int)
parser.add_argument('--max_bibrec', default=None, type=int)
parser.add_argument('--limit', default=None, type=int)
parser.add_argument('--step', default=100000, type=int)
parser.add_argument('--n_jobs', default=1, type=int)
parser.add_argument('--gold_only', default=False, type=bool)
args = parser.parse_args()
# Retrieve signatures
query = """SELECT personid, bibref_table, bibref_value,
bibrec, name, flag
FROM aidPERSONIDPAPERS """
if args.prefix or args.min_personid or args.max_personid or args.min_bibrec or args.max_bibrec or args.gold_only:
query += "WHERE "
clauses = []
if args.prefix:
clauses.append("name LIKE '%s%%'" % args.prefix)
if args.min_personid:
clauses.append("personid >= %d" % args.min_personid)
if args.max_personid:
clauses.append("personid < %d" % args.max_personid)
if args.min_bibrec:
clauses.append("bibrec >= %d" % args.min_bibrec)
if args.max_bibrec:
clauses.append("bibrec < %d" % args.max_bibrec)
if args.gold_only:
clauses.append("(flag = 2 OR flag = -2)")
query += " AND ".join(clauses) + " "
if args.limit:
query += "LIMIT %d " % args.limit
print query
all_signatures = set(run_sql(query))
positive_signatures = list(filter(lambda x: x[-1] >= 0, all_signatures))
positive_signatures = sorted(positive_signatures,
key=lambda x: (x[0], x[3]))
print "Found %d signatures" % len(all_signatures)
print "- %d positives" % len(positive_signatures)
print "- %d claimed" % len(filter(lambda x: x[-1] == 2, all_signatures))
print "- %d rejected" % len(filter(lambda x: x[-1] == -2, all_signatures))
# Extract data from signatures
step = args.step
filename_sig = "dump-beard-signatures-%010d.pickle"
print 'Step 1: Extracting data from signatures...'
for start in range(0, len(positive_signatures), step):
sig_data = extract_signature_data(
positive_signatures[start:min(start+step,
len(positive_signatures))],
n_jobs=args.n_jobs)
cPickle.dump(sig_data,
open(filename_sig % start, "w"),
protocol=cPickle.HIGHEST_PROTOCOL)
print 'Done!'
# Build similarity matrix
print 'Step 2: Populating similarity matrix...'
matrix = populate_signature_similarity(positive_signatures)
signature_id_mapping = {}
for i, sig in enumerate(positive_signatures):
signature_id_mapping[sig[1:4]] = i
personid_signature_mapping = {}
for pid, sigs in groupby(filter(lambda x: x[-1] == 2,
positive_signatures), lambda x: x[0]):
personid_signature_mapping[pid] = list(sigs)
matrix = populate_signature_similarity_from_disclaims(matrix,
all_signatures - set(positive_signatures),
signature_id_mapping,
personid_signature_mapping)
print 'Shape of matrix is %s.' % str(matrix.shape)
print 'The number of non zero values is %s' % matrix.nnz
cPickle.dump(matrix,
open("dump-beard-similarity.pickle", "w"),
protocol=cPickle.HIGHEST_PROTOCOL)
print 'Done!'
# Extract data from records
filename_rec = "dump-beard-records.pickle"
print 'Step 3: Extracting record metadata.'
records = {}
for i, sig in enumerate(positive_signatures):
if i % 1000 == 0:
print i
if sig[3] not in records:
records[sig[3]] = {'publication_id': sig[3],
'title': get_title_of_paper(sig[3]),
'references': get_refers_to(sig[3])}
cPickle.dump(records.values(),
open(filename_rec, "w"),
protocol=cPickle.HIGHEST_PROTOCOL)
print "Found %d records" % len(records)
print 'Done!'
import chardet
import re
import numpy as np
from functools import wraps
from unidecode import unidecode
def memoize(func):
cache = {}
@wraps(func)
def wrap(*args):
if args not in cache:
cache[args] = func(*args)
return cache[args]
return wrap
def guess_minimum_encoding(text, charsets=('ascii', 'latin1', 'utf8')):
text_in_unicode = text.decode('utf8', 'replace')
for charset in charsets:
try:
return (text_in_unicode.encode(charset), charset)
except (UnicodeEncodeError, UnicodeDecodeError):
pass
return (text_in_unicode.encode('utf8'), 'utf8')
def decode_to_unicode(text, default_encoding='utf-8'):
if not text:
return ""
try:
return text.decode(default_encoding)
except (UnicodeError, LookupError):
pass
detected_encoding = None
res = chardet.detect(text)
if res['confidence'] >= 0.8:
detected_encoding = res['encoding']
if not detected_encoding:
dummy, detected_encoding = guess_minimum_encoding(text)
return text.decode(detected_encoding, 'ignore')
@memoize
def translate_to_ascii(value):
unicode_text = decode_to_unicode(value)
if u"[?]" in unicode_text:
decoded_text = []
for unicode_char in unicode_text:
decoded_char = unidecode(unicode_char)
# Skip unrecognized characters
if decoded_char != "[?]":
decoded_text.append(decoded_char)
ascii_text = ''.join(decoded_text).encode('ascii')
else:
ascii_text = unidecode(unicode_text).replace(u"[?]", u"").encode('ascii')
return ascii_text
RE_NORMALIZE_LAST_NAME = re.compile("\s+|\-")
RE_NORMALIZE = re.compile("(,\s(i|ii|iii|iv|v|vi|jr))|[\.'\-,]|\s+")
@memoize
def normalize(name):
name = translate_to_ascii(name).lower()
try:
names = name.split(",", 1)
name = "%s %s" % (RE_NORMALIZE_LAST_NAME.sub("", names[0]), names[1])
except:
pass
name = RE_NORMALIZE.sub(" ", name)
name = name.strip()
return name
@memoize
def initials(name):
return set([w[0] for w in name.split()])
@memoize
def prefixes(name, min_length):
return set(w[:i] for w in name.split() for i in range(len(w)+1) if len(w) >= min_length)
def cmp(a, b):
a = normalize(a)
b = normalize(b)
# If initials are not a subset of each other
p_a = initials(a)
p_b = initials(b)
diff = len(p_a | p_b) != max(len(p_a), len(p_b))
if diff: # break early
return diff
# If prefixes of size >= 3 are not a subset of each other
p_a = prefixes(a, 3)
p_b = prefixes(b, 3)
diff = len(p_a | p_b) != max(len(p_a), len(p_b))
return diff
def are_different(a, b):
return (cmp(a, b) and
cmp(re.sub("\-", "", a), re.sub("\-", "", b)) and
cmp(re.sub("\-", " ", a), re.sub("\-", " ", b)))
def n_cluster_diff(labels_true, labels_pred):
return abs(len(np.unique(labels_true)) - len(np.unique(labels_pred)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment