Created
November 14, 2017 15:10
-
-
Save Yomguithereal/9809c66afa2437f38f7bd56fdbe3a503 to your computer and use it in GitHub Desktop.
FNAC Artists Sequences Clustering Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Points to improve: | |
# 1) Distance metrics | |
# 2) Clustering scheme | |
import csv | |
import re | |
from collections import defaultdict | |
# Parameters | |
SOURCE_CSV_PATH = './uniq_artworks.csv' | |
ARTISTS_SEPARATOR = re.compile(',\s+(?=[A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ])') | |
TOKEN_BLACK_LIST = set([ | |
'M/M' | |
]) | |
SEQUENCE_LENGTH_THRESHOLD = 3 | |
DISTANCE_THRESHOLD = 1 | |
# Indices | |
ARTISTS_ACQUISITION_INDEX = defaultdict(list) | |
ARTISTS_SEQUENCES = {} | |
# Helper functions | |
def substitution_cost(mode1, mode2, artist1, artist2): | |
return 1 | |
def levenshtein(str1, str2, artist1, artist2): | |
m = len(str1) | |
n = len(str2) | |
lensum = float(m + n) | |
d = [] | |
for i in range(m+1): | |
d.append([i]) | |
del d[0][0] | |
for j in range(n+1): | |
d[0].append(j) | |
for j in range(1,n+1): | |
for i in range(1,m+1): | |
if str1[i-1] == str2[j-1]: | |
d[i].insert(j,d[i-1][j-1]) | |
else: | |
minimum = min( | |
d[i-1][j] + 1, # Deletion cost | |
d[i][j-1] + 1, # Insertion cost | |
d[i-1][j-1] + substitution_cost(str1, str2, artist1, artist2)) # Substition cost | |
d[i].insert(j, minimum) | |
ldist = d[-1][-1] | |
return ldist | |
# 1) Read the source CSV file | |
with open(SOURCE_CSV_PATH, 'r') as sf: | |
reader = csv.DictReader(sf) | |
for row in reader: | |
artists = ARTISTS_SEPARATOR.split(row['authors_list'].strip()) | |
artists = (artist for artist in artists if artist not in TOKEN_BLACK_LIST) | |
year = row['acquisition_year'].strip() | |
# Skipping no year | |
if not year: | |
continue | |
year = int(year) | |
# Skipping before period | |
if year < 1945: | |
continue | |
raw_mode = row['acquisition_mode'].strip().lower() | |
mode = None | |
if 'commande' in raw_mode: | |
mode = 'C' | |
elif 'achat' in raw_mode: | |
mode = 'A' | |
elif 'don' in raw_mode: | |
mode = 'D' | |
# Skipping useless acquisition mode | |
if not mode: | |
continue | |
# Adding the acquisition | |
for artist in artists: | |
ARTISTS_ACQUISITION_INDEX[artist].append((year, mode)) | |
# 2) Compiling sequences | |
for artist, acquisitions in ARTISTS_ACQUISITION_INDEX.items(): | |
# Filtering tiny sequences | |
if len(acquisitions) < SEQUENCE_LENGTH_THRESHOLD: | |
continue | |
# Ordering acquisitions | |
acquisitions = sorted(acquisitions) | |
# Squeezing the sequence | |
sequence = '' | |
last_mode = None | |
for _, mode in acquisitions: | |
if mode != last_mode: | |
sequence += mode | |
last_mode = mode | |
# Filtering a second time | |
if len(sequence) < SEQUENCE_LENGTH_THRESHOLD: | |
continue | |
ARTISTS_SEQUENCES[artist] = sequence | |
# 3) Processing the eta-NN graph | |
GRAPH = defaultdict(list) | |
ARTISTS = list(ARTISTS_SEQUENCES.keys()) | |
for i, artist_source in enumerate(ARTISTS): | |
sequence_source = ARTISTS_SEQUENCES[artist_source] | |
for j in range(i + 1, len(ARTISTS)): | |
artist_target = ARTISTS[j] | |
sequence_target = ARTISTS_SEQUENCES[artist_target] | |
distance = levenshtein( | |
sequence_source, | |
sequence_target, | |
artist_source, | |
artist_target | |
) | |
if distance < DISTANCE_THRESHOLD: | |
GRAPH[artist_source].append(artist_target) | |
GRAPH[artist_target].append(artist_source) | |
# 4) Deriving clusters | |
CLUSTERS = [] | |
ALREADY_IN_CLUSTER = set() | |
for artist, neighbors in GRAPH.items(): | |
if artist in ALREADY_IN_CLUSTER: | |
continue | |
cluster = [artist] + neighbors | |
CLUSTERS.append(cluster) | |
ALREADY_IN_CLUSTER.update(cluster) | |
# 5) Dumping the clusters | |
for i, cluster in enumerate(CLUSTERS): | |
print('Cluster n°%i containing:' % (i + 1)) | |
for artist in cluster: | |
sequence = ARTISTS_SEQUENCES[artist] | |
print(' - %s (%s)' % (artist, sequence)) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment