Yomguithereal · November 14, 2017 15:10
diff --git a/cluster.py b/cluster.py
 #!/usr/bin/env python3
 # Points to improve:
 #   1) Distance metrics
 #   2) Clustering scheme
 import csv
 import re
 from collections import defaultdict

 # Parameters
 SOURCE_CSV_PATH = './uniq_artworks.csv'
 ARTISTS_SEPARATOR = re.compile(',\s+(?=[A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ])')
 TOKEN_BLACK_LIST = set([
    'M/M'
 ])
 SEQUENCE_LENGTH_THRESHOLD = 3
 DISTANCE_THRESHOLD = 1

 # Indices
 ARTISTS_ACQUISITION_INDEX = defaultdict(list)
 ARTISTS_SEQUENCES = {}

 # Helper functions
 def substitution_cost(mode1, mode2, artist1, artist2):
    return 1

 def levenshtein(str1, str2, artist1, artist2):
    m = len(str1)
    n = len(str2)
    lensum = float(m + n)
    d = []
    for i in range(m+1):
        d.append([i])
    del d[0][0]
    for j in range(n+1):
        d[0].append(j)
    for j in range(1,n+1):
        for i in range(1,m+1):
            if str1[i-1] == str2[j-1]:
                d[i].insert(j,d[i-1][j-1])
            else:
                minimum = min(
                    d[i-1][j] + 1,   # Deletion cost
                    d[i][j-1] + 1,   # Insertion cost
                    d[i-1][j-1] + substitution_cost(str1, str2, artist1, artist2)) # Substition cost

                d[i].insert(j, minimum)
    ldist = d[-1][-1]
    return ldist

 # 1) Read the source CSV file
 with open(SOURCE_CSV_PATH, 'r') as sf:
    reader = csv.DictReader(sf)

    for row in reader:
        artists = ARTISTS_SEPARATOR.split(row['authors_list'].strip())
        artists = (artist for artist in artists if artist not in TOKEN_BLACK_LIST)

        year = row['acquisition_year'].strip()

        # Skipping no year
        if not year:
            continue

        year = int(year)

        # Skipping before period
        if year < 1945:
            continue

        raw_mode = row['acquisition_mode'].strip().lower()

        mode = None

        if 'commande' in raw_mode:
            mode = 'C'
        elif 'achat' in raw_mode:
            mode = 'A'
        elif 'don' in raw_mode:
            mode = 'D'

        # Skipping useless acquisition mode
        if not mode:
            continue

        # Adding the acquisition
        for artist in artists:
            ARTISTS_ACQUISITION_INDEX[artist].append((year, mode))

 # 2) Compiling sequences
 for artist, acquisitions in ARTISTS_ACQUISITION_INDEX.items():

    # Filtering tiny sequences
    if len(acquisitions) < SEQUENCE_LENGTH_THRESHOLD:
        continue

    # Ordering acquisitions
    acquisitions = sorted(acquisitions)

    # Squeezing the sequence
    sequence = ''
    last_mode = None

    for _, mode in acquisitions:
        if mode != last_mode:
            sequence += mode
            last_mode = mode

    # Filtering a second time
    if len(sequence) < SEQUENCE_LENGTH_THRESHOLD:
        continue

    ARTISTS_SEQUENCES[artist] = sequence

 # 3) Processing the eta-NN graph
 GRAPH = defaultdict(list)
 ARTISTS = list(ARTISTS_SEQUENCES.keys())

 for i, artist_source in enumerate(ARTISTS):
    sequence_source = ARTISTS_SEQUENCES[artist_source]

    for j in range(i + 1, len(ARTISTS)):
        artist_target = ARTISTS[j]
        sequence_target = ARTISTS_SEQUENCES[artist_target]

        distance = levenshtein(
            sequence_source,
            sequence_target,
            artist_source,
            artist_target
        )

        if distance < DISTANCE_THRESHOLD:
            GRAPH[artist_source].append(artist_target)
            GRAPH[artist_target].append(artist_source)

 # 4) Deriving clusters
 CLUSTERS = []
 ALREADY_IN_CLUSTER = set()

 for artist, neighbors in GRAPH.items():
    if artist in ALREADY_IN_CLUSTER:
        continue

    cluster = [artist] + neighbors
    CLUSTERS.append(cluster)
    ALREADY_IN_CLUSTER.update(cluster)

 # 5) Dumping the clusters
 for i, cluster in enumerate(CLUSTERS):
    print('Cluster n°%i containing:' % (i + 1))

    for artist in cluster:
        sequence = ARTISTS_SEQUENCES[artist]
        print('  - %s (%s)' % (artist, sequence))

    print()
	#!/usr/bin/env python3
	# Points to improve:
	# 1) Distance metrics
	# 2) Clustering scheme
	import csv
	import re
	from collections import defaultdict

	# Parameters
	SOURCE_CSV_PATH = './uniq_artworks.csv'
	ARTISTS_SEPARATOR = re.compile(',\s+(?=[A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ])')
	TOKEN_BLACK_LIST = set([
	'M/M'
	])
	SEQUENCE_LENGTH_THRESHOLD = 3
	DISTANCE_THRESHOLD = 1

	# Indices
	ARTISTS_ACQUISITION_INDEX = defaultdict(list)
	ARTISTS_SEQUENCES = {}

	# Helper functions
	def substitution_cost(mode1, mode2, artist1, artist2):
	return 1

	def levenshtein(str1, str2, artist1, artist2):
	m = len(str1)
	n = len(str2)
	lensum = float(m + n)
	d = []
	for i in range(m+1):
	d.append([i])
	del d[0][0]
	for j in range(n+1):
	d[0].append(j)
	for j in range(1,n+1):
	for i in range(1,m+1):
	if str1[i-1] == str2[j-1]:
	d[i].insert(j,d[i-1][j-1])
	else:
	minimum = min(
	d[i-1][j] + 1, # Deletion cost
	d[i][j-1] + 1, # Insertion cost
	d[i-1][j-1] + substitution_cost(str1, str2, artist1, artist2)) # Substition cost

	d[i].insert(j, minimum)
	ldist = d[-1][-1]
	return ldist

	# 1) Read the source CSV file
	with open(SOURCE_CSV_PATH, 'r') as sf:
	reader = csv.DictReader(sf)

	for row in reader:
	artists = ARTISTS_SEPARATOR.split(row['authors_list'].strip())
	artists = (artist for artist in artists if artist not in TOKEN_BLACK_LIST)

	year = row['acquisition_year'].strip()

	# Skipping no year
	if not year:
	continue

	year = int(year)

	# Skipping before period
	if year < 1945:
	continue

	raw_mode = row['acquisition_mode'].strip().lower()

	mode = None

	if 'commande' in raw_mode:
	mode = 'C'
	elif 'achat' in raw_mode:
	mode = 'A'
	elif 'don' in raw_mode:
	mode = 'D'

	# Skipping useless acquisition mode
	if not mode:
	continue

	# Adding the acquisition
	for artist in artists:
	ARTISTS_ACQUISITION_INDEX[artist].append((year, mode))

	# 2) Compiling sequences
	for artist, acquisitions in ARTISTS_ACQUISITION_INDEX.items():

	# Filtering tiny sequences
	if len(acquisitions) < SEQUENCE_LENGTH_THRESHOLD:
	continue

	# Ordering acquisitions
	acquisitions = sorted(acquisitions)

	# Squeezing the sequence
	sequence = ''
	last_mode = None

	for _, mode in acquisitions:
	if mode != last_mode:
	sequence += mode
	last_mode = mode

	# Filtering a second time
	if len(sequence) < SEQUENCE_LENGTH_THRESHOLD:
	continue

	ARTISTS_SEQUENCES[artist] = sequence

	# 3) Processing the eta-NN graph
	GRAPH = defaultdict(list)
	ARTISTS = list(ARTISTS_SEQUENCES.keys())

	for i, artist_source in enumerate(ARTISTS):
	sequence_source = ARTISTS_SEQUENCES[artist_source]

	for j in range(i + 1, len(ARTISTS)):
	artist_target = ARTISTS[j]
	sequence_target = ARTISTS_SEQUENCES[artist_target]

	distance = levenshtein(
	sequence_source,
	sequence_target,
	artist_source,
	artist_target
	)

	if distance < DISTANCE_THRESHOLD:
	GRAPH[artist_source].append(artist_target)
	GRAPH[artist_target].append(artist_source)

	# 4) Deriving clusters
	CLUSTERS = []
	ALREADY_IN_CLUSTER = set()

	for artist, neighbors in GRAPH.items():
	if artist in ALREADY_IN_CLUSTER:
	continue

	cluster = [artist] + neighbors
	CLUSTERS.append(cluster)
	ALREADY_IN_CLUSTER.update(cluster)

	# 5) Dumping the clusters
	for i, cluster in enumerate(CLUSTERS):
	print('Cluster n°%i containing:' % (i + 1))

	for artist in cluster:
	sequence = ARTISTS_SEQUENCES[artist]
	print(' - %s (%s)' % (artist, sequence))

	print()
No results found