Skip to content

Instantly share code, notes, and snippets.

@subuk
Last active August 29, 2015 14:22
Show Gist options
  • Save subuk/72740e021cb16c54a3bb to your computer and use it in GitHub Desktop.
Save subuk/72740e021cb16c54a3bb to your computer and use it in GitHub Desktop.
Распределение элементов по группам без пересечений.
#!/usr/bin/env python
from __future__ import print_function
import collections
GROUPS_COUNT = 7
cleaned_words = set()
#
# Prepare input
#
for word in open('/tmp/doc.txt', 'rb').read().decode('utf-8').strip().split():
if len(word) < 5:
continue
if not word[0].isalpha():
continue
cleaned_words.add(word.lower())
words_grouped_by_first_letter = collections.defaultdict(list)
for word in cleaned_words:
words_grouped_by_first_letter[word[0]].append(word)
wordscount_grouped_by_first_letter = collections.defaultdict(list)
for letter, words in words_grouped_by_first_letter.items():
wordscount_grouped_by_first_letter[letter] = len(words)
result = []
sorted_letters = sorted(wordscount_grouped_by_first_letter.keys())
result = [[letter] for letter in sorted_letters]
print("==> Source ======> ")
for letter in sorted_letters:
print(letter, wordscount_grouped_by_first_letter[letter])
#
# Process
#
print("==> Merging =====> ")
while len(result) > GROUPS_COUNT:
min_val = float('inf')
min_idx = None
for index in range(len(result)-1):
cur_el_weight = sum(
wordscount_grouped_by_first_letter[key]
for key in result[index]
)
next_el_weight = sum(
wordscount_grouped_by_first_letter[key]
for key in result[index+1]
)
if (cur_el_weight + next_el_weight) < min_val:
min_idx = index
min_val = cur_el_weight + next_el_weight
if min_idx is not None:
result[min_idx] += result[min_idx+1]
del result[min_idx+1]
#
# Print result
#
print("==> Result ======> ")
for keys in result:
print(
keys[0], "-", keys[-1],
sum([wordscount_grouped_by_first_letter[key] for key in keys]),
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment