Last active
August 29, 2015 14:22
-
-
Save subuk/72740e021cb16c54a3bb to your computer and use it in GitHub Desktop.
Распределение элементов по группам без пересечений.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
import collections | |
GROUPS_COUNT = 7 | |
cleaned_words = set() | |
# | |
# Prepare input | |
# | |
for word in open('/tmp/doc.txt', 'rb').read().decode('utf-8').strip().split(): | |
if len(word) < 5: | |
continue | |
if not word[0].isalpha(): | |
continue | |
cleaned_words.add(word.lower()) | |
words_grouped_by_first_letter = collections.defaultdict(list) | |
for word in cleaned_words: | |
words_grouped_by_first_letter[word[0]].append(word) | |
wordscount_grouped_by_first_letter = collections.defaultdict(list) | |
for letter, words in words_grouped_by_first_letter.items(): | |
wordscount_grouped_by_first_letter[letter] = len(words) | |
result = [] | |
sorted_letters = sorted(wordscount_grouped_by_first_letter.keys()) | |
result = [[letter] for letter in sorted_letters] | |
print("==> Source ======> ") | |
for letter in sorted_letters: | |
print(letter, wordscount_grouped_by_first_letter[letter]) | |
# | |
# Process | |
# | |
print("==> Merging =====> ") | |
while len(result) > GROUPS_COUNT: | |
min_val = float('inf') | |
min_idx = None | |
for index in range(len(result)-1): | |
cur_el_weight = sum( | |
wordscount_grouped_by_first_letter[key] | |
for key in result[index] | |
) | |
next_el_weight = sum( | |
wordscount_grouped_by_first_letter[key] | |
for key in result[index+1] | |
) | |
if (cur_el_weight + next_el_weight) < min_val: | |
min_idx = index | |
min_val = cur_el_weight + next_el_weight | |
if min_idx is not None: | |
result[min_idx] += result[min_idx+1] | |
del result[min_idx+1] | |
# | |
# Print result | |
# | |
print("==> Result ======> ") | |
for keys in result: | |
print( | |
keys[0], "-", keys[-1], | |
sum([wordscount_grouped_by_first_letter[key] for key in keys]), | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment