Created
January 15, 2014 11:16
-
-
Save SegFaultAX/8434500 to your computer and use it in GitHub Desktop.
Frequency sorting #python jkbbwr
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
def powerset(l): | |
def _reducer(acc, e): | |
return acc + [x + [e] for x in acc] | |
return sorted(reduce(_reducer, l, [[]]), key=len) | |
def words(s): | |
return set(re.findall("\w+", s.lower())) | |
def ngrams(words): | |
return [tuple(e) for e in powerset(words)[1:]] | |
def process(s, count): | |
return { e: count for e in ngrams(words(s)) } | |
def combine(h1, h2): | |
for (k, v) in h2.iteritems(): | |
if k in h1: | |
h1[k] += v | |
else: | |
h1[k] = v | |
return h1 | |
def process_many(l): | |
acc = {} | |
for line in l: | |
combine(acc, process(line[0], int(line[1]))) | |
return acc | |
SAMPLE = [ | |
('how to move to sweden from the uk', '1'), | |
('how to move to poland from uk', '1'), | |
('furniture removals from spain', '1'), | |
('aubrey mc carthy am removels punchestown nass co kildare ireland', '1'), | |
('12 upper gower street darwin moved into 1839', '1'), | |
('groupage shipping malta', '1'), | |
('malta shipping agents', '1'), | |
('hello kitty stuff free shipping from china', '1'), | |
('emigration of jews from israel', '1'), | |
('moving to iran', '1'), | |
('emigrate to uk from india', '3'), | |
('emigrating to melbourne australia', '1'), | |
('emigrate to canada', '2'), | |
('pakistan move', '1'), | |
('emmigrating to cape town', '1'), | |
('cheapest surface shipping from usa to uk', '1'), | |
('emigrating to america from uk', '1'), | |
('emigrating to philippines from uk', '1'), | |
('ship furniture from usa to uk', '1'), | |
('shippers for mongolia in singapore', '1'), | |
('nomads removels', '1'), | |
('emigrate to gibraltar from the uk', '1'), | |
('bangladesh move', '1'), | |
('moving to madrid', '3'), | |
('moving to south korea', '1'), | |
('moving to colombia', '3'), | |
('move to america', '8'), | |
('removals from spain', '8'), | |
('removals lisbon', '1'), | |
('moving to india', '3'), | |
('moving to indonesia', '1'), | |
('removals dublin', '3'), | |
('moving to malta', '45'), | |
('move to brazil', '2'), | |
('moving chicago', '1'), | |
('moving to california', '29'), | |
('moving to brussels', '4'), | |
('moving to bangladesh', '1'), | |
('removals california', '2'), | |
('moving from spain', '4'), | |
('moving to russia', '3'), | |
('move to los angeles', '2'), | |
('move to germany', '1'), | |
('moving to poalnd', '1'), | |
('removals stockholm', '1'), | |
('removal to poland', '1'), | |
('moves uk', '7'), | |
('moving hamburger', '1'), | |
('move to malta', '8'), | |
('move to london', '1'), | |
('moving from cyprus', '1'), | |
('move to japan', '5'), | |
('removals poland', '1'), | |
('move to cyprus', '3'), | |
('removals to india', '3'), | |
('move london', '1'), | |
('moving to los angeles', '8'), | |
('move to uk', '1'), | |
('move to venezuela', '3'), | |
('move to malta', '1'), | |
('moves to philippines', '1'), | |
('moving to serbia', '1'), | |
('worldwide relocation services', '2'), | |
('moving to sicily', '1'), | |
('moving to brazil', '27'), | |
('movers san diego', '1'), | |
('moving to vancouver', '2'), | |
('move to finland', '1'), | |
('moving to america', '23'), | |
('move thailand', '1'), | |
('move to greece', '2'), | |
('mover to hungary', '1'), | |
('removals to macedonia', '1'), | |
('moving to uruguay', '1'), | |
('move to vietnam', '1'), | |
('moving to taiwan', '2'), | |
('move to san diego', '2'), | |
('moving to estonia', '2'), | |
('international removal surrey', '1'), | |
('removals to spain', '84'), | |
('removals to berlin', '3'), | |
('move mexico', '1'), | |
('moving to krakow', '1'), | |
('removals usa', '2'), | |
('move to canada', '3'), | |
('move to japan', '2'), | |
('moving to italy', '1'), | |
('move to copenhagen', '1'), | |
('move india', '2'), | |
('moving to peru', '2'), | |
('moving to florida', '1'), | |
('international removals edinburgh', '2'), | |
('moving to copenhagen', '1'), | |
('moving to edmonton', '3'), | |
('moving to ukraine', '2'), | |
('moving to us', '6'), | |
('removals to cyprus', '4'), | |
('move to south korea', '2'), | |
('move uk', '24'), | |
('move germany', '1') | |
] | |
def main(): | |
x = sorted(process_many(SAMPLE).items(), key=lambda e: e[1], reverse=True) | |
for item in x: | |
if item[1] > 1: | |
print "{0}: '{1}'".format(item[1], ", ".join(item[0])) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment