Created
December 1, 2013 10:12
-
-
Save cheekybastard/7730902 to your computer and use it in GitHub Desktop.
headline generator
https://github.com/wilg/headlines
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from random import randint | |
from random import random | |
import os | |
import itertools | |
import sys | |
# Settings | |
max_corpus_size = 20000 | |
# Accept args for different dictionaries | |
dicts = sys.argv | |
dicts.pop(0) | |
# import multiple dictionaries | |
dir = os.path.dirname(__file__) | |
imported_titles = [] | |
per_dictionary_limit = max_corpus_size / len(dicts) | |
for dictionary_name in dicts: | |
filename = os.path.join(dir, "dictionaries/" + dictionary_name + ".txt") | |
archive = open(filename) | |
dict_titles = archive.read().split("\n") | |
archive.close() | |
if len(dict_titles) > per_dictionary_limit: | |
window_start = randint(0,len(dict_titles) - per_dictionary_limit) | |
dict_titles = dict_titles[window_start:window_start+per_dictionary_limit] | |
imported_titles = imported_titles + dict_titles | |
titles = [] | |
# lowercase everything, remove quotes (to prevent dangling quotes) | |
# we're also going to create a mapping back to the original case if it's a weird word (like iPad) | |
titlecase_mappings = {} | |
for title in imported_titles: | |
titles.append(title.lower().replace("\"", "")) | |
for original_word in title.replace("\"", "").split(" "): | |
titlecase_mappings[original_word.lower()] = original_word | |
markov_map = defaultdict(lambda:defaultdict(int)) | |
lookback = 2 | |
#Generate map in the form word1 -> word2 -> occurences of word2 after word1 | |
for title in titles[:-1]: | |
title = title.split() | |
if len(title) > lookback: | |
for i in xrange(len(title)+1): | |
markov_map[' '.join(title[max(0,i-lookback):i])][' '.join(title[i:i+1])] += 1 | |
#Convert map to the word1 -> word2 -> probability of word2 after word1 | |
for word, following in markov_map.items(): | |
total = float(sum(following.values())) | |
for key in following: | |
following[key] /= total | |
#Typical sampling from a categorical distribution | |
def sample(items): | |
next_word = None | |
t = 0.0 | |
for k, v in items: | |
t += v | |
if t and random() < v/t: | |
next_word = k | |
return next_word | |
def get_sentence(length_max=140): | |
while True: | |
sentence = [] | |
next_word = sample(markov_map[''].items()) | |
while next_word != '': | |
sentence.append(next_word) | |
next_word = sample(markov_map[' '.join(sentence[-lookback:])].items()) | |
sentence = ' '.join(sentence) | |
if any(sentence in title for title in titles): | |
continue #Prune titles that are substrings of actual titles | |
if len(sentence) > length_max: | |
continue | |
return sentence | |
def retitlize_sentence(sentence): | |
lowercase_sentence = sentence.split(" ") | |
uppercase_sentence = [] | |
for word in lowercase_sentence: | |
uppercase_sentence.append(titlecase_mappings.get(word, word)) | |
return " ".join(uppercase_sentence) | |
for _ in itertools.repeat(None, 10): | |
print retitlize_sentence(get_sentence()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment