-
-
Save KarimJedda/42359052d4a166791b035ae15e7b4a75 to your computer and use it in GitHub Desktop.
A program that uses Markov chains to generate probabilistic Hacker News titles.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf8 | |
import urllib.request | |
import re | |
import sys | |
from collections import defaultdict | |
from random import random | |
import time | |
""" | |
# No need to run this | |
# You can find a list here: | |
# | |
# or remove #potato | |
def get_titles(): | |
with open("archive.txt","w") as archive: | |
#potato for year in range(17,22): | |
for month in range(1,13): | |
for day in range(1,32): | |
try: | |
print("https://www.daemonology.net/hn-daily/20%02d-%02d-%02d.html" % (year, month, day)) | |
response = urllib.request.urlopen("https://www.daemonology.net/hn-daily/20%02d-%02d-%02d.html" % (year, month, day)) | |
html = response.read().decode('utf-8') | |
titles = re.findall(r'ylink"><[^>]*>([^<]*)', str(html)) | |
for title in titles: | |
archive.write(title+"\n") | |
time.sleep(1) | |
except: | |
print("woopsie") | |
get_titles() | |
""" | |
archive = open("archive.txt") | |
titles = archive.read().split("\n") | |
archive.close() | |
markov_map = defaultdict(lambda:defaultdict(int)) | |
lookback = 2 | |
#Generate map in the form word1 -> word2 -> occurences of word2 after word1 | |
for title in titles[:-1]: | |
title = title.split() | |
if len(title) > lookback: | |
for i in range(len(title)+1): | |
markov_map[' '.join(title[max(0,i-lookback):i])][' '.join(title[i:i+1])] += 1 | |
#Convert map to the word1 -> word2 -> probability of word2 after word1 | |
for word, following in markov_map.items(): | |
total = float(sum(following.values())) | |
for key in following: | |
following[key] /= total | |
#Typical sampling from a categorical distribution | |
def sample(items): | |
next_word = None | |
t = 0.0 | |
for k, v in items: | |
t += v | |
if t and random() < v/t: | |
next_word = k | |
return next_word | |
sentences = [] | |
while len(sentences) < 100: | |
sentence = [] | |
next_word = sample(markov_map[''].items()) | |
while next_word != '': | |
sentence.append(next_word) | |
next_word = sample(markov_map[' '.join(sentence[-lookback:])].items()) | |
sentence = ' '.join(sentence) | |
flag = True | |
for title in titles: #Prune titles that are substrings of actual titles | |
if sentence in title: | |
flag = False | |
break | |
if flag: | |
sentences.append(sentence) | |
for sentence in sentences: | |
print(sentence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment