Created
July 30, 2020 16:29
-
-
Save javaarchive/3308a3d8fa6f73e74e50954a10d93b5b to your computer and use it in GitHub Desktop.
Grams.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Grams.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/javaarchive/3308a3d8fa6f73e74e50954a10d93b5b/grams.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "PDrSA4TuIB3a", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 139 | |
}, | |
"outputId": "b46d9e4e-53de-4f3a-f638-2a3bdbba3e1c" | |
}, | |
"source": [ | |
"import re, random, numpy\n", | |
"from numpy.random import choice, multinomial\n", | |
"random.seed(94753857)\n", | |
"#random.seed(1)\n", | |
"# UTILS\n", | |
"class WeightedRandomizer:\n", | |
" def __init__ (self, weights):\n", | |
" self.__max = .0\n", | |
" self.__weights = []\n", | |
" for value, weight in weights.items ():\n", | |
" self.__max += weight\n", | |
" self.__weights.append ( (self.__max, value) )\n", | |
"\n", | |
" def random (self):\n", | |
" r = random.random () * self.__max\n", | |
" for ceil, value in self.__weights:\n", | |
" if ceil > r: return value\n", | |
"# TYSM to https://stackoverflow.com/questions/14992521/python-weighted-random\n", | |
"# Data\n", | |
"from collections import defaultdict\n", | |
"text_data = \"\"\"\n", | |
"I’m gonna be coding this a lot to make the sentences make sense. I could use some sort of API, but I’m gonna try doing myself!\n", | |
"Send any feedback, hate or love for Twaffle :slight_smile:\n", | |
"Eddie\n", | |
"I am currently working on twaffle and I have got it to a state I think is OK to show. The app generates tech waffle or ‘twaffle’ for people who don’t know much about tech to try and impress their techy friends or family members, something I get sometimes in my family :joy: Right now it generates a sentence using a series of arrays with different words like nouns, verbs, prepositions, articles etc. Sometimes the sentence is total nonsense, but sometimes it spits out something good like below:\n", | |
"\"\"\"\n", | |
"regex = '|'.join(map(re.escape, [\"!\",\".\",\"?\"]))\n", | |
"def split_sents(full):\n", | |
" return re.split(regex,full)\n", | |
"def clean_data(data):\n", | |
" new_data = data.replace(\"-\",\"\")\n", | |
" new_data = new_data.replace(\"\\\"\",\"\")\n", | |
" return new_data\n", | |
"def format_data(data):\n", | |
" new_data = split_sents(data)\n", | |
" for i in range(len(new_data)):\n", | |
" new_data[i] = new_data[i].split(\" \")\n", | |
" for i in range(len(new_data)):\n", | |
" new_data[i] = list(filter(lambda x: x != \"\",new_data[i]))\n", | |
" return new_data\n", | |
"N = 3\n", | |
"M = 1\n", | |
"words = []\n", | |
"for line in text_data.splitlines():\n", | |
" if len(line) < 1:\n", | |
" print(\"Empty line\")\n", | |
" continue\n", | |
" else:\n", | |
" words += format_data(clean_data(line))\n", | |
" #print(words)\n", | |
"model = defaultdict(lambda: defaultdict(int)) \n", | |
"for sent in words:\n", | |
" for i in range(len(sent)-N):\n", | |
" #print(sent[i:(i+N)])\n", | |
" section = sent[i:(i+N)]\n", | |
" ending = section[M:]\n", | |
" starting = section[:M]\n", | |
" #print(ending)\n", | |
" #print(section[:M])\n", | |
" model[\" \".join(section[:M])][\" \".join(ending)] += 1\n", | |
"print(\"Generate Model: \")\n", | |
"#print(model)\n", | |
"def generate(approxlen, start = []):\n", | |
" print(\"Generating approximatley \"+str(approxlen)+\" words\")\n", | |
" output = start\n", | |
" if output == []:\n", | |
" output += random.choice(list(model.keys())).split(\" \")\n", | |
" while len(output) < approxlen:\n", | |
" #print(output)\n", | |
" lookup = \" \".join(output[-(M):])\n", | |
" #print(lookup)\n", | |
" #print(list(model[lookup].keys()))\n", | |
" #print(\"Choice!\")\n", | |
" freqsum = sum(model[lookup].values())\n", | |
" #print(\"KEYVALS\",list(model[lookup].keys()), list(map(lambda key: key/freqsum, list(model[lookup].values()))))\n", | |
" nextPortion = WeightedRandomizer(model[lookup]).random()\n", | |
" if nextPortion == None:\n", | |
" nextPortion = random.choice(list(model.keys()))\n", | |
" #print(\"Next Portion\",nextPortion)\n", | |
" output += nextPortion.split(\" \")\n", | |
" return \" \".join(output)\n", | |
"print(generate(50))\n", | |
"print(generate(200))\n" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Empty line\n", | |
"Generate Model: \n", | |
"Generating approximatley 50 words\n", | |
"on twaffle and I have got it to a series of API, but sometimes it to a lot to try and impress their techy friends or family :joy: Right now it spits out something good tech waffle or love for people who don’t know much about tech to a state\n", | |
"Generating approximatley 200 words\n", | |
"on twaffle and I have got it to a series of API, but sometimes it to a lot to try and impress their techy friends or family :joy: Right now it spits out something good tech waffle or love for people who don’t know much about tech to a state I think is OK try and impress their techy friends or ‘twaffle’ for people who don’t know much about tech to a state I think is OK app generates tech to try and impress their techy friends or ‘twaffle’ for people who don’t know much about tech to try and I have got it to a state I am currently working on twaffle and I have got it generates a state I am currently working on twaffle and I have got it generates a state I get sometimes in my family :joy: Right now it generates tech waffle or ‘twaffle’ for people who don’t know much about tech waffle or love for people who don’t know much about tech to make the sentences make the sentences make the sentences nouns, verbs, prepositions, waffle or ‘twaffle’ for people who don’t know much about tech waffle or love for people who don’t know\n" | |
], | |
"name": "stdout" | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment