Created
July 7, 2018 08:05
-
-
Save BlogBlocks/328178acdff1ee78827ab35069a4a363 to your computer and use it in GitHub Desktop.
Notebook create ngrams and stores in sqlite3 db
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Playing with the ngram modules" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%writefile NgramBD.py\n", | |
"import sqlite3\n", | |
"conn = sqlite3.connect('ngram.db')\n", | |
"c = conn.cursor()\n", | |
"def ngram(find):\n", | |
" for row in c.execute(\"SELECT ROWID,* from ngram where src == ? order by random() limit 1\", (find,)):\n", | |
" result = row[0],row[1],row[2],row[3]\n", | |
" result = str(result)\n", | |
" return result" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import string\n", | |
"from NgramBD import ngram\n", | |
"sentence = raw_input(\"FIND: \")\n", | |
"sentence = sentence.lower()\n", | |
"words = sentence.split()\n", | |
"find = words[-1]\n", | |
"\n", | |
"line0 = ngram(find)\n", | |
"line0 = line0.split(\", u'\")\n", | |
"M1 = line0[2][:-1]\n", | |
"\n", | |
"line1 = ngram(M1)\n", | |
"line1 = line1.split(\", u'\")\n", | |
"M2 = line1[2][:-1]\n", | |
"\n", | |
"line2 = ngram(M2)\n", | |
"line2 = line2.split(\", u'\")\n", | |
"M3 = line2[2][:-1]\n", | |
"\n", | |
"line3 = ngram(M3)\n", | |
"line3 = line3.split(\", u'\")\n", | |
"M4 = line3[2][:-1]\n", | |
"\n", | |
"line4 = ngram(M4)\n", | |
"line4 = line4.split(\", u'\")\n", | |
"M5 = line4[2][:-1]\n", | |
"\n", | |
"sentence = sentence.capitalize()\n", | |
"print sentence, M1, M2, M3, M4, M5\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Overwriting JNGRAM.py\n" | |
] | |
} | |
], | |
"source": [ | |
"%%writefile JNGRAM.py\n", | |
"import string\n", | |
"\"\"\"\n", | |
"USAGE:\n", | |
"from JNGRAM import NG\n", | |
"ng()\n", | |
"\n", | |
"\"\"\"\n", | |
"from NgramBD import ngram\n", | |
"def NG():\n", | |
" sentence = raw_input(\"Complete this line: \")\n", | |
" sentence = sentence.lower()\n", | |
" words = sentence.split()\n", | |
" find = words[-1]\n", | |
"\n", | |
" line0 = ngram(find)\n", | |
" line0 = line0.split(\", u'\")\n", | |
" M1 = line0[2][:-1]\n", | |
"\n", | |
" line1 = ngram(M1)\n", | |
" line1 = line1.split(\", u'\")\n", | |
" M2 = line1[2][:-1]\n", | |
"\n", | |
" line2 = ngram(M2)\n", | |
" line2 = line2.split(\", u'\")\n", | |
" M3 = line2[2][:-1]\n", | |
"\n", | |
" line3 = ngram(M3)\n", | |
" line3 = line3.split(\", u'\")\n", | |
" M4 = line3[2][:-1]\n", | |
"\n", | |
" line4 = ngram(M4)\n", | |
" line4 = line4.split(\", u'\")\n", | |
" M5 = line4[2][:-1]\n", | |
"\n", | |
" sentence = sentence.capitalize()\n", | |
" print sentence, M1, M2, M3, M4, M5,\".\"\n", | |
" return\n", | |
"\n", | |
"NG()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Complete this line: I think that\n", | |
"I think that concept central economic contribution did .\n" | |
] | |
} | |
], | |
"source": [ | |
"from JNGRAM import NG\n", | |
"# You may even enter phrases:\n", | |
"# ae: I think that\n", | |
"NG()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from itertools import tee, islice\n", | |
"from collections import Counter\n", | |
"out =\"short-test-ngram5.txt\"\n", | |
"infile = open(out, \"a\")\n", | |
"from time import sleep\n", | |
"def ngrams(lst, n):\n", | |
" tlst = lst\n", | |
" while True:\n", | |
" a, b = tee(tlst)\n", | |
" l = tuple(islice(a, n))\n", | |
" if len(l) == n:\n", | |
" yield l\n", | |
" next(b)\n", | |
" tlst = b\n", | |
" else:\n", | |
" break\n", | |
"out =\"short-list5.txt\"\n", | |
"view = open(out, \"r\").read()\n", | |
"view = view.split()\n", | |
"text = Counter(ngrams(view, 2))\n", | |
"text = str(text)\n", | |
"text = text.replace(\",'):\",\" \")\n", | |
"text = text.replace(\",', '\",\" \")\n", | |
"text = text.replace(\" ('\",\" \")\n", | |
"text = text.replace('\"',' ')\n", | |
"text = text.replace(\"})\",\"\")\n", | |
"text = text.replace(\"Counter({('\",\"\")\n", | |
"text = text.replace(\" \",\" \")\n", | |
"text = text.split(\", \")\n", | |
"for line in text:\n", | |
" INPUT = line+\"\\n\"\n", | |
" infile.write(INPUT)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"for line in view:\n", | |
" count = count +1\n", | |
" if count > 1000 and count < 1300:\n", | |
" line = line.replace(\"\\n\",\"\") \n", | |
" print (line)\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"find = raw_input(\"Find bigram: \")\n", | |
"for line in view:\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" line = line.split(\" \")\n", | |
" if find == line[1]:\n", | |
" \n", | |
" print line\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"sentence = raw_input(\"Find bigram: \")\n", | |
"words = sentence.split()\n", | |
"find = words[-1]\n", | |
"print find" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Experiment with text Database works better." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Find bigram: live with \n", | |
"live with 'of', 'the', ----- 61792\n", | |
"live with 'in', 'the', ----- 28150\n", | |
"live with 'and', 'the', ----- 1874\n", | |
"live with 'to', 'the', ----- 1937\n", | |
"live with 'the', 'lord', ----- 2032\n", | |
"live with 'on', 'the', ----- 8718\n", | |
"live with 'for', 'the', ----- 8538\n", | |
"live with 'with', 'the', ----- 7959\n", | |
"live with 'by', 'the', ----- 7676\n", | |
"live with 'from', 'the', ----- 6795\n", | |
"live with 'is', 'a', ----- 6708\n", | |
"live with 'to', 'be', ----- 6683\n", | |
"live with 'it', 'is', ----- 6674\n", | |
"live with 'is', 'the', ----- 6508\n", | |
"live with 'of', 'a', ----- 5617\n", | |
"live with 'all', 'the', ----- 5287\n", | |
"live with 'of', 'and', ----- 5136\n", | |
"live with 'i', 'i', ----- 5049\n", | |
"live with 'as', 'the', ----- 4613\n", | |
"live with 'at', 'the', ----- 4606\n", | |
"live with 'in', 'a', ----- 4585\n", | |
"live with 'that', 'the', ----- 4487\n", | |
"live with 'and', 'he', ----- 4448\n", | |
"live with 'as', 'a', ----- 4167\n", | |
"live with 'of', 'his', ----- 4042\n", | |
"live with 'and', 'they', ----- 3952\n", | |
"live with 'will', 'be', ----- 3892\n", | |
"live with 'i', 'will', ----- 3697\n", | |
"live with 'out', 'of', ----- 3695\n", | |
"live with 'machine', 'learning', ----- 3665\n", | |
"live with 'into', 'the', ----- 3661\n" | |
] | |
} | |
], | |
"source": [ | |
"import random\n", | |
"from time import sleep\n", | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"sentence = raw_input(\"Find bigram: \")\n", | |
"words = sentence.split()\n", | |
"find = words[-1]\n", | |
"for line in view:\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" linea = line.split(\" \")\n", | |
" if find == linea[0]:lines = \" \".join(linea)\n", | |
" linea = str(linea) \n", | |
" line0 = linea.split(\" \")\n", | |
" line0[2] = line0[2].replace(\"'\",\"\")\n", | |
" line0[2] = line0[2].replace(\"]\",\"\")\n", | |
" line0[0] = line0[0].replace(\"[\",\"\")\n", | |
" num = int(line0[2])\n", | |
" if num>50:\n", | |
" print sentence,line0[0],line0[1],\"-----\",line0[2]\n", | |
" sleep(.1)\n", | |
" count = count +1\n", | |
" if count>30:break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"reset -f" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from __future__ import division\n", | |
"n = 295/523671\n", | |
"print \"%.8f\" % n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Prints bigram and frequency" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Find bigram: test\n", | |
"test flight 0.00018523\n" | |
] | |
} | |
], | |
"source": [ | |
"from __future__ import division\n", | |
"import random\n", | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"\n", | |
"def gentext(fileout,sentence):\n", | |
" view = open(fileout, \"r\").readlines()\n", | |
" num_lines = sum(1 for line in open(fileout))\n", | |
" words = sentence.split()\n", | |
" find = words[-1]\n", | |
" hist = \"\"\n", | |
" lst1 = []\n", | |
" for line in view:\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" linea = line.split(\" \")\n", | |
" if find == linea[0]:\n", | |
" lines = \" \".join(linea)\n", | |
" line0 = lines.split(\" \")\n", | |
" num = int(line0[2])\n", | |
" nu = \"%.8f\" % float(num/num_lines)\n", | |
" if nu>0.00009739:\n", | |
" #nu = \"%.8f\" % float(num/num_lines)\n", | |
" nu = str(nu)\n", | |
" jo = \"\".join(sentence+\" \"+line0[1]+\" \"+nu)\n", | |
" return jo\n", | |
" \n", | |
"sentence = raw_input(\"Find bigram: \") \n", | |
"fileout = \"short-test-ngram5.txt\" \n", | |
"print gentext(fileout, sentence) " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# creating a list of posibilities\n", | |
"\n", | |
"lis = []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Find bigram: dog\n", | |
"dog fell 0.00009166\n", | |
"dog liked 0.00008020\n", | |
"dog and 0.00008020\n", | |
"dog thats 0.00003055\n", | |
"dog for 0.00003055\n", | |
"dog fighting 0.00003055\n", | |
"dog who 0.00003055\n", | |
"dog is 0.00002673\n", | |
"dog that 0.00001910\n", | |
"dog in 0.00001528\n", | |
"dog was 0.00001528\n", | |
"dog as 0.00001146\n", | |
"dog i 0.00000764\n", | |
"dog after 0.00000764\n", | |
"dog growls 0.00000764\n", | |
"dog near 0.00000764\n", | |
"dog with 0.00000764\n", | |
"dog curse 0.00000764\n", | |
"dog came 0.00000764\n", | |
"dog has 0.00000764\n", | |
"dog by 0.00000764\n", | |
"dog him 0.00000764\n", | |
"dog families 0.00000764\n", | |
"dog barking 0.00000382\n", | |
"dog into 0.00000382\n", | |
"dog does 0.00000382\n", | |
"dog family 0.00000382\n", | |
"dog before 0.00000382\n", | |
"dog out 0.00000382\n", | |
"dog really 0.00000382\n", | |
"dog upon 0.00000382\n", | |
"dog had 0.00000382\n", | |
"dog she 0.00000382\n", | |
"dog poop 0.00000382\n", | |
"dog so 0.00000382\n", | |
"dog knowing 0.00000382\n", | |
"dog but 0.00000382\n", | |
"dog move 0.00000382\n", | |
"dog barked 0.00000382\n", | |
"dog bit 0.00000382\n", | |
"dog of 0.00000382\n", | |
"dog may 0.00000382\n", | |
"dog can 0.00000382\n", | |
"dog his 0.00000382\n", | |
"dog while 0.00000382\n", | |
"dog carrying 0.00000382\n", | |
"dog howled 0.00000382\n", | |
"dog nine 0.00000382\n", | |
"None\n" | |
] | |
} | |
], | |
"source": [ | |
"from __future__ import division\n", | |
"import random\n", | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"lis = []\n", | |
"def gentext(fileout,sentence):\n", | |
" view = open(fileout, \"r\").readlines()\n", | |
" num_lines = (sum(1 for line in open(fileout)))/2\n", | |
" words = sentence.split()\n", | |
" find = words[-1]\n", | |
" hist = \"\"\n", | |
" lst1 = []\n", | |
" for line in view:\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" linea = line.split(\" \")\n", | |
" if find == linea[0]:\n", | |
" lines = \" \".join(linea)\n", | |
" line0 = lines.split(\" \")\n", | |
" num = int(line0[2])\n", | |
" nu = \"%.8f\" % float(num/num_lines)\n", | |
" if nu>0.00009739:\n", | |
" #nu = \"%.8f\" % float(num/num_lines)\n", | |
" print sentence,line0[1], nu\n", | |
" lis.append(sentence+\" \"+line0[1]+\" \"+nu)\n", | |
"sentence = raw_input(\"Find bigram: \") \n", | |
"fileout = \"short-test-ngram5.txt\" \n", | |
"print gentext(fileout, sentence) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"they said unto 0.00444745\n", | |
"they said to 0.00169954\n", | |
"they said the 0.00103882\n", | |
"they said i 0.00087650\n", | |
"they said that 0.00062253\n", | |
"they said he 0.00028071\n", | |
"they said this 0.00025016\n", | |
"they said what 0.00023488\n", | |
"they said in 0.00021769\n", | |
"they said you 0.00021197\n", | |
"they said let 0.00021006\n", | |
"they said it 0.00019478\n", | |
"they said behold 0.00017568\n", | |
"they said and 0.00017186\n", | |
"they said they 0.00017186\n", | |
"they said we 0.00016041\n", | |
"they said if 0.00014513\n", | |
"they said there 0.00013940\n", | |
"they said o 0.00012412\n", | |
"they said my 0.00012412\n", | |
"they said who 0.00012221\n", | |
"they said is 0.00011076\n", | |
"they said go 0.00010885\n", | |
"they said no 0.00010312\n", | |
"they said thou 0.00009548\n", | |
"they said for 0.00008975\n", | |
"they said before 0.00008593\n", | |
"they said as 0.00008211\n", | |
"they said she 0.00007638\n", | |
"they said take 0.00007447\n", | |
"they said about 0.00007066\n", | |
"they said a 0.00006875\n", | |
"they said thus 0.00006875\n", | |
"they said because 0.00006875\n", | |
"they said blessed 0.00006875\n", | |
"they said these 0.00006684\n", | |
"they said from 0.00006493\n", | |
"they said lord 0.00006111\n", | |
"they said one 0.00005920\n", | |
"they said thy 0.00005920\n", | |
"they said ye 0.00005538\n", | |
"they said god 0.00005347\n", | |
"they said of 0.00005347\n", | |
"they said how 0.00005347\n", | |
"they said nothing 0.00005156\n", | |
"they said now 0.00005156\n", | |
"they said but 0.00004774\n", | |
"they said never 0.00004774\n", | |
"they said quite 0.00004583\n", | |
"they said by 0.00004583\n", | |
"they said an 0.00004583\n", | |
"they said thats 0.00004392\n", | |
"they said therefore 0.00004201\n", | |
"they said have 0.00004201\n", | |
"they said wherefore 0.00004201\n", | |
"they said come 0.00004010\n", | |
"they said why 0.00004010\n", | |
"they said with 0.00003819\n", | |
"they said do 0.00003628\n", | |
"they said nay 0.00003628\n", | |
"they said calm 0.00003437\n", | |
"they said ultra 0.00003437\n", | |
"they said educate 0.00003437\n", | |
"they said on 0.00003246\n", | |
"they said moreover 0.00003246\n", | |
"they said be 0.00003246\n", | |
"they said surely 0.00003246\n", | |
"they said hear 0.00003246\n", | |
"they said also 0.00003055\n", | |
"they said not 0.00003055\n", | |
"they said will 0.00003055\n", | |
"they said bring 0.00002864\n", | |
"they said so 0.00002864\n", | |
"they said shall 0.00002673\n", | |
"they said some 0.00002673\n", | |
"they said h 0.00002482\n", | |
"they said verily 0.00002482\n", | |
"they said t 0.00002482\n", | |
"they said here 0.00002482\n", | |
"they said heresy 0.00002482\n", | |
"they said at 0.00002292\n", | |
"they said among 0.00002292\n", | |
"they said like 0.00002292\n", | |
"they said which 0.00002292\n", | |
"they said than 0.00002292\n", | |
"they said john 0.00002292\n", | |
"they said all 0.00002292\n", | |
"they said see 0.00002101\n", | |
"they said arise 0.00002101\n", | |
"they said or 0.00002101\n", | |
"they said during 0.00002101\n", | |
"they said father 0.00002101\n", | |
"they said seek 0.00001910\n", | |
"they said oh 0.00001910\n", | |
"they said policies 0.00001910\n", | |
"they said strong 0.00001910\n", | |
"they said noted 0.00001910\n", | |
"they said just 0.00001719\n", | |
"they said master 0.00001719\n", | |
"they said lest 0.00001719\n", | |
"they said alas 0.00001719\n", | |
"they said when 0.00001719\n", | |
"they said give 0.00001719\n", | |
"they said must 0.00001719\n", | |
"they said concerning 0.00001719\n", | |
"they said above 0.00001719\n", | |
"they said make 0.00001719\n", | |
"they said shed 0.00001528\n", | |
"they said today 0.00001528\n", | |
"they said loudly 0.00001528\n", | |
"they said almost 0.00001528\n", | |
"they said something 0.00001528\n", | |
"they said did 0.00001528\n", | |
"they said even 0.00001528\n", | |
"they said first 0.00001528\n", | |
"they said lo 0.00001528\n", | |
"they said great 0.00001528\n", | |
"they said china 0.00001528\n", | |
"they said horrible 0.00001528\n", | |
"they said actions 0.00001528\n", | |
"they said technical 0.00001528\n", | |
"they said wed 0.00001528\n", | |
"they said agencies 0.00001528\n", | |
"they said hath 0.00001528\n", | |
"they said bad 0.00001528\n", | |
"they said send 0.00001528\n", | |
"they said again 0.00001528\n", | |
"they said house 0.00001528\n", | |
"they said amen 0.00001528\n", | |
"they said judge 0.00001528\n", | |
"they said very 0.00001528\n", | |
"they said last 0.00001528\n", | |
"they said call 0.00001337\n", | |
"they said his 0.00001337\n", | |
"they said where 0.00001337\n", | |
"they said whether 0.00001337\n", | |
"they said am 0.00001337\n", | |
"they said while 0.00001337\n", | |
"they said should 0.00001337\n", | |
"they said smite 0.00001337\n", | |
"they said rise 0.00001146\n", | |
"they said your 0.00001146\n", | |
"they said after 0.00001146\n", | |
"they said was 0.00001146\n", | |
"they said hearken 0.00001146\n", | |
"they said then 0.00001146\n", | |
"they said get 0.00001146\n", | |
"they said well 0.00000955\n", | |
"they said furthermore 0.00000955\n", | |
"they said whose 0.00000955\n", | |
"they said little 0.00000955\n", | |
"they said tell 0.00000955\n", | |
"they said whither 0.00000955\n", | |
"they said peace 0.00000955\n", | |
"they said without 0.00000955\n", | |
"they said men 0.00000955\n", | |
"they said learned 0.00000955\n", | |
"they said against 0.00000764\n", | |
"they said yet 0.00000764\n", | |
"they said put 0.00000764\n", | |
"they said exodus 0.00000764\n", | |
"they said turning 0.00000764\n", | |
"they said comest 0.00000764\n", | |
"they said woe 0.00000764\n", | |
"they said appeal 0.00000764\n", | |
"they said errors 0.00000764\n", | |
"they said none 0.00000764\n", | |
"they said wilt 0.00000764\n", | |
"they said truly 0.00000764\n", | |
"they said yea 0.00000764\n", | |
"they said bishop 0.00000764\n", | |
"they said hereby 0.00000764\n", | |
"they said aloud 0.00000764\n", | |
"they said write 0.00000764\n", | |
"they said sirs 0.00000764\n", | |
"they said heresies 0.00000764\n", | |
"they said follow 0.00000764\n", | |
"they said every 0.00000764\n", | |
"they said ah 0.00000764\n", | |
"they said are 0.00000764\n", | |
"they said appellant 0.00000764\n", | |
"they said our 0.00000764\n", | |
"they said neither 0.00000573\n", | |
"they said boa 0.00000573\n", | |
"they said goodbye 0.00000573\n", | |
"they said peradventure 0.00000573\n", | |
"they said whosoever 0.00000573\n", | |
"they said three 0.00000573\n", | |
"they said daughter 0.00000573\n", | |
"they said rather 0.00000573\n", | |
"they said under 0.00000573\n", | |
"they said art 0.00000573\n", | |
"they said out 0.00000573\n", | |
"they said doth 0.00000573\n", | |
"they said say 0.00000573\n", | |
"they said stand 0.00000573\n", | |
"they said enough 0.00000573\n", | |
"they said whence 0.00000573\n", | |
"they said speak 0.00000573\n", | |
"they said peaceably 0.00000573\n", | |
"they said seven 0.00000573\n", | |
"they said abjuration 0.00000573\n", | |
"they said secular 0.00000573\n", | |
"they said whom 0.00000573\n", | |
"they said swear 0.00000573\n", | |
"they said within 0.00000573\n", | |
"they said cause 0.00000573\n", | |
"they said except 0.00000573\n", | |
"they said upon 0.00000573\n", | |
"they said open 0.00000573\n", | |
"they said entreat 0.00000573\n", | |
"they said thine 0.00000573\n", | |
"they said turn 0.00000573\n", | |
"they said ask 0.00000573\n", | |
"they said son 0.00000573\n", | |
"they said lay 0.00000573\n", | |
"they said tarry 0.00000573\n", | |
"they said whoever 0.00000382\n", | |
"they said trouble 0.00000382\n", | |
"they said shes 0.00000382\n", | |
"they said king 0.00000382\n", | |
"they said vengeance 0.00000382\n", | |
"they said tempting 0.00000382\n", | |
"they said anxiously 0.00000382\n", | |
"they said consecrate 0.00000382\n", | |
"they said desperately 0.00000382\n", | |
"they said leave 0.00000382\n", | |
"they said tossing 0.00000382\n", | |
"they said its 0.00000382\n", | |
"they said prisoner 0.00000382\n", | |
"they said disperse 0.00000382\n", | |
"they said remember 0.00000382\n", | |
"they said everybody 0.00000382\n", | |
"they said hail 0.00000382\n", | |
"they said me 0.00000382\n", | |
"they said pray 0.00000382\n", | |
"they said hast 0.00000382\n", | |
"they said looking 0.00000382\n", | |
"they said yes 0.00000382\n", | |
"they said draw 0.00000382\n", | |
"they said feeling 0.00000382\n", | |
"they said according 0.00000382\n", | |
"they said came 0.00000382\n", | |
"they said separate 0.00000382\n", | |
"they said pig 0.00000382\n", | |
"they said numbers 0.00000382\n", | |
"they said saith 0.00000382\n", | |
"they said ho 0.00000382\n", | |
"they said figs 0.00000382\n", | |
"they said canst 0.00000382\n", | |
"they said plainly 0.00000382\n", | |
"they said fetch 0.00000382\n", | |
"they said keep 0.00000382\n", | |
"they said wherein 0.00000382\n", | |
"they said her 0.00000382\n", | |
"they said crimes 0.00000382\n", | |
"they said could 0.00000382\n", | |
"they said indignantly 0.00000382\n", | |
"they said only 0.00000382\n", | |
"they said daughters 0.00000382\n", | |
"they said sacrifice 0.00000382\n", | |
"they said garment 0.00000382\n", | |
"they said up 0.00000382\n", | |
"they said genesis 0.00000382\n", | |
"they said comfort 0.00000382\n", | |
"they said poor 0.00000382\n", | |
"they said farewell 0.00000382\n", | |
"they said friend 0.00000382\n", | |
"they said two 0.00000382\n", | |
"they said cast 0.00000382\n", | |
"they said gather 0.00000382\n", | |
"they said fill 0.00000382\n", | |
"they said moses 0.00000382\n", | |
"they said seriously 0.00000382\n", | |
"they said always 0.00000382\n", | |
"they said four 0.00000382\n", | |
"they said maid 0.00000382\n", | |
"they said hastily 0.00000382\n", | |
"they said indeed 0.00000382\n", | |
"they said waving 0.00000382\n", | |
"they said suffer 0.00000382\n", | |
"they said help 0.00000382\n", | |
"they said cry 0.00000382\n", | |
"they said any 0.00000382\n", | |
"they said theres 0.00000382\n", | |
"they said hes 0.00000382\n", | |
"they said brother 0.00000382\n", | |
"they said slay 0.00000382\n", | |
"they said hang 0.00000382\n", | |
"they said treason 0.00000382\n", | |
"they said either 0.00000382\n", | |
"they said drink 0.00000382\n", | |
"they said severely 0.00000382\n", | |
"they said escape 0.00000382\n", | |
"they said pour 0.00000382\n", | |
"they said lie 0.00000382\n", | |
"they said five 0.00000382\n", | |
"they said court 0.00000382\n", | |
"they said away 0.00000191\n", | |
"they said wretch 0.00000191\n", | |
"they said killed 0.00000191\n", | |
"they said kings 0.00000191\n", | |
"they said wait 0.00000191\n", | |
"they said ineffective 0.00000191\n", | |
"they said angrily 0.00000191\n", | |
"they said influence 0.00000191\n", | |
"they said depart 0.00000191\n", | |
"they said despicable 0.00000191\n", | |
"they said more 0.00000191\n", | |
"they said bull 0.00000191\n", | |
"they said far 0.00000191\n", | |
"they said nor 0.00000191\n", | |
"they said aught 0.00000191\n", | |
"they said panting 0.00000191\n", | |
"they said b 0.00000191\n", | |
"they said become 0.00000191\n", | |
"they said regardless 0.00000191\n", | |
"they said would 0.00000191\n", | |
"they said admitted 0.00000191\n", | |
"they said anyone 0.00000191\n", | |
"they said curse 0.00000191\n", | |
"they said thee 0.00000191\n", | |
"they said diocese 0.00000191\n", | |
"they said evil 0.00000191\n", | |
"they said sovereign 0.00000191\n", | |
"they said together 0.00000191\n", | |
"they said since 0.00000191\n", | |
"they said look 0.00000191\n", | |
"they said however 0.00000191\n", | |
"they said present 0.00000191\n", | |
"they said giving 0.00000191\n", | |
"they said cathedral 0.00000191\n", | |
"they said throw 0.00000191\n", | |
"they said privately 0.00000191\n", | |
"they said discern 0.00000191\n", | |
"they said render 0.00000191\n", | |
"they said pope 0.00000191\n", | |
"they said peter 0.00000191\n", | |
"they said resist 0.00000191\n", | |
"they said roll 0.00000191\n", | |
"they said charges 0.00000191\n", | |
"they said shoot 0.00000191\n", | |
"they said remove 0.00000191\n", | |
"they said second 0.00000191\n", | |
"they said though 0.00000191\n", | |
"they said grave 0.00000191\n", | |
"they said foul 0.00000191\n", | |
"they said procedure 0.00000191\n", | |
"they said adding 0.00000191\n", | |
"they said holy 0.00000191\n", | |
"they said tat 0.00000191\n", | |
"they said such 0.00000191\n", | |
"they said awake 0.00000191\n", | |
"they said accused 0.00000191\n", | |
"they said accusation 0.00000191\n", | |
"they said penance 0.00000191\n", | |
"they said divide 0.00000191\n", | |
"they said judges 0.00000191\n", | |
"they said mark 0.00000191\n", | |
"they said penalty 0.00000191\n", | |
"they said threescore 0.00000191\n", | |
"they said nevertheless 0.00000191\n", | |
"they said man 0.00000191\n", | |
"they said increase 0.00000191\n", | |
"they said does 0.00000191\n", | |
"they said mercy 0.00000191\n", | |
"they said confraternity 0.00000191\n", | |
"they said skin 0.00000191\n", | |
"they said deliver 0.00000191\n", | |
"they said defamation 0.00000191\n", | |
"they said colleague 0.00000191\n", | |
"they said certainly 0.00000191\n", | |
"they said sentence 0.00000191\n", | |
"they said their 0.00000191\n", | |
"they said chronicles 0.00000191\n", | |
"they said think 0.00000191\n", | |
"they said return 0.00000191\n", | |
"they said provinces 0.00000191\n", | |
"they said lets 0.00000191\n", | |
"they said forbid 0.00000191\n", | |
"they said good 0.00000191\n", | |
"they said immediately 0.00000191\n", | |
"they said forty 0.00000191\n", | |
"they said silver 0.00000191\n", | |
"they said disdainful 0.00000191\n", | |
"they said direct 0.00000191\n", | |
"they said inquisitors 0.00000191\n", | |
"they said temporal 0.00000191\n", | |
"they said naked 0.00000191\n", | |
"they said church 0.00000191\n", | |
"they said lather 0.00000191\n", | |
"they said walk 0.00000191\n", | |
"they said differ 0.00000191\n", | |
"they said presumably 0.00000191\n", | |
"they said hew 0.00000191\n", | |
"they said those 0.00000191\n", | |
"they said provides 0.00000191\n", | |
"they said through 0.00000191\n", | |
"they said hello 0.00000191\n", | |
"they said tyrants 0.00000191\n", | |
"they said saw 0.00000191\n", | |
"they said early 0.00000191\n", | |
"they said candle 0.00000191\n", | |
"they said had 0.00000191\n", | |
"they said wicked 0.00000191\n", | |
"they said secretly 0.00000191\n", | |
"they said thirdly 0.00000191\n", | |
"they said lift 0.00000191\n", | |
"they said tidings 0.00000191\n", | |
"they said young 0.00000191\n", | |
"they said legion 0.00000191\n", | |
"they said having 0.00000191\n", | |
"they said salvation 0.00000191\n", | |
"they said loose 0.00000191\n", | |
"they said eat 0.00000191\n", | |
"they said sight 0.00000191\n", | |
"they said happy 0.00000191\n", | |
"they said places 0.00000191\n" | |
] | |
} | |
], | |
"source": [ | |
"for line in lis:\n", | |
" print line" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"------------------------------------USING ________________________" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from __future__ import division\n", | |
"import random\n", | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"lis1 = []\n", | |
"lis2 = []\n", | |
"lis3 = []\n", | |
"def gentext(n, fileout,sentence):\n", | |
" view = open(fileout, \"r\").readlines()\n", | |
" num_lines = sum(1 for line in open(fileout))\n", | |
" words = sentence.split()\n", | |
" find = words[-1]\n", | |
" hist = \"\"\n", | |
" lst1 = []\n", | |
" for line in view:\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" linea = line.split(\" \")\n", | |
" if find == linea[0]:\n", | |
" lines = \" \".join(linea)\n", | |
" line0 = lines.split(\" \")\n", | |
" num = int(line0[2])\n", | |
" nu = \"%.8f\" % float(num/num_lines)\n", | |
" if n==1:lis1.append(sentence+\" \"+line0[1]+\" \")\n", | |
" if n==2:lis2.append(sentence+\" \"+line0[1]+\" \")\n", | |
" if n==1:lis3.append(sentence+\" \"+line0[1]+\" \")\n", | |
" if n==2:lis3.append(sentence+\" \"+line0[1]+\" \") \n", | |
" \n", | |
"sent = raw_input(\"Find bigram: \") \n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"n = 0\n", | |
"while n<3:\n", | |
" n = n+1\n", | |
" gentext(n,fileout,sent)\n", | |
" X1 = random.choice(lis1)\n", | |
" X1 = str(X1).split()\n", | |
" x1 = X1[1]\n", | |
" #print x1\n", | |
" gentext(n,fileout,x1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import random\n", | |
"print random.choice(lis1)\n", | |
"print random.choice(lis2)\n", | |
"print random.choice(lis3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for line in lis3:\n", | |
" print line" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import random\n", | |
"import bisect\n", | |
"import matplotlib.pyplot as plt\n", | |
"import math\n", | |
"import time\n", | |
"\n", | |
"def method_in(a,b,c):\n", | |
" start_time = time.time()\n", | |
" for i,x in enumerate(a):\n", | |
" if x in b:\n", | |
" c[i] = 1\n", | |
" return(time.time()-start_time) \n", | |
"\n", | |
"def method_set_in(a,b,c):\n", | |
" start_time = time.time()\n", | |
" s = set(b)\n", | |
" for i,x in enumerate(a):\n", | |
" if x in s:\n", | |
" c[i] = 1\n", | |
" return(time.time()-start_time)\n", | |
"\n", | |
"def method_bisect(a,b,c):\n", | |
" start_time = time.time()\n", | |
" b.sort()\n", | |
" for i,x in enumerate(a):\n", | |
" index = bisect.bisect_left(b,x)\n", | |
" if index < len(a):\n", | |
" if x == b[index]:\n", | |
" c[i] = 1\n", | |
" return(time.time()-start_time)\n", | |
"\n", | |
"def profile():\n", | |
" time_method_in = []\n", | |
" time_method_set_in = []\n", | |
" time_method_bisect = []\n", | |
" \n", | |
" Nls = [x for x in range(1000,20000,1000)]\n", | |
" for N in Nls:\n", | |
" a = [x for x in range(0,N)]\n", | |
" random.shuffle(a)\n", | |
" b = [x for x in range(0,N)]\n", | |
" random.shuffle(b)\n", | |
" c = [0 for x in range(0,N)]\n", | |
"\n", | |
" time_method_in.append(math.log(method_in(a,b,c)))\n", | |
" time_method_set_in.append(math.log(method_set_in(a,b,c)))\n", | |
" time_method_bisect.append(math.log(method_bisect(a,b,c)))\n", | |
"\n", | |
" plt.plot(Nls,time_method_in,marker='o',color='r',linestyle='-',label='in')\n", | |
" plt.plot(Nls,time_method_set_in,marker='o',color='b',linestyle='-',label='set')\n", | |
" plt.plot(Nls,time_method_bisect,marker='o',color='g',linestyle='-',label='bisect')\n", | |
" plt.xlabel('list size', fontsize=18)\n", | |
" plt.ylabel('log(time)', fontsize=18)\n", | |
" plt.legend(loc = 'upper left')\n", | |
" plt.show() " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"print random.choice(lis3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import random\n", | |
"X1 = random.choice(lis1)\n", | |
"X1 = str(X1).split()\n", | |
"x1 = X1[1]\n", | |
"print x1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# ------------------------------------USING __ ABOVE ______________________" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"reset -f" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"print lis" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import random\n", | |
"nl = lis\n", | |
"print random.choice(nl)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nl = lis\n", | |
"for line in nl:\n", | |
" print line" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from __future__ import division\n", | |
"import random\n", | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"num_lines = sum(1 for line in open(fileout))\n", | |
"sentence = raw_input(\"Find bigram: \")\n", | |
"words = sentence.split()\n", | |
"find = words[-1]\n", | |
"hist = \"\"\n", | |
"lst1 = []\n", | |
"for line in view:\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" linea = line.split(\" \")\n", | |
" if find == linea[0]:\n", | |
" lines = \" \".join(linea)\n", | |
" line0 = lines.split(\" \")\n", | |
" num = int(line0[2])\n", | |
" nu = \"%.8f\" % float(num/num_lines)\n", | |
" if nu>0.00009739:\n", | |
" #nu = \"%.8f\" % float(num/num_lines)\n", | |
" \n", | |
" print sentence,line0[1], nu\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Find bigram: running\n", | |
". : running on the lord on time\n", | |
": running on the lord on are\n", | |
": running on the lord on it\n", | |
": running on the lord on work\n", | |
": running on the lord on to\n", | |
": running on the lord on in\n", | |
": running on the lord on later\n", | |
": running on the lord on him\n", | |
": running on the lord on now\n", | |
": running on the lord on talk\n", | |
": running on the lord on was\n", | |
": running on the lord on down\n", | |
": running on the lord on you\n", | |
": running on the lord on out\n", | |
": running on the lord on hands\n", | |
": running on the lord on at\n", | |
": running on the lord on were\n", | |
": running on the lord on rt\n", | |
": running on the lord on sea\n", | |
": running on the lord on that\n", | |
": running on the lord on database\n", | |
": running on the lord on not\n", | |
": running on the lord on because\n", | |
": running on the lord on see\n", | |
": running on the lord on book\n", | |
": running on the lord on vote\n", | |
": running on the lord on place\n", | |
": running on the lord on put\n", | |
": running on the lord on the\n", | |
": running on the lord on water\n", | |
": running on the lord on or\n", | |
": running on the lord on run\n", | |
": running on the lord on passion\n", | |
": running on the lord on come\n", | |
": running on the lord on one\n", | |
": running on the lord on lived\n", | |
": running on the lord on founded\n", | |
": running on the lord on but\n", | |
": running on the lord on dwell\n", | |
": running on the lord on war\n", | |
": running on the lord on fell\n", | |
": running on the lord on being\n", | |
": running on the lord on running\n", | |
": running on the lord on a\n", | |
": running on the lord on done\n", | |
": running on the lord on research\n", | |
": running on the lord on back\n", | |
": running on the lord on report\n", | |
": running on the lord on only\n", | |
": running on the lord on trump\n", | |
": running on the lord on sit\n", | |
": running on the lord on people\n", | |
": running on the lord on hold\n" | |
] | |
}, | |
{ | |
"ename": "KeyboardInterrupt", | |
"evalue": "", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-13-47183d9ef90c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlineY\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mview\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0mlineY\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlineY\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 45\u001b[0;31m \u001b[0mnum5\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlineY\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 46\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnum5\u001b[0m \u001b[0;34m>\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlineY\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mliney\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mnum\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mKeyboardInterrupt\u001b[0m: " | |
] | |
} | |
], | |
"source": [ | |
"import random\n", | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"sentence = raw_input(\"Find bigram: \")\n", | |
"words = sentence.split()\n", | |
"find = words[-1]\n", | |
"hist = \"\"\n", | |
"lst1 = []\n", | |
"for line in view:\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" linea = line.split(\" \")\n", | |
" if find == linea[0]:\n", | |
" lines = \" \".join(linea)\n", | |
" line0 = lines.split(\" \")\n", | |
" num = int(line0[2])\n", | |
" if num>50:\n", | |
" #print \":\",line0[0],line0[1]\n", | |
" for linez in view:\n", | |
" linez = linez.split(\" \")\n", | |
" num2 = int(linez[2])\n", | |
" if num2 >50:\n", | |
" if line0[1] == linez[0] and num>30:\n", | |
" #print linea[0],linea[1],linez[1]\n", | |
" for lineZ in view:\n", | |
" lineZ = lineZ.split(\" \")\n", | |
" num3 = int(lineZ[2])\n", | |
" if num3 >50:\n", | |
" if linez[1] == lineZ[0] and num>30:\n", | |
" #print linea[0],linea[1],linez[1],lineZ[1]\n", | |
" all = sentence,linea[1],linez[1],lineZ[1]\n", | |
" all = \" \".join(all)\n", | |
" #print \"All: \",all\n", | |
" print \".\",\n", | |
" for liney in view:\n", | |
" liney = liney.split(\" \")\n", | |
" num4 = int(liney[2])\n", | |
" if num4 >50:\n", | |
" if liney[1] == linez[0] and num>30:\n", | |
" #if count <20:break\n", | |
" #count =count +1\n", | |
" #print sentence,linea[1],linez[1],lineZ[1],liney[1]\n", | |
" for lineY in view:\n", | |
" lineY = lineY.split(\" \")\n", | |
" num5 = int(lineY[2])\n", | |
" if num5 >50:\n", | |
" if lineY[1] == liney[0] and num>30:\n", | |
" count =count +1\n", | |
" #print count,\n", | |
" if count <20:break\n", | |
" if lineY[1] != hist:\n", | |
" print \":\",sentence,linea[1],linez[1],lineZ[1],liney[1],lineY[1] \n", | |
" hist = lineY[1]\n", | |
" \n", | |
" \n", | |
" \n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Find bigram: find\n", | |
": find the\n" | |
] | |
} | |
], | |
"source": [ | |
"import random\n", | |
"count = 0\n", | |
"longlist = []\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"sentence = raw_input(\"Find bigram: \")\n", | |
"words = sentence.split()\n", | |
"find = words[-1]\n", | |
"for line in view:\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" linea = line.split(\" \")\n", | |
" if find == linea[0]:\n", | |
" lines = \" \".join(linea)\n", | |
" line0 = lines.split(\" \")\n", | |
" num = int(line0[2])\n", | |
" if num>50:\n", | |
" print \":\",line0[0],line0[1]\n", | |
" for linez in view:\n", | |
" linez = linez.split(\" \")\n", | |
" num2 = int(linez[2])\n", | |
" if num2 >50:\n", | |
" if line0[1] == linez[0] and num>30:\n", | |
" part = linea[0],linea[1],linez[1]\n", | |
" longlist.append(part)\n", | |
" for lineZ in view:\n", | |
" lineZ = lineZ.split(\" \")\n", | |
" num3 = int(lineZ[2])\n", | |
" if num3 >50:\n", | |
" if linez[1] == lineZ[0] and num>30:\n", | |
" #print linea[0],linea[1],linez[1],lineZ[1]\n", | |
" all = sentence,linea[1],linez[1],lineZ[1]\n", | |
" all = \" \".join(all)\n", | |
" longlist.append(all)\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"find = raw_input(\"Find bigram: \")\n", | |
"for line in view:\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" linea = line.split(\" \")\n", | |
" if find == linea[0]:\n", | |
" lines = \" \".join(linea)\n", | |
" line0 = lines.split(\" \")\n", | |
" for linez in view:\n", | |
" num = int(line0[2])\n", | |
" if line0[1] in linez and num>30:\n", | |
" print linea[0],linea[1],linez\n", | |
" \n", | |
" \n", | |
" #print line\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"find = raw_input(\"Find bigram: \")\n", | |
"for line in view:\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" line = line.split(\" \")\n", | |
" if find == line[0]:\n", | |
" \n", | |
" print line\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!ls www.gutenberg.lib/text/ALLfiles.txt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%writefile testit.txt\n", | |
"leader brave 8\n", | |
"morning,'\n", | |
"trump's\n", | |
"): 8\n", | |
"co to 8\n", | |
"these advanced 8\n", | |
"beg your 8\n", | |
"issuing ht 8\n", | |
"out looks 8\n", | |
"the completely 8\n", | |
"science samples 8\n", | |
"night grows 8\n", | |
"slaves today 8\n", | |
"fast congress 8\n", | |
"these energies 8\n", | |
"less we 8\n", | |
"single chip 8\n", | |
"other journalists 8\n", | |
"for rocket 8\n", | |
"hearts courage 8\n", | |
"breakfast milk 8\n", | |
"of meddling 8\n", | |
"what energy 8\n", | |
"taxes,'\n", | |
"don't\n", | |
"): 8\n", | |
"democracy divided 8\n", | |
"no cigar 8\n", | |
"party thank 8\n", | |
"tomb general 8\n", | |
"scanner that 8\n", | |
"torn,'\n", | |
"we'll\n", | |
"): 8\n", | |
"central pm 8\n", | |
"love amp 8\n", | |
"unfortunately many 8\n", | |
"on international 8\n", | |
"continue visiting 8\n", | |
"faith,'\n", | |
"isn't\n", | |
"): 8\n", | |
"members local 8\n", | |
"cause economy 8\n", | |
"prices now 8\n", | |
"high commissioners 8\n", | |
"in anticipation 8" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filein = \"testit.txt\"\n", | |
"#fileout =\"www.gutenberg.lib/text/d_parsed3.txt\"\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"for line in IN:\n", | |
" NL = []\n", | |
" line = line.replace(\"\\n\",\"\")\n", | |
" linez = line.split()\n", | |
" try:\n", | |
" if linez[1] == \"\":A = '1 no exist'\n", | |
" except:\n", | |
" print line,'1 no exist'\n", | |
" NL.append(linez[0])\n", | |
" pass\n", | |
" try:\n", | |
" if linez[1] == \"\":B = '2 no exist'\n", | |
" \n", | |
" except:\n", | |
" print line,'2 no exist' \n", | |
" NL.append(linez[0])\n", | |
" pass\n", | |
" \n", | |
" \n", | |
" print line,\"-\",NL\n", | |
" print \"------------\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filein = \"testit.txt\"\n", | |
"#fileout =\"www.gutenberg.lib/text/d_parsed3.txt\"\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"for line in IN:\n", | |
" NL = []\n", | |
" line = line.replace(\"\\n\",\"\")\n", | |
" linez = line.split() \n", | |
" if \",'\" in linez[0]:\n", | |
" nline = line.replace(\",'\",\" \")\n", | |
" NL.append(nline)\n", | |
" if \"):\" in line:\n", | |
" n2line = line.replace(\"):\",\" \")\n", | |
" NL.append(n2line)\n", | |
" if \"):\" not in line and \",'\" not in line:\n", | |
" if \"):\" in line in line:\n", | |
" linez = nline+n2line\n", | |
" \n", | |
" print \":::\",linez[0],linez[1], linez[2] \n", | |
" \n", | |
" print line,\"-\",NL" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#!/usr/bin/env python\n", | |
"from __future__ import print_function\n", | |
"import io\n", | |
"import string\n", | |
"filein = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed3.txt\"\n", | |
"whitelist = set('abcdefghijklmnopqrstuvwxy ABCDEFGHIJKLMNOPQRSTUVWXYZ')\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"OUT = open(fileout,\"a\")\n", | |
"for line in IN:\n", | |
" #line = line.encode('ascii', 'ignore')\n", | |
" line = line.replace(\"u'\",\"\")\n", | |
" line = line.replace(\"_\",\" \")\n", | |
" line = line.replace('\"'\".' ')\n", | |
" line = line.replace(\"-\",\" \")\n", | |
" line = line.replace(\":\",\" \")\n", | |
" line = line.replace(\".\",\"\")\n", | |
" line = line.replace(\";\",\" \")\n", | |
" line = line.replace(\"/\",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" printable = set(string.printable)\n", | |
" line = filter(lambda x: x in printable, line)\n", | |
" line = ''.join(filter(whitelist.__contains__, line)) \n", | |
" newline = line.lower()\n", | |
" newline = newline.replace(\"\\n\", \"\")\n", | |
" if len(newline)>100:\n", | |
" OUT.write(newline+\"\\n\")\n", | |
"OUT.close()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#!/usr/bin/env python\n", | |
"from __future__ import print_function\n", | |
"import io\n", | |
"import string\n", | |
"count = 0\n", | |
"filein = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"fileout =\"short.txt\"\n", | |
"whitelist = set('abcdefghijklmnopqrstuvwxy ABCDEFGHIJKLMNOPQRSTUVWXYZ')\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"OUT = open(fileout,\"a\")\n", | |
"for line in IN:\n", | |
" #line = line.encode('ascii', 'ignore')\n", | |
" line = line.replace(\"u'\",\"\")\n", | |
" line = line.replace(\"_\",\" \")\n", | |
" line = line.replace('\"',' ')\n", | |
" line = line.replace(\"-\",\" \")\n", | |
" line = line.replace(\":\",\" \")\n", | |
" line = line.replace(\".\",\"\")\n", | |
" line = line.replace(\";\",\" \")\n", | |
" line = line.replace(\"/\",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" printable = set(string.printable)\n", | |
" line = filter(lambda x: x in printable, line)\n", | |
" line = ''.join(filter(whitelist.__contains__, line)) \n", | |
" newline = line.lower()\n", | |
" newline = newline.replace(\"\\n\", \"\")\n", | |
" if len(newline)>100:\n", | |
" count = count +1\n", | |
" if count<200: \n", | |
" OUT.write(newline+\"\\n\")\n", | |
"OUT.close()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#!/usr/bin/env python\n", | |
"#from __future__ import print_function\n", | |
"import io\n", | |
"import string\n", | |
"import enchant\n", | |
"d = enchant.Dict(\"en_US\")\n", | |
"filein = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed3-oneline.txt\"\n", | |
"whitelist = set('abcdefghijklmnopqrstuvwxy \\'ABCDEFGHIJKLMNOPQRSTUVWXYZ')\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"OUT = open(fileout,\"a\")\n", | |
"for line in IN:\n", | |
" #line = line.encode('ascii', 'ignore')\n", | |
" line = line.replace(\"u'\",\"\")\n", | |
" line = line.replace(\"_\",\" \")\n", | |
" line = line.replace('\"',' ')\n", | |
" line = line.replace(\"-\",\" \")\n", | |
" line = line.replace(\":\",\" \")\n", | |
" line = line.replace(\".\",\"\")\n", | |
" line = line.replace(\";\",\" \")\n", | |
" line = line.replace(\"/\",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" printable = set(string.printable)\n", | |
" line = filter(lambda x: x in printable, line)\n", | |
" line = ''.join(filter(whitelist.__contains__, line)) \n", | |
" newline = line.lower()\n", | |
" newline = newline.replace(\"\\n\", \"\")\n", | |
" if len(newline)>100:\n", | |
" newline = newline.split(\" \")\n", | |
" for word in newline:\n", | |
" count = count +1\n", | |
" #if count<100:print word\n", | |
" #if count > 16 and count < 160 and len(word)>1:\n", | |
" if len(word)>0: \n", | |
" if d.check(word) == True: \n", | |
" OUT.write(word+\" \")\n", | |
" #count = count +1\n", | |
" if count %25 == 0:OUT.write(\"\\n\")\n", | |
" \n", | |
"OUT.close()\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# short-list.txt , delimiters" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#!/usr/bin/env python\n", | |
"#from __future__ import print_function\n", | |
"import io\n", | |
"import string\n", | |
"import enchant\n", | |
"d = enchant.Dict(\"en_US\")\n", | |
"filein = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"fileout =\"short-list.txt\"\n", | |
"whitelist = set('abcdefghijklmnopqrstuvwxy \\'ABCDEFGHIJKLMNOPQRSTUVWXYZ')\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"OUT = open(fileout,\"a\")\n", | |
"count = 0\n", | |
"for line in IN:\n", | |
" #line = line.encode('ascii', 'ignore')\n", | |
" line = line.replace(\"u'\",\"\")\n", | |
" line = line.replace(\"_\",\" \")\n", | |
" line = line.replace('\"',' ')\n", | |
" line = line.replace(\"-\",\" \")\n", | |
" line = line.replace(\":\",\" \")\n", | |
" line = line.replace(\".\",\"\")\n", | |
" line = line.replace(\";\",\" \")\n", | |
" line = line.replace(\"/\",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" printable = set(string.printable)\n", | |
" line = filter(lambda x: x in printable, line)\n", | |
" line = ''.join(filter(whitelist.__contains__, line)) \n", | |
" newline = line.lower()\n", | |
" newline = newline.replace(\"\\n\", \"\")\n", | |
" if len(newline)>100:\n", | |
" newline = newline.split(\" \")\n", | |
" for word in newline:\n", | |
" count = count +1\n", | |
" #if count<100:print word\n", | |
" #if count > 16 and count < 160 and len(word)>1:\n", | |
" if len(word)>0: \n", | |
" if d.check(word) == True:\n", | |
" if count > 2000:sys.exit()\n", | |
" OUT.write(word+\", \")\n", | |
" count = count +1\n", | |
" print (count)\n", | |
" #if count %25 == 0:OUT.write(\"\\n\")\n", | |
" \n", | |
"OUT.close()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"a.encode('ascii', 'ignore')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!rm www.gutenberg.lib/text/d_parsed3.txt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"x = 'There is a dog and fox fighting in the park and there is an apple falling down.'\n", | |
"\n", | |
"x = x.split(' ')\n", | |
"\n", | |
"for i,word in enumerate(x):\n", | |
" if i != 0 and i % 3 == 0:\n", | |
" x[i] = word + '\\n'\n", | |
"\n", | |
"print (' '.join(x))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from itertools import tee, islice\n", | |
"from collections import Counter\n", | |
"out =\"short-test-ngram4.txt\"\n", | |
"infile = open(out, \"a\")\n", | |
"from time import sleep\n", | |
"def ngrams(lst, n):\n", | |
" tlst = lst\n", | |
" while True:\n", | |
" a, b = tee(tlst)\n", | |
" l = tuple(islice(a, n))\n", | |
" if len(l) == n:\n", | |
" yield l\n", | |
" next(b)\n", | |
" tlst = b\n", | |
" else:\n", | |
" break\n", | |
"out =\"short-list4.txt\"\n", | |
"view = open(out, \"r\").read()\n", | |
"view = view.split()\n", | |
"text = Counter(ngrams(view, 2))\n", | |
"text = str(text)\n", | |
"text = text.replace(\",'):\",\" \")\n", | |
"text = text.replace(\",', '\",\" \")\n", | |
"text = text.replace(\" ('\",\" \")\n", | |
"text = text.replace('\"',' ')\n", | |
"text = text.replace(\"})\",\"\")\n", | |
"text = text.replace(\"Counter({('\",\"\")\n", | |
"text = text.replace(\" \",\" \")\n", | |
"text = text.split(\", \")\n", | |
"for line in text:\n", | |
" INPUT = line+\"\\n\"\n", | |
" infile.write(INPUT)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from itertools import tee, islice\n", | |
"from collections import Counter\n", | |
"#out =\"short.txt\"\n", | |
"#view = open(out, \"a\")\n", | |
"from time import sleep\n", | |
"def ngrams(lst, n):\n", | |
" tlst = lst\n", | |
" while True:\n", | |
" a, b = tee(tlst)\n", | |
" l = tuple(islice(a, n))\n", | |
" if len(l) == n:\n", | |
" yield l\n", | |
" next(b)\n", | |
" tlst = b\n", | |
" else:\n", | |
" break\n", | |
"#short-test-ngram5.txt \n", | |
"out =\"short-test-ngram5.txt\"\n", | |
"view = open(out, \"r\").read()\n", | |
"view = view.split()\n", | |
"text = Counter(ngrams(view, 2))\n", | |
"text = str(text)\n", | |
"text = text.replace(\",'):\",\" \")\n", | |
"text = text.replace(\",', '\",\" \")\n", | |
"text = text.replace(\" ('\",\" \")\n", | |
"\n", | |
"text = text.replace(\"Counter({('\",\"\")\n", | |
"text = text.replace(\" \",\" \")\n", | |
"text = text.split(\", \")\n", | |
"\n", | |
"for i,word in enumerate(text):\n", | |
" if i != 0 and i % 3 == 0:\n", | |
" text[i] = word + '\\n'\n", | |
"\n", | |
"print (' '.join(text))\n", | |
"\n", | |
"print (text)\n", | |
"sleep(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import obo" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# --------------------------------------------------------------------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#!/usr/bin/env python\n", | |
"from __future__ import print_function\n", | |
"import io\n", | |
"import string\n", | |
"count = 0\n", | |
"filein = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"fileout =\"short.txt\"\n", | |
"whitelist = set('abcdefghijklmnopqrstuvwxy ABCDEFGHIJKLMNOPQRSTUVWXYZ')\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"OUT = open(fileout,\"a\")\n", | |
"for line in IN:\n", | |
" #line = line.encode('ascii', 'ignore')\n", | |
" line = line.replace(\"u'\",\"\")\n", | |
" line = line.replace(\"_\",\" \")\n", | |
" line = line.replace('\"',' ')\n", | |
" line = line.replace(\"-\",\" \")\n", | |
" line = line.replace(\":\",\" \")\n", | |
" line = line.replace(\".\",\"\")\n", | |
" line = line.replace(\";\",\" \")\n", | |
" line = line.replace(\"/\",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" printable = set(string.printable)\n", | |
" line = filter(lambda x: x in printable, line)\n", | |
" line = ''.join(filter(whitelist.__contains__, line)) \n", | |
" newline = line.lower()\n", | |
" newline = newline.replace(\"\\n\", \"\")\n", | |
" if len(newline)>100:\n", | |
" count = count +1\n", | |
" if count<200: \n", | |
" OUT.write(newline+\"\\n\")\n", | |
"OUT.close()\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Viewing ngrams of letters" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Creating short.text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#!/usr/bin/env python\n", | |
"from __future__ import print_function\n", | |
"import io\n", | |
"import string\n", | |
"count = 0\n", | |
"filein = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"fileout =\"short.txt\"\n", | |
"whitelist = set('abcdefghijklmnopqrstuvwxy ABCDEFGHIJKLMNOPQRSTUVWXYZ')\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"OUT = open(fileout,\"a\")\n", | |
"for line in IN:\n", | |
" #line = line.encode('ascii', 'ignore')\n", | |
" line = line.replace(\"u'\",\"\")\n", | |
" line = line.replace(\"_\",\" \")\n", | |
" line = line.replace('\"',' ')\n", | |
" line = line.replace(\"-\",\" \")\n", | |
" line = line.replace(\":\",\" \")\n", | |
" line = line.replace(\".\",\"\")\n", | |
" line = line.replace(\";\",\" \")\n", | |
" line = line.replace(\"/\",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" printable = set(string.printable)\n", | |
" line = filter(lambda x: x in printable, line)\n", | |
" line = ''.join(filter(whitelist.__contains__, line)) \n", | |
" newline = line.lower()\n", | |
" newline = newline.replace(\"\\n\", \"\")\n", | |
" if len(newline)>100:\n", | |
" count = count +1\n", | |
" if count<200: \n", | |
" OUT.write(newline+\"\\n\")\n", | |
"OUT.close()\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Trigrams of letters" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from itertools import tee, islice\n", | |
"from collections import Counter\n", | |
"#out =\"short.txt\"\n", | |
"#view = open(out, \"a\")\n", | |
"\n", | |
"def ngrams(lst, n):\n", | |
" tlst = lst\n", | |
" while True:\n", | |
" a, b = tee(tlst)\n", | |
" l = tuple(islice(a, n))\n", | |
" if len(l) == n:\n", | |
" yield l\n", | |
" next(b)\n", | |
" tlst = b\n", | |
" else:\n", | |
" break\n", | |
"out =\"short.txt\"\n", | |
"view = open(out, \"r\").read()\n", | |
"view = list(view)\n", | |
"Counter(ngrams(view, 3))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# --------------------------------------------------------------------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"from itertools import islice, izip\n", | |
"from collections import Counter\n", | |
"out =\"www.gutenberg.lib/text/d_parsed3.txt\"\n", | |
"view = open(out, \"a\")\n", | |
"\n", | |
"\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed3.txt\"\n", | |
"view = open(fileout, \"r\").read() \n", | |
"words = re.findall(\"\\w+\", view)\n", | |
"tx = Counter(izip(words, islice(words, 1, None)))\n", | |
"view.write(tx)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"out =\"www.gutenberg.lib/text/d_parsed3s.txt\"\n", | |
"view = open(out, \"a\")\n", | |
"\n", | |
"for line in tx:\n", | |
" linez =str(line)\n", | |
" view.write(linez)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from itertools import tee, islice\n", | |
"\n", | |
"def ngrams(lst, n):\n", | |
" tlst = lst\n", | |
" while True:\n", | |
" a, b = tee(tlst)\n", | |
" l = tuple(islice(a, n))\n", | |
" if len(l) == n:\n", | |
" yield l\n", | |
" next(b)\n", | |
" tlst = b\n", | |
" else:\n", | |
" break\n", | |
"\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed3.txt\"\n", | |
"view = open(fileout, \"r\").read() \n", | |
" \n", | |
"print ngrams(view, 3)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from time import sleep\n", | |
"count = 0\n", | |
"fileout =\"short-list4.txt\"\n", | |
"view = open(fileout, \"r\").read()\n", | |
"for line in view.split(\",\"):\n", | |
" count = count +1\n", | |
" if '(' in line:\n", | |
" #\"\"count > 160800 and count < 160953'':\n", | |
" #if count < 16:\n", | |
" print (line)\n", | |
" sleep(1)\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from time import sleep\n", | |
"count = 0\n", | |
"fileout =\"short-test-ngram.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"for line in view:\n", | |
" count = count +1\n", | |
" if count > 1608 and count < 1700:\n", | |
" #if count < 16:\n", | |
" print (line)\n", | |
" sleep(1)\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed3.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"for line in view:\n", | |
" count = count +1\n", | |
" if count > 160800 and count < 160953:\n", | |
" #if count < 16:\n", | |
" print (line)\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import enchant\n", | |
"d = enchant.Dict(\"en_US\")\n", | |
"\n", | |
"count = 0\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed3.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"for line in view:\n", | |
" line = line.split(\" \")\n", | |
" count = count +1\n", | |
" for word in line:\n", | |
" if count > 16 and count < 160 and len(word)>1:\n", | |
" \n", | |
" #print (\":\",word)\n", | |
" if d.check(word) == True:\n", | |
" print word,\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"reset -f" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"fileout = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"for line in view:\n", | |
" count = count +1\n", | |
" if count > 2655580 and count < 2655588:\n", | |
" print (line)\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"filename = 'www.gutenberg.lib/text/ALLfiles.txt'\n", | |
"file = open(filename, 'rt')\n", | |
"text = file.read()\n", | |
"printable = set(string.printable)\n", | |
"text = filter(lambda x: x in printable, text)\n", | |
"file.close()\n", | |
"# split into words\n", | |
"from nltk.tokenize import word_tokenize\n", | |
"tokens = word_tokenize(text)\n", | |
"# remove all tokens that are not alphabetic\n", | |
"words = [word for word in tokens if word.isalpha()]\n", | |
"print(words[:100])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import enchant\n", | |
"d = enchant.Dict(\"en_US\")\n", | |
"d.check(\"Hello\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import enchant\n", | |
"d = enchant.Dict(\"en_US\")\n", | |
"d.check(\"Hello\")\n", | |
"True\n", | |
">>> d.check(\"Helo\")\n", | |
"False\n", | |
">>> d.suggest(\"Helo\")\n", | |
"['He lo', 'He-lo', 'Hello', 'Helot', 'Help', 'Halo', 'Hell', 'Held', 'Helm', 'Hero', \"He'll\"]\n", | |
">>>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"filename = 'www.gutenberg.lib/text/ALLfiles.txt'\n", | |
"file = open(filename, 'rt')\n", | |
"text = file.read()\n", | |
"printable = set(string.printable)\n", | |
"text = filter(lambda x: x in printable, text)\n", | |
"file.close()\n", | |
"# split into words\n", | |
"from nltk.tokenize import word_tokenize\n", | |
"tokens = word_tokenize(text)\n", | |
"# remove all tokens that are not alphabetic\n", | |
"words = [word for word in tokens if word.isalpha()]\n", | |
"for line in words:\n", | |
" count=count+1\n", | |
"print (count)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#!/usr/bin/env python\n", | |
"from __future__ import print_function\n", | |
"import io\n", | |
"import string\n", | |
"filein = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed.txt\"\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"OUT = open(fileout,\"a\")\n", | |
"for line in IN:\n", | |
" printable = set(string.printable)\n", | |
" line = filter(lambda x: x in printable, line) \n", | |
" if len(line)>100:\n", | |
" OUT.write(line)\n", | |
"OUT.close()\n", | |
"IN.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#!/usr/bin/env python\n", | |
"from __future__ import print_function\n", | |
"import io\n", | |
"import string\n", | |
"filein = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed3.txt\"\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"OUT = open(fileout,\"a\")\n", | |
"for line in IN:\n", | |
" line = line.encode(\"utf-8\")\n", | |
" printable = set(string.printable)\n", | |
" line = filter(lambda x: x in printable, line)\n", | |
" line = ''.join([i for i in line if not i.isdigit()])\n", | |
" #line = filter(str.isalpha, line)\n", | |
" line = line.lower()\n", | |
" line = line.replace(\"\\n\", \"\")\n", | |
" if len(line)>100:\n", | |
" OUT.write(line+\"\\n\")\n", | |
"OUT.close()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed2.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"for line in view:\n", | |
" count = count +1\n", | |
" if count == 1200:\n", | |
" print (line)\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"for line in view:\n", | |
" count = count +1\n", | |
" if count == 1200:\n", | |
" print (line)\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count = 0\n", | |
"fileout =\"www.gutenberg.lib/text/d_parsed.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"for line in view:\n", | |
" count = count +1\n", | |
" if count == 100:\n", | |
" print (line)\n", | |
"print (count) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import string\n", | |
"printable = set(string.printable)\n", | |
"filter(lambda x: x in printable, s)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from itertools import tee, islice\n", | |
"from collections import Counter\n", | |
"#out =\"short.txt\"\n", | |
"#view = open(out, \"a\")\n", | |
"\n", | |
"def ngrams(lst, n):\n", | |
" tlst = lst\n", | |
" while True:\n", | |
" a, b = tee(tlst)\n", | |
" l = tuple(islice(a, n))\n", | |
" if len(l) == n:\n", | |
" yield l\n", | |
" next(b)\n", | |
" tlst = b\n", | |
" else:\n", | |
" break\n", | |
"out =\"short-list.txt\"\n", | |
"view = open(out, \"r\").read()\n", | |
"view2 = list(view)\n", | |
"Counter(ngrams(view2, 3))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import random\n", | |
"import bisect\n", | |
"import matplotlib.pyplot as plt\n", | |
"import math\n", | |
"import time\n", | |
"\n", | |
"def method_in(a,b,c):\n", | |
" start_time = time.time()\n", | |
" for i,x in enumerate(a):\n", | |
" if x in b:\n", | |
" c[i] = 1\n", | |
" return(time.time()-start_time) \n", | |
"\n", | |
"def method_set_in(a,b,c):\n", | |
" start_time = time.time()\n", | |
" s = set(b)\n", | |
" for i,x in enumerate(a):\n", | |
" if x in s:\n", | |
" c[i] = 1\n", | |
" return(time.time()-start_time)\n", | |
"\n", | |
"def method_bisect(a,b,c):\n", | |
" start_time = time.time()\n", | |
" b.sort()\n", | |
" for i,x in enumerate(a):\n", | |
" index = bisect.bisect_left(b,x)\n", | |
" if index < len(a):\n", | |
" if x == b[index]:\n", | |
" c[i] = 1\n", | |
" return(time.time()-start_time)\n", | |
"\n", | |
"def profile():\n", | |
" time_method_in = []\n", | |
" time_method_set_in = []\n", | |
" time_method_bisect = []\n", | |
"\n", | |
" Nls = [x for x in range(1000,20000,1000)]\n", | |
" for N in Nls:\n", | |
" a = [x for x in range(0,N)]\n", | |
" random.shuffle(a)\n", | |
" b = [x for x in range(0,N)]\n", | |
" random.shuffle(b)\n", | |
" c = [0 for x in range(0,N)]\n", | |
"\n", | |
" time_method_in.append(math.log(method_in(a,b,c)))\n", | |
" time_method_set_in.append(math.log(method_set_in(a,b,c)))\n", | |
" time_method_bisect.append(math.log(method_bisect(a,b,c)))\n", | |
"\n", | |
" plt.plot(Nls,time_method_in,marker='o',color='r',linestyle='-',label='in')\n", | |
" plt.plot(Nls,time_method_set_in,marker='o',color='b',linestyle='-',label='set')\n", | |
" plt.plot(Nls,time_method_bisect,marker='o',color='g',linestyle='-',label='bisect')\n", | |
" plt.xlabel('list size', fontsize=18)\n", | |
" plt.ylabel('log(time)', fontsize=18)\n", | |
" plt.legend(loc = 'upper left')\n", | |
" plt.show()\n", | |
" \n", | |
"profile() " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from time import sleep\n", | |
"count = 0\n", | |
"fileout =\"ngram-from Allfiles.txt\"\n", | |
"with open(fileout) as f:\n", | |
" lines = f.read()\n", | |
" line = str(lines)\n", | |
" elements = line.split(\", \")\n", | |
" for line in elements:\n", | |
" print line\n", | |
" count = count +1\n", | |
" sleep(1)\n", | |
"print len(line) \n", | |
"print count \n", | |
"#32102202\n", | |
"# 4910816" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Clean raw text files into word list" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#!/usr/bin/env python\n", | |
"#from __future__ import print_function\n", | |
"import io\n", | |
"import string\n", | |
"import enchant\n", | |
"d = enchant.Dict(\"en_US\")\n", | |
"filein = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"fileout =\"ngram-from Allfiles.txt\"\n", | |
"whitelist = set('abcdefghijklmnopqrstuvwxy ABCDEFGHIJKLMNOPQRSTUVWXYZ')\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"OUT = open(fileout,\"a\")\n", | |
"count = 0\n", | |
"for line in IN:\n", | |
" #line = line.encode('ascii', 'ignore')\n", | |
" printable = set(string.printable)\n", | |
" line = filter(lambda x: x in printable, line)\n", | |
" line = ''.join(filter(whitelist.__contains__, line)) \n", | |
" line = line.replace(\"u'\",\"\")\n", | |
" line = line.replace(\"_\",\" \")\n", | |
" line = line.replace('\"',' ')\n", | |
" line = line.replace(\"-\",\" \")\n", | |
" line = line.replace(\":\",\" \")\n", | |
" line = line.replace(\")\",\"\")\n", | |
" line = line.replace(\"(\",\"\")\n", | |
" line = line.replace(\".\",\"\")\n", | |
" line = line.replace(\";\",\" \")\n", | |
" line = line.replace(\"/\",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" \n", | |
" newline = line.lower()\n", | |
" newline = newline.replace(\"\\n\", \"\")\n", | |
" if len(newline)>100:\n", | |
" newline = newline.split(\" \")\n", | |
" for word in newline:\n", | |
" count = count +1\n", | |
" #if count<100:print word\n", | |
" #if count > 16 and count < 160 and len(word)>1:\n", | |
" if len(word)>0: \n", | |
" if d.check(word) == True:\n", | |
" #if count > 2000:sys.exit()\n", | |
" OUT.write(word+\", \")\n", | |
" count = count +1\n", | |
" #print (count)\n", | |
" #if count %25 == 0:OUT.write(\"\\n\")\n", | |
" \n", | |
"OUT.close()\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Converts list ngram-fromAllfiles.txt\n", | |
"## to three column - word-one word-two frequency-count \n", | |
"Example:\n", | |
"\n", | |
"of the 61792\n", | |
"in the 28150\n", | |
"and the 19274\n", | |
"to the 17937\n", | |
"the lord 9832\n", | |
"on the 8718\n", | |
"for the 8538\n", | |
"with the 7959\n", | |
"by the 7676\n", | |
"from the 6795\n", | |
"is a 6708\n", | |
"to be 6683" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from itertools import tee, islice\n", | |
"from collections import Counter\n", | |
"out =\"ngram-fromAllfilesgrams.txt\"\n", | |
"infile = open(out, \"a\")\n", | |
"from time import sleep\n", | |
"def ngrams(lst, n):\n", | |
" tlst = lst\n", | |
" while True:\n", | |
" a, b = tee(tlst)\n", | |
" l = tuple(islice(a, n))\n", | |
" if len(l) == n:\n", | |
" yield l\n", | |
" next(b)\n", | |
" tlst = b\n", | |
" else:\n", | |
" break\n", | |
"out =\"ngram-fromAllfiles.txt\"\n", | |
"view = open(out, \"r\").read()\n", | |
"view = view.split()\n", | |
"text = Counter(ngrams(view, 2))\n", | |
"text = str(text)\n", | |
"text = text.replace(\",'):\",\" \")\n", | |
"text = text.replace(\",', '\",\" \")\n", | |
"text = text.replace(\" ('\",\" \")\n", | |
"text = text.replace('\"',' ')\n", | |
"text = text.replace(\"})\",\"\")\n", | |
"text = text.replace(\"Counter({('\",\"\")\n", | |
"text = text.replace(\" \",\" \")\n", | |
"text = text.split(\", \")\n", | |
"for line in text:\n", | |
" INPUT = line+\"\\n\"\n", | |
" infile.write(INPUT)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Create ngram.db and the tables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sqlite3\n", | |
"conn = sqlite3.connect('ngram.db')\n", | |
"c = conn.cursor()\n", | |
"c.execute(\"CREATE TABLE ngram(src,res,val)\")\n", | |
"conn.commit()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Enters the three column ngram file into an sqlite3.db" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from time import sleep\n", | |
"import sqlite3\n", | |
"conn = sqlite3.connect('ngram.db')\n", | |
"c = conn.cursor()\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"for rows in view:\n", | |
" rows = rows.replace(\"\\n\",\"\")\n", | |
" rowz= str(rows)\n", | |
" rowZ = rowz.split(\" \")\n", | |
" src = rowZ[0]\n", | |
" res = rowZ[1]\n", | |
" val = rowZ[2]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Insert the ngram list to database" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sqlite3\n", | |
"conn = sqlite3.connect('ngram.db')\n", | |
"conn.text_factory = lambda x: unicode(x, \"utf-8\", \"ignore\")\n", | |
"c = conn.cursor()\n", | |
"c.execute(\"CREATE TABLE IF NOT EXISTS ngram(src, res, val)\")\n", | |
"from time import sleep\n", | |
"import sqlite3\n", | |
"conn = sqlite3.connect('ngram.db')\n", | |
"c = conn.cursor()\n", | |
"fileout = \"short-test-ngram5.txt\"\n", | |
"view = open(fileout, \"r\").readlines()\n", | |
"for rows in view:\n", | |
" rows = rows.replace(\"\\n\",\"\")\n", | |
" rowz= str(rows)\n", | |
" rowZ = rowz.split(\" \")\n", | |
" src = rowZ[0]\n", | |
" res = rowZ[1]\n", | |
" val = rowZ[2]\n", | |
" c.execute(\"INSERT into ngram(src,res,val) values (?,?,?)\",(src,res,val))\n", | |
"conn.commit()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Test the database\n", | |
"## Select a random match" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Match: time\n", | |
"727891 time kept 3\n", | |
"464017 time using 1\n", | |
"197427 time delays 3\n", | |
"676903 time man 7\n" | |
] | |
} | |
], | |
"source": [ | |
"#%%writefile NgramBD.py\n", | |
"import sqlite3\n", | |
"conn = sqlite3.connect('ngram.db')\n", | |
"c = conn.cursor()\n", | |
"\n", | |
"def ngram(find):\n", | |
" for row in c.execute(\"SELECT ROWID,* from ngram where src == ? order by random() limit 4\", (find,)):\n", | |
" print row[0],row[1],row[2],row[3]\n", | |
"\n", | |
"find = raw_input(\"Match: \")\n", | |
"ngram(find) " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# turn into a module" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%writefile NgramBD.py\n", | |
"import sqlite3\n", | |
"conn = sqlite3.connect('ngram.db')\n", | |
"c = conn.cursor()\n", | |
"def ngram(find):\n", | |
" for row in c.execute(\"SELECT ROWID,* from ngram where src == ? order by random() limit 1\", (find,)):\n", | |
" XX = row[0],row[1],row[2],row[3]\n", | |
" return XX" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"FIND: time\n", | |
"(4456, u'time', u'they', u'119')\n" | |
] | |
} | |
], | |
"source": [ | |
"from NgramBD import ngram\n", | |
"find = raw_input(\"FIND: \")\n", | |
"print ngram(find)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sqlite3\n", | |
"conn = sqlite3.connect('/home/jack/hubiC/Databases/SNIPPETS.db')\n", | |
"c = conn.cursor()\n", | |
"search = raw_input(\"SEARCH: \")\n", | |
"for row in c.execute(\"select * from snippets where snippets MATCH ?\", (search,)):\n", | |
" print row[0]\n", | |
"conn.close() " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"186 \n", | |
"#!/usr/bin/env python\n", | |
"#from __future__ import print_function\n", | |
"import io\n", | |
"import string\n", | |
"import enchant\n", | |
"d = enchant.Dict(\"en_US\")\n", | |
"filein = \"www.gutenberg.lib/text/ALLfiles.txt\"\n", | |
"fileout =\"ngram-from Allfiles.txt\"\n", | |
"whitelist = set('abcdefghijklmnopqrstuvwxy ABCDEFGHIJKLMNOPQRSTUVWXYZ')\n", | |
"IN = open(filein,\"r\").readlines()\n", | |
"OUT = open(fileout,\"a\")\n", | |
"count = 0\n", | |
"for line in IN:\n", | |
" #line = line.encode('ascii', 'ignore')\n", | |
" printable = set(string.printable)\n", | |
" line = filter(lambda x: x in printable, line)\n", | |
" line = ''.join(filter(whitelist.__contains__, line)) \n", | |
" line = line.replace(\"u'\",\"\")\n", | |
" line = line.replace(\"_\",\" \")\n", | |
" line = line.replace('\"',' ')\n", | |
" line = line.replace(\"-\",\" \")\n", | |
" line = line.replace(\":\",\" \")\n", | |
" line = line.replace(\")\",\"\")\n", | |
" line = line.replace(\"(\",\"\")\n", | |
" line = line.replace(\".\",\"\")\n", | |
" line = line.replace(\";\",\" \")\n", | |
" line = line.replace(\"/\",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" line = line.replace(\" \",\" \")\n", | |
" \n", | |
" newline = line.lower()\n", | |
" newline = newline.replace(\"\n", | |
"\", \"\")\n", | |
" if len(newline)>100:\n", | |
" newline = newline.split(\" \")\n", | |
" for word in newline:\n", | |
" count = count +1\n", | |
" #if count<100:print word\n", | |
" #if count > 16 and count < 160 and len(word)>1:\n", | |
" if len(word)>0: \n", | |
" if d.check(word) == True:\n", | |
" #if count > 2000:sys.exit()\n", | |
" OUT.write(word+\", \")\n", | |
" count = count +1\n", | |
" #print (count)\n", | |
" #if count %25 == 0:OUT.write(\"\n", | |
"\")\n", | |
" \n", | |
"OUT.close()\n", | |
"\n", | |
"\n", | |
"clean text, clean file, prepare data, prepare file, create data, creat datafile\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"import sqlite3\n", | |
"conn = sqlite3.connect('/home/jack/hubiC/Databases/SNIPPETS.db')\n", | |
"conn.text_factory = lambda x: unicode(x, \"utf-8\", \"ignore\")\n", | |
"c = conn.cursor()\n", | |
"text = \"\"\"\n", | |
"\"\"\"\n", | |
"Usage: python remove_output.py notebook.ipynb [ > without_output.ipynb ]\n", | |
"Modified from remove_output by Minrk\n", | |
"\"\"\"\n", | |
"import sys\n", | |
"import io\n", | |
"import os\n", | |
"from IPython.nbformat.current import read, write\n", | |
"\n", | |
"\n", | |
"def remove_outputs(nb):\n", | |
" \"\"remove the outputs from a notebook\"\"\n", | |
" for ws in nb.worksheets:\n", | |
" for cell in ws.cells:\n", | |
" if cell.cell_type == 'code':\n", | |
" cell.outputs = []\n", | |
"\n", | |
"if __name__ == '__main__':\n", | |
" fname = sys.argv[1]\n", | |
" with io.open(fname, 'r') as f:\n", | |
" nb = read(f, 'json')\n", | |
" remove_outputs(nb)\n", | |
" base, ext = os.path.splitext(fname)\n", | |
" new_ipynb = \"%s_removed%s\" % (base, ext)\n", | |
" with io.open(new_ipynb, 'w', encoding='utf8') as f:\n", | |
" write(nb, f, 'json')\n", | |
" print \"wrote %s\" % new_ipynb\n", | |
"\"\"\"\n", | |
"keywords = \"\"\"\n", | |
"ipynb recover, recover notebook, jupter recover, recover jupyter\n", | |
"\"\"\"\n", | |
"c.execute(\"INSERT into snippets values (?,?)\",(text, keywords,))\n", | |
"conn.commit()\n", | |
"\n", | |
"for row in c.execute(\"SELECT rowid,* FROM snippets order by ROWID DESC limit 1\"):\n", | |
" print row[0],row[1],row[2]\n", | |
" \n", | |
"conn.close() " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"184 \n", | |
"import fileinput\n", | |
"import glob\n", | |
"\n", | |
"def Join(PATH, newfile, N):\n", | |
" outfile = open(newfile, 'a')\n", | |
" for line in fileinput.input(glob.glob(PATH+\"*.txt\")):\n", | |
" outfile.write(line)\n", | |
" outfile.write(\"\n", | |
"\")\n", | |
" count = 0\n", | |
" outfile = open(newfile, 'r').readlines()\n", | |
" for line in outfile:\n", | |
" count = count +1\n", | |
" if count<N:\n", | |
" print line\n", | |
" LineCount = \"Line Count\",count \n", | |
" return LineCount \n", | |
"\n", | |
"newfile= 'kuran1.txt'\n", | |
"PATH = 'kuran/'\n", | |
"N = 6\n", | |
"Join(PATH, newfile, N)\n", | |
"--------------------------------------------\n", | |
"copy files, join text files, join files, search files, \n", | |
"search and join search join contanate files join file function\n", | |
"\n", | |
"copy files, join text files, join files, search files, \n", | |
"search and join search join contanate files join file function\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"import sqlite3\n", | |
"conn = sqlite3.connect('/home/jack/hubiC/Databases/SNIPPETS.db')\n", | |
"conn.text_factory = lambda x: unicode(x, \"utf-8\", \"ignore\")\n", | |
"c = conn.cursor()\n", | |
"for row in c.execute(\"SELECT rowid,* FROM snippets order by ROWID DESC limit 1\"):\n", | |
" print row[0],row[1],row[2]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "DefaultPY", | |
"language": "python", | |
"name": "default_py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment