Skip to content

Instantly share code, notes, and snippets.

@d2207197
Last active August 29, 2015 14:09
Show Gist options
  • Save d2207197/b05569bf118c9cad3bae to your computer and use it in GitHub Desktop.
Save d2207197/b05569bf118c9cad3bae to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:3ae96f2363badc84fab25cb948f8a096a628048de7aae0009c1715aff2c7c5a6"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import division\n",
"import nltk, random\n",
"names_with_gender = ([(name.lower(), 'male') for name in nltk.corpus.names.words('male.txt')] \n",
" + [(name.lower(), 'female') for name in nltk.corpus.names.words('female.txt')])\n",
"random.shuffle(names_with_gender)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": true,
"input": [
"def gender_features(word):\n",
" word = word.lower()\n",
" import string\n",
" features = {char: char in word for char in string.lowercase}\n",
" features.update({'count({})'.format(char): word.count(char) for char in string.lowercase})\n",
" features.update({'startswith': word[0], 'endswith': word[-1]})\n",
" return features\n",
"gender_features('joe')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"{'a': False,\n",
" 'b': False,\n",
" 'c': False,\n",
" 'count(a)': 0,\n",
" 'count(b)': 0,\n",
" 'count(c)': 0,\n",
" 'count(d)': 0,\n",
" 'count(e)': 1,\n",
" 'count(f)': 0,\n",
" 'count(g)': 0,\n",
" 'count(h)': 0,\n",
" 'count(i)': 0,\n",
" 'count(j)': 1,\n",
" 'count(k)': 0,\n",
" 'count(l)': 0,\n",
" 'count(m)': 0,\n",
" 'count(n)': 0,\n",
" 'count(o)': 1,\n",
" 'count(p)': 0,\n",
" 'count(q)': 0,\n",
" 'count(r)': 0,\n",
" 'count(s)': 0,\n",
" 'count(t)': 0,\n",
" 'count(u)': 0,\n",
" 'count(v)': 0,\n",
" 'count(w)': 0,\n",
" 'count(x)': 0,\n",
" 'count(y)': 0,\n",
" 'count(z)': 0,\n",
" 'd': False,\n",
" 'e': True,\n",
" 'endswith': 'e',\n",
" 'f': False,\n",
" 'g': False,\n",
" 'h': False,\n",
" 'i': False,\n",
" 'j': True,\n",
" 'k': False,\n",
" 'l': False,\n",
" 'm': False,\n",
" 'n': False,\n",
" 'o': True,\n",
" 'p': False,\n",
" 'q': False,\n",
" 'r': False,\n",
" 's': False,\n",
" 'startswith': 'j',\n",
" 't': False,\n",
" 'u': False,\n",
" 'v': False,\n",
" 'w': False,\n",
" 'x': False,\n",
" 'y': False,\n",
" 'z': False}"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"featuresets = [(gender_features(name), gender) for name, gender in names_with_gender]\n",
"split_point = len(featuresets)*9//10\n",
"train_set, test_set = featuresets[:split_point], featuresets[split_point:]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from nltk.classify import SklearnClassifier \n",
"from sklearn.linear_model import LogisticRegression\n",
"%time sklearn_classifier = SklearnClassifier(LogisticRegression(C=10e5)).train(train_set)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"CPU times: user 1.58 s, sys: 87.1 ms, total: 1.67 s\n",
"Wall time: 1.7 s\n"
]
}
],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from nltk.classify import MaxentClassifier\n",
"%time nltk_classifier = MaxentClassifier.train(train_set, nltk.classify.MaxentClassifier.ALGORITHMS[0])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" ==> Training (100 iterations)\n",
"\n",
" Iteration Log Likelihood Accuracy\n",
" ---------------------------------------\n",
" 1 -0.69315 0.367"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 2 -0.60937 0.633"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 3 -0.59447 0.633"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 4 -0.58143 0.634"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 5 -0.56917 0.638"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 6 -0.55763 0.649"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 7 -0.54679 0.669"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 8 -0.53661 0.686"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 9 -0.52704 0.703"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 10 -0.51805 0.714"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 11 -0.50960 0.725"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 12 -0.50165 0.734"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 13 -0.49416 0.743"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 14 -0.48711 0.749"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 15 -0.48046 0.755"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 16 -0.47418 0.759"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 17 -0.46824 0.764"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 18 -0.46263 0.767"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 19 -0.45731 0.770"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 20 -0.45227 0.774"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 21 -0.44749 0.776"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 22 -0.44295 0.776"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 23 -0.43864 0.777"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 24 -0.43453 0.780"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 25 -0.43062 0.782"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 26 -0.42689 0.785"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 27 -0.42334 0.787"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 28 -0.41994 0.788"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 29 -0.41670 0.789"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 30 -0.41360 0.790"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 31 -0.41063 0.791"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 32 -0.40779 0.791"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 33 -0.40506 0.792"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 34 -0.40245 0.793"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 35 -0.39995 0.793"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 36 -0.39754 0.793"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 37 -0.39523 0.794"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 38 -0.39301 0.795"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 39 -0.39087 0.795"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 40 -0.38881 0.795"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 41 -0.38683 0.796"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 42 -0.38492 0.796"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 43 -0.38308 0.796"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 44 -0.38131 0.797"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 45 -0.37960 0.798"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 46 -0.37795 0.799"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 47 -0.37635 0.799"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 48 -0.37481 0.799"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 49 -0.37332 0.800"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 50 -0.37188 0.799"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 51 -0.37048 0.799"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 52 -0.36914 0.800"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 53 -0.36783 0.800"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 54 -0.36656 0.801"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 55 -0.36534 0.801"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 56 -0.36415 0.802"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 57 -0.36300 0.802"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 58 -0.36188 0.802"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 59 -0.36080 0.802"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 60 -0.35975 0.802"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 61 -0.35872 0.803"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 62 -0.35773 0.803"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 63 -0.35677 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 64 -0.35583 0.803"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 65 -0.35492 0.803"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 66 -0.35404 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 67 -0.35317 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 68 -0.35234 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 69 -0.35152 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 70 -0.35073 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 71 -0.34996 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 72 -0.34920 0.803"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 73 -0.34847 0.803"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 74 -0.34776 0.803"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 75 -0.34706 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 76 -0.34638 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 77 -0.34572 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 78 -0.34507 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 79 -0.34444 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 80 -0.34383 0.805"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 81 -0.34323 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 82 -0.34264 0.805"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 83 -0.34207 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 84 -0.34151 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 85 -0.34097 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 86 -0.34044 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 87 -0.33992 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 88 -0.33941 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 89 -0.33891 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 90 -0.33842 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 91 -0.33795 0.805"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 92 -0.33748 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 93 -0.33703 0.804"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 94 -0.33658 0.805"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 95 -0.33615 0.805"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 96 -0.33572 0.805"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 97 -0.33530 0.805"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 98 -0.33490 0.805"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" 99 -0.33450 0.805"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" Final -0.33410 0.805"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"CPU times: user 9min 6s, sys: 3.78 s, total: 9min 10s\n",
"Wall time: 9min 31s\n"
]
}
],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print '== SkLearn MaxEnt =='\n",
"print sklearn_classifier.classify(gender_features('mark'))\n",
"print nltk.classify.accuracy(sklearn_classifier, test_set)\n",
"print \n",
"print '== NLTK MaxEnt =='\n",
"print nltk_classifier.classify(gender_features('mark'))\n",
"print nltk.classify.accuracy(nltk_classifier, test_set)\n",
"print nltk_classifier.show_most_informative_features(10)\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"== SkLearn MaxEnt ==\n",
"male\n",
"0.796226415094"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"\n",
"== NLTK MaxEnt ==\n",
"male\n",
"0.782389937107"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" -4.141 endswith==u'a' and label is 'male'\n",
" -3.658 endswith==u'k' and label is 'female'\n",
" -2.731 count(v)==2 and label is 'male'\n",
" 2.245 count(j)==2 and label is 'female'\n",
" -2.132 endswith==u'f' and label is 'female'\n",
" -1.903 endswith==u'v' and label is 'female'\n",
" -1.718 endswith==u'm' and label is 'female'\n",
" 1.671 count(h)==3 and label is 'female'\n",
" -1.573 endswith==u'p' and label is 'female'\n",
" -1.501 endswith==u'd' and label is 'female'\n",
"None\n"
]
}
],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!gist logistic\\ regression.ipynb --update b05569bf118c9cad3bae"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"https://gist.github.com/b05569bf118c9cad3bae\r\n"
]
}
],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment