erichannell · March 13, 2015 16:26
diff --git a/notebook b/notebook
 {"nbformat_minor": 0, "cells": [{"execution_count": null, "cell_type": "code", "source": "print \"Hello, world!\" #ohai! this is a comment", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "name = \"Eric\" # creating a variable\nprint \"Hello\" + \" \" + name # adding strings", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "your_name = raw_input(\"Who are you? \")\nprint \"Hello\", your_name # , adds spaces", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "print your_name.lower() # string method\nprint your_name.upper()", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "i = 0\nprint i\ni = i + 1 # i += 1\nprint i", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "for i in range(10):\n    print i,", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "x = [1,2,3,4,5] # a list\nprint x\nprint len(x) # length of list", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "# lists start at the \"zeroeth element\"\nprint x[0] # first element\nprint x[-1] # last element\nprint x[1:3]", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "names = [\"Jim\", \"Jane\", \"Alice\"]\nfor name in names:\n    print name, len(name)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "tweet = \"The spreadsheet is a tool, and it is also a world view - reality by the numbers.\"\ntweet_words = tweet.split()\nprint tweet_words", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "for word in tweet_words:\n    print word", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "with open(\"text.txt\") as inf: # \"text.txt\" is a text file on my computer & inf becomes a reference to it\n    text_in_file = inf.read()\n    print text_in_file", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "# function\ndef standardize_text(text):\n    text = text.replace('.', '') # remove commas\n    text = text.replace(',', '') # remove periods\n    text = text.replace('\"', '') # remove quotes\n    text = text.lower() # convert all characters to lower case\n    text = text.split() # convert string to a list of words\n    return text\n\nclean_text = standardize_text(text_in_file)\n\nfor i, word in enumerate(clean_text):\n    print i, word", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "from collections import Counter # module that counts things\ntop_5 = Counter(clean_text).most_common(5)\nprint \"top 5:\", top_5 # this is a list of tuples\nfor word, count in top_5:\n    print count, word", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "# let's put all this together:\n\nfrom collections import Counter # module that counts things\n\ndef standardize_text(text):\n    text = text.replace('.', '') # remove commas\n    text = text.replace(',', '') # remove periods\n    text = text.replace('\"', '') # remove quotes\n    text = text.lower() # convert all characters to lower case\n    text = text.split() # convert string to a list of words\n    return text\n\ndef get_text(text_file):\n    with open(text_file) as inf:\n        text_in_file = inf.read()\n        clean_text = standardize_text(text_in_file)\n        return clean_text\n\ndef get_top_N(text, top_N):\n    return Counter(text).most_common(top_N)\n\ntext_file = \"quixote.txt\" # we also have ulysses, iliad and quixote\ntext = get_text(text_file)\nprint \"There are\", len(text), \"words in\", text_file\ntop_N = 10\nprint \"The top\", top_N, \"words are:\"\nfor word, count in get_top_N(text, top_N):\n    print count, word", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "# how can we save this data?\n\nimport csv\n\nwith open('test.csv', 'wb') as outf:\n    writer = csv.writer(outf)\n    for i in range(10):\n        writer.writerow([i])", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "# let's make a function to do that\n\ndef save_data_to_csv(data):\n    with open('testing_words.csv', 'wb') as outf:\n        writer = csv.writer(outf)\n        writer.writerow(['word','count','book'])\n        for row in data:\n            print row\n            writer.writerow(row)\n\n# let's test the function\nsave_data_to_csv([['bla', 7, 'bla text'], ['bleh', 4, 'bleh text']])\nprint \"write the test text\"", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "def standardize_text(text):\n    text = text.replace('.', '') # remove commas\n    text = text.replace(',', '') # remove periods\n    text = text.replace('\"', '') # remove quotes\n    text = text.lower() # convert all characters to lower case\n    text = text.split() # convert string to a list of words\n    return text\n\ndef get_text(text_file):\n    with open(text_file) as inf:\n        text_in_file = inf.read()\n        clean_text = standardize_text(text_in_file)\n        return clean_text\n\ndef get_top_N(text, top_N):\n    return Counter(text).most_common(top_N)\n\ndef read_books(books):\n    top_N = 100\n    result_data = []\n    for book in books:\n        text = get_text(book)\n        top_words = get_top_N(text, top_N)\n        for word, count in top_words:\n            result_data.append([word, count, book])\n    return result_data\n\ndef save_data_to_csv(data):\n    with open('words.csv', 'wb') as outf:\n        writer = csv.writer(outf)\n        writer.writerow(['word','count','book'])\n        for row in data:\n            writer.writerow(row)\n\nbooks = [\"ulysses.txt\", \"iliad.txt\", \"quixote.txt\"]\ndata = read_books(books)\nsave_data_to_csv(data)\nprint \"done!\"", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.7", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}}