Created
March 13, 2015 16:26
-
-
Save erichannell/9bebba35694176332ad9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"nbformat_minor": 0, "cells": [{"execution_count": null, "cell_type": "code", "source": "print \"Hello, world!\" #ohai! this is a comment", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "name = \"Eric\" # creating a variable\nprint \"Hello\" + \" \" + name # adding strings", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "your_name = raw_input(\"Who are you? \")\nprint \"Hello\", your_name # , adds spaces", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "print your_name.lower() # string method\nprint your_name.upper()", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "i = 0\nprint i\ni = i + 1 # i += 1\nprint i", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "for i in range(10):\n print i,", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "x = [1,2,3,4,5] # a list\nprint x\nprint len(x) # length of list", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "# lists start at the \"zeroeth element\"\nprint x[0] # first element\nprint x[-1] # last element\nprint x[1:3]", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "names = [\"Jim\", \"Jane\", \"Alice\"]\nfor name in names:\n print name, len(name)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "tweet = \"The spreadsheet is a tool, and it is also a world view - reality by the numbers.\"\ntweet_words = tweet.split()\nprint tweet_words", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "for word in tweet_words:\n print word", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "with open(\"text.txt\") as inf: # \"text.txt\" is a text file on my computer & inf becomes a reference to it\n text_in_file = inf.read()\n print text_in_file", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "# function\ndef standardize_text(text):\n text = text.replace('.', '') # remove commas\n text = text.replace(',', '') # remove periods\n text = text.replace('\"', '') # remove quotes\n text = text.lower() # convert all characters to lower case\n text = text.split() # convert string to a list of words\n return text\n\nclean_text = standardize_text(text_in_file)\n\nfor i, word in enumerate(clean_text):\n print i, word", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "from collections import Counter # module that counts things\ntop_5 = Counter(clean_text).most_common(5)\nprint \"top 5:\", top_5 # this is a list of tuples\nfor word, count in top_5:\n print count, word", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "# let's put all this together:\n\nfrom collections import Counter # module that counts things\n\ndef standardize_text(text):\n text = text.replace('.', '') # remove commas\n text = text.replace(',', '') # remove periods\n text = text.replace('\"', '') # remove quotes\n text = text.lower() # convert all characters to lower case\n text = text.split() # convert string to a list of words\n return text\n\ndef get_text(text_file):\n with open(text_file) as inf:\n text_in_file = inf.read()\n clean_text = standardize_text(text_in_file)\n return clean_text\n\ndef get_top_N(text, top_N):\n return Counter(text).most_common(top_N)\n\ntext_file = \"quixote.txt\" # we also have ulysses, iliad and quixote\ntext = get_text(text_file)\nprint \"There are\", len(text), \"words in\", text_file\ntop_N = 10\nprint \"The top\", top_N, \"words are:\"\nfor word, count in get_top_N(text, top_N):\n print count, word", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "# how can we save this data?\n\nimport csv\n\nwith open('test.csv', 'wb') as outf:\n writer = csv.writer(outf)\n for i in range(10):\n writer.writerow([i])", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "# let's make a function to do that\n\ndef save_data_to_csv(data):\n with open('testing_words.csv', 'wb') as outf:\n writer = csv.writer(outf)\n writer.writerow(['word','count','book'])\n for row in data:\n print row\n writer.writerow(row)\n\n# let's test the function\nsave_data_to_csv([['bla', 7, 'bla text'], ['bleh', 4, 'bleh text']])\nprint \"write the test text\"", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "def standardize_text(text):\n text = text.replace('.', '') # remove commas\n text = text.replace(',', '') # remove periods\n text = text.replace('\"', '') # remove quotes\n text = text.lower() # convert all characters to lower case\n text = text.split() # convert string to a list of words\n return text\n\ndef get_text(text_file):\n with open(text_file) as inf:\n text_in_file = inf.read()\n clean_text = standardize_text(text_in_file)\n return clean_text\n\ndef get_top_N(text, top_N):\n return Counter(text).most_common(top_N)\n\ndef read_books(books):\n top_N = 100\n result_data = []\n for book in books:\n text = get_text(book)\n top_words = get_top_N(text, top_N)\n for word, count in top_words:\n result_data.append([word, count, book])\n return result_data\n\ndef save_data_to_csv(data):\n with open('words.csv', 'wb') as outf:\n writer = csv.writer(outf)\n writer.writerow(['word','count','book'])\n for row in data:\n writer.writerow(row)\n\nbooks = [\"ulysses.txt\", \"iliad.txt\", \"quixote.txt\"]\ndata = read_books(books)\nsave_data_to_csv(data)\nprint \"done!\"", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.7", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment