Created
May 8, 2013 00:56
-
-
Save eddieberklee/5537414 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "Day_21_EdwardLee" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "KEY = 'AKIAJH2FD7572FCTVSSQ'\nSECRET = '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'", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 4 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "!s3cmd ls s3://aws-publicdatasets/common-crawl/parse-output/segment/1341690169105/1341826131693_45.arc.gz", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": "/bin/sh: 1: s3cmd: not found\r\n" | |
| } | |
| ], | |
| "prompt_number": 12 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "import boto\nfrom boto.s3.connection import S3Connection\n\n# this key, secret access to aws-publicdatasets only -- createdd for WwOD 13 student usage\nKEY = 'AKIAJH2FD7572FCTVSSQ'\nSECRET = '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'\n\nfrom itertools import islice\nfrom pandas import DataFrame\n\nconn= S3Connection(KEY, SECRET)\nbucket = conn.get_bucket('aws-publicdatasets')\n\n# you might find this conversion function between DataFrame and a list of a regular dict useful\n#https://gist.github.com/mikedewar/1486027#comment-804797\ndef df_to_dictlist(df):\n return [{k:df.values[i][v] for v,k in enumerate(df.columns)} for i in range(len(df))]\n\ndef cc_file_type(path):\n\n fname = path.split(\"/\")[-1]\n \n if fname[-7:] == '.arc.gz':\n return 'arc.gz'\n elif fname[:9] == 'textData-':\n return 'textData'\n elif fname[:9] == 'metadata-':\n return 'metadata'\n elif fname == '_SUCCESS':\n return 'success'\n else:\n return 'other'\n \n# a first pass, using DataFrame. Might not be so efficient considering we are returning only totals\ndef segment_stats(seg_id, stop=None):\n all_files = islice(bucket.list(prefix=\"common-crawl/parse-output/segment/{0}/\".format(seg_id), delimiter=\"/\"),stop)\n df = DataFrame([{'size': f.size if hasattr(f, 'size') else 0, 'name':f.name, 'type':cc_file_type(f.name)} for f in all_files])\n return {'count': df_to_dictlist(df[['size','type']].groupby('type').count()[['size']].T)[0],\n 'size': df_to_dictlist(df[['size', 'type']].groupby('type').sum().astype('int64').T)[0]}", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 7 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "# get valid_segments\n# https://commoncrawl.atlassian.net/wiki/display/CRWL/About+the+Data+Set\n\nimport boto\nfrom boto.s3.connection import S3Connection\n\nconn = S3Connection(KEY, SECRET)\nbucket = conn.get_bucket('aws-publicdatasets')\n\nk = bucket.get_key(\"common-crawl/parse-output/valid_segments.txt\")\ns = k.get_contents_as_string()\n\nvalid_segments = filter(None, s.split(\"\\n\"))\n\nprint len(valid_segments), valid_segments[0]", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": "177 1346823845675\n" | |
| } | |
| ], | |
| "prompt_number": 9 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "%time segment_stats(valid_segments[0], None)", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": "CPU times: user 2.34 s, sys: 0.09 s, total: 2.43 s\nWall time: 11.95 s\n" | |
| }, | |
| { | |
| "output_type": "pyout", | |
| "prompt_number": 10, | |
| "text": "{'count': {'arc.gz': 11904, 'metadata': 4377, 'success': 1, 'textData': 4377},\n 'size': {'arc.gz': 967409519222,\n 'metadata': 187079951008,\n 'success': 0,\n 'textData': 129994977292}}" | |
| } | |
| ], | |
| "prompt_number": 10 | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": "", | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment