Skip to content

Instantly share code, notes, and snippets.

@eddieberklee
Created May 8, 2013 00:56
Show Gist options
  • Save eddieberklee/5537414 to your computer and use it in GitHub Desktop.
Save eddieberklee/5537414 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "Day_21_EdwardLee"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "KEY = 'AKIAJH2FD7572FCTVSSQ'\nSECRET = '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'",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": "!s3cmd ls s3://aws-publicdatasets/common-crawl/parse-output/segment/1341690169105/1341826131693_45.arc.gz",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "/bin/sh: 1: s3cmd: not found\r\n"
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": "import boto\nfrom boto.s3.connection import S3Connection\n\n# this key, secret access to aws-publicdatasets only -- createdd for WwOD 13 student usage\nKEY = 'AKIAJH2FD7572FCTVSSQ'\nSECRET = '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'\n\nfrom itertools import islice\nfrom pandas import DataFrame\n\nconn= S3Connection(KEY, SECRET)\nbucket = conn.get_bucket('aws-publicdatasets')\n\n# you might find this conversion function between DataFrame and a list of a regular dict useful\n#https://gist.github.com/mikedewar/1486027#comment-804797\ndef df_to_dictlist(df):\n return [{k:df.values[i][v] for v,k in enumerate(df.columns)} for i in range(len(df))]\n\ndef cc_file_type(path):\n\n fname = path.split(\"/\")[-1]\n \n if fname[-7:] == '.arc.gz':\n return 'arc.gz'\n elif fname[:9] == 'textData-':\n return 'textData'\n elif fname[:9] == 'metadata-':\n return 'metadata'\n elif fname == '_SUCCESS':\n return 'success'\n else:\n return 'other'\n \n# a first pass, using DataFrame. Might not be so efficient considering we are returning only totals\ndef segment_stats(seg_id, stop=None):\n all_files = islice(bucket.list(prefix=\"common-crawl/parse-output/segment/{0}/\".format(seg_id), delimiter=\"/\"),stop)\n df = DataFrame([{'size': f.size if hasattr(f, 'size') else 0, 'name':f.name, 'type':cc_file_type(f.name)} for f in all_files])\n return {'count': df_to_dictlist(df[['size','type']].groupby('type').count()[['size']].T)[0],\n 'size': df_to_dictlist(df[['size', 'type']].groupby('type').sum().astype('int64').T)[0]}",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": "# get valid_segments\n# https://commoncrawl.atlassian.net/wiki/display/CRWL/About+the+Data+Set\n\nimport boto\nfrom boto.s3.connection import S3Connection\n\nconn = S3Connection(KEY, SECRET)\nbucket = conn.get_bucket('aws-publicdatasets')\n\nk = bucket.get_key(\"common-crawl/parse-output/valid_segments.txt\")\ns = k.get_contents_as_string()\n\nvalid_segments = filter(None, s.split(\"\\n\"))\n\nprint len(valid_segments), valid_segments[0]",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "177 1346823845675\n"
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": "%time segment_stats(valid_segments[0], None)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "CPU times: user 2.34 s, sys: 0.09 s, total: 2.43 s\nWall time: 11.95 s\n"
},
{
"output_type": "pyout",
"prompt_number": 10,
"text": "{'count': {'arc.gz': 11904, 'metadata': 4377, 'success': 1, 'textData': 4377},\n 'size': {'arc.gz': 967409519222,\n 'metadata': 187079951008,\n 'success': 0,\n 'textData': 129994977292}}"
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment