Skip to content

Instantly share code, notes, and snippets.

@bicubic
Created April 11, 2013 12:17
Show Gist options
  • Select an option

  • Save bicubic/5362927 to your computer and use it in GitHub Desktop.

Select an option

Save bicubic/5362927 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "500px scrape update"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "import pandas\nimport numpy as np\nimport matplotlib\nimport matplotlib.pyplot as plt\nimport time\nfrom datetime import date",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": "df = pandas.DataFrame.load(\"dump3\")\ndf = df.drop_duplicates(\"id\")\ndf.index = df.id\ndf = df.dropna()\n#shift = datetime.timedelta(hours = -8)\n#df.uploaded = df.uploaded.map(lambda x: x + shift)\ndf['hour'] = df.uploaded.map(lambda x: x.hour)\ndf['hourf'] = df.uploaded.map(lambda x: float(x.hour) + x.minute/60.0)\ndf = df[df.views < 200]\nlen(df)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 2,
"text": "51556"
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": "df[:5]",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>author</th>\n <th>category</th>\n <th>favourites</th>\n <th>id</th>\n <th>pulse</th>\n <th>tags</th>\n <th>uploaded</th>\n <th>views</th>\n <th>votes</th>\n <th>hour</th>\n <th>hourf</th>\n </tr>\n <tr>\n <th>id</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>13051629</th>\n <td> /dlbooth3232</td>\n <td> Family</td>\n <td> 0</td>\n <td> 13051629</td>\n <td> 0.0</td>\n <td> []</td>\n <td> 2012-09-03 01:02:00+00:00</td>\n <td> 37</td>\n <td> 0</td>\n <td> 1</td>\n <td> 1.033333</td>\n </tr>\n <tr>\n <th>27962561</th>\n <td> /i_nemer</td>\n <td> Nude</td>\n <td> 7</td>\n <td> 27962561</td>\n <td> 65.4</td>\n <td> []</td>\n <td> 2013-03-11 03:12:00+00:00</td>\n <td> 81</td>\n <td> 12</td>\n <td> 3</td>\n <td> 3.200000</td>\n </tr>\n <tr>\n <th>22766425</th>\n <td> /silecer</td>\n <td> Uncategorized</td>\n <td> 0</td>\n <td> 22766425</td>\n <td> 0.0</td>\n <td> []</td>\n <td> 2013-01-10 00:23:00+00:00</td>\n <td> 8</td>\n <td> 0</td>\n <td> 0</td>\n <td> 0.383333</td>\n </tr>\n <tr>\n <th>22690995</th>\n <td> /soulstodeny</td>\n <td> People</td>\n <td> 1</td>\n <td> 22690995</td>\n <td> 42.5</td>\n <td> [Old man, chinese, hong kong, smoker]</td>\n <td> 2013-01-09 01:18:00+00:00</td>\n <td> 13</td>\n <td> 1</td>\n <td> 1</td>\n <td> 1.300000</td>\n </tr>\n <tr>\n <th>18580023</th>\n <td> /AllanCaron</td>\n <td> Nature</td>\n <td> 0</td>\n <td> 18580023</td>\n <td> 0.0</td>\n <td> []</td>\n <td> 2012-11-19 21:54:00+00:00</td>\n <td> 15</td>\n <td> 0</td>\n <td> 21</td>\n <td> 21.900000</td>\n </tr>\n </tbody>\n</table>\n</div>",
"output_type": "pyout",
"prompt_number": 3,
"text": " author category favourites id pulse \\\nid \n13051629 /dlbooth3232 Family 0 13051629 0.0 \n27962561 /i_nemer Nude 7 27962561 65.4 \n22766425 /silecer Uncategorized 0 22766425 0.0 \n22690995 /soulstodeny People 1 22690995 42.5 \n18580023 /AllanCaron Nature 0 18580023 0.0 \n\n tags uploaded views \\\nid \n13051629 [] 2012-09-03 01:02:00+00:00 37 \n27962561 [] 2013-03-11 03:12:00+00:00 81 \n22766425 [] 2013-01-10 00:23:00+00:00 8 \n22690995 [Old man, chinese, hong kong, smoker] 2013-01-09 01:18:00+00:00 13 \n18580023 [] 2012-11-19 21:54:00+00:00 15 \n\n votes hour hourf \nid \n13051629 0 1 1.033333 \n27962561 12 3 3.200000 \n22766425 0 0 0.383333 \n22690995 1 1 1.300000 \n18580023 0 21 21.900000 "
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": "ids = df.id",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": "from IPython.parallel import Client\nc = Client()\nd = c[:]\nlen(c.ids)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 36,
"text": "15"
}
],
"prompt_number": 36
},
{
"cell_type": "code",
"collapsed": false,
"input": "with d.sync_imports():\n import pandas\n import numpy as np\n import matplotlib\n import matplotlib.pyplot as plt\n import time\n from datetime import date\n from IPython.parallel.util import interactive\n import urllib2\n from BeautifulSoup import BeautifulSoup\n import libxml2 as lxml\n import datetime\n import dateutil.parser\n import re\n import pandas\n import scipy\n import matplotlib\n import dateutil\n import socket\n import json",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "importing pandas on engine(s)\nimporting numpy on engine(s)\nimporting matplotlib on engine(s)"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\nimporting matplotlib.pyplot on engine(s)\nimporting time on engine(s)\nimporting date from datetime on engine(s)"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\nimporting interactive from IPython.parallel.util on engine(s)\nimporting urllib2 on engine(s)\nimporting BeautifulSoup from BeautifulSoup on engine(s)\nimporting libxml2 on engine(s)"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\nimporting datetime on engine(s)\nimporting dateutil.parser on engine(s)\nimporting re on engine(s)\nimporting scipy on engine(s)"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\nimporting dateutil on engine(s)\nimporting socket on engine(s)\nimporting json on engine(s)\n"
}
],
"prompt_number": 37
},
{
"cell_type": "code",
"collapsed": false,
"input": "@d.parallel(block=True)\ndef getPhoto(ix):\n socket.setdefaulttimeout(5) \n \n try:\n photo = {}\n soup = BeautifulSoup(urllib2.urlopen('http://500px.com/photo/' + str(ix)).read())\n photo['id'] = int(ix)\n photo['pulse'] = float(soup('h1', {'title' : 'Pulse'})[0].text)\n photo['views'] = int(soup('li', {'class' : 'views'})[0]('strong')[0].text)\n photo['votes'] = int(soup('li', {'class' : 'votes'})[0]('strong')[0].text)\n photo['favourites'] = int(soup('li', {'class' : 'favs'})[0]('strong')[0].text)\n photo['uploaded'] = dateutil.parser.parse(soup('li', {'class' : 'created-at'})[0]('abbr')[0]['title'])\n photo['tags'] = [] if (len(soup('div', {'id' : 'tags'})) == 0) else [a['href'][a['href'].find('=')+1:] for a in soup('div', {'id' : 'tags'})[0].findAll('a',href=True)]\n photo['author'] = soup('div', {'class' : 'avatar'})[0]('a')[0]['href']\n photo['category'] = [] if len(soup('li', {'class' : 'category'})) == 0 else soup('li', {'class' : 'category'})[0]('a')[0].text\n except:\n return None\n return photo\n \n \n ",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 41
},
{
"cell_type": "code",
"collapsed": false,
"input": "t = ids[:15]\nt = t.values\nt",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 42,
"text": "array([13051629, 27962561, 22766425, 22690995, 18580023, 23164823,\n 17088731, 5523252, 22551061, 15134417, 19424391, 19049713,\n 25842297, 28749255, 23263023], dtype=int64)"
}
],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": "parallel_result = d.map_sync(lambda x: getPhoto(x), t)",
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "CompositeError",
"evalue": "one or more exceptions from call to method: <lambda>\n[0:apply]: NameError: global name 'getPhoto' is not defined\n[1:apply]: NameError: global name 'getPhoto' is not defined\n[2:apply]: NameError: global name 'getPhoto' is not defined\n[3:apply]: NameError: global name 'getPhoto' is not defined\n[4:apply]: NameError: global name 'getPhoto' is not defined\n[5:apply]: NameError: global name 'getPhoto' is not defined\n[6:apply]: NameError: global name 'getPhoto' is not defined\n[7:apply]: NameError: global name 'getPhoto' is not defined\n[8:apply]: NameError: global name 'getPhoto' is not defined\n[9:apply]: NameError: global name 'getPhoto' is not defined\n[10:apply]: NameError: global name 'getPhoto' is not defined\n[11:apply]: NameError: global name 'getPhoto' is not defined\n[12:apply]: NameError: global name 'getPhoto' is not defined\n[13:apply]: NameError: global name 'getPhoto' is not defined\n[14:apply]: NameError: global name 'getPhoto' is not defined",
"output_type": "pyerr",
"traceback": [
"[0:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[1:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[2:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[3:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[4:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[5:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[6:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[7:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[8:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[9:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[10:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[11:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[12:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[13:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
"",
"[14:apply]: ",
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)\u001b[1;32m<string>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m",
"\u001b[1;32m<ipython-input-43-67d8eda10f9e>\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m",
"\u001b[1;31mNameError\u001b[0m: global name 'getPhoto' is not defined",
""
]
}
],
"prompt_number": 43
},
{
"cell_type": "code",
"collapsed": false,
"input": "getPhoto(t)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 44,
"text": "[None,\n None,\n None,\n None,\n None,\n None,\n None,\n None,\n None,\n None,\n None,\n None,\n None,\n None,\n None]"
}
],
"prompt_number": 44
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment