Created
May 17, 2019 01:39
-
-
Save igorbrigadir/648c76ed21b0d7266ee36504518fffd3 to your computer and use it in GitHub Desktop.
Where's My Elephant?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 132, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "import requests\n", | |
| "import pandas as pd\n", | |
| "from bs4 import BeautifulSoup" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 133, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Save a given url as a file\n", | |
| "def cache_url(url, fname, overwrite=False):\n", | |
| " if os.path.isfile(fname) and not overwrite:\n", | |
| " return\n", | |
| " data = requests.get(url).content\n", | |
| " with open(fname, 'wb') as f: \n", | |
| " f.write(data)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 134, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Open Main page:\n", | |
| "base_url = \"https://www.uliwestphal.de/elephas-anthropogenus/\"\n", | |
| "r = requests.get(base_url)\n", | |
| "page = BeautifulSoup(r.content, \"lxml\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 163, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Find all area tags that link to an html\n", | |
| "for area_tag in page.map.findAll(\"area\"):\n", | |
| " \n", | |
| " fname = \"elephants_data/{}\".format(area_tag['href'])\n", | |
| " url = \"{}{}\".format(base_url, area_tag['href'])\n", | |
| " \n", | |
| " image_part_url = area_tag['href'].split('/')[0] + \"/jpgs/\" + area_tag['href'].split('/')[1].replace('html','jpg')\n", | |
| " image_url = \"{}{}\".format(base_url, image_part_url)\n", | |
| " image_fname = \"elephants_data/{}\".format(image_part_url)\n", | |
| " \n", | |
| " #print(area_tag['href'], fname, image_url_part)\n", | |
| " \n", | |
| " # Page\n", | |
| " cache_url(url, fname)\n", | |
| " # Image:\n", | |
| " cache_url(image_url, image_fname)\n", | |
| "\n", | |
| "# Manually download ~1450b.html ... it's not in the map?\n", | |
| "cache_url(\"https://www.uliwestphal.de/elephas-anthropogenus/soloelephants/~1450b.html\", \"elephants_data/soloelephants/~1450b.html\")\n", | |
| "cache_url(\"https://www.uliwestphal.de/elephas-anthropogenus/soloelephants/jpgs/~1450b.jpg\", \"elephants_data/soloelephants/jpgs/~1450b.jpg\")\n", | |
| "urls_image_map[\"https://www.uliwestphal.de/elephas-anthropogenus/soloelephants/~1450b.html\"] = \"https://www.uliwestphal.de/elephas-anthropogenus/soloelephants/jpgs/~1450b.jpg\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 164, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# There was a few images missing on original site: 1025-1050, 1444, 1470-1482, 1508" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 165, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "163" | |
| ] | |
| }, | |
| "execution_count": 165, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(urls_image_map)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 158, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "headers = {\n", | |
| " 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',\n", | |
| "}\n", | |
| "def try_external_site(url):\n", | |
| " try:\n", | |
| " # use smaller timeout to skip errors, but can result in failed downloads\n", | |
| " response = requests.get(url, stream=False, timeout=10, allow_redirects=True, headers=headers)\n", | |
| " if response.ok:\n", | |
| " return \"yes\"\n", | |
| " else:\n", | |
| " return \"no\"\n", | |
| " except:\n", | |
| " return \"no\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 159, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Copied the google doc to tsv ...\n", | |
| "df = pd.read_csv(\"elephants_data/elephants.tsv\", sep='\\t')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 160, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>ID</th>\n", | |
| " <th>Year(s)</th>\n", | |
| " <th>Parent ID</th>\n", | |
| " <th>Modal URL</th>\n", | |
| " <th>line drawing URL</th>\n", | |
| " <th>digital facsimile of original source URL</th>\n", | |
| " <th>digital facsimile still around?</th>\n", | |
| " <th>image only URL</th>\n", | |
| " <th>image URL still around?</th>\n", | |
| " <th>source site URL</th>\n", | |
| " <th>source URL still around?</th>\n", | |
| " <th>last checked</th>\n", | |
| " <th>comments</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>30</th>\n", | |
| " <td>31</td>\n", | |
| " <td>1444</td>\n", | |
| " <td>30</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>53</th>\n", | |
| " <td>54</td>\n", | |
| " <td>~1250</td>\n", | |
| " <td>48</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>70</th>\n", | |
| " <td>71</td>\n", | |
| " <td>~1270-1290</td>\n", | |
| " <td>trunk</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>72</th>\n", | |
| " <td>73</td>\n", | |
| " <td>~1500</td>\n", | |
| " <td>72</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>29</th>\n", | |
| " <td>30</td>\n", | |
| " <td>~1350</td>\n", | |
| " <td>29</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>10</td>\n", | |
| " <td>1050-1075</td>\n", | |
| " <td>trunk</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://visualiseur.bnf.fr/ConsulterElementNum?...</td>\n", | |
| " <td>yes</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>http://mandragore.bnf.fr/jsp/classementThema.jsp</td>\n", | |
| " <td>yes</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>135</th>\n", | |
| " <td>136</td>\n", | |
| " <td>1482</td>\n", | |
| " <td>135</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>nightmare elephant</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>4</td>\n", | |
| " <td>~1450</td>\n", | |
| " <td>3</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://visualiseur.bnf.fr/ConsulterElementNum?...</td>\n", | |
| " <td>yes</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>http://mandragore.bnf.fr/jsp/classementThema.jsp</td>\n", | |
| " <td>yes</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>158</th>\n", | |
| " <td>159</td>\n", | |
| " <td>1765</td>\n", | |
| " <td>158</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>156</th>\n", | |
| " <td>157</td>\n", | |
| " <td>1664</td>\n", | |
| " <td>155</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " ID Year(s) Parent ID \\\n", | |
| "30 31 1444 30 \n", | |
| "53 54 ~1250 48 \n", | |
| "70 71 ~1270-1290 trunk \n", | |
| "72 73 ~1500 72 \n", | |
| "29 30 ~1350 29 \n", | |
| "9 10 1050-1075 trunk \n", | |
| "135 136 1482 135 \n", | |
| "3 4 ~1450 3 \n", | |
| "158 159 1765 158 \n", | |
| "156 157 1664 155 \n", | |
| "\n", | |
| " Modal URL \\\n", | |
| "30 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "53 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "70 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "72 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "29 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "9 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "135 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "3 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "158 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "156 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "\n", | |
| " line drawing URL \\\n", | |
| "30 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "53 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "70 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "72 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "29 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "9 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "135 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "3 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "158 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "156 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "\n", | |
| " digital facsimile of original source URL \\\n", | |
| "30 NaN \n", | |
| "53 NaN \n", | |
| "70 NaN \n", | |
| "72 NaN \n", | |
| "29 NaN \n", | |
| "9 http://visualiseur.bnf.fr/ConsulterElementNum?... \n", | |
| "135 NaN \n", | |
| "3 http://visualiseur.bnf.fr/ConsulterElementNum?... \n", | |
| "158 NaN \n", | |
| "156 NaN \n", | |
| "\n", | |
| " digital facsimile still around? image only URL image URL still around? \\\n", | |
| "30 NaN NaN NaN \n", | |
| "53 NaN NaN NaN \n", | |
| "70 NaN NaN NaN \n", | |
| "72 NaN NaN NaN \n", | |
| "29 NaN NaN NaN \n", | |
| "9 yes NaN NaN \n", | |
| "135 NaN NaN NaN \n", | |
| "3 yes NaN NaN \n", | |
| "158 NaN NaN NaN \n", | |
| "156 NaN NaN NaN \n", | |
| "\n", | |
| " source site URL \\\n", | |
| "30 NaN \n", | |
| "53 NaN \n", | |
| "70 NaN \n", | |
| "72 NaN \n", | |
| "29 NaN \n", | |
| "9 http://mandragore.bnf.fr/jsp/classementThema.jsp \n", | |
| "135 NaN \n", | |
| "3 http://mandragore.bnf.fr/jsp/classementThema.jsp \n", | |
| "158 NaN \n", | |
| "156 NaN \n", | |
| "\n", | |
| " source URL still around? last checked comments \n", | |
| "30 NaN 08.09.08 NaN \n", | |
| "53 NaN 08.09.08 NaN \n", | |
| "70 NaN 08.09.08 NaN \n", | |
| "72 NaN 08.09.08 NaN \n", | |
| "29 NaN 08.09.08 NaN \n", | |
| "9 yes 08.09.08 NaN \n", | |
| "135 NaN 08.09.08 nightmare elephant \n", | |
| "3 yes 08.09.08 NaN \n", | |
| "158 NaN 08.09.08 NaN \n", | |
| "156 NaN 08.09.08 NaN " | |
| ] | |
| }, | |
| "execution_count": 160, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.sample(10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 168, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def process_row(row):\n", | |
| " #print(row['Modal URL'])\n", | |
| " \n", | |
| " fname = row['Modal URL'].replace(base_url, \"elephants_data/\")\n", | |
| " \n", | |
| " with open(fname) as f:\n", | |
| " html_page = BeautifulSoup(f, \"lxml\")\n", | |
| " \n", | |
| " image_part_url = row['Modal URL'].split('/')[-2] + \"/jpgs/\" + row['Modal URL'].split('/')[-1].replace('html','jpg')\n", | |
| " image_url = \"{}{}\".format(base_url, image_part_url)\n", | |
| " row['line drawing URL'] = image_url\n", | |
| " \n", | |
| " a_facsimile = html_page.find(\"a\", string=\"view digital facsimile of original source\")\n", | |
| " if a_facsimile is not None:\n", | |
| " row['digital facsimile of original source URL'] = a_facsimile['href']\n", | |
| " row['digital facsimile still around?'] = try_external_site(a_facsimile['href'])\n", | |
| " \n", | |
| " a_source = html_page.find(\"a\", string=\"view source-site\")\n", | |
| " if a_source is not None:\n", | |
| " row['source site URL'] = a_source['href']\n", | |
| " row['source URL still around?'] = try_external_site(a_source['href'])\n", | |
| " \n", | |
| " a_image_only = html_page.find(\"a\", string=\"image only\")\n", | |
| " if a_image_only is not None:\n", | |
| " row['image only URL'] = a_image_only['href']\n", | |
| " row['image URL still around?'] = try_external_site(a_image_only['href'])\n", | |
| " \n", | |
| " return row" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = df.apply(process_row, axis=1)\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = df.fillna(\"missing\")\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 170, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>ID</th>\n", | |
| " <th>Year(s)</th>\n", | |
| " <th>Parent ID</th>\n", | |
| " <th>Modal URL</th>\n", | |
| " <th>line drawing URL</th>\n", | |
| " <th>digital facsimile of original source URL</th>\n", | |
| " <th>digital facsimile still around?</th>\n", | |
| " <th>image only URL</th>\n", | |
| " <th>image URL still around?</th>\n", | |
| " <th>source site URL</th>\n", | |
| " <th>source URL still around?</th>\n", | |
| " <th>last checked</th>\n", | |
| " <th>comments</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>28</th>\n", | |
| " <td>29</td>\n", | |
| " <td>1304-1321</td>\n", | |
| " <td>28</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://www.finns-books.com/petepic1.htm</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>http://www.finns-books.com/pictures/peterbo1.jpg</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>47</th>\n", | |
| " <td>48</td>\n", | |
| " <td>1236-1245</td>\n", | |
| " <td>trunk</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://prodigi.bl.uk/illcat/ILLUMIN.ASP?Size=m...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>http://prodigi.bl.uk/IllImages/iBase\\component...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>38</th>\n", | |
| " <td>39</td>\n", | |
| " <td>~1230</td>\n", | |
| " <td>38</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://bestiary.ca/etexts/druce1919-2/druce%20...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>72</th>\n", | |
| " <td>73</td>\n", | |
| " <td>~1500</td>\n", | |
| " <td>72</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>52</th>\n", | |
| " <td>53</td>\n", | |
| " <td>~1275</td>\n", | |
| " <td>50</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://expositions.bnf.fr/bestiaire/grand/11_0...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>http://expositions.bnf.fr/bestiaire/images/3/1...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>140</th>\n", | |
| " <td>141</td>\n", | |
| " <td>1480-1485</td>\n", | |
| " <td>139</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://visualiseur.bnf.fr/ConsulterElementNum?...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>54</th>\n", | |
| " <td>55</td>\n", | |
| " <td>1255-1259</td>\n", | |
| " <td>54</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://www.collectbritain.co.uk/personalisatio...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>http://www.collectbritain.co.uk/mediastore/011...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>90</th>\n", | |
| " <td>91</td>\n", | |
| " <td>1293</td>\n", | |
| " <td>trunk</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://nausikaa2.mpiwg-berlin.mpg.de/cgi-bin/t...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>http://nausikaa2.mpiwg-berlin.mpg.de/digitalli...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>kind of rhino-like I guess?</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>131</th>\n", | |
| " <td>132</td>\n", | |
| " <td>~1450</td>\n", | |
| " <td>131</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://visualiseur.bnf.fr/ConsulterElementNum?...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>http://mandragore.bnf.fr/jsp/classementThema.jsp</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>ok picasso</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>91</th>\n", | |
| " <td>92</td>\n", | |
| " <td>~1300</td>\n", | |
| " <td>trunk</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>https://www.uliwestphal.de/elephas-anthropogen...</td>\n", | |
| " <td>http://bestiary.ca/etexts/druce1919-2/druce%20...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>08.09.08</td>\n", | |
| " <td>contains the ocean</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " ID Year(s) Parent ID \\\n", | |
| "28 29 1304-1321 28 \n", | |
| "47 48 1236-1245 trunk \n", | |
| "38 39 ~1230 38 \n", | |
| "72 73 ~1500 72 \n", | |
| "52 53 ~1275 50 \n", | |
| "140 141 1480-1485 139 \n", | |
| "54 55 1255-1259 54 \n", | |
| "90 91 1293 trunk \n", | |
| "131 132 ~1450 131 \n", | |
| "91 92 ~1300 trunk \n", | |
| "\n", | |
| " Modal URL \\\n", | |
| "28 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "47 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "38 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "72 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "52 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "140 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "54 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "90 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "131 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "91 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "\n", | |
| " line drawing URL \\\n", | |
| "28 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "47 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "38 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "72 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "52 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "140 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "54 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "90 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "131 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "91 https://www.uliwestphal.de/elephas-anthropogen... \n", | |
| "\n", | |
| " digital facsimile of original source URL \\\n", | |
| "28 http://www.finns-books.com/petepic1.htm \n", | |
| "47 http://prodigi.bl.uk/illcat/ILLUMIN.ASP?Size=m... \n", | |
| "38 http://bestiary.ca/etexts/druce1919-2/druce%20... \n", | |
| "72 NaN \n", | |
| "52 http://expositions.bnf.fr/bestiaire/grand/11_0... \n", | |
| "140 http://visualiseur.bnf.fr/ConsulterElementNum?... \n", | |
| "54 http://www.collectbritain.co.uk/personalisatio... \n", | |
| "90 http://nausikaa2.mpiwg-berlin.mpg.de/cgi-bin/t... \n", | |
| "131 http://visualiseur.bnf.fr/ConsulterElementNum?... \n", | |
| "91 http://bestiary.ca/etexts/druce1919-2/druce%20... \n", | |
| "\n", | |
| " digital facsimile still around? \\\n", | |
| "28 NaN \n", | |
| "47 NaN \n", | |
| "38 NaN \n", | |
| "72 NaN \n", | |
| "52 NaN \n", | |
| "140 NaN \n", | |
| "54 NaN \n", | |
| "90 NaN \n", | |
| "131 NaN \n", | |
| "91 NaN \n", | |
| "\n", | |
| " image only URL \\\n", | |
| "28 http://www.finns-books.com/pictures/peterbo1.jpg \n", | |
| "47 http://prodigi.bl.uk/IllImages/iBase\\component... \n", | |
| "38 NaN \n", | |
| "72 NaN \n", | |
| "52 http://expositions.bnf.fr/bestiaire/images/3/1... \n", | |
| "140 NaN \n", | |
| "54 http://www.collectbritain.co.uk/mediastore/011... \n", | |
| "90 http://nausikaa2.mpiwg-berlin.mpg.de/digitalli... \n", | |
| "131 NaN \n", | |
| "91 NaN \n", | |
| "\n", | |
| " image URL still around? source site URL \\\n", | |
| "28 NaN NaN \n", | |
| "47 NaN NaN \n", | |
| "38 NaN NaN \n", | |
| "72 NaN NaN \n", | |
| "52 NaN NaN \n", | |
| "140 NaN NaN \n", | |
| "54 NaN NaN \n", | |
| "90 NaN NaN \n", | |
| "131 NaN http://mandragore.bnf.fr/jsp/classementThema.jsp \n", | |
| "91 NaN NaN \n", | |
| "\n", | |
| " source URL still around? last checked comments \n", | |
| "28 NaN 08.09.08 NaN \n", | |
| "47 NaN 08.09.08 NaN \n", | |
| "38 NaN 08.09.08 NaN \n", | |
| "72 NaN 08.09.08 NaN \n", | |
| "52 NaN 08.09.08 NaN \n", | |
| "140 NaN 08.09.08 NaN \n", | |
| "54 NaN 08.09.08 NaN \n", | |
| "90 NaN 08.09.08 kind of rhino-like I guess? \n", | |
| "131 NaN 08.09.08 ok picasso \n", | |
| "91 NaN 08.09.08 contains the ocean " | |
| ] | |
| }, | |
| "execution_count": 170, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.sample(10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 161, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df.to_csv(\"elephants_data/results.tsv\", sep='\\t', index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.8" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment