Last active
April 17, 2025 14:28
-
-
Save rwcitek/e21d2c3aa0e28753ac57bfedc7581a39 to your computer and use it in GitHub Desktop.
census.api.blocked.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"name": "census.api.blocked.ipynb", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/rwcitek/e21d2c3aa0e28753ac57bfedc7581a39/census-api-blocked.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Setup" | |
], | |
"metadata": { | |
"id": "IU5z5Pp7_vnh" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Date" | |
], | |
"metadata": { | |
"id": "DyzfnrRmhQDE" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!date" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Kf9GEeGL8K-u", | |
"outputId": "0eba74d0-2b63-4c80-8b49-e0611746256e" | |
}, | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Thu Apr 17 02:24:49 PM UTC 2025\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## IP addresses: v4 and v6" | |
], | |
"metadata": { | |
"id": "iQWPg4cYhMBC" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!curl 'https://api.ipify.org?format=json'\n", | |
"!curl 'https://api64.ipify.org?format=json'\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "jLJi-K9Ng_Yo", | |
"outputId": "9bb8c103-26e3-43ee-85e9-28dbf18bf126" | |
}, | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"{\"ip\":\"34.23.11.87\"}{\"ip\":\"34.23.11.87\"}" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%%capture\n", | |
"%%bash\n", | |
"apt-get update\n", | |
"apt-get install -y elinks jq" | |
], | |
"metadata": { | |
"id": "jfRDq8XIw9uB" | |
}, | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import requests\n", | |
"from bs4 import BeautifulSoup as BS4\n", | |
"import urllib\n" | |
], | |
"metadata": { | |
"id": "gGD41kzz9kmZ" | |
}, | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## URL, i.e. target webpage" | |
], | |
"metadata": { | |
"id": "t5W9ZqK8hR-o" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"url = \"https://www2.census.gov/geo/tiger/TIGER2024/LINEARWATER/\"\n", | |
"url" | |
], | |
"metadata": { | |
"id": "5CFxC__2cI0j", | |
"outputId": "5868c22b-f41c-4e5b-d1cf-1b31cfcd1b4b", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 49 | |
} | |
}, | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'https://www2.census.gov/geo/tiger/TIGER2024/LINEARWATER/'" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## curl" | |
], | |
"metadata": { | |
"id": "dPdARxOq_3fX" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Request headers" | |
], | |
"metadata": { | |
"id": "-35w_BIWQtB7" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!curl -s -v -I {url} 2>&1 | grep '^> '\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Txqsj5AhE7hz", | |
"outputId": "8d83f3b9-cd5c-4756-a5ea-d242ed1a3896" | |
}, | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"> HEAD /geo/tiger/TIGER2024/LINEARWATER/ HTTP/2\r\n", | |
"> Host: www2.census.gov\r\n", | |
"> user-agent: curl/7.81.0\r\n", | |
"> accept: */*\r\n", | |
"> \r\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Response headers" | |
], | |
"metadata": { | |
"id": "JxaqrTCdQwOI" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!curl -s -I {url}\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Em32AdUXQW_3", | |
"outputId": "4314d67e-0368-4837-b5a2-a333ad60b4ab" | |
}, | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"HTTP/2 200 \r\n", | |
"\u001b[1mdate\u001b[0m: Thu, 17 Apr 2025 14:25:22 GMT\r\n", | |
"\u001b[1mcontent-type\u001b[0m: text/html;charset=ISO-8859-1\r\n", | |
"\u001b[1mcontent-security-policy\u001b[0m: default-src 'self' 'unsafe-inline' 'unsafe-eval' census.gov *.census.gov http://www.census.gov assets.adobedtm.com www.googletagmanager.com https://dpm.demdex.net https://cm.everesttech.net https://uscensusbureau.demdex.net dap.digitalgov.gov www.google-analytics.com; frame-ancestors 'self';\r\n", | |
"\u001b[1mx-frame-options\u001b[0m: SAMEORIGIN\r\n", | |
"\u001b[1mx-content-type-options\u001b[0m: nosniff\r\n", | |
"\u001b[1mcache-control\u001b[0m: public, max-age=14400\r\n", | |
"\u001b[1mreferrer-policy\u001b[0m: strict-origin-when-cross-origin\r\n", | |
"\u001b[1mvary\u001b[0m: Accept-Encoding\r\n", | |
"\u001b[1mx-xss-protection\u001b[0m: 1; mode=block\r\n", | |
"\u001b[1mstrict-transport-security\u001b[0m: max-age=31536000\r\n", | |
"\u001b[1mcf-cache-status\u001b[0m: HIT\r\n", | |
"\u001b[1mage\u001b[0m: 145076\r\n", | |
"\u001b[1mlast-modified\u001b[0m: Tue, 15 Apr 2025 22:07:26 GMT\r\n", | |
"\u001b[1mexpires\u001b[0m: Thu, 17 Apr 2025 18:25:22 GMT\r\n", | |
"\u001b[1mset-cookie\u001b[0m: __cf_bm=U_vfFfir1RYtfYNO_ncQmXkLKvckRqxs54M5.r2d5Ds-1744899922-1.0.1.1-_U0P.SR1g44S8wlzVIKmXmf8.pP9XWCjBA2dz2P4kaC6YBRxRFWyQzmhLKEVP_kEIHvR.xAW1rwCbRDUaKuJ0mCVo3qt9F3BVXG.3n4aDfY; path=/; expires=Thu, 17-Apr-25 14:55:22 GMT; domain=.www2.census.gov; HttpOnly; Secure; SameSite=None\r\n", | |
"\u001b[1mserver\u001b[0m: cloudflare\r\n", | |
"\u001b[1mcf-ray\u001b[0m: 931c97617d687bb2-ATL\r\n", | |
"\r\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"Rendered response body" | |
], | |
"metadata": { | |
"id": "4SHG-33eQyYa" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!curl -s {url} | elinks --dump --dump-width 100 | head -50\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "OFxpPpDAE7e6", | |
"outputId": "171476ca-48ef-4b4e-bf7f-25358e520a34" | |
}, | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
" [1]U.S. flag\n", | |
"\n", | |
" An official website of the United States government\n", | |
"\n", | |
" [2]Skip to main content\n", | |
" [3]United States Census Bureau\n", | |
"\n", | |
" ══════════════════════════════════════════════════════════════════════════════════════════════\n", | |
"\n", | |
" end of header\n", | |
"\n", | |
" [4][ICO] [5]Name [6]Last modified [7]Size [8]Description\n", | |
" ══════════════════════════════════════════════════════════════════════════════════════════\n", | |
" [9][PARENTDIR] [10]Parent Directory - \n", | |
" [11][ ] [12]tl_2024_01001_linearwater.zip 2024-09-24 09:24 1.7M \n", | |
" [13][ ] [14]tl_2024_01003_linearwater.zip 2024-09-24 09:24 1.7M \n", | |
" [15][ ] [16]tl_2024_01005_linearwater.zip 2024-09-24 09:24 321K \n", | |
" [17][ ] [18]tl_2024_01007_linearwater.zip 2024-09-24 09:24 1.5M \n", | |
" [19][ ] [20]tl_2024_01009_linearwater.zip 2024-09-24 09:24 473K \n", | |
" [21][ ] [22]tl_2024_01011_linearwater.zip 2024-09-24 09:24 1.5M \n", | |
" [23][ ] [24]tl_2024_01013_linearwater.zip 2024-09-24 09:24 323K \n", | |
" [25][ ] [26]tl_2024_01015_linearwater.zip 2024-09-24 09:24 1.1M \n", | |
" [27][ ] [28]tl_2024_01017_linearwater.zip 2024-09-24 09:24 546K \n", | |
" [29][ ] [30]tl_2024_01019_linearwater.zip 2024-09-24 09:25 268K \n", | |
" [31][ ] [32]tl_2024_01021_linearwater.zip 2024-09-24 09:25 1.3M \n", | |
" [33][ ] [34]tl_2024_01023_linearwater.zip 2024-09-24 09:25 551K \n", | |
" [35][ ] [36]tl_2024_01025_linearwater.zip 2024-09-24 09:25 577K \n", | |
" [37][ ] [38]tl_2024_01027_linearwater.zip 2024-09-24 09:25 908K \n", | |
" [39][ ] [40]tl_2024_01029_linearwater.zip 2024-09-24 09:25 389K \n", | |
" [41][ ] [42]tl_2024_01031_linearwater.zip 2024-09-24 09:25 264K \n", | |
" [43][ ] [44]tl_2024_01033_linearwater.zip 2024-09-24 09:25 965K \n", | |
" [45][ ] [46]tl_2024_01035_linearwater.zip 2024-09-24 09:25 1.5M \n", | |
" [47][ ] [48]tl_2024_01037_linearwater.zip 2024-09-24 09:25 1.0M \n", | |
" [49][ ] [50]tl_2024_01039_linearwater.zip 2024-09-24 09:25 1.5M \n", | |
" [51][ ] [52]tl_2024_01041_linearwater.zip 2024-09-24 09:25 587K \n", | |
" [53][ ] [54]tl_2024_01043_linearwater.zip 2024-09-24 09:25 1.5M \n", | |
" [55][ ] [56]tl_2024_01045_linearwater.zip 2024-09-24 09:25 200K \n", | |
" [57][ ] [58]tl_2024_01047_linearwater.zip 2024-09-24 09:25 1.8M \n", | |
" [59][ ] [60]tl_2024_01049_linearwater.zip 2024-09-24 09:26 479K \n", | |
" [61][ ] [62]tl_2024_01051_linearwater.zip 2024-09-24 09:26 1.5M \n", | |
" [63][ ] [64]tl_2024_01053_linearwater.zip 2024-09-24 09:26 1.6M \n", | |
" [65][ ] [66]tl_2024_01055_linearwater.zip 2024-09-24 09:26 374K \n", | |
" [67][ ] [68]tl_2024_01057_linearwater.zip 2024-09-24 09:26 184K \n", | |
" [69][ ] [70]tl_2024_01059_linearwater.zip 2024-09-24 09:26 929K \n", | |
" [71][ ] [72]tl_2024_01061_linearwater.zip 2024-09-24 09:26 408K \n", | |
" [73][ ] [74]tl_2024_01063_linearwater.zip 2024-09-24 09:26 670K \n", | |
" [75][ ] [76]tl_2024_01065_linearwater.zip 2024-09-24 09:26 1.7M \n", | |
" [77][ ] [78]tl_2024_01067_linearwater.zip 2024-09-24 09:26 169K \n", | |
" [79][ ] [80]tl_2024_01069_linearwater.zip 2024-09-24 09:26 405K \n", | |
" [81][ ] [82]tl_2024_01071_linearwater.zip 2024-09-24 09:26 653K \n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## elinks" | |
], | |
"metadata": { | |
"id": "sqxZFeq-_5mJ" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!elinks --dump 'https://httpbin.org/headers' | tr -s '\\n\\r' ' ' | jq .headers\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "u6brXeoVMnma", | |
"outputId": "6fb2983c-839a-49ca-92ed-0251b9ed7b9c" | |
}, | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"\u001b[1;39m{\n", | |
" \u001b[0m\u001b[34;1m\"Accept\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"*/*\"\u001b[0m\u001b[1;39m,\n", | |
" \u001b[0m\u001b[34;1m\"Accept-Language\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"en\"\u001b[0m\u001b[1;39m,\n", | |
" \u001b[0m\u001b[34;1m\"Host\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"httpbin.org\"\u001b[0m\u001b[1;39m,\n", | |
" \u001b[0m\u001b[34;1m\"User-Agent\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"ELinks/0.13.2 (textmode; Linux 6.1.85+ x86_64; -)\"\u001b[0m\u001b[1;39m,\n", | |
" \u001b[0m\u001b[34;1m\"X-Amzn-Trace-Id\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"Root=1-68010f52-059fcaac499fb39733a94046\"\u001b[0m\u001b[1;39m\n", | |
"\u001b[1;39m}\u001b[0m\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!elinks --dump --dump-width 100 {url} | head -50\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "_1uS6TRVE7cT", | |
"outputId": "b2275a27-631d-4fc6-e17c-2b1f37870448" | |
}, | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
" [1]U.S. flag\n", | |
"\n", | |
" An official website of the United States government\n", | |
"\n", | |
" [2]Skip to main content\n", | |
" [3]United States Census Bureau\n", | |
"\n", | |
" ══════════════════════════════════════════════════════════════════════════════════════════════\n", | |
"\n", | |
" end of header\n", | |
"\n", | |
" [4][ICO] [5]Name [6]Last modified [7]Size [8]Description\n", | |
" ══════════════════════════════════════════════════════════════════════════════════════════\n", | |
" [9][PARENTDIR] [10]Parent Directory - \n", | |
" [11][ ] [12]tl_2024_01001_linearwater.zip 2024-09-24 09:24 1.7M \n", | |
" [13][ ] [14]tl_2024_01003_linearwater.zip 2024-09-24 09:24 1.7M \n", | |
" [15][ ] [16]tl_2024_01005_linearwater.zip 2024-09-24 09:24 321K \n", | |
" [17][ ] [18]tl_2024_01007_linearwater.zip 2024-09-24 09:24 1.5M \n", | |
" [19][ ] [20]tl_2024_01009_linearwater.zip 2024-09-24 09:24 473K \n", | |
" [21][ ] [22]tl_2024_01011_linearwater.zip 2024-09-24 09:24 1.5M \n", | |
" [23][ ] [24]tl_2024_01013_linearwater.zip 2024-09-24 09:24 323K \n", | |
" [25][ ] [26]tl_2024_01015_linearwater.zip 2024-09-24 09:24 1.1M \n", | |
" [27][ ] [28]tl_2024_01017_linearwater.zip 2024-09-24 09:24 546K \n", | |
" [29][ ] [30]tl_2024_01019_linearwater.zip 2024-09-24 09:25 268K \n", | |
" [31][ ] [32]tl_2024_01021_linearwater.zip 2024-09-24 09:25 1.3M \n", | |
" [33][ ] [34]tl_2024_01023_linearwater.zip 2024-09-24 09:25 551K \n", | |
" [35][ ] [36]tl_2024_01025_linearwater.zip 2024-09-24 09:25 577K \n", | |
" [37][ ] [38]tl_2024_01027_linearwater.zip 2024-09-24 09:25 908K \n", | |
" [39][ ] [40]tl_2024_01029_linearwater.zip 2024-09-24 09:25 389K \n", | |
" [41][ ] [42]tl_2024_01031_linearwater.zip 2024-09-24 09:25 264K \n", | |
" [43][ ] [44]tl_2024_01033_linearwater.zip 2024-09-24 09:25 965K \n", | |
" [45][ ] [46]tl_2024_01035_linearwater.zip 2024-09-24 09:25 1.5M \n", | |
" [47][ ] [48]tl_2024_01037_linearwater.zip 2024-09-24 09:25 1.0M \n", | |
" [49][ ] [50]tl_2024_01039_linearwater.zip 2024-09-24 09:25 1.5M \n", | |
" [51][ ] [52]tl_2024_01041_linearwater.zip 2024-09-24 09:25 587K \n", | |
" [53][ ] [54]tl_2024_01043_linearwater.zip 2024-09-24 09:25 1.5M \n", | |
" [55][ ] [56]tl_2024_01045_linearwater.zip 2024-09-24 09:25 200K \n", | |
" [57][ ] [58]tl_2024_01047_linearwater.zip 2024-09-24 09:25 1.8M \n", | |
" [59][ ] [60]tl_2024_01049_linearwater.zip 2024-09-24 09:26 479K \n", | |
" [61][ ] [62]tl_2024_01051_linearwater.zip 2024-09-24 09:26 1.5M \n", | |
" [63][ ] [64]tl_2024_01053_linearwater.zip 2024-09-24 09:26 1.6M \n", | |
" [65][ ] [66]tl_2024_01055_linearwater.zip 2024-09-24 09:26 374K \n", | |
" [67][ ] [68]tl_2024_01057_linearwater.zip 2024-09-24 09:26 184K \n", | |
" [69][ ] [70]tl_2024_01059_linearwater.zip 2024-09-24 09:26 929K \n", | |
" [71][ ] [72]tl_2024_01061_linearwater.zip 2024-09-24 09:26 408K \n", | |
" [73][ ] [74]tl_2024_01063_linearwater.zip 2024-09-24 09:26 670K \n", | |
" [75][ ] [76]tl_2024_01065_linearwater.zip 2024-09-24 09:26 1.7M \n", | |
" [77][ ] [78]tl_2024_01067_linearwater.zip 2024-09-24 09:26 169K \n", | |
" [79][ ] [80]tl_2024_01069_linearwater.zip 2024-09-24 09:26 405K \n", | |
" [81][ ] [82]tl_2024_01071_linearwater.zip 2024-09-24 09:26 653K \n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Python requests library - v01" | |
], | |
"metadata": { | |
"id": "Zj1wVo60_7AB" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"response = requests.get( url )\n", | |
"response\n" | |
], | |
"metadata": { | |
"id": "luKGB5oD8l80", | |
"outputId": "8ba182a0-f688-46eb-b847-4133da0c79de", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<Response [200]>" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 11 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"{ k:v for k,v in sorted(response.request.headers.items()) }\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "EAIJYwtcHjKp", | |
"outputId": "6f25fe2d-fb2a-4c59-f96a-a23a9cc91eb5" | |
}, | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'Accept': '*/*',\n", | |
" 'Accept-Encoding': 'gzip, deflate, zstd',\n", | |
" 'Connection': 'keep-alive',\n", | |
" 'User-Agent': 'python-requests/2.32.3'}" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 12 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"html = response.text\n", | |
"lines = BS4(html, \"html.parser\").find(\"body\").text.split(\"\\n\")\n", | |
"top = [ x.strip() for x in lines[:100] if x != \"\" ][:50]\n", | |
"print(\"\\n\".join(top))\n" | |
], | |
"metadata": { | |
"id": "oBzvQN0G-nJA", | |
"outputId": "235d3501-cf1a-4006-ea56-40b7f4795610", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"An official website of the United States government\n", | |
"\n", | |
"Skip to main content\n", | |
"end of header\n", | |
"NameLast modifiedSizeDescription\n", | |
"Parent Directory -\n", | |
"tl_2024_01001_linearwater.zip2024-09-24 09:24 1.7M\n", | |
"tl_2024_01003_linearwater.zip2024-09-24 09:24 1.7M\n", | |
"tl_2024_01005_linearwater.zip2024-09-24 09:24 321K\n", | |
"tl_2024_01007_linearwater.zip2024-09-24 09:24 1.5M\n", | |
"tl_2024_01009_linearwater.zip2024-09-24 09:24 473K\n", | |
"tl_2024_01011_linearwater.zip2024-09-24 09:24 1.5M\n", | |
"tl_2024_01013_linearwater.zip2024-09-24 09:24 323K\n", | |
"tl_2024_01015_linearwater.zip2024-09-24 09:24 1.1M\n", | |
"tl_2024_01017_linearwater.zip2024-09-24 09:24 546K\n", | |
"tl_2024_01019_linearwater.zip2024-09-24 09:25 268K\n", | |
"tl_2024_01021_linearwater.zip2024-09-24 09:25 1.3M\n", | |
"tl_2024_01023_linearwater.zip2024-09-24 09:25 551K\n", | |
"tl_2024_01025_linearwater.zip2024-09-24 09:25 577K\n", | |
"tl_2024_01027_linearwater.zip2024-09-24 09:25 908K\n", | |
"tl_2024_01029_linearwater.zip2024-09-24 09:25 389K\n", | |
"tl_2024_01031_linearwater.zip2024-09-24 09:25 264K\n", | |
"tl_2024_01033_linearwater.zip2024-09-24 09:25 965K\n", | |
"tl_2024_01035_linearwater.zip2024-09-24 09:25 1.5M\n", | |
"tl_2024_01037_linearwater.zip2024-09-24 09:25 1.0M\n", | |
"tl_2024_01039_linearwater.zip2024-09-24 09:25 1.5M\n", | |
"tl_2024_01041_linearwater.zip2024-09-24 09:25 587K\n", | |
"tl_2024_01043_linearwater.zip2024-09-24 09:25 1.5M\n", | |
"tl_2024_01045_linearwater.zip2024-09-24 09:25 200K\n", | |
"tl_2024_01047_linearwater.zip2024-09-24 09:25 1.8M\n", | |
"tl_2024_01049_linearwater.zip2024-09-24 09:26 479K\n", | |
"tl_2024_01051_linearwater.zip2024-09-24 09:26 1.5M\n", | |
"tl_2024_01053_linearwater.zip2024-09-24 09:26 1.6M\n", | |
"tl_2024_01055_linearwater.zip2024-09-24 09:26 374K\n", | |
"tl_2024_01057_linearwater.zip2024-09-24 09:26 184K\n", | |
"tl_2024_01059_linearwater.zip2024-09-24 09:26 929K\n", | |
"tl_2024_01061_linearwater.zip2024-09-24 09:26 408K\n", | |
"tl_2024_01063_linearwater.zip2024-09-24 09:26 670K\n", | |
"tl_2024_01065_linearwater.zip2024-09-24 09:26 1.7M\n", | |
"tl_2024_01067_linearwater.zip2024-09-24 09:26 169K\n", | |
"tl_2024_01069_linearwater.zip2024-09-24 09:26 405K\n", | |
"tl_2024_01071_linearwater.zip2024-09-24 09:26 653K\n", | |
"tl_2024_01073_linearwater.zip2024-09-24 09:26 780K\n", | |
"tl_2024_01075_linearwater.zip2024-09-24 09:26 290K\n", | |
"tl_2024_01077_linearwater.zip2024-09-24 09:26 1.3M\n", | |
"tl_2024_01079_linearwater.zip2024-09-24 09:27 1.3M\n", | |
"tl_2024_01081_linearwater.zip2024-09-24 09:27 650K\n", | |
"tl_2024_01083_linearwater.zip2024-09-24 09:27 1.1M\n", | |
"tl_2024_01085_linearwater.zip2024-09-24 09:27 2.5M\n", | |
"tl_2024_01087_linearwater.zip2024-09-24 09:27 2.0M\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Python requests library - v02" | |
], | |
"metadata": { | |
"id": "ioPWQWemcC3S" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"html = None\n", | |
"response = None\n", | |
"headers = {\n", | |
" \"Accept\": (\n", | |
" \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,\"\n", | |
" \"image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7\"\n", | |
" ),\n", | |
" \"Accept-Language\": \"en-US,en;q=0.9\",\n", | |
" \"Cache-Control\": \"max-age=0\",\n", | |
" \"Priority\": \"u=0, i\",\n", | |
" # \"sec-ch-ua\": '\"Not(A:Brand\";v=\"99\", \"Google Chrome\";v=\"133\", \"Chromium\";v=\"133\"',\n", | |
" # \"sec-ch-ua-mobile\": \"?0\",\n", | |
" # \"sec-ch-ua-platform\": '\"macOS\"',\n", | |
" # \"sec-fetch-dest\": \"document\",\n", | |
" # \"sec-fetch-mode\": \"navigate\",\n", | |
" # \"sec-fetch-site\": \"none\",\n", | |
" # \"sec-fetch-user\": \"?1\",\n", | |
" \"Upgrade-Insecure-Requests\": \"1\",\n", | |
" \"User-Agent\": (\n", | |
" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) \"\n", | |
" \"Chrome/133.0.0.0 Safari/537.36\"\n", | |
" ),\n", | |
"}\n", | |
"{ k:v for k,v in sorted(headers.items()) }\n" | |
], | |
"metadata": { | |
"id": "nMu0vgcI-VtF", | |
"outputId": "90a71443-b030-44ae-b4dd-4bca8b1af46e", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": 14, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n", | |
" 'Accept-Language': 'en-US,en;q=0.9',\n", | |
" 'Cache-Control': 'max-age=0',\n", | |
" 'Priority': 'u=0, i',\n", | |
" 'Upgrade-Insecure-Requests': '1',\n", | |
" 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'}" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 14 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"response = requests.get( url, headers=headers)\n", | |
"response\n" | |
], | |
"metadata": { | |
"id": "5nMN9MiJcZ_I", | |
"outputId": "cb8cd256-16b1-4f88-dfb4-256b8a25054f", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<Response [200]>" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 15 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"{ k:v for k,v in sorted(response.request.headers.items()) }\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "C-gXaL0JHrPn", | |
"outputId": "48317365-ba88-4360-81ff-1e1d87d29dbd" | |
}, | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n", | |
" 'Accept-Encoding': 'gzip, deflate, zstd',\n", | |
" 'Accept-Language': 'en-US,en;q=0.9',\n", | |
" 'Cache-Control': 'max-age=0',\n", | |
" 'Connection': 'keep-alive',\n", | |
" 'Priority': 'u=0, i',\n", | |
" 'Upgrade-Insecure-Requests': '1',\n", | |
" 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'}" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 16 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"html = response.text\n", | |
"lines = BS4(html, \"html.parser\").find(\"body\").text.split(\"\\n\")\n", | |
"top = [ x.strip() for x in lines[:100] if x != \"\" ][:50]\n", | |
"print(\"\\n\".join(top))\n" | |
], | |
"metadata": { | |
"id": "24POxIuCX-0C", | |
"outputId": "a411b5b8-e622-439f-a295-926cd839c593", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"An official website of the United States government\n", | |
"\n", | |
"Skip to main content\n", | |
"end of header\n", | |
"NameLast modifiedSizeDescription\n", | |
"Parent Directory -\n", | |
"tl_2024_01001_linearwater.zip2024-09-24 09:24 1.7M\n", | |
"tl_2024_01003_linearwater.zip2024-09-24 09:24 1.7M\n", | |
"tl_2024_01005_linearwater.zip2024-09-24 09:24 321K\n", | |
"tl_2024_01007_linearwater.zip2024-09-24 09:24 1.5M\n", | |
"tl_2024_01009_linearwater.zip2024-09-24 09:24 473K\n", | |
"tl_2024_01011_linearwater.zip2024-09-24 09:24 1.5M\n", | |
"tl_2024_01013_linearwater.zip2024-09-24 09:24 323K\n", | |
"tl_2024_01015_linearwater.zip2024-09-24 09:24 1.1M\n", | |
"tl_2024_01017_linearwater.zip2024-09-24 09:24 546K\n", | |
"tl_2024_01019_linearwater.zip2024-09-24 09:25 268K\n", | |
"tl_2024_01021_linearwater.zip2024-09-24 09:25 1.3M\n", | |
"tl_2024_01023_linearwater.zip2024-09-24 09:25 551K\n", | |
"tl_2024_01025_linearwater.zip2024-09-24 09:25 577K\n", | |
"tl_2024_01027_linearwater.zip2024-09-24 09:25 908K\n", | |
"tl_2024_01029_linearwater.zip2024-09-24 09:25 389K\n", | |
"tl_2024_01031_linearwater.zip2024-09-24 09:25 264K\n", | |
"tl_2024_01033_linearwater.zip2024-09-24 09:25 965K\n", | |
"tl_2024_01035_linearwater.zip2024-09-24 09:25 1.5M\n", | |
"tl_2024_01037_linearwater.zip2024-09-24 09:25 1.0M\n", | |
"tl_2024_01039_linearwater.zip2024-09-24 09:25 1.5M\n", | |
"tl_2024_01041_linearwater.zip2024-09-24 09:25 587K\n", | |
"tl_2024_01043_linearwater.zip2024-09-24 09:25 1.5M\n", | |
"tl_2024_01045_linearwater.zip2024-09-24 09:25 200K\n", | |
"tl_2024_01047_linearwater.zip2024-09-24 09:25 1.8M\n", | |
"tl_2024_01049_linearwater.zip2024-09-24 09:26 479K\n", | |
"tl_2024_01051_linearwater.zip2024-09-24 09:26 1.5M\n", | |
"tl_2024_01053_linearwater.zip2024-09-24 09:26 1.6M\n", | |
"tl_2024_01055_linearwater.zip2024-09-24 09:26 374K\n", | |
"tl_2024_01057_linearwater.zip2024-09-24 09:26 184K\n", | |
"tl_2024_01059_linearwater.zip2024-09-24 09:26 929K\n", | |
"tl_2024_01061_linearwater.zip2024-09-24 09:26 408K\n", | |
"tl_2024_01063_linearwater.zip2024-09-24 09:26 670K\n", | |
"tl_2024_01065_linearwater.zip2024-09-24 09:26 1.7M\n", | |
"tl_2024_01067_linearwater.zip2024-09-24 09:26 169K\n", | |
"tl_2024_01069_linearwater.zip2024-09-24 09:26 405K\n", | |
"tl_2024_01071_linearwater.zip2024-09-24 09:26 653K\n", | |
"tl_2024_01073_linearwater.zip2024-09-24 09:26 780K\n", | |
"tl_2024_01075_linearwater.zip2024-09-24 09:26 290K\n", | |
"tl_2024_01077_linearwater.zip2024-09-24 09:26 1.3M\n", | |
"tl_2024_01079_linearwater.zip2024-09-24 09:27 1.3M\n", | |
"tl_2024_01081_linearwater.zip2024-09-24 09:27 650K\n", | |
"tl_2024_01083_linearwater.zip2024-09-24 09:27 1.1M\n", | |
"tl_2024_01085_linearwater.zip2024-09-24 09:27 2.5M\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Python urllib library" | |
], | |
"metadata": { | |
"id": "nnD3jGHZcgbf" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"html = None\n", | |
"response = None\n", | |
"request = urllib.request.Request(url)\n", | |
"for k, v in headers.items():\n", | |
" request.add_header(k, v)\n" | |
], | |
"metadata": { | |
"id": "uZ7G_6JjcqrC" | |
}, | |
"execution_count": 18, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"{ k:v for k,v in sorted(request.headers.items()) }\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "WE-NneWKIe2g", | |
"outputId": "8a4b0a57-3e22-4080-b34b-f013f3649dbd" | |
}, | |
"execution_count": 19, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n", | |
" 'Accept-language': 'en-US,en;q=0.9',\n", | |
" 'Cache-control': 'max-age=0',\n", | |
" 'Priority': 'u=0, i',\n", | |
" 'Upgrade-insecure-requests': '1',\n", | |
" 'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'}" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 19 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"response = urllib.request.urlopen(request)\n", | |
"response.reason, response.status\n" | |
], | |
"metadata": { | |
"id": "RtdkB89Tctat", | |
"outputId": "1f72e67f-d02b-40c9-fcf7-bc2b76c28d80", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"('OK', 200)" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 20 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"html = response.read()\n" | |
], | |
"metadata": { | |
"id": "DsDjIBmrheLf" | |
}, | |
"execution_count": 21, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"lines = BS4(html, \"html.parser\").find(\"body\").text.split(\"\\n\")\n", | |
"top = [ x.strip() for x in lines[:100] if x != \"\" ][:50]\n", | |
"print(\"\\n\".join(top))\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "L84DtdsShx7m", | |
"outputId": "f047830f-03e6-455c-c56e-0c19159c083f" | |
}, | |
"execution_count": 22, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"An official website of the United States government\n", | |
"\n", | |
"Skip to main content\n", | |
"end of header\n", | |
"NameLast modifiedSizeDescription\n", | |
"Parent Directory -\n", | |
"tl_2024_01001_linearwater.zip2024-09-24 09:24 1.7M\n", | |
"tl_2024_01003_linearwater.zip2024-09-24 09:24 1.7M\n", | |
"tl_2024_01005_linearwater.zip2024-09-24 09:24 321K\n", | |
"tl_2024_01007_linearwater.zip2024-09-24 09:24 1.5M\n", | |
"tl_2024_01009_linearwater.zip2024-09-24 09:24 473K\n", | |
"tl_2024_01011_linearwater.zip2024-09-24 09:24 1.5M\n", | |
"tl_2024_01013_linearwater.zip2024-09-24 09:24 323K\n", | |
"tl_2024_01015_linearwater.zip2024-09-24 09:24 1.1M\n", | |
"tl_2024_01017_linearwater.zip2024-09-24 09:24 546K\n", | |
"tl_2024_01019_linearwater.zip2024-09-24 09:25 268K\n", | |
"tl_2024_01021_linearwater.zip2024-09-24 09:25 1.3M\n", | |
"tl_2024_01023_linearwater.zip2024-09-24 09:25 551K\n", | |
"tl_2024_01025_linearwater.zip2024-09-24 09:25 577K\n", | |
"tl_2024_01027_linearwater.zip2024-09-24 09:25 908K\n", | |
"tl_2024_01029_linearwater.zip2024-09-24 09:25 389K\n", | |
"tl_2024_01031_linearwater.zip2024-09-24 09:25 264K\n", | |
"tl_2024_01033_linearwater.zip2024-09-24 09:25 965K\n", | |
"tl_2024_01035_linearwater.zip2024-09-24 09:25 1.5M\n", | |
"tl_2024_01037_linearwater.zip2024-09-24 09:25 1.0M\n", | |
"tl_2024_01039_linearwater.zip2024-09-24 09:25 1.5M\n", | |
"tl_2024_01041_linearwater.zip2024-09-24 09:25 587K\n", | |
"tl_2024_01043_linearwater.zip2024-09-24 09:25 1.5M\n", | |
"tl_2024_01045_linearwater.zip2024-09-24 09:25 200K\n", | |
"tl_2024_01047_linearwater.zip2024-09-24 09:25 1.8M\n", | |
"tl_2024_01049_linearwater.zip2024-09-24 09:26 479K\n", | |
"tl_2024_01051_linearwater.zip2024-09-24 09:26 1.5M\n", | |
"tl_2024_01053_linearwater.zip2024-09-24 09:26 1.6M\n", | |
"tl_2024_01055_linearwater.zip2024-09-24 09:26 374K\n", | |
"tl_2024_01057_linearwater.zip2024-09-24 09:26 184K\n", | |
"tl_2024_01059_linearwater.zip2024-09-24 09:26 929K\n", | |
"tl_2024_01061_linearwater.zip2024-09-24 09:26 408K\n", | |
"tl_2024_01063_linearwater.zip2024-09-24 09:26 670K\n", | |
"tl_2024_01065_linearwater.zip2024-09-24 09:26 1.7M\n", | |
"tl_2024_01067_linearwater.zip2024-09-24 09:26 169K\n", | |
"tl_2024_01069_linearwater.zip2024-09-24 09:26 405K\n", | |
"tl_2024_01071_linearwater.zip2024-09-24 09:26 653K\n", | |
"tl_2024_01073_linearwater.zip2024-09-24 09:26 780K\n", | |
"tl_2024_01075_linearwater.zip2024-09-24 09:26 290K\n", | |
"tl_2024_01077_linearwater.zip2024-09-24 09:26 1.3M\n", | |
"tl_2024_01079_linearwater.zip2024-09-24 09:27 1.3M\n", | |
"tl_2024_01081_linearwater.zip2024-09-24 09:27 650K\n", | |
"tl_2024_01083_linearwater.zip2024-09-24 09:27 1.1M\n", | |
"tl_2024_01085_linearwater.zip2024-09-24 09:27 2.5M\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Python urllib library - stack overflow" | |
], | |
"metadata": { | |
"id": "BxmopvJvc6Nu" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"from https://stackoverflow.com/a/74674276" | |
], | |
"metadata": { | |
"id": "992X7rTudHbI" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"html = None\n", | |
"response = None\n", | |
"req = urllib.request.Request( url )\n", | |
"req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')\n", | |
"req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')\n", | |
"req.add_header('Accept-Language', 'en-US,en;q=0.5')\n", | |
"\n", | |
"{ k:v for k,v in sorted(req.headers.items()) }\n" | |
], | |
"metadata": { | |
"id": "25R1LDHEdD9K", | |
"outputId": "e5e20919-ce73-484d-e559-d86f34b14590", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": 23, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',\n", | |
" 'Accept-language': 'en-US,en;q=0.5',\n", | |
" 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'}" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 23 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"r = urllib.request.urlopen(req)\n", | |
"r.reason, r.status\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Vp0v6XK5RiPi", | |
"outputId": "c9f7c5aa-beb2-4fa0-d10b-b843b1f8ab15" | |
}, | |
"execution_count": 24, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"('OK', 200)" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 24 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "TirBac5_dYQ2" | |
}, | |
"execution_count": 24, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment