Last active
December 26, 2021 09:56
-
-
Save vol1ura/722bbe3cd9dadfcaac9e778cd2055a70 to your computer and use it in GitHub Desktop.
clubmates.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "clubmates.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyMSbfzztz2IMxYL6dDXQJeG", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/vol1ura/722bbe3cd9dadfcaac9e778cd2055a70/clubmates.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "zRg0J0r6ztxV" | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import random\n", | |
"import re\n", | |
"import requests\n", | |
"import time\n", | |
"from tqdm.notebook import tqdm\n", | |
"\n", | |
"pd.set_option('display.max_rows', None)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install random_user_agent" | |
], | |
"metadata": { | |
"id": "Z61g9j-HFebY", | |
"outputId": "3c9ecf80-d679-4e10-b42c-67e33d268940", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Requirement already satisfied: random_user_agent in /usr/local/lib/python3.7/dist-packages (1.0.1)\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from random_user_agent.user_agent import UserAgent\n", | |
"\n", | |
"user_agent_rotator = UserAgent()" | |
], | |
"metadata": { | |
"id": "bZnw6iEPFy36" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"club_id = 23212 # Wake&Run\n", | |
"home_parkrun = 'Kuzminki'" | |
], | |
"metadata": { | |
"id": "clo0v2xzzyLU" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"headers = {\n", | |
" 'Host': 'www.parkrun.ru',\n", | |
" 'User-Agent': user_agent_rotator.get_random_user_agent(),\n", | |
" 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',\n", | |
" 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',\n", | |
" 'Accept-Encoding': 'gzip, deflate, br',\n", | |
" 'Connection': 'keep-alive',\n", | |
" 'Upgrade-Insecure-Requests': '1',\n", | |
" 'Sec-GPC': '1',\n", | |
" 'TE': 'Trailers'\n", | |
" }" | |
], | |
"metadata": { | |
"id": "JMntqd19z4bw" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"page_all_results = requests.get('https://www.parkrun.ru/results/courserecords/', headers=headers)\n", | |
"data = pd.read_html(page_all_results.text)[0]\n", | |
"russian_parkruns = data[data.columns[0]]" | |
], | |
"metadata": { | |
"id": "ctIMuJuVz7T9", | |
"outputId": "29d2ffd0-250d-40e1-a1f9-a2edb2e98511", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 348 | |
} | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "error", | |
"ename": "ValueError", | |
"evalue": "ignored", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-58-7fb0c7ed9957>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mpage_all_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'https://www.parkrun.ru/results/courserecords/'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_html\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpage_all_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mrussian_parkruns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 294\u001b[0m )\n\u001b[1;32m 295\u001b[0m \u001b[0mwarnings\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwarn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mFutureWarning\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstacklevel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 296\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 297\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36mread_html\u001b[0;34m(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)\u001b[0m\n\u001b[1;32m 1099\u001b[0m \u001b[0mna_values\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mna_values\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1100\u001b[0m \u001b[0mkeep_default_na\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkeep_default_na\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1101\u001b[0;31m \u001b[0mdisplayed_only\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdisplayed_only\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1102\u001b[0m )\n", | |
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36m_parse\u001b[0;34m(flavor, io, match, attrs, encoding, displayed_only, **kwargs)\u001b[0m\n\u001b[1;32m 915\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 916\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 917\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mretained\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 919\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36m_parse\u001b[0;34m(flavor, io, match, attrs, encoding, displayed_only, **kwargs)\u001b[0m\n\u001b[1;32m 896\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 897\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 898\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 899\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcaught\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 900\u001b[0m \u001b[0;31m# if `io` is an io-like object, check if it's seekable\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36mparse_tables\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0mlist\u001b[0m \u001b[0mof\u001b[0m \u001b[0mparsed\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mheader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfooter\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0mtuples\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtables\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \"\"\"\n\u001b[0;32m--> 217\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_build_doc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 218\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_thead_tbody_tfoot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtable\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtables\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36m_parse_tables\u001b[0;34m(self, doc, match, attrs)\u001b[0m\n\u001b[1;32m 545\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtables\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 547\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No tables found\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 548\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mValueError\u001b[0m: No tables found" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"dfs = []\n", | |
"for parkrun in tqdm(russian_parkruns):\n", | |
" time.sleep(3 + 5*random.random())\n", | |
" parkrun_trim = re.sub(r'[\\s-]', '', parkrun)\n", | |
" url = f'https://www.parkrun.ru/{parkrun_trim}/results/clubhistory/?clubNum={club_id}'\n", | |
" headers['User-Agent'] = user_agent_rotator.get_random_user_agent()\n", | |
" club_results = requests.get(url, headers=headers)\n", | |
" try:\n", | |
" df = pd.read_html(club_results.text)[0]\n", | |
" dfs.append(df[df.columns[0]])\n", | |
" if parkrun == home_parkrun:\n", | |
" home_list = df[df.columns[0]]\n", | |
" except:\n", | |
" print('ОШИБКА - операция завершилась досрочно. Паркран временно заблокировал IP.')\n", | |
" break" | |
], | |
"metadata": { | |
"id": "pB86PSu40JiI" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def last_name_first(full_name: str):\n", | |
" names = full_name.split()\n", | |
" last_name = names.pop(1).capitalize()\n", | |
" names.insert(0, last_name)\n", | |
" return ' '.join(names)" | |
], | |
"metadata": { | |
"id": "JGv4TY_5LOho" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"full_list = pd.concat(dfs).drop_duplicates(keep='last')\n", | |
"outer_home_list = pd.concat([full_list, home_list]).apply(last_name_first).sort_values().drop_duplicates(keep=False).reset_index(drop=True)" | |
], | |
"metadata": { | |
"id": "u-ij29bnAOaq" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Одноклубники, не бегавшие на домашнем паркране:" | |
], | |
"metadata": { | |
"id": "QmpqdvUI8Hsx" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"outer_home_list.shift(1, fill_value='________Фамилия_Имя___')" | |
], | |
"metadata": { | |
"id": "526myT8CFL74" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Полный список одноклубников:" | |
], | |
"metadata": { | |
"id": "5Ka2TdEp8a4_" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"full_list.apply(last_name_first).sort_values().reset_index(drop=True).shift(1, fill_value='________Фамилия_Имя___')" | |
], | |
"metadata": { | |
"id": "ImhNvi-aS6I_" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment