Created
September 30, 2016 21:58
-
-
Save ischurov/8b5a231255bd0fff08b860fabcdf69f0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from selenium import webdriver\n", | |
"from bs4 import BeautifulSoup\n", | |
"import re\n", | |
"from time import sleep" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"browser = webdriver.Chrome()\n", | |
"browser.implicitly_wait(5)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Подготовка\n", | |
"Здесь собирается список районов и список улиц и строится словарь, позволяющий искать район по названию улицы " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"browser.get(\"http://www.cikrf.ru/services/lk_address/?do=address\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"lnk = browser.find_element_by_partial_link_text(\"Севастополь\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"lnk.click()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.5/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", | |
"\n", | |
"To get rid of this warning, change this:\n", | |
"\n", | |
" BeautifulSoup([your markup])\n", | |
"\n", | |
"to this:\n", | |
"\n", | |
" BeautifulSoup([your markup], \"lxml\")\n", | |
"\n", | |
" markup_type=markup_type))\n" | |
] | |
} | |
], | |
"source": [ | |
"bs = BeautifulSoup(browser.page_source)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"sevastopol = bs.find(text=re.compile(\".*Севастополь.*\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"rayons = [a.text for a in sevastopol.parent.parent.find_all(\"a\")[1:]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['Балаклавский район',\n", | |
" 'Гагаринский район',\n", | |
" 'Ленинский район',\n", | |
" 'Нахимовский район']" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"rayons" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"for rayon in rayons:\n", | |
" browser.find_element_by_partial_link_text(rayon).click()\n", | |
"sleep(2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"bs = BeautifulSoup(browser.page_source, 'lxml')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"rayon_to_street = {}\n", | |
"for rayon in rayons:\n", | |
" r = bs.find(text=rayon)\n", | |
" streets = [a.text for a in r.parent.parent.find_all(\"a\")[1:]]\n", | |
" rayon_to_street[rayon] = streets\n", | |
" \n", | |
"street_to_rayon = {}\n", | |
"for rayon, streets in rayon_to_street.items():\n", | |
" for street in streets:\n", | |
" street_to_rayon[street] = rayon" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Главная функция" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def traverse(browser, path):\n", | |
" \"\"\"\n", | |
" Эта функция получает на вход «путь» и проходит по нему.\n", | |
" Например:\n", | |
" path = ['Город Севастополь', 'Балаклавский район']\n", | |
" Будет последовательно кликнута сначала ссылка «Город Севастополь», потом она раскроется, внутри этого города\n", | |
" будет найден «Балаклавский район» и тоже кликнут.\n", | |
" Путь может быть сколь угодно подробным, в том числе до дома (в этом случае последний клик приведёт к выводу\n", | |
" информации о УИКе, обслуживающим этот дом)\n", | |
" \"\"\"\n", | |
" for i in range(len(path)):\n", | |
" scope = browser\n", | |
" for element in path[:i + 1]:\n", | |
" lnk = scope.find_element_by_link_text(element)\n", | |
" scope = lnk.find_element_by_xpath(\"..\").find_element_by_xpath(\"..\")\n", | |
" lnk.click()\n", | |
" sleep(0.5)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Тут начинается самое содержательное" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"addresses = [\"Аграрная, 6\", \"Балашова, 2\", \"Полевая, 2\", \"Гончарная, 4\"]\n", | |
"# Адреса должны быть в формате \"Улица, дом\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"uik_data = {}\n", | |
"for address in addresses:\n", | |
" m = re.search(r\"(.+),\\s*(\\d+)\", address)\n", | |
" if m is None:\n", | |
" print(\"Incorrect address string\", address)\n", | |
" continue\n", | |
" street, house = m.groups()\n", | |
" rayon = street_to_rayon.get(street)\n", | |
" if rayon is None:\n", | |
" print(\"Cannot find rayon for street\", street)\n", | |
" continue\n", | |
" browser.get(\"http://www.cikrf.ru/services/lk_address/?do=address\")\n", | |
" traverse(browser, [\"Город Севастополь\", rayon, street, house])\n", | |
" m = re.search(r\"Участковая избирательная комиссия №(\\d+)\", browser.page_source)\n", | |
" if m is None:\n", | |
" print(\"Cannot find UIK number for address\", address)\n", | |
" uik = \"\"\n", | |
" else:\n", | |
" uik = m.group(1)\n", | |
" m = re.search(r\"Адрес помещения для голосования: ([^<]+)\", browser.page_source)\n", | |
" if m is None:\n", | |
" print(\"Cannot find UIK address for address\", address)\n", | |
" addr = \"\"\n", | |
" else:\n", | |
" addr = m.group(1)\n", | |
" uik_data[address] = (uik, addr)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'Аграрная, 6': ('22',\n", | |
" 'Город Севастополь, Балаклавский район, Разъездная, 1Б'),\n", | |
" 'Балашова, 2': ('10',\n", | |
" 'Город Севастополь, Балаклавский район, Благодатная, 16'),\n", | |
" 'Гончарная, 4': ('8', 'Город Севастополь, Балаклавский район, Коммунаров, 2'),\n", | |
" 'Полевая, 2': ('14', 'Город Севастополь, Балаклавский район, Тимирязева, 23')}" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"uik_data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.0" | |
}, | |
"toc": { | |
"toc_cell": false, | |
"toc_number_sections": true, | |
"toc_threshold": 6, | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment