Created
February 4, 2017 15:45
-
-
Save ischurov/a2ccfabe6b99b83e1ea58d9a828a937d to your computer and use it in GitHub Desktop.
parse data from mkrf.ru
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "import requests", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "from bs4 import BeautifulSoup", | |
"execution_count": 15, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "r = requests.get(\"http://mkrf.ru/registr/\", {'q': 'Утомленные солнцем'.encode('cp1251')})", | |
"execution_count": 51, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "bs = BeautifulSoup(r.text)", | |
"execution_count": 54, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "/usr/local/lib/python3.5/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"lxml\")\n\n markup_type=markup_type))\n", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "[a['href'] for a in bs.find_all(\"table\")[1].find_all(\"a\")]", | |
"execution_count": 59, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": "['/registr/detail.php?ID=156113132&q=%D3%F2%EE%EC%EB%E5%ED%ED%FB%E5+%F1%EE%EB%ED%F6%E5%EC',\n '/registr/detail.php?ID=123780497&q=%D3%F2%EE%EC%EB%E5%ED%ED%FB%E5+%F1%EE%EB%ED%F6%E5%EC',\n '/registr/detail.php?ID=148779834&q=%D3%F2%EE%EC%EB%E5%ED%ED%FB%E5+%F1%EE%EB%ED%F6%E5%EC',\n '/registr/detail.php?ID=146484465&q=%D3%F2%EE%EC%EB%E5%ED%ED%FB%E5+%F1%EE%EB%ED%F6%E5%EC']" | |
}, | |
"metadata": {}, | |
"execution_count": 59 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "baseurl = \"http://mkrf.ru\"", | |
"execution_count": 60, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "datas = []\nfor link in bs.find_all(\"table\")[1].find_all(\"a\")[:1]:\n url = baseurl + link['href']\n r = requests.get(url)\n datas.append(BeautifulSoup(r.text))", | |
"execution_count": 61, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "/usr/local/lib/python3.5/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"lxml\")\n\n markup_type=markup_type))\n", | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "datas[0].find_all(\"table\")[1]", | |
"execution_count": 71, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": "<table align=\"center\" border=\"0\" cellpadding=\"3\" cellspacing=\"0\" class=\"table\" style=\"border-collapse: collapse;\" width=\"100%\">\n<tr>\n<td><b>Номер прокатного удостоверения</b></td>\n<td>1103394</td>\n</tr>\n<tr>\n<td><b>Дата прокатного удостоверения</b></td>\n<td>04.04.1994</td>\n</tr>\n<tr>\n<td><b>Категория прав проката</b></td>\n<td>Все права</td>\n</tr>\n<tr>\n<td><b>Окончание прав проката</b></td>\n<td></td>\n</tr>\n<tr>\n<td><b>Дополнительное описание прав проката</b></td>\n<td>на срок действия авторского права на фильм</td>\n</tr>\n<tr>\n<td><b>Фирма-заявитель</b></td>\n<td>Студия \"ТриТэ\", Роскомкино, фирма \"Русский Клуб\". Права проката у Студия \"ТриТэ\" на срок действия авторского права на фильм</td>\n</tr>\n<tr>\n<td><b>Телефон фирмы-заявителя</b></td>\n<td></td>\n</tr>\n<tr>\n<td><b>Номер договора</b></td>\n<td></td>\n</tr>\n<tr>\n<td><b>Дата заключения договора</b></td>\n<td></td>\n</tr>\n<tr>\n<td><b>Дата заполнения</b></td>\n<td>04.04.1994</td>\n</tr>\n</table>" | |
}, | |
"metadata": {}, | |
"execution_count": 71 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"version": "3.5.2", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"name": "python", | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "parse data from mkrf.ru", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment