Skip to content

Instantly share code, notes, and snippets.

@ischurov
Created February 4, 2017 15:45
Show Gist options
  • Save ischurov/a2ccfabe6b99b83e1ea58d9a828a937d to your computer and use it in GitHub Desktop.
Save ischurov/a2ccfabe6b99b83e1ea58d9a828a937d to your computer and use it in GitHub Desktop.
parse data from mkrf.ru
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "import requests",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "from bs4 import BeautifulSoup",
"execution_count": 15,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "r = requests.get(\"http://mkrf.ru/registr/\", {'q': 'Утомленные солнцем'.encode('cp1251')})",
"execution_count": 51,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "bs = BeautifulSoup(r.text)",
"execution_count": 54,
"outputs": [
{
"output_type": "stream",
"text": "/usr/local/lib/python3.5/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"lxml\")\n\n markup_type=markup_type))\n",
"name": "stderr"
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "[a['href'] for a in bs.find_all(\"table\")[1].find_all(\"a\")]",
"execution_count": 59,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "['/registr/detail.php?ID=156113132&q=%D3%F2%EE%EC%EB%E5%ED%ED%FB%E5+%F1%EE%EB%ED%F6%E5%EC',\n '/registr/detail.php?ID=123780497&q=%D3%F2%EE%EC%EB%E5%ED%ED%FB%E5+%F1%EE%EB%ED%F6%E5%EC',\n '/registr/detail.php?ID=148779834&q=%D3%F2%EE%EC%EB%E5%ED%ED%FB%E5+%F1%EE%EB%ED%F6%E5%EC',\n '/registr/detail.php?ID=146484465&q=%D3%F2%EE%EC%EB%E5%ED%ED%FB%E5+%F1%EE%EB%ED%F6%E5%EC']"
},
"metadata": {},
"execution_count": 59
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "baseurl = \"http://mkrf.ru\"",
"execution_count": 60,
"outputs": []
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "datas = []\nfor link in bs.find_all(\"table\")[1].find_all(\"a\")[:1]:\n url = baseurl + link['href']\n r = requests.get(url)\n datas.append(BeautifulSoup(r.text))",
"execution_count": 61,
"outputs": [
{
"output_type": "stream",
"text": "/usr/local/lib/python3.5/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"lxml\")\n\n markup_type=markup_type))\n",
"name": "stderr"
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "datas[0].find_all(\"table\")[1]",
"execution_count": 71,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "<table align=\"center\" border=\"0\" cellpadding=\"3\" cellspacing=\"0\" class=\"table\" style=\"border-collapse: collapse;\" width=\"100%\">\n<tr>\n<td><b>Номер прокатного удостоверения</b></td>\n<td>1103394</td>\n</tr>\n<tr>\n<td><b>Дата прокатного удостоверения</b></td>\n<td>04.04.1994</td>\n</tr>\n<tr>\n<td><b>Категория прав проката</b></td>\n<td>Все права</td>\n</tr>\n<tr>\n<td><b>Окончание прав проката</b></td>\n<td></td>\n</tr>\n<tr>\n<td><b>Дополнительное описание прав проката</b></td>\n<td>на срок действия авторского права на фильм</td>\n</tr>\n<tr>\n<td><b>Фирма-заявитель</b></td>\n<td>Студия \"ТриТэ\", Роскомкино, фирма \"Русский Клуб\". Права проката у Студия \"ТриТэ\" на срок действия авторского права на фильм</td>\n</tr>\n<tr>\n<td><b>Телефон фирмы-заявителя</b></td>\n<td></td>\n</tr>\n<tr>\n<td><b>Номер договора</b></td>\n<td></td>\n</tr>\n<tr>\n<td><b>Дата заключения договора</b></td>\n<td></td>\n</tr>\n<tr>\n<td><b>Дата заполнения</b></td>\n<td>04.04.1994</td>\n</tr>\n</table>"
},
"metadata": {},
"execution_count": 71
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"version": "3.5.2",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"name": "python",
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "",
"data": {
"description": "parse data from mkrf.ru",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment