Created
November 11, 2017 12:36
-
-
Save kokes/6b9e7174231d46099f1dfca4c99beab2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import json\n", | |
"import gzip\n", | |
"from urllib.parse import urljoin\n", | |
"import zlib\n", | |
"import lxml.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def data(site):\n", | |
" url = f'http://index.commoncrawl.org/CC-MAIN-2017-43-index?url={site}*&output=json'\n", | |
"\n", | |
" r = requests.get(url)\n", | |
" assert r.ok\n", | |
"\n", | |
" lines = r.content.strip().split(b'\\n')\n", | |
" res = []\n", | |
" for ln in lines:\n", | |
"# dt = json.loads(ln)\n", | |
" res.append(json.loads(ln))\n", | |
" return res" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"burl = 'https://commoncrawl.s3.amazonaws.com/'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 179, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"9686" | |
] | |
}, | |
"execution_count": 179, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dt = data('parlamentnilisty.cz')\n", | |
"len(dt)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 180, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'digest': 'ODAU2MIXYTW2BQJWG6CPD5V2UXHKJL32',\n", | |
" 'filename': 'crawl-data/CC-MAIN-2017-43/segments/1508187824543.20/warc/CC-MAIN-20171021024136-20171021044136-00571.warc.gz',\n", | |
" 'length': '34289',\n", | |
" 'mime': 'text/html',\n", | |
" 'mime-detected': 'text/html',\n", | |
" 'offset': '537953258',\n", | |
" 'status': '200',\n", | |
" 'timestamp': '20171021030631',\n", | |
" 'url': 'http://www.parlamentnilisty.cz/zpravy/Zpoved-o-Klausove-amnestii-Nagyove-i-prevratu-ktery-zacne-1-ledna-Hovori-byvaly-hradni-pravnik-298509',\n", | |
" 'urlkey': 'cz,parlamentnilisty)/zpravy/zpoved-o-klausove-amnestii-nagyove-i-prevratu-ktery-zacne-1-ledna-hovori-byvaly-hradni-pravnik-298509'}" | |
] | |
}, | |
"execution_count": 180, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dt[-2]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**we won't save much by bulking files together**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 181, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"8394" | |
] | |
}, | |
"execution_count": 181, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(set([el['filename'] for el in dt]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 182, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_range(bfrom, blen):\n", | |
" return {'Range': f'bytes={int(bfrom)}-{int(bfrom)+int(blen)}'}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 183, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def grab_article(el):\n", | |
" url = urljoin(burl, el['filename'])\n", | |
" r = requests.get(url, headers=get_range(el['offset'], el['length']))\n", | |
" assert r.ok\n", | |
"\n", | |
" dd = zlib.decompress(r.content, 15+16)\n", | |
" content = dd[dd.index(b'<!DOCTYPE html>'):]\n", | |
" ht = lxml.html.fromstring(content)\n", | |
"\n", | |
" tu = b'WARC-Target-URI'\n", | |
" art_url = [j for j in dd.split(b'\\n') if j.startswith(tu)][0][len(tu)+1:].strip()\n", | |
"\n", | |
" ttl = ht.cssselect('article.detail h1')\n", | |
" if len(ttl) == 0:\n", | |
" return None\n", | |
" \n", | |
" title = ttl[0].text_content().strip()\n", | |
" time = ht.cssselect('article.detail div.time')[0].text_content()\n", | |
" lead = ht.cssselect('article.detail p.brief')[0].text_content().strip()\n", | |
" content = ht.cssselect('section.article-content')[0].text_content().strip()\n", | |
" return {\n", | |
" 'url': art_url.decode('ascii'),\n", | |
" 'title': title,\n", | |
" 'lead': lead,\n", | |
" 'text': content\n", | |
" }" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 184, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"res = []\n", | |
"for el in dt[:5]:\n", | |
" dd = grab_article(el)\n", | |
" if dd is None: continue\n", | |
" \n", | |
" res.append(dd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 185, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['„Ahoj prostitutky, co to tady vaříte za ejakuláty.“ U Jílkové se řešila šikana na školách a došlo na mrazivá svědectví',\n", | |
" 'Ale ale. Babišovi se prý zadrhla jeho klíčová věc',\n", | |
" '„Asi je to hlupák. Kde on žil? Nesmysly!“ Historik z ÚSTR sepsal seznam „ušpiněných“ herců z majora Zemana a je zle. Ozvali se Obermaierová, Přeučil a další']" | |
] | |
}, | |
"execution_count": 185, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"[j['title'] for j in res]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 119, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# with open('pl.html', 'wb') as f:\n", | |
"# f.write(content)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment