Skip to content

Instantly share code, notes, and snippets.

@kokes
Created November 11, 2017 12:36
Show Gist options
  • Save kokes/6b9e7174231d46099f1dfca4c99beab2 to your computer and use it in GitHub Desktop.
Save kokes/6b9e7174231d46099f1dfca4c99beab2 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"import gzip\n",
"from urllib.parse import urljoin\n",
"import zlib\n",
"import lxml.html"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def data(site):\n",
" url = f'http://index.commoncrawl.org/CC-MAIN-2017-43-index?url={site}*&output=json'\n",
"\n",
" r = requests.get(url)\n",
" assert r.ok\n",
"\n",
" lines = r.content.strip().split(b'\\n')\n",
" res = []\n",
" for ln in lines:\n",
"# dt = json.loads(ln)\n",
" res.append(json.loads(ln))\n",
" return res"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"burl = 'https://commoncrawl.s3.amazonaws.com/'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9686"
]
},
"execution_count": 179,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dt = data('parlamentnilisty.cz')\n",
"len(dt)"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'digest': 'ODAU2MIXYTW2BQJWG6CPD5V2UXHKJL32',\n",
" 'filename': 'crawl-data/CC-MAIN-2017-43/segments/1508187824543.20/warc/CC-MAIN-20171021024136-20171021044136-00571.warc.gz',\n",
" 'length': '34289',\n",
" 'mime': 'text/html',\n",
" 'mime-detected': 'text/html',\n",
" 'offset': '537953258',\n",
" 'status': '200',\n",
" 'timestamp': '20171021030631',\n",
" 'url': 'http://www.parlamentnilisty.cz/zpravy/Zpoved-o-Klausove-amnestii-Nagyove-i-prevratu-ktery-zacne-1-ledna-Hovori-byvaly-hradni-pravnik-298509',\n",
" 'urlkey': 'cz,parlamentnilisty)/zpravy/zpoved-o-klausove-amnestii-nagyove-i-prevratu-ktery-zacne-1-ledna-hovori-byvaly-hradni-pravnik-298509'}"
]
},
"execution_count": 180,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dt[-2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**we won't save much by bulking files together**"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8394"
]
},
"execution_count": 181,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(set([el['filename'] for el in dt]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_range(bfrom, blen):\n",
" return {'Range': f'bytes={int(bfrom)}-{int(bfrom)+int(blen)}'}"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {},
"outputs": [],
"source": [
"def grab_article(el):\n",
" url = urljoin(burl, el['filename'])\n",
" r = requests.get(url, headers=get_range(el['offset'], el['length']))\n",
" assert r.ok\n",
"\n",
" dd = zlib.decompress(r.content, 15+16)\n",
" content = dd[dd.index(b'<!DOCTYPE html>'):]\n",
" ht = lxml.html.fromstring(content)\n",
"\n",
" tu = b'WARC-Target-URI'\n",
" art_url = [j for j in dd.split(b'\\n') if j.startswith(tu)][0][len(tu)+1:].strip()\n",
"\n",
" ttl = ht.cssselect('article.detail h1')\n",
" if len(ttl) == 0:\n",
" return None\n",
" \n",
" title = ttl[0].text_content().strip()\n",
" time = ht.cssselect('article.detail div.time')[0].text_content()\n",
" lead = ht.cssselect('article.detail p.brief')[0].text_content().strip()\n",
" content = ht.cssselect('section.article-content')[0].text_content().strip()\n",
" return {\n",
" 'url': art_url.decode('ascii'),\n",
" 'title': title,\n",
" 'lead': lead,\n",
" 'text': content\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [],
"source": [
"res = []\n",
"for el in dt[:5]:\n",
" dd = grab_article(el)\n",
" if dd is None: continue\n",
" \n",
" res.append(dd)"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['„Ahoj prostitutky, co to tady vaříte za ejakuláty.“ U Jílkové se řešila šikana na školách a došlo na mrazivá svědectví',\n",
" 'Ale ale. Babišovi se prý zadrhla jeho klíčová věc',\n",
" '„Asi je to hlupák. Kde on žil? Nesmysly!“ Historik z ÚSTR sepsal seznam „ušpiněných“ herců z majora Zemana a je zle. Ozvali se Obermaierová, Přeučil a další']"
]
},
"execution_count": 185,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[j['title'] for j in res]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# with open('pl.html', 'wb') as f:\n",
"# f.write(content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment