Created
March 28, 2018 19:35
-
-
Save kokes/3b8fcf06410cb9d051cf75401ce88ce5 to your computer and use it in GitHub Desktop.
stahovani titulku z ivysilani
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Stáhni seznam pořadů z webu iVysílání | |
""" | |
import json | |
from urllib.parse import urljoin | |
import lxml.html | |
burl = 'http://www.ceskatelevize.cz/ivysilani/podle-abecedy' | |
ht = lxml.html.parse(burl).getroot() | |
abc = ht.cssselect('ul#programmeAlphabet a') | |
urls = [(j.text_content().strip(), j.attrib['href']) for j in abc] | |
dt = dict() | |
for psm, url in urls: | |
print('Stahuju: %s ' % psm, end='\r') | |
ht = lxml.html.parse(urljoin(burl, url)).getroot() | |
seznam = ht.cssselect('div#programmeAlphabetContent ul li a') | |
porady = [(j.text_content().strip(), urljoin(burl, j.attrib['href'])) for j in seznam] | |
dt[psm] = porady | |
print('Staženo %d názvů pořadů' % sum([len(k) for j,k in dt.items()])) | |
with open('data/porady.json', 'w') as f: | |
json.dump(dt, f, ensure_ascii=False, indent=2) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import unicodedata | |
import re | |
from urllib.parse import urljoin | |
import lxml.html | |
import csv | |
with open('data/porady.json') as f: | |
dt = json.load(f) | |
# whitelist ma porady, ktery jako jediny stahujem | |
# nemusi existovat | |
with open('whitelist.txt') as f: | |
csvr = csv.reader(f) | |
f = open('whitelist.txt') | |
wl = set([j[0] for j in csvr]) | |
# prochazej pismena | |
# TODO: neumi to stary layout - http://www.ceskatelevize.cz/porady/1181831094-vaclav-belohradsky-nikdo-neposloucha/20456226263 | |
# projdi tak vsechny porady a koukni na ty, kde nemame ani jeden dil | |
for p, pr in dt.items(): | |
tdr = 'data/dily/%s' % p | |
# a porady v ramci pismen | |
for por in pr: | |
nazev = unicodedata.normalize('NFD', por[0]).encode('ascii', 'ignore').decode().lower() | |
mt = '/ivysilani/' | |
idd = por[1][por[1].index(mt)+len(mt):por[1].index('-')] | |
if not idd.isdigit(): | |
print('Přeskakuji %s (%s)' % (por[0], por[1])) | |
# pouzivame whitelist, ale tohle ID tam neni? Preskoc | |
if len(wl) > 0 and (idd not in wl): | |
continue | |
fn = os.path.join(tdr, '%s-%s.json' % ('-'.join(re.findall('\w+', nazev)), idd)) | |
if os.path.isfile(fn): continue # nedelame zadny updaty - ty jen kdyz smazes raw data, TODO | |
ht = lxml.html.parse(por[1]).getroot() | |
urlp = urljoin(por[1], ht.cssselect('div#programmeInfo a')[0].attrib['href']) | |
# print(urlp) | |
lporid = 1e50 # potrebujem nejakej initovej id | |
ret = [] | |
for pg in range(1, 10000): # max 100000 stranek? | |
print('Stahuju %s, stránka %d%s' % (por[0], pg, 10*' '), end='\r') | |
urlstr = urlp + '/dily/%d' % pg # URL stranky (schvalne nepouzito urljoin) | |
# print(urlstr) | |
ht = lxml.html.parse(urlstr).getroot() | |
els = ht.cssselect('div.episodes-broadcast-content a') | |
if len(els) == 0: break | |
for el in els: | |
datum = el.find('time').text.replace('\xa0', ' ') | |
nazev = el.find('h3').text | |
popis = el.find('p').text if el.find('p') is not None else '' | |
furl = urljoin(por[1], el.attrib['href']) # plna URL na porad | |
# porid = furl.split('/')[-2] # ID poradu | |
# porid = porid[:porid.find('-')] # odsekni nazev poradu, pokud tam je | |
porid = furl[:-1] if furl.endswith('/') else furl | |
porid = porid[porid.rindex('/')+1:] | |
if '-' in porid: # odsekni nazev poradu, pokud tam je | |
porid = porid[:porid.index('-')] | |
assert porid.isdigit(), furl | |
porid = int(porid) | |
# kontroluj, ze jdem do minulosti (obcas ma posledni strana budouci epizody, napr. v OVM) | |
# UPDATE: uz neni potreba, protoze oni pouzivaji jiny selektor, takze to breakuje uz vyse | |
# if porid > lporid: | |
# break | |
# lporid = porid | |
ret.append({ | |
'datum': datum, | |
'nazev': nazev, | |
'popis': popis.replace('\r', ''), | |
'url': furl, | |
'id': porid | |
}) | |
if len(ret) == 0: continue # zadny dily | |
if not os.path.isdir(tdr): | |
os.makedirs(tdr) | |
with open(fn, 'w') as f: | |
json.dump(ret, f, ensure_ascii=False, indent=2) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from glob import glob | |
import json | |
import gzip | |
import os | |
import requests | |
from collections import defaultdict | |
fns = glob('data/dily/*/*.json') | |
sub_url = 'http://imgct.ceskatelevize.cz/cache/data/ivysilani/subtitles/%s/%s/sub.vtt' | |
fl = defaultdict(int) # pocet failures na soubor | |
nfl = 10 # maximalni pocet failu | |
for fn in fns: | |
print('Stahuju', fn) | |
nm = os.path.split(fn)[-1] | |
tdr = 'data/titulky/raw/' | |
# co uz mame stazeno | |
mfn = os.path.join(tdr, 'mame', nm) | |
mame = set() | |
if os.path.isfile(mfn): | |
with open(mfn) as f: | |
mame = set(json.load(f)) | |
# stazena data | |
dfn = os.path.join(tdr, 'titulky', nm + '.gz') | |
tt = dict() | |
if os.path.isfile(dfn): | |
with gzip.open(dfn, 'rt') as f: | |
tt = json.load(f) | |
# Načti seznam pořadů | |
with open(fn) as f: | |
dt = json.load(f) | |
for j, el in enumerate(dt[:2000]): | |
print('Stahuju: %d/%d' % (j, len(dt)), end='\r') | |
porid = str(el['id']) | |
if porid in mame: continue | |
turl = sub_url % (porid[:3], porid) | |
r = requests.get(turl) | |
st = r.status_code | |
if st == 200: | |
tt[porid] = r.text | |
mame.add(porid) | |
elif st == 404: | |
# koncime jen a pouze kdyz to presahne mez | |
fl[fn] += 1 | |
if fl[fn] > nfl: | |
print('Selhalo na', el['datum']) | |
break | |
else: | |
print('unexpected error with', turl) | |
with gzip.open(dfn, 'wt') as f: | |
json.dump(tt, f, ensure_ascii=False) | |
with open(mfn, 'w') as f: | |
json.dump(sorted(list(mame)), f, ensure_ascii=False) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import gzip | |
from glob import glob | |
import re | |
from elasticsearch import Elasticsearch, helpers | |
def parsuj_titulky(dt): | |
ws = re.compile(r'\s+') | |
dt = dt.split('\n') # neni to moc velky, tak tu nebudem lazy | |
assert dt[0].strip() == 'WEBVTT' | |
dt = dt[2:] # odsekni zacatek | |
# TODO: merguj věty ve vícero titulcích | |
res = [] | |
bf = [] # intermediate buffer | |
for j, rw in enumerate(dt): | |
hd = False # jsme v casovy hlavicce? | |
if '-->' in rw: | |
od, do = rw[:8], rw[17:25] | |
hd = True | |
# ukoncime text pokud jsme v hlavicce (minulej text) a nebo na konci | |
if len(bf) > 0 and (hd or j == (len(dt) - 1)): | |
text = re.sub(ws, ' ', ' '.join(bf).strip()) | |
res.append({ | |
'od': od, | |
'do': do, | |
'text': text | |
}) | |
bf = [] # reset | |
if not hd: | |
bf.append(rw) | |
return res | |
es = Elasticsearch() | |
ind = 'ctgrep' | |
if es.indices.exists(ind): | |
es.indices.delete(ind) | |
mp = { | |
"mappings": { | |
"titulky": { | |
"properties": { | |
"titulky": { | |
"type": "nested", | |
"properties": { | |
"od": { "type": "date", "format": "hour_minute_second" }, | |
"do": { "type": "date", "format": "hour_minute_second" }, | |
"text": { "type": "string" } | |
} | |
} | |
} | |
} | |
} | |
} | |
# create index | |
es.indices.create(index=ind, ignore=400, body=mp) | |
dfns = glob('data/dily/*/*.json') | |
actions = [] # es queue | |
# loopujem porady | |
for fn in dfns: | |
with open(fn) as f: | |
dily = json.load(f) | |
# nactem titulky | |
tfn = os.path.join('data/titulky/raw/titulky/', os.path.split(fn)[-1]+'.gz') | |
with gzip.open(tfn, 'rt') as f: | |
dt = json.load(f) | |
tk = set(dt.keys()) # co ve skutecnosti mame | |
# a loopnem díly | |
for dl in dily: | |
porid = str(dl['id']) | |
if porid not in tk: continue # titulky nemame, jdem dal | |
edt = dl | |
edt['titulky'] = parsuj_titulky(dt[porid]) | |
res = es.index(index=ind, doc_type='titulky', id=edt['id'], body=edt) | |
# actions.append({ | |
# '_index': ind, | |
# '_type': 'titulky', | |
# '_id': edt['id'], | |
# '_source': edt | |
# }) | |
# if len(actions) > 0: | |
# helpers.bulk(es, actions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment