Skip to content

Instantly share code, notes, and snippets.

@jiridanek
Created April 5, 2011 17:36
Show Gist options
  • Save jiridanek/904075 to your computer and use it in GitHub Desktop.
Save jiridanek/904075 to your computer and use it in GitHub Desktop.
Druhý nástřel
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
"""
" Vysledek je na https://spreadsheets.google.com/ccc?key=0AveNRb1dJwZ5dHIwTUVzZlQ5TGdoZDh1d0RFeVpqT3c&hl=en#gid=0
"
" Osekává první velké písmeno - člověk musí sám poznat, že se jedná o vlastní jméno atd
"
" Způsob výpisu se volí natvrdo v předpředposlední funkci
"
" Už mě nic nenapadá
"""
class Titulek():
def __init__(self, lineno = None, text = []):
self.lineno = lineno
self.text = text
class Slovo():
def __init__(self, lineno = [], line = [], ang = "", cze = ""):
self.lineno = lineno
self.line = line
self.ang = ang
self.cze = cze
def parse_srt(file):
output = []
blok = []
cnt = 1
for line in file.readlines():
if len(line) <= 1:
output.append(Titulek(lineno = cnt, text = blok[2:])) #drop first two lines - číslo a časování
blok = []
else:
blok.append(line)
cnt = cnt+1
return output
def trim_balast(slovo):
output = slovo
output = "".join( output.split('<i>') ) #<i> xxx </i>
output = "".join( output.split('</i>') )
output = output.strip('"?.,!:- \t\n')
return output[0].lower() + output[1:] if len(output) > 1 else output.lower() #make first letter lower
def parse_titulek(titulek):
output = []
for tline in titulek.text:
for slovo in [trim_balast(s) for s in tline.split(" ")]:
output.append( Slovo(lineno=[titulek.lineno], line = [tline.strip()], ang = slovo) )
return output
def merge_duplicities(slova):
output = []
k = lambda x: x.ang.lower()
s_slova = sorted(slova, key = k)
output.append(s_slova[0])
for slovo in s_slova[1:]:
if k(slovo) != k(output[-1]):
output.append(slovo)
else:
last = output[-1]
last.lineno.extend(slovo.lineno)
last.line.extend(slovo.line)
output[-1] = last
return output
def output_csv_l(slovo):
i = 1
if i == 1:
print '%(lineno)s\t%(line)s\t%(ang)s' % {'lineno': " ".join(map(str, slovo.lineno))
, 'line': " | ".join(slovo.line)
, 'ang':slovo.ang}
elif i == 2:
print '%(ang)s' % {'ang':slovo.ang}
def flatten(ple):
output = []
for i in ple:
if type(i) == list:
output.extend(flatten(i))
else:
output.append(i)
return output
def main():
if not len(sys.argv) == 2:
print "Argument!"
return
file = open(sys.argv[1])
titulky = parse_srt(file)
slova = flatten([parse_titulek(titulek) for titulek in titulky])
procistena_slova = merge_duplicities(slova)
for slovo in procistena_slova:
output_csv_l(slovo)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment