|
# -*- coding: utf-8 -*- |
|
from gensim.summarization import summarize |
|
from topia.termextract import extract |
|
from topia.termextract import tag |
|
import pysrt |
|
import argparse |
|
import re |
|
import sys |
|
|
|
def tldr(text,rat,keys,ratk): |
|
text = re.sub(r"<[^>]*>",' ',text) |
|
summarization = summarize(text, word_count=rat) |
|
if keys is None: |
|
print(summarization.encode('utf-8')) |
|
else: |
|
print(summarization.encode('utf-8')) |
|
print("======================================") |
|
tagger = tag.Tagger() |
|
tagger.initialize() |
|
tagger.tokenize(text) |
|
extractor = extract.TermExtractor(tagger) |
|
words = extractor(text) |
|
count = 0 |
|
while count != ratk: |
|
try: |
|
print(words[count][0].encode('utf-8')) |
|
count+=1 |
|
except IndexError: |
|
pass |
|
sys.exit(1) |
|
|
|
def handleFiles(file,ratio,keywords,krat): |
|
subtitles = pysrt.open(file,encoding='utf-8') |
|
tosend = u" " |
|
for index, sub in enumerate(subtitles): |
|
linefromsub = subtitles[index].text |
|
tosend = tosend + linefromsub |
|
tldr(tosend,ratio,keywords,krat) |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description='Python Program that returns a summarization from a subtitle file') |
|
parser.add_argument('-i', '--input-file', action="store", help="takes the input file", metavar="FILE") |
|
parser.add_argument('-r', '--ratio',action="store", help="define the ratio of words", default=120) |
|
parser.add_argument('-k', '--keywords',action="store", help="if keywords should be on") |
|
parser.add_argument('-kr', '--keywords-ratio',action="store", help="how many keywords to display", default=5) |
|
args = parser.parse_args() |
|
handleFiles(args.input_file,float(args.ratio),args.keywords,float(args.keywords_ratio)) |
|
|
|
main() |