Created
February 9, 2020 10:20
-
-
Save peterk/6439de6d7ef5ea56fca44de237da32c2 to your computer and use it in GitHub Desktop.
A short script to test text summarization with the KB BERT model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from summarizer import Summarizer # see https://github.com/dmmiller612/bert-extractive-summarizer | |
import transformers | |
import os | |
import sys | |
# load text file to summarize | |
filename = sys.argv[1] | |
print("Summarizing %s" % filename) | |
body = "" | |
with open(filename, 'r') as f: | |
body = f.read() | |
bert_model = "KB/bert-base-swedish-cased" | |
custom_model = transformers.BertModel.from_pretrained(bert_model, output_hidden_states=True) | |
custom_tokenizer = transformers.BertTokenizer.from_pretrained(bert_model) | |
model = Summarizer(model=bert_model, custom_model=custom_model, custom_tokenizer=custom_tokenizer) | |
result = model(body, max_length=80, min_length=50) # experiment with these for variations in output length | |
full = ''.join(result) | |
print(full) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment