-
-
Save cyberandy/e290d567157c7e4bf4aac26b2016999a to your computer and use it in GitHub Desktop.
Use sumy summarizer to extract summary from HTML pages that can be used for meta descriptions.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import os | |
import requests, sys | |
import pandas as pd | |
from sumy.parsers.html import HtmlParser | |
from sumy.parsers.plaintext import PlaintextParser | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.summarizers.lsa import LsaSummarizer as Lsa | |
from sumy.summarizers.luhn import LuhnSummarizer as Luhn | |
from sumy.summarizers.text_rank import TextRankSummarizer as TxtRank | |
from sumy.summarizers.lex_rank import LexRankSummarizer as LexRank | |
from sumy.summarizers.sum_basic import SumBasicSummarizer as SumBasic | |
from sumy.summarizers.kl import KLSummarizer as KL | |
from sumy.nlp.stemmers import Stemmer | |
from sumy.utils import get_stop_words | |
SENTENCES_COUNT = 1 | |
# We call the script with 3 arguments: | |
# 1. the CSV with the list of URLs to analyze, | |
# 2. the new CSV where we will store the new MDs and | |
# 3. the language to be used (if missing "english" will be used) | |
# i.e. "generate-md.py in.csv out.csv english" | |
urlinput = sys.argv[1] | |
print("csv to analyze: ", urlinput) | |
outputcsv = sys.argv[2] | |
print("output csv name: ", outputcsv) | |
# Check if language has been set | |
def get_lan(): | |
try: | |
sys.argv[3] | |
except IndexError: | |
return 'english' | |
else: | |
return sys.argv[3] | |
LANGUAGE = get_lan() | |
print("language set to: ", LANGUAGE) | |
# Open the CSV file with the list of URLs to analyze | |
df = pd.read_csv(urlinput) | |
print("Number of rows in csv", len(df)) | |
# Create a list to store the MDs | |
data_x = [] | |
# For each URL in the input CSV run the analysis and store the results in the list | |
for i in range(len(df)): | |
stemmer = Stemmer(LANGUAGE) | |
# Here is the URL to be analyzed | |
line = df.iloc[i][0] | |
print(line) | |
lsaSummarizer = Lsa(stemmer) | |
lsaSummarizer.stop_words = get_stop_words(LANGUAGE) | |
luhnSummarizer = Luhn(stemmer) | |
luhnSummarizer.stop_words = get_stop_words(LANGUAGE) | |
lexrankSummarizer = LexRank(stemmer) | |
lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) | |
textrankSummarizer = TxtRank(stemmer) | |
textrankSummarizer.stop_words = get_stop_words(LANGUAGE) | |
sumbasicSummarizer = SumBasic(stemmer) | |
sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) | |
klSummarizer = KL(stemmer) | |
klSummarizer.stop_words = get_stop_words(LANGUAGE) | |
# Error handling for HTTP connection problems | |
try: | |
parser = HtmlParser.from_url(line, Tokenizer(LANGUAGE)) | |
except: | |
print('error while fetching', line) | |
# LSA | |
for lsa_sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): | |
print(lsa_sentence) | |
print("Summarizing URL via LSA: " + line) | |
# Luhn | |
for luhn_sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): | |
print(luhn_sentence) | |
print("Summarizing URL via Luhn: " + line) | |
# LexRank | |
for lex_sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): | |
print(lex_sentence) | |
print("Summarizing URL via LexRank: " + line) | |
# TextRank | |
for text_sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): | |
print(text_sentence) | |
print("Summarizing URL via TextRank: " + line) | |
# SumBasic | |
for sum_sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): | |
print(sum_sentence) | |
print("Summarizing URL via SumBasic: " + line) | |
# KL-Sum | |
for kl_sentence in klSummarizer(parser.document, SENTENCES_COUNT): | |
print(kl_sentence) | |
print("Summarizing URL via KL-Sum: " + line) | |
# Storing all values into the list | |
data_x.append({"url":line, "LSA":lsa_sentence, "Luhn":luhn_sentence, "LexRank":lex_sentence, "TextRank":text_sentence, "SumBasic":sum_sentence, "KL-Sum":kl_sentence}) | |
# Save results to the output CSV | |
df = pd.DataFrame(data_x, columns=["url", "LSA", "Luhn", "LexRank", "TextRank", "SumBasic", "KL-Sum"]) | |
df.to_csv(outputcsv, encoding='utf-8', index=False) | |
print("Results saved on", outputcsv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment