Created
June 20, 2025 01:14
-
-
Save urielhdz/7ef19065633941e8f00b6bb2fb9202c1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import click | |
import os | |
from dotenv import load_dotenv | |
from openai import OpenAI | |
import requests | |
from bs4 import BeautifulSoup | |
load_dotenv() | |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
client = OpenAI(api_key=OPENAI_API_KEY) | |
@click.command() | |
@click.option('--url', default='https://news.ycombinator.com', help='URL to get a summary from') | |
@click.option('--about', default='', help='Description of the page to summarize, e.g., "This is a news website about technology.". It helps the AI to understand the context better.') | |
@click.option('--output-file', default='output.txt', help='Path to the output summary file') | |
def process_file(url, about, output_file): | |
print("Starting the process...") | |
info = scrap_url(url) | |
# print(f"Scraped info from {url}:\n{info[:500]}...") # Print first 500 characters for debugging | |
print(f"Scraped info from {url} successfully.") | |
r = summarize_info(info,url, about) | |
print(f"Summary generated successfully.") | |
with open(output_file, 'w', encoding='utf-8') as f: | |
f.write(r) | |
print(f"Summary written to {output_file} successfully.") | |
def summarize_info(info,url,about): | |
input= """ | |
Por favor, resume las siguiente información de la página web: | |
{url} | |
Si la información es muy extensa, por favor, resume las líneas más importantes. | |
Si la información es corta, por favor, resume todo. | |
Aquí está el cuerpo de la información: | |
{info} | |
""".format(info=info, url=url) | |
if about: | |
input += f"\n\nContexto: {about}" | |
# print(f"Input: {input}") | |
response = client.responses.create( | |
model="gpt-4.1", | |
input=input | |
) | |
return response.output_text | |
def scrap_url(url): | |
""" | |
Scrapes the main text content from the given URL. | |
Returns the extracted text as a string. | |
""" | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Remove script and style elements | |
for script in soup(["script", "style"]): | |
script.decompose() | |
# Get text and clean up whitespace | |
text = soup.get_text(separator=' ', strip=True) | |
lines = [line.strip() for line in text.splitlines() if line.strip()] | |
return '\n'.join(lines) | |
except Exception as e: | |
return f"Error scraping URL: {e}" | |
if __name__ == '__main__': | |
process_file() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment