Last active
January 31, 2023 16:17
-
-
Save luighifeodrippe/f1db27695b0c8b112e7857ab8ad459cc to your computer and use it in GitHub Desktop.
Neon
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import os | |
import re | |
from telebot import TeleBot | |
from PIL import Image | |
from io import BytesIO | |
import datetime | |
from xmlrpc import client as xmlrpc_client | |
import unidecode | |
from tqdm import tqdm | |
from colorama import Fore | |
# Cross-platform colored terminal text. | |
color_bars = [Fore.BLACK, | |
Fore.RED, | |
Fore.GREEN, | |
Fore.YELLOW, | |
Fore.BLUE, | |
Fore.MAGENTA, | |
Fore.CYAN, | |
Fore.WHITE] | |
USERNAME = "luighi" | |
PASSWORD = "XXXXX" | |
URL = "https://fiquesabendo.org/" | |
TOKEN = "5885311028:AAHUwE6aVS73fZm03DksQI52WZ2ctTS5Xkw" | |
CHAT_ID = -880447273 | |
bot = TeleBot(TOKEN) | |
def remove_special_characters(string: str) -> str: | |
"""Remove special characters from string using a regular expression""" | |
return re.sub(r"[^\w\s]|\?|\!", "", string) | |
def page_url() -> str: | |
"""Prompt user to enter page URL""" | |
url = input("Insira a URL da página: ") | |
return url | |
@bot.message_handler(commands=['start', 'help']) | |
def send_welcome(message): | |
"""Send welcome message to user""" | |
bot.reply_to(message, "Olá! Eu sou um bot que pode baixar o título e o conteúdo de uma página da web e salvar em um arquivo de texto. Para usar, envie a URL da página para mim.") | |
@bot.message_handler(func=lambda message: True) | |
def download_page(message): | |
"""Download page title and content and save to text file""" | |
# Get URL from message sent by user | |
url = message.text | |
# Make a GET request to the site | |
response = requests.get(url) | |
# Extract HTML content from response | |
html = response.text | |
# Create a BeautifulSoup object from the HTML | |
soup = BeautifulSoup(html, 'html.parser') | |
# Find the <title> element in the HTML | |
title_element = soup.find('h1') | |
# Get content of <title> element | |
title = title_element.text # type: ignore | |
# Remove special characters from title | |
title_mod = remove_special_characters(title) | |
# Create a directory with the page title | |
def create_dir(): | |
current_dir = os.path.dirname(__file__) | |
folder_path = os.path.join(current_dir, title_mod) | |
# Try to create the directory | |
try: | |
os.mkdir(folder_path) | |
# Change to directory | |
os.chdir(folder_path) | |
# If the directory already exists, print an error message | |
except FileNotFoundError: | |
print("The file was not found.") | |
except PermissionError: | |
print("You do not have permission to access the file.") | |
# If request is not successful, print response status code | |
else: | |
print("Error:", response.status_code) #type: ignore | |
create_dir() | |
# Create an empty list to store topics | |
topics = [] | |
# Loop through all <h3> elements and extract the text from each one | |
topics_element = soup.find_all('h3') | |
for topic_element in topics_element: | |
topic = topic_element.text | |
topics.append(topic) | |
#create dir files | |
imagens = [] | |
imagens_element = soup.find_all("div", class_="main") | |
for imagem_element in imagens_element: | |
imagem_element.find('img') | |
# Open the file in write mode | |
text_file = open("page_content.txt", "w", encoding="utf-8") | |
# Write the title to the file | |
text_file.write(title + "\n") | |
# Loop through the topics and write each one to the file | |
for topic in topics: | |
text_file.write(topic + "\n") | |
# Close the file | |
text_file.close() | |
# Get all <p> elements from the HTML | |
paragraphs = soup.find_all('p') | |
# Create a list to store the paragraph texts | |
paragraph_texts = [] | |
# Loop through the paragraphs and extract the text from each one | |
for paragraph in paragraphs: | |
paragraph_text = paragraph.text | |
# Append the text to the list | |
paragraph_texts.append(paragraph_text) | |
# Join the texts in the list into a single string | |
page_content = '\n'.join(paragraph_texts) | |
# Remove special characters from page content | |
page_content_mod = remove_special_characters(page_content) | |
# Open the file in append mode | |
text_file = open("page_content.txt", "a", encoding="utf-8") | |
# Write the page content to the file | |
text_file.write(page_content_mod) | |
# Close the file | |
text_file.close() | |
# Send message to user indicating success | |
bot.send_message( | |
CHAT_ID, "Acabei de extraír com sucesso o conteúdo da página e o salvei em um arquivo de texto.") | |
# Search class from article content | |
article_content = soup.find_all("div", class_="main") | |
# If class exist, get img data from the article content | |
if article_content: | |
if not os.path.exists("imagens"): | |
os.mkdir("imagens") | |
# Mude para a pasta Imagens | |
os.chdir("imagens") | |
# Create a list to store the image URLs | |
image_urls = [] | |
# Find all <img> elements in the HTML | |
images = article_content[0].find_all("img") | |
# Loop through the images and extract the URL from each one | |
for image in images: | |
# Get the 'src' attribute of the image | |
image_url = image['src'] | |
# Verifique se o URL da imagem está faltando o esquema | |
if not image_url.startswith('http'): | |
# Adicione o esquema faltante ao URL da imagem | |
image_url = 'http:' + image_url | |
# Verifique se o arquivo é uma imagem válida (JPEG ou WEBP) | |
if not (image_url.endswith('.jpg') or image_url.endswith('.jpeg') or image_url.endswith('.webp')): | |
continue | |
# Add the URL to the list | |
image_urls.append(image_url) | |
# Set the image file name prefix | |
image_name_prefix = "___"+unidecode.unidecode(title_mod) | |
# Set the image file name extension | |
image_name_extension = ".webp" | |
# Set the initial value of the image counter | |
image_counter = 1 | |
# Loop through the image URLs #use tqdm progress bar | |
for image_url in tqdm(image_urls, desc="Downloading images", bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)): | |
# Download the image | |
remove_accentuation = unidecode.unidecode(image_url) | |
response = requests.get(remove_accentuation) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Read the image content | |
image_content = response.content | |
# Decode the image content to a BytesIO object | |
image_data = BytesIO(image_content) | |
# Open the image using PIL | |
image = Image.open(image_data) | |
# Generate the image file name | |
image_name = f"{image_counter}{image_name_prefix}{image_name_extension}".lower() | |
# Save the image to a file | |
image.save(image_name) | |
# Increment the image counter | |
image_counter += 1 | |
# If the request was not successful | |
else: | |
# Print an error message | |
print("Erro ao baixar imagem: ", response.status_code) | |
# Send message to user indicating success | |
bot.send_message(CHAT_ID, "As imagens foram baixadas com sucesso.") | |
# Crie uma lista vazia para armazenar os nomes das imagens | |
images = [] | |
# List all files in the current directory | |
for file in os.listdir(): | |
# Check if file is an image | |
if file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".webp"): | |
images.append(file) | |
# Create an XML-RPC client | |
client = xmlrpc_client.ServerProxy("https://fiquesabendo.org/xmlrpc.php") | |
# Create a new post | |
post = client.metaWeblog.newPost(1, USERNAME, PASSWORD, { | |
'title': title, | |
'description': topics, | |
'post_type': 'post', | |
'post_status': 'draft', | |
'post_category': ['entretenimento'], | |
} | |
) | |
# Check the return value of the metaWeblog.newPost method | |
if isinstance(post, dict): | |
# The return value is a dictionary, which means that the post was created successfully | |
post_id = post['post_id'] | |
else: | |
# The return value is a string, which means that it is the ID of the created post | |
post_id = post | |
# List all .webp files in the current directory | |
webp_files = [f for f in os.listdir() if f.endswith('.webp')] | |
# Iterate over the webp files | |
for image in tqdm(webp_files, desc="Uploading images", bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)): | |
# Open the image file | |
with open(image, 'rb') as img: | |
img_data = img.read() | |
data = { | |
'name': image, | |
'type': 'image/webp', | |
'bits': xmlrpc_client.Binary(img_data), | |
'overwrite': True | |
} | |
try: | |
# Upload the image to the post | |
response = client.metaWeblog.newMediaObject(post_id, USERNAME, PASSWORD, data) | |
print("Uploaded: %s" % str(response)) | |
except xmlrpc_client.Fault as e: | |
print("Error uploading image: %s" % e) | |
def extract_number(string): | |
# Use a regular expression to match the number at the beginning of the string | |
match = re.match(r'(\d+)___', string) | |
# If a match was found, return the number as an integer | |
if match: | |
return int(match.group(1)) | |
# Otherwise, return 0 | |
return 0 | |
sorted_images = sorted(images, key=extract_number) | |
sorted_topics = sorted(topics, key=extract_number) | |
output = [] | |
today = datetime.datetime.now() | |
month = today.month | |
year = today.year | |
for topic, image in tqdm(zip(sorted_topics, sorted_images),desc="Making post",bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)): | |
output.append( | |
f"<h4> {topic} </h4>\n <img src='https://fiquesabendo.org/wp-content/uploads/{year}/{month}/{image.replace(' ', '-').replace(' ', '-')}' alt = {topic} width = '100 % ' height = 'auto'/>\n\n") | |
# Join the elements in the output list and save the result to a string | |
formatted_output = "".join(output) | |
# Update the post with the formatted string | |
client.metaWeblog.editPost(post_id, USERNAME, PASSWORD, { | |
'title': title, | |
'description': formatted_output, | |
'post_type': 'post', | |
'post_status': 'draft', | |
'post_category': ['entretenimento'], | |
}) | |
bot.send_message(CHAT_ID, f"A Postagem '{title}' foi criada com sucesso.") | |
print("Postagem realizada com sucesso!") | |
# Start the bot | |
bot.polling() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment