Last active January 31, 2023 16:17
import requests
from bs4 import BeautifulSoup
import os
import re
from telebot import TeleBot
from PIL import Image
from io import BytesIO
import datetime
from xmlrpc import client as xmlrpc_client
import unidecode
from tqdm import tqdm
from colorama import Fore
# Cross-platform colored terminal text.
color_bars = [Fore.BLACK,
USERNAME = "luighi"
URL = ""
TOKEN = "5885311028:AAHUwE6aVS73fZm03DksQI52WZ2ctTS5Xkw"
CHAT_ID = -880447273
bot = TeleBot(TOKEN)
def remove_special_characters(string: str) -> str:
"""Remove special characters from string using a regular expression"""
return re.sub(r"[^\w\s]|\?|\!", "", string)
def page_url() -> str:
"""Prompt user to enter page URL"""
url = input("Insira a URL da página: ")
return url
@bot.message_handler(commands=['start', 'help'])
def send_welcome(message):
"""Send welcome message to user"""
bot.reply_to(message, "Olá! Eu sou um bot que pode baixar o título e o conteúdo de uma página da web e salvar em um arquivo de texto. Para usar, envie a URL da página para mim.")
@bot.message_handler(func=lambda message: True)
def download_page(message):
"""Download page title and content and save to text file"""
# Get URL from message sent by user
url = message.text
# Make a GET request to the site
response = requests.get(url)
# Extract HTML content from response
html = response.text
# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup(html, 'html.parser')
# Find the <title> element in the HTML
title_element = soup.find('h1')
# Get content of <title> element
title = title_element.text # type: ignore
# Remove special characters from title
title_mod = remove_special_characters(title)
# Create a directory with the page title
def create_dir():
current_dir = os.path.dirname(__file__)
folder_path = os.path.join(current_dir, title_mod)
# Try to create the directory
# Change to directory
# If the directory already exists, print an error message
except FileNotFoundError:
print("The file was not found.")
except PermissionError:
print("You do not have permission to access the file.")
# If request is not successful, print response status code
print("Error:", response.status_code) #type: ignore
# Create an empty list to store topics
topics = []
# Loop through all <h3> elements and extract the text from each one
topics_element = soup.find_all('h3')
for topic_element in topics_element:
topic = topic_element.text
#create dir files
imagens = []
imagens_element = soup.find_all("div", class_="main")
for imagem_element in imagens_element:
# Open the file in write mode
text_file = open("page_content.txt", "w", encoding="utf-8")
# Write the title to the file
text_file.write(title + "\n")
# Loop through the topics and write each one to the file
for topic in topics:
text_file.write(topic + "\n")
# Close the file
# Get all <p> elements from the HTML
paragraphs = soup.find_all('p')
# Create a list to store the paragraph texts
paragraph_texts = []
# Loop through the paragraphs and extract the text from each one
for paragraph in paragraphs:
paragraph_text = paragraph.text
# Append the text to the list
# Join the texts in the list into a single string
page_content = '\n'.join(paragraph_texts)
# Remove special characters from page content
page_content_mod = remove_special_characters(page_content)
# Open the file in append mode
text_file = open("page_content.txt", "a", encoding="utf-8")
# Write the page content to the file
# Close the file
# Send message to user indicating success
CHAT_ID, "Acabei de extraír com sucesso o conteúdo da página e o salvei em um arquivo de texto.")
# Search class from article content
article_content = soup.find_all("div", class_="main")
# If class exist, get img data from the article content
if article_content:
if not os.path.exists("imagens"):
# Mude para a pasta Imagens
# Create a list to store the image URLs
image_urls = []
# Find all <img> elements in the HTML
images = article_content[0].find_all("img")
# Loop through the images and extract the URL from each one
for image in images:
# Get the 'src' attribute of the image
image_url = image['src']
# Verifique se o URL da imagem está faltando o esquema
if not image_url.startswith('http'):
# Adicione o esquema faltante ao URL da imagem
image_url = 'http:' + image_url
# Verifique se o arquivo é uma imagem válida (JPEG ou WEBP)
if not (image_url.endswith('.jpg') or image_url.endswith('.jpeg') or image_url.endswith('.webp')):
# Add the URL to the list
# Set the image file name prefix
image_name_prefix = "___"+unidecode.unidecode(title_mod)
# Set the image file name extension
image_name_extension = ".webp"
# Set the initial value of the image counter
image_counter = 1
# Loop through the image URLs #use tqdm progress bar
for image_url in tqdm(image_urls, desc="Downloading images", bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
# Download the image
remove_accentuation = unidecode.unidecode(image_url)
response = requests.get(remove_accentuation)
# Check if the request was successful
if response.status_code == 200:
# Read the image content
image_content = response.content
# Decode the image content to a BytesIO object
image_data = BytesIO(image_content)
# Open the image using PIL
image =
# Generate the image file name
image_name = f"{image_counter}{image_name_prefix}{image_name_extension}".lower()
# Save the image to a file
# Increment the image counter
image_counter += 1
# If the request was not successful
# Print an error message
print("Erro ao baixar imagem: ", response.status_code)
# Send message to user indicating success
bot.send_message(CHAT_ID, "As imagens foram baixadas com sucesso.")
# Crie uma lista vazia para armazenar os nomes das imagens
images = []
# List all files in the current directory
for file in os.listdir():
# Check if file is an image
if file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".webp"):
# Create an XML-RPC client
client = xmlrpc_client.ServerProxy("")
# Create a new post
post = client.metaWeblog.newPost(1, USERNAME, PASSWORD, {
'title': title,
'description': topics,
'post_type': 'post',
'post_status': 'draft',
'post_category': ['entretenimento'],
# Check the return value of the metaWeblog.newPost method
if isinstance(post, dict):
# The return value is a dictionary, which means that the post was created successfully
post_id = post['post_id']
# The return value is a string, which means that it is the ID of the created post
post_id = post
# List all .webp files in the current directory
webp_files = [f for f in os.listdir() if f.endswith('.webp')]
# Iterate over the webp files
for image in tqdm(webp_files, desc="Uploading images", bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
# Open the image file
with open(image, 'rb') as img:
img_data =
data = {
'name': image,
'type': 'image/webp',
'bits': xmlrpc_client.Binary(img_data),
'overwrite': True
# Upload the image to the post
response = client.metaWeblog.newMediaObject(post_id, USERNAME, PASSWORD, data)
print("Uploaded: %s" % str(response))
except xmlrpc_client.Fault as e:
print("Error uploading image: %s" % e)
def extract_number(string):
# Use a regular expression to match the number at the beginning of the string
match = re.match(r'(\d+)___', string)
# If a match was found, return the number as an integer
if match:
return int(
# Otherwise, return 0
return 0
sorted_images = sorted(images, key=extract_number)
sorted_topics = sorted(topics, key=extract_number)
output = []
today =
month = today.month
year = today.year
for topic, image in tqdm(zip(sorted_topics, sorted_images),desc="Making post",bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)):
f"<h4> {topic} </h4>\n <img src='{year}/{month}/{image.replace(' ', '-').replace(' ', '-')}' alt = {topic} width = '100 % ' height = 'auto'/>\n\n")
# Join the elements in the output list and save the result to a string
formatted_output = "".join(output)
# Update the post with the formatted string
client.metaWeblog.editPost(post_id, USERNAME, PASSWORD, {
'title': title,
'description': formatted_output,
'post_type': 'post',
'post_status': 'draft',
'post_category': ['entretenimento'],
bot.send_message(CHAT_ID, f"A Postagem '{title}' foi criada com sucesso.")
print("Postagem realizada com sucesso!")
# Start the bot
