Created
November 5, 2024 13:57
-
-
Save s-espriz/c4d5a72ddcaee82ef484ddc1442e1bd6 to your computer and use it in GitHub Desktop.
This script fetch the News and corresponding reactions from a given telegram channel.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from telethon import TelegramClient | |
import datetime | |
import re | |
# Replace these with your own values | |
api_id = 1111 # Your API ID | |
api_hash = 'Your API hash' | |
channel_username = 'Akharinkhabar' | |
# setting initial date to one day before 7 october | |
initial_date = datetime.datetime(2023 , 10 , 6) | |
def extract_title_remove_water_mark(text): | |
text = re.sub(r"@Akharinkhabar.+$", "", text) | |
title_text = "" | |
# Step 2: Extract the title (first line that starts with emoji and bold text) | |
title = re.search(r"^.+$", text ,re.MULTILINE) | |
if title : | |
title_text = title.group() | |
text = text.strip() | |
return title_text ,text | |
def get_reaction(message ) : | |
message_reaction = {} | |
if message.reactions: | |
for reaction in message.reactions.results: | |
count = reaction.count # Get the count of that reaction | |
emoji_code = reaction.reaction.emoticon # Final | |
if emoji_code in message_reaction : | |
message_reaction[emoji_code] += count | |
else : | |
message_reaction[emoji_code]= count | |
return message_reaction | |
# Create the client and connect | |
client = TelegramClient('anon', api_id, api_hash) | |
async def fetch_messages_to_csv(channel_username, batch_size=100): | |
await client.start() | |
channel = await client.get_entity(channel_username) | |
messages_data = [] | |
async for message in client.iter_messages(channel , offset_date=initial_date , reverse= True) : | |
try : | |
message_reaction = get_reaction(message= message) | |
except : | |
message_reaction = {} | |
try: | |
title , text = extract_title_remove_water_mark(message.raw_text) | |
except : | |
title = "" | |
text = "" | |
message_info = { | |
'id': f"{channel_username}-{message.id}", | |
'channel': channel_username, | |
'date': message.date, | |
'views': message.views if message.views is not None else 0, | |
'reply_to': f"{channel_username}-{message.id}" if message.is_reply else None , | |
'title' : title, | |
'text': text, | |
'reactions' : message_reaction | |
} | |
messages_data.append(message_info) | |
if len(messages_data) >= batch_size: | |
# Convert to DataFrame and save to CSV | |
df = pd.DataFrame(messages_data) | |
df.to_csv(f'{channel_username}_messages.csv', mode='a', header=not pd.io.common.file_exists(f'{channel_username}_messages.csv'), index=False) | |
messages_data = [] # Reset the list for the next batch | |
exit(1) | |
# Save any remaining messages that didn't fill a full batch | |
if messages_data: | |
df = pd.DataFrame(messages_data) | |
df.to_csv(f'{channel_username}_messages.csv', mode='a', header=not pd.io.common.file_exists(f'{channel_username}_messages.csv'), index=False) | |
# Run the client | |
with client: | |
client.loop.run_until_complete(fetch_messages_to_csv(channel_username, batch_size=100)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment