s-espriz · November 5, 2024 13:57
diff --git a/getting_news.py b/getting_news.py
 import pandas as pd
 from telethon import TelegramClient
 import datetime
 import re

 # Replace these with your own values
 api_id = 1111 # Your API ID
 api_hash = 'Your API hash'
 channel_username = 'Akharinkhabar'

 # setting initial date to one day before 7 october
 initial_date = datetime.datetime(2023 , 10 , 6)

 def extract_title_remove_water_mark(text):
    text = re.sub(r"@Akharinkhabar.+$", "", text)
    title_text = ""
    # Step 2: Extract the title (first line that starts with emoji and bold text)
    title = re.search(r"^.+$", text ,re.MULTILINE)
    if title :
        title_text = title.group()
    text = text.strip()
    return title_text ,text  

 def get_reaction(message ) : 
    message_reaction = {}
    if message.reactions:
        for reaction in message.reactions.results:
            count = reaction.count  # Get the count of that reaction
            emoji_code = reaction.reaction.emoticon # Final

            if emoji_code in message_reaction : 
                message_reaction[emoji_code] += count
            else : 
                message_reaction[emoji_code]= count
    return message_reaction

 # Create the client and connect
 client = TelegramClient('anon', api_id, api_hash)

 async def fetch_messages_to_csv(channel_username, batch_size=100):
    await client.start()

    channel = await client.get_entity(channel_username)

    messages_data = []

    async for message in client.iter_messages(channel , offset_date=initial_date , reverse= True) : 
        try :
            message_reaction = get_reaction(message= message)
        except : 
            message_reaction = {}
        try:
            title , text = extract_title_remove_water_mark(message.raw_text) 
        except :
            title = ""
            text = ""
        message_info = {
            'id': f"{channel_username}-{message.id}", 
            'channel': channel_username,
            'date': message.date,
            'views': message.views if message.views is not None else 0,
            'reply_to': f"{channel_username}-{message.id}" if message.is_reply else None , 
            'title' : title,
            'text': text,
            'reactions' : message_reaction
        }
        messages_data.append(message_info)

        if len(messages_data) >= batch_size:
            # Convert to DataFrame and save to CSV
            df = pd.DataFrame(messages_data)
            df.to_csv(f'{channel_username}_messages.csv', mode='a', header=not pd.io.common.file_exists(f'{channel_username}_messages.csv'), index=False)
            messages_data = []  # Reset the list for the next batch
            exit(1)
    # Save any remaining messages that didn't fill a full batch
    if messages_data:
        df = pd.DataFrame(messages_data)
        df.to_csv(f'{channel_username}_messages.csv', mode='a', header=not pd.io.common.file_exists(f'{channel_username}_messages.csv'), index=False)

 # Run the client
 with client:
    client.loop.run_until_complete(fetch_messages_to_csv(channel_username, batch_size=100))
	import pandas as pd
	from telethon import TelegramClient
	import datetime
	import re

	# Replace these with your own values
	api_id = 1111 # Your API ID
	api_hash = 'Your API hash'
	channel_username = 'Akharinkhabar'

	# setting initial date to one day before 7 october
	initial_date = datetime.datetime(2023 , 10 , 6)

	def extract_title_remove_water_mark(text):
	text = re.sub(r"@Akharinkhabar.+$", "", text)
	title_text = ""
	# Step 2: Extract the title (first line that starts with emoji and bold text)
	title = re.search(r"^.+$", text ,re.MULTILINE)
	if title :
	title_text = title.group()
	text = text.strip()
	return title_text ,text

	def get_reaction(message ) :
	message_reaction = {}
	if message.reactions:
	for reaction in message.reactions.results:
	count = reaction.count # Get the count of that reaction
	emoji_code = reaction.reaction.emoticon # Final

	if emoji_code in message_reaction :
	message_reaction[emoji_code] += count
	else :
	message_reaction[emoji_code]= count
	return message_reaction

	# Create the client and connect
	client = TelegramClient('anon', api_id, api_hash)

	async def fetch_messages_to_csv(channel_username, batch_size=100):
	await client.start()

	channel = await client.get_entity(channel_username)

	messages_data = []

	async for message in client.iter_messages(channel , offset_date=initial_date , reverse= True) :
	try :
	message_reaction = get_reaction(message= message)
	except :
	message_reaction = {}
	try:
	title , text = extract_title_remove_water_mark(message.raw_text)
	except :
	title = ""
	text = ""
	message_info = {
	'id': f"{channel_username}-{message.id}",
	'channel': channel_username,
	'date': message.date,
	'views': message.views if message.views is not None else 0,
	'reply_to': f"{channel_username}-{message.id}" if message.is_reply else None ,
	'title' : title,
	'text': text,
	'reactions' : message_reaction
	}
	messages_data.append(message_info)

	if len(messages_data) >= batch_size:
	# Convert to DataFrame and save to CSV
	df = pd.DataFrame(messages_data)
	df.to_csv(f'{channel_username}_messages.csv', mode='a', header=not pd.io.common.file_exists(f'{channel_username}_messages.csv'), index=False)
	messages_data = [] # Reset the list for the next batch
	exit(1)
	# Save any remaining messages that didn't fill a full batch
	if messages_data:
	df = pd.DataFrame(messages_data)
	df.to_csv(f'{channel_username}_messages.csv', mode='a', header=not pd.io.common.file_exists(f'{channel_username}_messages.csv'), index=False)

	# Run the client
	with client:
	client.loop.run_until_complete(fetch_messages_to_csv(channel_username, batch_size=100))