pixobe · June 5, 2024 11:50
diff --git a/gistfile1.txt b/gistfile1.txt
 from playwright.sync_api import sync_playwright, Playwright,TimeoutError as PlaywrightTimeoutError
 import re
 import csv
 import os
 from datetime import datetime

 text_to_search=r"TCS|Tata Consultancy Services"


 def write_to_csv(publish_date,header, content, filename='tata-2023.csv'):
    # Open the file in write mode
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow([publish_date,header, f"{content}"])

 def convert_last_updated_to_datetime(text):
    # Define the pattern to match the "Last Updated" date string
    pattern = r"Last Updated: (\w+ \d{2}, \d{4}, \d{2}:\d{2}:\d{2} [AP]M IST)"
    
    # Search for the pattern in the text
    match = re.search(pattern, text)
    
    if match:
        # Extract the date and time string
        date_str = match.group(1)
        print(f"Extracted datetime string: {date_str}")
        
        # Define the format of the date string
        date_format = "%b %d, %Y, %I:%M:%S %p IST"
        
        # Convert the date string to a datetime object
        date_obj = datetime.strptime(date_str, date_format)
        
        return date_obj
    else:
        print("No 'Last Updated' pattern found in the text.")
        return None

 def test_has_contents(playwright: Playwright):

    years = ['2024']
    months = [str(day) for day in range(1, 13)]
    days = [str(day) for day in range(1, 32)]
    
    chromium = playwright.chromium
    browser = chromium.launch()
    context = browser.new_context()
    page = context.new_page()  
    
    for year in years:
        for month in months:
            for day in days:
                try:
                    url = f"https://economictimes.indiatimes.com/archive/year-{year},month-{month}.cms";

                    page.goto(url, wait_until="domcontentloaded")
                    page.get_by_role("link", name=day, exact=True).click(timeout=5000);
                    # wait for all the archive list
                    page.wait_for_url("**/archivelist/**",timeout=5000)
                    locators = page.get_by_role("link",name=re.compile(text_to_search, re.IGNORECASE), exact=False).all()
                    
                    for locator in locators:
                        locator.click(timeout=5000)
                        publish_date_element = page.query_selector(".jsdtTime")
                        publish_date = ""
                        if publish_date_element:
                            publish_date = convert_last_updated_to_datetime(publish_date_element.inner_text())
                        heading = page.query_selector(".artTitle")
                        header = ""
                        if heading:
                            header = heading.inner_text()
                        contentElement = page.query_selector(".artData")
                        content = ""
                        if contentElement:
                            content = contentElement.inner_text()
                        write_to_csv(publish_date,header,content)
                        page.go_back(timeout=5000, wait_until='domcontentloaded')

                except PlaywrightTimeoutError as e:
                    print(f" ********** Timed out for {year}-{month}-{day}  ********** {e}")
                    break
                except Exception as e:
                    print(f" ********** An error occurred for {year}-{month}-{day}  ********** {e}")
                    break
               
    page.close()
	from playwright.sync_api import sync_playwright, Playwright,TimeoutError as PlaywrightTimeoutError
	import re
	import csv
	import os
	from datetime import datetime

	text_to_search=r"TCS\|Tata Consultancy Services"


	def write_to_csv(publish_date,header, content, filename='tata-2023.csv'):
	# Open the file in write mode
	with open(filename, mode='a', newline='') as file:
	writer = csv.writer(file)
	# Write the header
	writer.writerow([publish_date,header, f"{content}"])

	def convert_last_updated_to_datetime(text):
	# Define the pattern to match the "Last Updated" date string
	pattern = r"Last Updated: (\w+ \d{2}, \d{4}, \d{2}:\d{2}:\d{2} [AP]M IST)"

	# Search for the pattern in the text
	match = re.search(pattern, text)

	if match:
	# Extract the date and time string
	date_str = match.group(1)
	print(f"Extracted datetime string: {date_str}")

	# Define the format of the date string
	date_format = "%b %d, %Y, %I:%M:%S %p IST"

	# Convert the date string to a datetime object
	date_obj = datetime.strptime(date_str, date_format)

	return date_obj
	else:
	print("No 'Last Updated' pattern found in the text.")
	return None

	def test_has_contents(playwright: Playwright):

	years = ['2024']
	months = [str(day) for day in range(1, 13)]
	days = [str(day) for day in range(1, 32)]

	chromium = playwright.chromium
	browser = chromium.launch()
	context = browser.new_context()
	page = context.new_page()

	for year in years:
	for month in months:
	for day in days:
	try:
	url = f"https://economictimes.indiatimes.com/archive/year-{year},month-{month}.cms";

	page.goto(url, wait_until="domcontentloaded")
	page.get_by_role("link", name=day, exact=True).click(timeout=5000);
	# wait for all the archive list
	page.wait_for_url("/archivelist/",timeout=5000)
	locators = page.get_by_role("link",name=re.compile(text_to_search, re.IGNORECASE), exact=False).all()

	for locator in locators:
	locator.click(timeout=5000)
	publish_date_element = page.query_selector(".jsdtTime")
	publish_date = ""
	if publish_date_element:
	publish_date = convert_last_updated_to_datetime(publish_date_element.inner_text())
	heading = page.query_selector(".artTitle")
	header = ""
	if heading:
	header = heading.inner_text()
	contentElement = page.query_selector(".artData")
	content = ""
	if contentElement:
	content = contentElement.inner_text()
	write_to_csv(publish_date,header,content)
	page.go_back(timeout=5000, wait_until='domcontentloaded')

	except PlaywrightTimeoutError as e:
	print(f" ******** Timed out for {year}-{month}-{day} ******** {e}")
	break
	except Exception as e:
	print(f" ******** An error occurred for {year}-{month}-{day} ******** {e}")
	break

	page.close()