Created
June 5, 2024 11:50
-
-
Save pixobe/f20c345de1a3cd466e002a6d1381bcac to your computer and use it in GitHub Desktop.
code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from playwright.sync_api import sync_playwright, Playwright,TimeoutError as PlaywrightTimeoutError | |
import re | |
import csv | |
import os | |
from datetime import datetime | |
text_to_search=r"TCS|Tata Consultancy Services" | |
def write_to_csv(publish_date,header, content, filename='tata-2023.csv'): | |
# Open the file in write mode | |
with open(filename, mode='a', newline='') as file: | |
writer = csv.writer(file) | |
# Write the header | |
writer.writerow([publish_date,header, f"{content}"]) | |
def convert_last_updated_to_datetime(text): | |
# Define the pattern to match the "Last Updated" date string | |
pattern = r"Last Updated: (\w+ \d{2}, \d{4}, \d{2}:\d{2}:\d{2} [AP]M IST)" | |
# Search for the pattern in the text | |
match = re.search(pattern, text) | |
if match: | |
# Extract the date and time string | |
date_str = match.group(1) | |
print(f"Extracted datetime string: {date_str}") | |
# Define the format of the date string | |
date_format = "%b %d, %Y, %I:%M:%S %p IST" | |
# Convert the date string to a datetime object | |
date_obj = datetime.strptime(date_str, date_format) | |
return date_obj | |
else: | |
print("No 'Last Updated' pattern found in the text.") | |
return None | |
def test_has_contents(playwright: Playwright): | |
years = ['2024'] | |
months = [str(day) for day in range(1, 13)] | |
days = [str(day) for day in range(1, 32)] | |
chromium = playwright.chromium | |
browser = chromium.launch() | |
context = browser.new_context() | |
page = context.new_page() | |
for year in years: | |
for month in months: | |
for day in days: | |
try: | |
url = f"https://economictimes.indiatimes.com/archive/year-{year},month-{month}.cms"; | |
page.goto(url, wait_until="domcontentloaded") | |
page.get_by_role("link", name=day, exact=True).click(timeout=5000); | |
# wait for all the archive list | |
page.wait_for_url("**/archivelist/**",timeout=5000) | |
locators = page.get_by_role("link",name=re.compile(text_to_search, re.IGNORECASE), exact=False).all() | |
for locator in locators: | |
locator.click(timeout=5000) | |
publish_date_element = page.query_selector(".jsdtTime") | |
publish_date = "" | |
if publish_date_element: | |
publish_date = convert_last_updated_to_datetime(publish_date_element.inner_text()) | |
heading = page.query_selector(".artTitle") | |
header = "" | |
if heading: | |
header = heading.inner_text() | |
contentElement = page.query_selector(".artData") | |
content = "" | |
if contentElement: | |
content = contentElement.inner_text() | |
write_to_csv(publish_date,header,content) | |
page.go_back(timeout=5000, wait_until='domcontentloaded') | |
except PlaywrightTimeoutError as e: | |
print(f" ********** Timed out for {year}-{month}-{day} ********** {e}") | |
break | |
except Exception as e: | |
print(f" ********** An error occurred for {year}-{month}-{day} ********** {e}") | |
break | |
page.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment