Created
September 28, 2023 14:27
-
-
Save Tuhin-thinks/1a08c08f20b034a40c8df885e0caef61 to your computer and use it in GitHub Desktop.
To expand any linkedin encded url to actual URL and save them as CSV file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from typing import Dict, List | |
import re | |
import requests | |
from requests.exceptions import RequestException | |
def expand_url(url): | |
try: | |
resp = requests.head(url, allow_redirects=True) | |
return resp.url | |
except requests.exceptions.RequestException as e: | |
print(e) | |
return url | |
def read_text_file(file_path): | |
with open(file_path, 'r') as f: | |
return f.read() | |
def parse_urls_from_text(text): | |
line_pattern = r'\d\) (?P<title>.+?)\: (?P<url>.+)$' | |
urls = re.finditer(line_pattern, text, re.MULTILINE) | |
return urls | |
def expand_all_urls() -> List[Dict]: | |
text = read_text_file('system-design-45urls.txt') | |
all_urls = parse_urls_from_text(text) | |
expanded_url_list = [] | |
for index, url in enumerate(all_urls, 1): | |
url: re.Match | |
try: | |
match_dict = url.groupdict() | |
expanded_url = expand_url(match_dict['url'].strip()) | |
match_dict['url'] = expanded_url | |
expanded_url_list.append(match_dict) | |
except RequestException: | |
print(f"Failed to expand URL ({index}): ", url) | |
return expanded_url_list | |
def save_as_csv(expanded_url_list: List[Dict[str, str]]): | |
with open('system-design-45urls-expanded.csv', 'w') as f: | |
fieldnames = ['title', 'url'] | |
writer = csv.DictWriter(f, fieldnames=fieldnames) | |
writer.writeheader() | |
writer.writerows(expanded_url_list) | |
print("writing complete ✅") | |
if __name__ == '__main__': | |
all_expanded_urls = expand_all_urls() | |
save_as_csv(all_expanded_urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you're lazy enough to run this script, here's the output CSV for you:
https://file.io/yHdj5SVXa6CR