Created
May 10, 2023 05:20
-
-
Save lennier1/6c4ae0d4945114eeeded59b74cda3ecf to your computer and use it in GitHub Desktop.
Expand link-shortened URLs from .csv files for Twitter imgur results
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import requests | |
import re | |
import sys | |
import glob | |
from urllib.parse import urlsplit | |
from time import sleep | |
def expand_url(short_url): | |
try: | |
response = requests.get(short_url, timeout=10) | |
if response.status_code == 200: | |
return response.url | |
else: | |
print(f"Error: {short_url} returned status code {response.status_code}") | |
return None | |
except Exception as e: | |
print(f"Error: {short_url} - {str(e)}") | |
return None | |
def extract_urls_from_text(text): | |
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') | |
return url_pattern.findall(text) | |
def extract_urls_from_csv(file_path, excluded_domains, column_name): | |
urls = [] | |
with open(file_path, newline='') as csvfile: | |
reader = csv.DictReader(csvfile) | |
for row in reader: | |
text = row[column_name] | |
for url in extract_urls_from_text(text): | |
try: | |
domain = urlsplit(url).netloc | |
#if domain in target_domains: | |
# urls.append(url) | |
if domain not in excluded_domains: | |
urls.append(url) | |
except ValueError as e: | |
if str(e) == "Invalid IPv6 URL": | |
print(f"Warning: Skipping invalid IPv6 URL: {url}") | |
else: | |
raise e | |
return urls | |
def main(): | |
if len(sys.argv) < 2: | |
print("Usage: python expand_urls_from_csvs.py file1.csv file2.csv ...") | |
sys.exit(1) | |
csv_files = [] | |
for arg in sys.argv[1:]: | |
csv_files.extend(glob.glob(arg)) | |
#input_file = "imgur.com_imgur.io_2014-01-09_2014-01-10_incomplete.csv" | |
target_domains = ["bit.ly", "ift.tt"] | |
excluded_domains = ["imgur.com", "votejaceysebion.com", "i.imgur.com", "2014SC.com", "2014REEL.com", "24TOTO.com", "m.imgur.com", "dlvr.it", "fb.me", "redd.it", "24toto.com", "2014OS.com", "VoteIsabellaHibbs.com", "reddit.com", "twitter.com"] | |
column_name = "Embedded_text" # Replace with the appropriate column name in your .csv | |
file1 = open("expanded_urls.txt", "a") # append mode | |
expanded_urls = [] | |
for csv_file in csv_files: | |
print("processing " + str(csv_file)) | |
file1.write("processing " + str(csv_file) + "\n") | |
short_urls = extract_urls_from_csv(csv_file, excluded_domains, column_name) | |
for short_url in short_urls: | |
print("expanding url " + short_url) | |
file1.write("expanding url " + short_url + "\n") | |
expanded_url = expand_url(short_url) | |
if expanded_url: | |
expanded_urls.append(expanded_url) | |
if expanded_url is not None: | |
print(expanded_url) | |
file1.write(expanded_url + "\n") | |
else: | |
print("None returned") | |
file1.write("None returned\n") | |
sleep(0.5) | |
file1.close() | |
# Print the expanded URLs | |
#for expanded_url in expanded_urls: | |
# print(expanded_url) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment