Skip to content

Instantly share code, notes, and snippets.

@lennier1
Created May 10, 2023 05:20
Show Gist options
  • Save lennier1/6c4ae0d4945114eeeded59b74cda3ecf to your computer and use it in GitHub Desktop.
Save lennier1/6c4ae0d4945114eeeded59b74cda3ecf to your computer and use it in GitHub Desktop.
Expand link-shortened URLs from .csv files for Twitter imgur results
import csv
import requests
import re
import sys
import glob
from urllib.parse import urlsplit
from time import sleep
def expand_url(short_url):
try:
response = requests.get(short_url, timeout=10)
if response.status_code == 200:
return response.url
else:
print(f"Error: {short_url} returned status code {response.status_code}")
return None
except Exception as e:
print(f"Error: {short_url} - {str(e)}")
return None
def extract_urls_from_text(text):
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
return url_pattern.findall(text)
def extract_urls_from_csv(file_path, excluded_domains, column_name):
urls = []
with open(file_path, newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
text = row[column_name]
for url in extract_urls_from_text(text):
try:
domain = urlsplit(url).netloc
#if domain in target_domains:
# urls.append(url)
if domain not in excluded_domains:
urls.append(url)
except ValueError as e:
if str(e) == "Invalid IPv6 URL":
print(f"Warning: Skipping invalid IPv6 URL: {url}")
else:
raise e
return urls
def main():
if len(sys.argv) < 2:
print("Usage: python expand_urls_from_csvs.py file1.csv file2.csv ...")
sys.exit(1)
csv_files = []
for arg in sys.argv[1:]:
csv_files.extend(glob.glob(arg))
#input_file = "imgur.com_imgur.io_2014-01-09_2014-01-10_incomplete.csv"
target_domains = ["bit.ly", "ift.tt"]
excluded_domains = ["imgur.com", "votejaceysebion.com", "i.imgur.com", "2014SC.com", "2014REEL.com", "24TOTO.com", "m.imgur.com", "dlvr.it", "fb.me", "redd.it", "24toto.com", "2014OS.com", "VoteIsabellaHibbs.com", "reddit.com", "twitter.com"]
column_name = "Embedded_text" # Replace with the appropriate column name in your .csv
file1 = open("expanded_urls.txt", "a") # append mode
expanded_urls = []
for csv_file in csv_files:
print("processing " + str(csv_file))
file1.write("processing " + str(csv_file) + "\n")
short_urls = extract_urls_from_csv(csv_file, excluded_domains, column_name)
for short_url in short_urls:
print("expanding url " + short_url)
file1.write("expanding url " + short_url + "\n")
expanded_url = expand_url(short_url)
if expanded_url:
expanded_urls.append(expanded_url)
if expanded_url is not None:
print(expanded_url)
file1.write(expanded_url + "\n")
else:
print("None returned")
file1.write("None returned\n")
sleep(0.5)
file1.close()
# Print the expanded URLs
#for expanded_url in expanded_urls:
# print(expanded_url)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment