lennier1 · May 10, 2023 05:20
diff --git a/expand_urls_from_csvs.py b/expand_urls_from_csvs.py
 import csv
 import requests
 import re
 import sys
 import glob
 from urllib.parse import urlsplit
 from time import sleep

 def expand_url(short_url):
    try:
        response = requests.get(short_url, timeout=10)
        if response.status_code == 200:
            return response.url
        else:
            print(f"Error: {short_url} returned status code {response.status_code}")
            return None
    except Exception as e:
        print(f"Error: {short_url} - {str(e)}")
        return None


 def extract_urls_from_text(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.findall(text)

 def extract_urls_from_csv(file_path, excluded_domains, column_name):
    urls = []
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            text = row[column_name]
            for url in extract_urls_from_text(text):
                try:
                    domain = urlsplit(url).netloc
                    #if domain in target_domains:
                    #    urls.append(url)
                    if domain not in excluded_domains:
                        urls.append(url)
                except ValueError as e:
                    if str(e) == "Invalid IPv6 URL":
                        print(f"Warning: Skipping invalid IPv6 URL: {url}")
                    else:
                        raise e
    return urls

 def main():
    if len(sys.argv) < 2:
        print("Usage: python expand_urls_from_csvs.py file1.csv file2.csv ...")
        sys.exit(1)
        
    csv_files = []
    for arg in sys.argv[1:]:
        csv_files.extend(glob.glob(arg))
        
    #input_file = "imgur.com_imgur.io_2014-01-09_2014-01-10_incomplete.csv"
    target_domains = ["bit.ly", "ift.tt"]
    excluded_domains = ["imgur.com", "votejaceysebion.com", "i.imgur.com", "2014SC.com", "2014REEL.com", "24TOTO.com", "m.imgur.com", "dlvr.it", "fb.me", "redd.it", "24toto.com", "2014OS.com", "VoteIsabellaHibbs.com", "reddit.com", "twitter.com"]
    column_name = "Embedded_text"  # Replace with the appropriate column name in your .csv

    file1 = open("expanded_urls.txt", "a")  # append mode
    expanded_urls = []
    for csv_file in csv_files:
        print("processing " + str(csv_file))
        file1.write("processing " + str(csv_file) + "\n")
        short_urls = extract_urls_from_csv(csv_file, excluded_domains, column_name)
        for short_url in short_urls:
            print("expanding url " + short_url)
            file1.write("expanding url " + short_url + "\n")
            expanded_url = expand_url(short_url)
            if expanded_url:
                expanded_urls.append(expanded_url)
            if expanded_url is not None:
                print(expanded_url)
                file1.write(expanded_url + "\n")
            else:
                print("None returned")
                file1.write("None returned\n")
            sleep(0.5)
        
    file1.close()

    # Print the expanded URLs
    #for expanded_url in expanded_urls:
    #    print(expanded_url)

 if __name__ == "__main__":
    main()
	import csv
	import requests
	import re
	import sys
	import glob
	from urllib.parse import urlsplit
	from time import sleep

	def expand_url(short_url):
	try:
	response = requests.get(short_url, timeout=10)
	if response.status_code == 200:
	return response.url
	else:
	print(f"Error: {short_url} returned status code {response.status_code}")
	return None
	except Exception as e:
	print(f"Error: {short_url} - {str(e)}")
	return None


	def extract_urls_from_text(text):
	url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\\(\\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
	return url_pattern.findall(text)

	def extract_urls_from_csv(file_path, excluded_domains, column_name):
	urls = []
	with open(file_path, newline='') as csvfile:
	reader = csv.DictReader(csvfile)
	for row in reader:
	text = row[column_name]
	for url in extract_urls_from_text(text):
	try:
	domain = urlsplit(url).netloc
	#if domain in target_domains:
	# urls.append(url)
	if domain not in excluded_domains:
	urls.append(url)
	except ValueError as e:
	if str(e) == "Invalid IPv6 URL":
	print(f"Warning: Skipping invalid IPv6 URL: {url}")
	else:
	raise e
	return urls

	def main():
	if len(sys.argv) < 2:
	print("Usage: python expand_urls_from_csvs.py file1.csv file2.csv ...")
	sys.exit(1)

	csv_files = []
	for arg in sys.argv[1:]:
	csv_files.extend(glob.glob(arg))

	#input_file = "imgur.com_imgur.io_2014-01-09_2014-01-10_incomplete.csv"
	target_domains = ["bit.ly", "ift.tt"]
	excluded_domains = ["imgur.com", "votejaceysebion.com", "i.imgur.com", "2014SC.com", "2014REEL.com", "24TOTO.com", "m.imgur.com", "dlvr.it", "fb.me", "redd.it", "24toto.com", "2014OS.com", "VoteIsabellaHibbs.com", "reddit.com", "twitter.com"]
	column_name = "Embedded_text" # Replace with the appropriate column name in your .csv

	file1 = open("expanded_urls.txt", "a") # append mode
	expanded_urls = []
	for csv_file in csv_files:
	print("processing " + str(csv_file))
	file1.write("processing " + str(csv_file) + "\n")
	short_urls = extract_urls_from_csv(csv_file, excluded_domains, column_name)
	for short_url in short_urls:
	print("expanding url " + short_url)
	file1.write("expanding url " + short_url + "\n")
	expanded_url = expand_url(short_url)
	if expanded_url:
	expanded_urls.append(expanded_url)
	if expanded_url is not None:
	print(expanded_url)
	file1.write(expanded_url + "\n")
	else:
	print("None returned")
	file1.write("None returned\n")
	sleep(0.5)

	file1.close()

	# Print the expanded URLs
	#for expanded_url in expanded_urls:
	# print(expanded_url)

	if __name__ == "__main__":
	main()