Created
February 24, 2024 06:22
-
-
Save av1d/63bc95eff3045616e734d93195672802 to your computer and use it in GitHub Desktop.
extract relative & absolute paths from CSS and JS files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os | |
# extract relative & absolute paths from css and js files | |
# warning: assumes all assets in javascript are within either backticks, single or double quotes. | |
asset_path = './downloaded/' # location of assets to extract URLs from | |
outfile = 'asset_links.txt' # output file | |
chr_b = '' # characters to be printed before the asset link. Leave empty ('') if none. | |
chr_a = '' # characters to be printed after the asset link. Leave empty ('') if none. | |
count = 0 | |
def process_css(asset_path, outfile): | |
global count | |
url_pattern = re.compile(r'url\((.*?)\)') | |
asset_paths = [] | |
for root, directories, files in os.walk(asset_path): | |
for file_name in files: | |
if file_name.endswith('.css'): | |
css_file_path = os.path.join(root, file_name) | |
with open(css_file_path, 'r') as css_file: | |
css_content = css_file.read() | |
matches = url_pattern.findall(css_content) | |
asset_paths += [match.strip('\'"') for match in matches] | |
with open(outfile, 'a') as output_file: | |
for asset_path in asset_paths: | |
file_string = str(chr_b) + str(asset_path) + str(chr_a) | |
output_file.write(file_string + '\n') | |
print(file_string) | |
count += 1 | |
def extract_js_asset_paths(js_content): | |
pattern = r'([\'"`])((https?:\/\/\S+)|(\.{1,2}\/\S+?))\1' | |
asset_paths = re.findall(pattern, js_content) | |
asset_paths = [path[1] for path in asset_paths] | |
return asset_paths | |
def process_javascript(asset_path, outfile): | |
global count | |
for root, directories, files in os.walk(asset_path): | |
for filename in files: | |
if filename.endswith('.js'): | |
file_path = os.path.join(root, filename) | |
with open(file_path, 'r') as file: | |
js_content = file.read() | |
paths = extract_js_asset_paths(js_content) | |
with open(outfile, 'a') as output: | |
for path in paths: | |
file_string = str(chr_b) + str(path) + str(chr_a) | |
output.write(file_string + '\n') | |
print(file_string) | |
count += 1 | |
if __name__ == '__main__': | |
process_css(asset_path, outfile) | |
process_javascript(asset_path, outfile) | |
print('\nProcessed ' + str(count) + ' assets.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment