Skip to content

Instantly share code, notes, and snippets.

@scovetta
Last active January 11, 2022 23:06
Show Gist options
  • Save scovetta/51e15537309d25f3617f908fcd2a1327 to your computer and use it in GitHub Desktop.
Save scovetta/51e15537309d25f3617f908fcd2a1327 to your computer and use it in GitHub Desktop.
Basic script to scan a directory for .cmake and .json files and safely replace `http:` URLs with `https:`, where "safely" is defined as "making requests returns identical bytes".
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
The purpose of this script is to scan a directory (default: ports) for
port configuration files that contain http-based URLS, and convert
them to https if possible.
It only looks for files with an extension of .cmake or .json.
The "if possible" check is whether or not the bytes returned from a
GET request to the http URL are identical to those returned from a
GET request to the https URL.
The only external module needed is requests (pip install requests).
Author: Michael Scovetta <[email protected]>
License: MIT
Copyright: Microsoft Corporation
Last Updated: 1/3/2022
"""
import os
import re
import sys
import requests
import hashlib
import logging
VERSION = "1.0.0"
logging.basicConfig(format='%(levelname)s %(message)s', level=logging.DEBUG)
class HttpsChecker:
# List of files modified
updated_files = []
# Request timeout for the network calls
REQUEST_TIMEOUT = 30
def is_upgradeable(self, http_link: str) -> bool:
"""
Check to see if the http URL is upgradeable.
Returns:
True iff it is upgradeable
False if it is not, or on any error.
"""
logging.debug('is_upgradeable(%s)', http_link)
if '${' in http_link:
logging.debug('URL is not upgradeable because it contains a variable.')
return False
try:
http_res = requests.get(http_link, timeout=self.REQUEST_TIMEOUT)
http_res.raise_for_status()
except Exception as msg:
logging.debug('Error loading %s: %s', http_link, msg)
return False
try:
https_link = re.sub('^http://', 'https://', http_link, 1, re.IGNORECASE)
https_res = requests.get(https_link, timeout=self.REQUEST_TIMEOUT)
https_res.raise_for_status()
except Exception as msg:
logging.debug('Error loading %s: %s', https_link, msg)
return False
if http_res.content == https_res.content:
logging.info("Upgradeable URL: %s, Size=%d, Hash=%s", http_link,
len(http_res.content), hashlib.sha256(http_res.content).hexdigest())
return True
return False
def process_file(self, filename):
"""
Processes a given file. This means extracting URLs, checking each one, and
modifying the file with the new URL if safe.
Returns: None
"""
logging.info('Processing file: %s', filename)
if not os.path.isfile(filename):
return
with open(filename, 'rb') as f:
content = f.read()
# Handles ports like antlr4, but needs more work. For now, ignore them.
# Replace "${NAME}" with data from 'set(NAME VALUE)'"
# replacement_content = content
# for key, value in re.findall(bytes(r'set\(([^\s]+) ([^\s]+)\)', encoding='ascii'), content):
# replacement_content = replacement_content.replace(bytes('${' + key + '}', encoding='ascii'), value)
content_modified = False
# This regex was modified from one found at:
# https://stackoverflow.com/questions/55663644/how-to-search-for-a-href-from-a-text-file-through-python-regex
urls = re.findall(bytes(r'http://(?:[-\w.\${}/]|(?:%[\da-fA-F]{2}))+', encoding='ascii'), content)
logging.debug("Found %d URLs: [%s]", len(urls), urls)
for url in set(urls):
url_str = url.decode('utf-8')
if self.is_upgradeable(url_str):
logging.debug('URL [%s] was upgradeable.', url_str)
https_link = re.sub('^http://', 'https://', url_str, 1, re.IGNORECASE)
content = content.replace(url, bytes(https_link, encoding='utf-8'))
content_modified = True
self.updated_files.append(filename)
else:
logging.debug('URL [%s] was NOT upgradeable.', url_str)
if content_modified:
with open(filename, 'wb') as f:
f.write(content)
return
if __name__ == '__main__':
checker = HttpsChecker()
directory = sys.argv[1] if len(sys.argv) > 1 else 'ports'
if not os.path.isdir(directory):
print("Usage: python update_https.py DIRECTORY")
sys.exit(1)
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.cmake') or file.endswith('.json'):
checker.process_file(os.path.join(root, file))
print("Updated %d file(s)." % len(checker.updated_files))
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment