Skip to content

Instantly share code, notes, and snippets.

@ndamulelonemakh
Last active August 10, 2022 01:42
Show Gist options
  • Save ndamulelonemakh/5aea94850411940b588e76051aeb251b to your computer and use it in GitHub Desktop.
Save ndamulelonemakh/5aea94850411940b588e76051aeb251b to your computer and use it in GitHub Desktop.
Google colab utils
"""A naive imnplementation for interactively cloning private python repositories in google colab"""
from google.colab import drive
def get_config(path: str):
"""deserialised configuration from disk or return an empty dict"""
if not os.path.exists(path):
return {}
with open(path) as cbf:
_colab_cfg = cbf.read()
_colab_cfg = {} if len(_colab_cfg) == 0 else eval(_colab_cfg)
return _colab_cfg
def iscolab():
"""Return true if notebook is executing within google colab"""
try:
import google.colab
return True
except ModuleNotFoundError:
return False
def interactive_config_info():
"""Load project info from saved config or prompt user"""
CONFIG_FILE = '/root/.colab_cfg'
_colab_cfg = get_config(CONFIG_FILE)
github_url = _colab_cfg.get('github_url') or input("Enter authenticated clone url e.g. https://<username>:<pat>@github.com/path/to/myrepo.git ?")
project_name = _colab_cfg.get('project_name') or input("The name of the github repository e.g. myrepo ")
google_drive_data = _colab_cfg.get('google_drive_data') or input("Enter path to your google drive data e.g. MyDrive/myrepo/data ?")
# update file cache with the latest settings
with open(CONFIG_FILE, 'w') as cbf:
_colab_cfg = dict(github_url=github_url,
project_name=project_name,
google_drive_data=google_drive_data)
cbf.write(str(_colab_cfg))
return github_url, project_name, google_drive_data
def setup_colab():
if iscolab():
print("Prepare notebook to run in colab")
github_url, project_name, google_drive_data = interactive_config_info()
if not os.path.exists(project_name):
print("Downloading project from github..")
!git clone -b dev $github_url
else:
print("Skip clone, already downloaded")
os.chdir(project_name)
print("Directory changed to", os.getcwd())
print("Installing project dependencies")
!python3 -m pip install --quiet -r requirements.txt
print("Mounting google drive @/gdrive")
drive.mount('/gdrive')
!ln -fsv /gdrive/$google_drive_data data
print("Colab runtime setup done")
else:
print("This is not colab")
setup_colab()
"""Utility script for setting up a colab notebook to work with a data science project setup with cookiecutter.
Ideally this script should not depend on the core package, because it runs before pip install for the core package.
"""
import os
import io
import sys
import logging
import subprocess
import traceback
from pathlib import Path
from typing import List
log = logging.getLogger(__name__)
logs_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(level=os.getenv('LOG_LEVEL', logging.INFO), format=logs_format)
class MissingDepsError(Exception):
def __init__(self, package, *args, **kwargs):
super().__init__(*args, **kwargs)
self.error = f"Missing dependency {package}"
def iscolab() -> bool:
"""Return true if notebook is executing within google colab"""
try:
import google.colab
return True
except (ModuleNotFoundError, ImportError):
return False
def __build_gitlab_pypi_url(gitlab_pat: str, project_id: str) -> str:
return f"https://__token__:{gitlab_pat}@gitlab.com/api/v4/projects/{project_id}/packages/pypi/simple"
def __write_env_file(secrets: str, path: Path = ".env"):
log.debug(f"Writing environment file to {path}")
with open(path, "w") as f:
for secret in secrets.splitlines():
if 'ROOT_DIR' in secret: # Override ROOT_DIR with current directory
secret = f'ROOT_DIR={os.getcwd()}'
f.write(f"{secret}\n")
def __read_env_file(env_path: str = ".env"):
log.debug(f"Reading environment file from {env_path}")
if os.path.exists(env_path):
with open(env_path, "r") as f:
return f.read()
log.debug(f"Environment file {env_path} not found")
return None
def execute_bash_command(command: List[str]) -> bool:
result: subprocess.CompletedProcess = subprocess.run(command,
capture_output=True)
if result.returncode != 0:
log.error(f"Command {command} failed with return code {result.returncode}")
log.error(result.stderr.decode())
return False
return True
def install_extra_dependencies():
log.info("Installing extra dependencies for colab environment")
log.debug("1. Updating pip")
if not execute_bash_command(["pip", "install", "--upgrade", "--quiet", "pip"]):
sys.exit(2)
log.debug("2. Installing gcp secrets sdk")
if not execute_bash_command(["pip", "install", "--quiet", "google-cloud-secret-manager"]):
sys.exit(2)
log.debug("3. Installing environment variables manager")
if not execute_bash_command(["pip", "install", "--quiet", "python-dotenv"]):
sys.exit(2)
log.info("Done installing extra dependencies for colab environment")
def configure_secrets(secrets_url: str) -> None:
log.info(f"Fetching secrets from {secrets_url}..")
try:
from dotenv import load_dotenv
from google.cloud import secretmanager
secrets = __read_env_file()
if secrets is None:
log.debug("No environment file found, fetching secrets from remote")
client = secretmanager.SecretManagerServiceClient()
res = client.access_secret_version(
request={"name": secrets_url})
secrets: str = res.payload.data.decode("UTF-8")
else:
log.debug("Using cached secrets from existing environment file")
log.info("Installing secrets to colab environment..")
envs = [line.strip() for line in secrets.splitlines() if
len(line.strip()) > 0 and not line.strip().startswith("#")]
with io.StringIO('\n'.join(envs)) as config:
load_dotenv(stream=config)
log.debug("Override ROOT_DIR with current directory")
os.environ["ROOT_DIR"] = os.getcwd()
__write_env_file(secrets)
log.info("Secrets successfully installed")
except ImportError as e:
raise MissingDepsError(str(e))
except Exception as e:
log.error(f"Failed to fetch secrets from {secrets_url}")
traceback.print_exc()
log.error(e)
sys.exit(1)
def install_core_projects(package_url: str, package_name: str):
log.info("Installing core project(s)")
if not execute_bash_command(["pip",
"install",
package_name,
"--no-deps",
"-i",
package_url]):
sys.exit(2)
def mount_google_drive(data_path: str, mount_path: str = "/gdrive"):
try:
if Path(os.getcwd()).joinpath("data").exists():
log.info(f"Data directory already exists, skipping mount\n{os.listdir('data')}")
return
log.info(f"Mounting google drive to {mount_path}")
from google.colab import drive
drive.mount(mount_path)
log.debug(f"Mounted google drive to {mount_path}. Create a symlink to {data_path}")
os.symlink(src=os.path.join(mount_path, data_path), dst="data")
log.info(f"Done mounting google drive")
except ImportError as e:
log.error(f"Failed to mount google drive. Reason: {e}")
exit(2)
def do_login():
log.info("Logging in to google cloud")
try:
from google.colab import auth
auth.authenticate_user()
except:
traceback.print_exc()
log.error("Failed to login to google cloud, exiting")
sys.exit(1)
def run(secrets_url: str, package_name: str, gdrive_data_path: str):
if not iscolab():
log.info("Not running in colab, nothing to do")
return
log.info("Running in colab, setting up environment")
try:
do_login()
install_extra_dependencies()
configure_secrets(secrets_url)
package_url = __build_gitlab_pypi_url(os.environ["GITLAB_PAT"], os.environ["GITLAB_PROJECT_ID"])
install_core_projects(package_url, package_name)
mount_google_drive(gdrive_data_path)
log.info("Done setting up environment")
except MissingDepsError:
traceback.print_exc()
log.error("Missing required dependencies, exiting")
sys.exit(1)
if __name__ == '__main__':
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment