Skip to content

Instantly share code, notes, and snippets.

@Enweave
Created February 4, 2025 13:13
Show Gist options
  • Save Enweave/195983e0b68f970552a8c71d19cad26d to your computer and use it in GitHub Desktop.
Save Enweave/195983e0b68f970552a8c71d19cad26d to your computer and use it in GitHub Desktop.
Gitlab get files by extension
import logging
import shutil
from typing import List
import requests
import os
import base64
# --- Configuration ---
GITLAB_URL = "https://gitlab.com" # Your GitLab instance URL
PRIVATE_TOKEN = "iseedeadpeople" # Your GitLab personal access token
GROUP_ID = "mygroup" # The ID of the GitLab group you want to search within (or None for all accessible projects)
FILE_EXTENSIONS = (".yaml", ".yml") # The file pattern to search for
OUTPUT_DIR = "yaml_files" # Directory to save the downloaded files
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class Project:
def __init__(self, project_id: int, project_name: str, web_url: str, branch: str = "main"):
self.project_id = project_id
self.project_name = project_name
self.web_url = web_url
self.branch = branch
self.file_paths = []
def __str__(self):
return f"{self.project_id}: {self.project_name}"
def get_main_branch(project_id, headers) -> str:
project_url = f"{GITLAB_URL}/api/v4/projects/{project_id}"
response = requests.get(project_url, headers=headers)
response.raise_for_status()
project_data = response.json()
return project_data.get("default_branch", "master")
def file_is_matching_extension(file_name) -> bool:
return any(file_name.endswith(ext) for ext in FILE_EXTENSIONS)
def find_files(project_id, headers):
files_url = f"{GITLAB_URL}/api/v4/projects/{project_id}/repository/tree"
all_files = []
page = 1
while True:
params = {"ref_type": "heads", "recursive": True, "per_page": 100, "page": page}
response = requests.get(files_url, headers=headers, params=params)
response.raise_for_status()
files = response.json()
if not files:
break
for file in files:
if file["type"] == "blob":
file_path = file["path"]
if file_is_matching_extension(file_path):
all_files.append(file["path"])
page += 1
return all_files
def get_projects(group_id, headers) -> List[Project]:
if group_id:
projects_url = f"{GITLAB_URL}/api/v4/groups/{group_id}/projects"
else:
projects_url = f"{GITLAB_URL}/api/v4/projects"
projects = []
page = 1
while True:
params = {"page": page, "per_page": 100}
response = requests.get(projects_url, headers=headers, params=params)
response.raise_for_status()
projects_json = response.json()
if not projects_json:
break # No more projects
projects += [
Project(project_id=project["id"], project_name=project["name"], web_url=project["web_url"]) for project in projects_json
]
page += 1
for project in projects:
try:
project.branch = get_main_branch(project.project_id, headers)
except Exception as e:
logging.error(f"Error getting main branch for project {project.project_id}: {e}")
logging.info(f"Getting file paths for project {project}")
project.file_paths = find_files(project.project_id, headers)
return projects
def get_file_content(project: Project, file_path: str, headers: dict):
urlencoded_file_path = file_path.replace("/", "%2F")
file_url = f"{GITLAB_URL}/api/v4/projects/{project.project_id}/repository/files/{urlencoded_file_path}"
params = {"ref": project.branch} # Or specify a branch, tag, or commit SHA
response = requests.get(file_url, headers=headers, params=params)
response.raise_for_status()
file_data = response.json()
content_encoded = file_data.get('content')
if content_encoded:
content_decoded = base64.b64decode(content_encoded).decode('utf-8')
return content_decoded
else:
return None
def retrieve():
headers = {"PRIVATE-TOKEN": PRIVATE_TOKEN}
retrieved_projects = get_projects(GROUP_ID, headers)
if os.path.exists(OUTPUT_DIR):
shutil.rmtree(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)
for project in retrieved_projects:
logging.info(f"Processing project: {project}")
for file_path in project.file_paths:
try:
file_content = get_file_content(project, file_path, headers)
if file_content:
output_file_path = os.path.join(OUTPUT_DIR, str(project.project_name), file_path)
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
with open(output_file_path, "w", encoding="utf-8") as f:
f.write(file_content)
logging.info(f"Downloaded: {file_path} from project {project}")
else:
logging.error(f"Failed to get content for: {file_path} from project {project}")
except requests.exceptions.RequestException as e:
logging.error(f"Error downloading {file_path} from project {project}: {e}")
except Exception as e:
logging.error(f"A general error occurred processing file {file_path} from project {project}: {e}")
if __name__ == "__main__":
retrieve()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment