Created
August 5, 2025 22:02
-
-
Save igorcosta/224a339d1cdef3aad66f7948db77a1fc to your computer and use it in GitHub Desktop.
Launch gpt-oss-120b with Ollama on AWS Sydney region
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
AWSTemplateFormatVersion: '2010-09-09' | |
Description: 'GPU instance for Ollama with flexible instance type selection (P4d A100 or G5 A10G)' | |
Parameters: | |
InstanceType: | |
Type: String | |
Default: 'g5.12xlarge' | |
AllowedValues: | |
- 'g5.2xlarge' # 1x A10G (24GB) - $1.006/hr | |
- 'g5.4xlarge' # 1x A10G (24GB) - $1.624/hr | |
- 'g5.12xlarge' # 4x A10G (96GB total) - $5.672/hr | |
- 'g5.24xlarge' # 4x A10G (96GB total) - $10.888/hr | |
- 'g5.48xlarge' # 8x A10G (192GB total) - $16.288/hr | |
- 'p4d.24xlarge' # 8x A100 (640GB total) - $32.77/hr | |
- 'p4de.24xlarge' # 8x A100 (640GB total) - $40.96/hr | |
Description: 'GPU instance type - G5 for cost-effectiveness, P4d for maximum power' | |
KeyPairName: | |
Type: AWS::EC2::KeyPair::KeyName | |
Description: 'EC2 Key Pair for SSH access to the instance' | |
UseSpotInstance: | |
Type: String | |
Default: 'true' | |
AllowedValues: ['true', 'false'] | |
Description: 'Use Spot instance for 60-70% cost savings' | |
SpotMaxPrice: | |
Type: String | |
Default: '3.00' | |
Description: 'Maximum Spot price per hour (adjust based on instance type)' | |
VpcCidr: | |
Type: String | |
Default: '10.0.0.0/16' | |
Description: 'CIDR block for the VPC' | |
AllowedCidrBlock: | |
Type: String | |
Default: '0.0.0.0/0' | |
Description: 'CIDR block allowed to access Ollama API' | |
DefaultModel: | |
Type: String | |
Default: 'llama3.1:8b' | |
Description: 'Default Ollama model (use smaller models for G5 instances)' | |
ProjectName: | |
Type: String | |
Default: 'ollama-gpu' | |
Description: 'Project name for resource tagging' | |
Environment: | |
Type: String | |
Default: 'development' | |
AllowedValues: ['development', 'staging', 'production'] | |
Description: 'Environment designation' | |
Conditions: | |
UseSpot: !Equals [!Ref UseSpotInstance, 'true'] | |
IsP4Instance: !Or | |
- !Equals [!Ref InstanceType, 'p4d.24xlarge'] | |
- !Equals [!Ref InstanceType, 'p4de.24xlarge'] | |
IsLargeG5: !Or | |
- !Equals [!Ref InstanceType, 'g5.24xlarge'] | |
- !Equals [!Ref InstanceType, 'g5.48xlarge'] | |
Mappings: | |
RegionMap: | |
ap-southeast-2: | |
# Deep Learning AMI Ubuntu 22.04 | |
AMI: 'ami-0c02fb55956c7d316' | |
InstanceSpecs: | |
g5.2xlarge: | |
GPUs: 1 | |
GPUMemory: 24 | |
RecommendedModel: 'llama3.1:8b' | |
g5.4xlarge: | |
GPUs: 1 | |
GPUMemory: 24 | |
RecommendedModel: 'llama3.1:8b' | |
g5.12xlarge: | |
GPUs: 4 | |
GPUMemory: 96 | |
RecommendedModel: 'llama3.1:70b' | |
g5.24xlarge: | |
GPUs: 4 | |
GPUMemory: 96 | |
RecommendedModel: 'llama3.1:70b' | |
g5.48xlarge: | |
GPUs: 8 | |
GPUMemory: 192 | |
RecommendedModel: 'llama3.1:405b' | |
p4d.24xlarge: | |
GPUs: 8 | |
GPUMemory: 640 | |
RecommendedModel: 'gpt-oss:120b' | |
p4de.24xlarge: | |
GPUs: 8 | |
GPUMemory: 640 | |
RecommendedModel: 'gpt-oss:120b' | |
Resources: | |
# VPC and Networking | |
VPC: | |
Type: AWS::EC2::VPC | |
Properties: | |
CidrBlock: !Ref VpcCidr | |
EnableDnsHostnames: true | |
EnableDnsSupport: true | |
Tags: | |
- Key: Name | |
Value: !Sub '${ProjectName}-${Environment}-vpc' | |
InternetGateway: | |
Type: AWS::EC2::InternetGateway | |
Properties: | |
Tags: | |
- Key: Name | |
Value: !Sub '${ProjectName}-${Environment}-igw' | |
AttachGateway: | |
Type: AWS::EC2::VPCGatewayAttachment | |
Properties: | |
VpcId: !Ref VPC | |
InternetGatewayId: !Ref InternetGateway | |
PublicSubnet: | |
Type: AWS::EC2::Subnet | |
Properties: | |
VpcId: !Ref VPC | |
CidrBlock: '10.0.1.0/24' | |
AvailabilityZone: !Select [0, !GetAZs ''] | |
MapPublicIpOnLaunch: true | |
Tags: | |
- Key: Name | |
Value: !Sub '${ProjectName}-${Environment}-public-subnet' | |
PublicRouteTable: | |
Type: AWS::EC2::RouteTable | |
Properties: | |
VpcId: !Ref VPC | |
Tags: | |
- Key: Name | |
Value: !Sub '${ProjectName}-${Environment}-public-rt' | |
PublicRoute: | |
Type: AWS::EC2::Route | |
DependsOn: AttachGateway | |
Properties: | |
RouteTableId: !Ref PublicRouteTable | |
DestinationCidrBlock: '0.0.0.0/0' | |
GatewayId: !Ref InternetGateway | |
PublicSubnetRouteTableAssociation: | |
Type: AWS::EC2::SubnetRouteTableAssociation | |
Properties: | |
SubnetId: !Ref PublicSubnet | |
RouteTableId: !Ref PublicRouteTable | |
# Security Group | |
OllamaSecurityGroup: | |
Type: AWS::EC2::SecurityGroup | |
Properties: | |
GroupDescription: 'Security group for Ollama GPU instance' | |
VpcId: !Ref VPC | |
SecurityGroupIngress: | |
- IpProtocol: tcp | |
FromPort: 22 | |
ToPort: 22 | |
CidrIp: !Ref AllowedCidrBlock | |
Description: 'SSH access' | |
- IpProtocol: tcp | |
FromPort: 11434 | |
ToPort: 11434 | |
CidrIp: !Ref AllowedCidrBlock | |
Description: 'Ollama API access' | |
SecurityGroupEgress: | |
- IpProtocol: -1 | |
CidrIp: 0.0.0.0/0 | |
Description: 'All outbound traffic' | |
Tags: | |
- Key: Name | |
Value: !Sub '${ProjectName}-${Environment}-ollama-sg' | |
# IAM Role | |
GPUInstanceRole: | |
Type: AWS::IAM::Role | |
Properties: | |
AssumeRolePolicyDocument: | |
Version: '2012-10-17' | |
Statement: | |
- Effect: Allow | |
Principal: | |
Service: ec2.amazonaws.com | |
Action: sts:AssumeRole | |
ManagedPolicyArns: | |
- arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy | |
Policies: | |
- PolicyName: CloudFormationSignaling | |
PolicyDocument: | |
Version: '2012-10-17' | |
Statement: | |
- Effect: Allow | |
Action: | |
- cloudformation:SignalResource | |
Resource: !Sub 'arn:aws:cloudformation:${AWS::Region}:${AWS::AccountId}:stack/${AWS::StackName}/*' | |
Tags: | |
- Key: Name | |
Value: !Sub '${ProjectName}-${Environment}-gpu-instance-role' | |
GPUInstanceProfile: | |
Type: AWS::IAM::InstanceProfile | |
Properties: | |
Roles: | |
- !Ref GPUInstanceRole | |
# Wait Condition | |
OllamaWaitConditionHandle: | |
Type: AWS::CloudFormation::WaitConditionHandle | |
OllamaWaitCondition: | |
Type: AWS::CloudFormation::WaitCondition | |
DependsOn: GPUInstance | |
Properties: | |
Handle: !Ref OllamaWaitConditionHandle | |
Timeout: '1800' # 30 minutes | |
Count: 1 | |
# Launch Template | |
GPULaunchTemplate: | |
Type: AWS::EC2::LaunchTemplate | |
Properties: | |
LaunchTemplateName: !Sub '${ProjectName}-${Environment}-gpu-template' | |
LaunchTemplateData: | |
ImageId: !FindInMap [RegionMap, !Ref 'AWS::Region', AMI] | |
InstanceType: !Ref InstanceType | |
KeyName: !Ref KeyPairName | |
IamInstanceProfile: | |
Name: !Ref GPUInstanceProfile | |
SecurityGroupIds: | |
- !Ref OllamaSecurityGroup | |
BlockDeviceMappings: | |
- DeviceName: /dev/sda1 | |
Ebs: | |
VolumeSize: !If [IsP4Instance, 500, 200] | |
VolumeType: gp3 | |
DeleteOnTermination: true | |
Encrypted: true | |
UserData: | |
Fn::Base64: !Sub | | |
#!/bin/bash | |
set -e | |
# Logging setup | |
LOGFILE="/var/log/ollama-deployment.log" | |
exec 1>>"$LOGFILE" 2>&1 | |
log_info() { | |
echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: $1" | |
} | |
signal_success() { | |
curl -X PUT -H 'Content-Type:' \ | |
--data-binary '{"Status": "SUCCESS","Reason": "Ollama deployment completed successfully","UniqueId": "ollama-deployment","Data": "Success"}' \ | |
"${OllamaWaitConditionHandle}" | |
} | |
signal_failure() { | |
curl -X PUT -H 'Content-Type:' \ | |
--data-binary '{"Status": "FAILURE","Reason": "'$1'","UniqueId": "ollama-deployment","Data": "Failed"}' \ | |
"${OllamaWaitConditionHandle}" | |
exit 1 | |
} | |
trap 'signal_failure "Deployment failed at line $LINENO"' ERR | |
log_info "Starting GPU instance deployment - ${InstanceType}" | |
# System updates | |
log_info "Updating system packages" | |
apt-get update -y | |
apt-get upgrade -y | |
apt-get install -y curl wget jq htop nvidia-smi | |
# Verify GPU availability | |
log_info "Checking GPU status" | |
if ! nvidia-smi; then | |
signal_failure "NVIDIA GPUs not detected" | |
fi | |
GPU_COUNT=$(nvidia-smi --list-gpus | wc -l) | |
EXPECTED_GPUS=${FindInMap [InstanceSpecs, !Ref InstanceType, GPUs]} | |
if [ "$GPU_COUNT" -ne "$EXPECTED_GPUS" ]; then | |
signal_failure "Expected $EXPECTED_GPUS GPUs but found $GPU_COUNT" | |
fi | |
log_info "Detected $GPU_COUNT GPUs as expected" | |
# Install Docker | |
log_info "Installing Docker" | |
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - | |
add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | |
apt-get update -y | |
apt-get install -y docker-ce docker-ce-cli containerd.io | |
# Install NVIDIA Container Toolkit | |
log_info "Installing NVIDIA Container Toolkit" | |
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) | |
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add - | |
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | tee /etc/apt/sources.list.d/nvidia-docker.list | |
apt-get update -y | |
apt-get install -y nvidia-container-toolkit | |
# Configure Docker for GPU | |
cat > /etc/docker/daemon.json << 'EOF' | |
{ | |
"default-runtime": "nvidia", | |
"runtimes": { | |
"nvidia": { | |
"path": "nvidia-container-runtime", | |
"runtimeArgs": [] | |
} | |
} | |
} | |
EOF | |
systemctl restart docker | |
systemctl enable docker | |
usermod -aG docker ubuntu | |
# Install Ollama | |
log_info "Installing Ollama" | |
curl -fsSL https://ollama.com/install.sh | sh | |
# Create Ollama service | |
cat > /etc/systemd/system/ollama.service << 'EOF' | |
[Unit] | |
Description=Ollama Service | |
After=network-online.target | |
Wants=network-online.target | |
[Service] | |
ExecStart=/usr/local/bin/ollama serve | |
User=ollama | |
Group=ollama | |
Restart=always | |
RestartSec=3 | |
Environment="OLLAMA_HOST=0.0.0.0:11434" | |
Environment="OLLAMA_ORIGINS=*" | |
[Install] | |
WantedBy=multi-user.target | |
EOF | |
# Create ollama user | |
useradd -r -s /bin/false -m -d /usr/share/ollama ollama | |
# Start Ollama | |
systemctl daemon-reload | |
systemctl enable ollama | |
systemctl start ollama | |
# Wait for Ollama to be ready | |
log_info "Waiting for Ollama service to start" | |
for i in {1..60}; do | |
if curl -f http://localhost:11434/api/tags >/dev/null 2>&1; then | |
log_info "Ollama service is ready" | |
break | |
fi | |
if [ $i -eq 60 ]; then | |
signal_failure "Ollama service failed to start within timeout" | |
fi | |
sleep 5 | |
done | |
# Download default model | |
RECOMMENDED_MODEL="${FindInMap [InstanceSpecs, !Ref InstanceType, RecommendedModel]}" | |
DEFAULT_MODEL="${DefaultModel}" | |
log_info "Downloading model: $DEFAULT_MODEL" | |
if ! timeout 1800 ollama pull "$DEFAULT_MODEL"; then | |
log_info "Failed to download $DEFAULT_MODEL, trying recommended model: $RECOMMENDED_MODEL" | |
if ! timeout 1800 ollama pull "$RECOMMENDED_MODEL"; then | |
signal_failure "Failed to download any model" | |
fi | |
fi | |
# Create startup summary | |
cat > /home/ubuntu/deployment-summary.txt << EOF | |
========================================== | |
OLLAMA GPU DEPLOYMENT SUMMARY | |
========================================== | |
Instance Type: ${InstanceType} | |
GPUs: $GPU_COUNT x $(nvidia-smi --query-gpu=name --format=csv,noheader,nounits | head -1) | |
GPU Memory: ${FindInMap [InstanceSpecs, !Ref InstanceType, GPUMemory]}GB total | |
Ollama Endpoint: http://$(curl -s http://169.254.169.254/latest/meta-data/public-ipv4):11434 | |
Available Models: | |
$(ollama list) | |
Test Commands: | |
curl http://localhost:11434/api/tags | |
ollama run $DEFAULT_MODEL | |
GPU Status: | |
$(nvidia-smi) | |
========================================== | |
EOF | |
chown ubuntu:ubuntu /home/ubuntu/deployment-summary.txt | |
log_info "Deployment completed successfully" | |
signal_success | |
TagSpecifications: | |
- ResourceType: instance | |
Tags: | |
- Key: Name | |
Value: !Sub '${ProjectName}-${Environment}-ollama-gpu' | |
- Key: Project | |
Value: !Ref ProjectName | |
- Key: Environment | |
Value: !Ref Environment | |
- Key: InstanceType | |
Value: !Ref InstanceType | |
# GPU Instance | |
GPUInstance: | |
Type: !If | |
- UseSpot | |
- AWS::EC2::SpotFleet | |
- AWS::EC2::Instance | |
Properties: !If | |
- UseSpot | |
- SpotFleetRequestConfig: | |
IamFleetRole: !Sub 'arn:aws:iam::${AWS::AccountId}:role/aws-ec2-spot-fleet-tagging-role' | |
AllocationStrategy: 'diversified' | |
TargetCapacity: 1 | |
SpotPrice: !Ref SpotMaxPrice | |
LaunchTemplateConfigs: | |
- LaunchTemplateSpecification: | |
LaunchTemplateId: !Ref GPULaunchTemplate | |
Version: !GetAtt GPULaunchTemplate.LatestVersionNumber | |
Overrides: | |
- InstanceType: !Ref InstanceType | |
SubnetId: !Ref PublicSubnet | |
WeightedCapacity: 1 | |
ReplaceUnhealthyInstances: true | |
Type: 'maintain' | |
- LaunchTemplate: | |
LaunchTemplateId: !Ref GPULaunchTemplate | |
Version: !GetAtt GPULaunchTemplate.LatestVersionNumber | |
SubnetId: !Ref PublicSubnet | |
Outputs: | |
InstanceType: | |
Description: 'Selected GPU instance type' | |
Value: !Ref InstanceType | |
ExpectedGPUs: | |
Description: 'Number of GPUs in the instance' | |
Value: !FindInMap [InstanceSpecs, !Ref InstanceType, GPUs] | |
TotalGPUMemory: | |
Description: 'Total GPU memory available' | |
Value: !Sub '${FindInMap [InstanceSpecs, !Ref InstanceType, GPUMemory]}GB' | |
RecommendedModel: | |
Description: 'Recommended model for this instance type' | |
Value: !FindInMap [InstanceSpecs, !Ref InstanceType, RecommendedModel] | |
OllamaEndpoint: | |
Description: 'Ollama API endpoint (get IP from EC2 console)' | |
Value: 'http://[INSTANCE-PUBLIC-IP]:11434' | |
SSHCommand: | |
Description: 'SSH command to connect to the instance' | |
Value: !Sub 'ssh -i ${KeyPairName}.pem ubuntu@[INSTANCE-PUBLIC-IP]' | |
EstimatedCostPerHour: | |
Description: 'Estimated cost per hour for selected instance type' | |
Value: !If | |
- UseSpot | |
- !Sub 'Spot: ~30-70% of On-Demand pricing (varies by availability)' | |
- !Sub 'On-Demand pricing - check AWS pricing page for current rates' | |
CostOptimizationTips: | |
Description: 'Tips to reduce costs' | |
Value: | | |
1. Use Spot instances for development/training (60-70% savings) | |
2. Stop instances when not in use | |
3. Consider smaller G5 instances for lighter workloads | |
4. Use Savings Plans for predictable workloads |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment