Here's a review of the current quickstart and some suggestions for improvement.
Using my devspaces for environment:
https://github.com/faermanj/cluster-api-provider-aws/tree/add-dedicated-hosts
kind create cluster
# OK
kubectl cluster-info
Kubernetes control plane is running at https://127.0.0.1:38683
CoreDNS is running at https://127.0.0.1:38683/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy
# OK
clusterctl version
clusterctl version: &version.Info{Major:"1", Minor:"8", GitVersion:"v1.8.5", GitCommit:"09f2f6b1758bb4e6eb88122209717b2525469258", GitTreeState:"clean", BuildDate:"2024-11-05T16:50:29Z", GoVersion:"go1.22.8", Compiler:"gc", Platform:"linux/amd64"}
# OK
$ echo $AWS_REGION
us-east-1
@faermanj ➜ /workspaces/cluster-api-provider-aws (add-dedicated-hosts) $ aws sts get-caller-identity
{
"UserId": "AIDAT5TK4ZONHNABNETAH",
"Account": "269733383066",
"Arn": "arn:aws:iam::269733383066:user/jufaerma"
}
# OK
clusterawsadm version
clusterawsadm version: &version.Info{Major:"", Minor:"", GitVersion:"", GitCommit:"", GitTreeState:"", BuildDate:"", GoVersion:"go1.23.2", AwsSdkVersion:"v1.55.5", Compiler:"gc", Platform:"linux/amd64"}
# TODO: Verify if i should expect a "clean compile" to wor like this
$ clusterawsadm bootstrap iam create-cloudformation-stack
Attempting to create AWS CloudFormation stack cluster-api-provider-aws-sigs-k8s-io
Following resources are in the stack:
Resource |Type |Status
AWS::IAM::InstanceProfile |control-plane.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
AWS::IAM::InstanceProfile |controllers.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
AWS::IAM::InstanceProfile |nodes.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
AWS::IAM::ManagedPolicy |arn:aws:iam::269733383066:policy/control-plane.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
AWS::IAM::ManagedPolicy |arn:aws:iam::269733383066:policy/nodes.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
AWS::IAM::ManagedPolicy |arn:aws:iam::269733383066:policy/controllers.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
AWS::IAM::ManagedPolicy |arn:aws:iam::269733383066:policy/controllers-eks.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
AWS::IAM::Role |control-plane.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
AWS::IAM::Role |controllers.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
AWS::IAM::Role |eks-controlplane.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
AWS::IAM::Role |nodes.cluster-api-provider-aws.sigs.k8s.io |CREATE_COMPLETE
# OK
export AWS_B64ENCODED_CREDENTIALS=$(clusterawsadm bootstrap credentials encode-as-profile)
echo $AWS_B64ENCODED_CREDENTIALS | base64 -d
[default]
aws_access_key_id = XXXXX
aws_secret_access_key = YYYYY
region = us-east-1
# OK but this is weird, why does it need explicit credentials?
clusterctl init --infrastructure aws
Fetching providers
Installing cert-manager version="v1.16.1"
Waiting for cert-manager to be available...
Installing provider="cluster-api" version="v1.8.5" targetNamespace="capi-system"
Installing provider="bootstrap-kubeadm" version="v1.8.5" targetNamespace="capi-kubeadm-bootstrap-system"
Installing provider="control-plane-kubeadm" version="v1.8.5" targetNamespace="capi-kubeadm-control-plane-system"
Installing provider="infrastructure-aws" version="v2.7.1" targetNamespace="capa-system"
[KubeAPIWarningLogger] spec.template.spec.affinity.nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution[1].preference.matchExpressions[0].key: node-role.kubernetes.io/master is use "node-role.kubernetes.io/control-plane" instead
#OK
#TODO add KUBERNETES_VERSION <= clusterawsadm ami list
export AWS_CONTROL_PLANE_MACHINE_TYPE=t3.large
export AWS_NODE_MACHINE_TYPE=t3.large
# Create AWS SSH KEYPAIR usin cli
export AWS_SSH_KEY_NAME="capi-quickstart__${AWS_REGION}"
aws ec2 create-key-pair --key-name "$AWS_SSH_KEY_NAME" --query 'KeyMaterial' --output text > "$AWS_SSH_KEY_NAME.pem"
chmod 400 "$AWS_SSH_KEY_NAME.pem"
aws ec2 describe-key-pairs --key-name "$AWS_SSH_KEY_NAME"
clusterctl generate cluster capi-quickstart --kubernetes-version 1.30.5 > capi-quickstart.yml
kubectl apply -f capi-quickstart.yml
...
# OK
kubectl get cluster
clusterctl describe cluster capi-quickstart
clusterctl get kubeconfig capi-quickstart > capi-quickstart.kubeconfig
# Cluster never gets ready / accesssible
# Load balancer health check failed
# SSHing into the instance:
/var/log/cloud-init-output.log
[2024-12-05 15:23:08] [certs] Generating "etcd/peer" certificate and key
[2024-12-05 15:23:08] [certs] etcd/peer serving cert is signed for DNS names [ip-10-0-255-33.ec2.internal localhost] and IPs [10.0.255.33 127.0.0.1 ::1]
[2024-12-05 15:23:09] [certs] Generating "etcd/healthcheck-client" certificate and key
[2024-12-05 15:23:09] [certs] Generating "apiserver-etcd-client" certificate and key
[2024-12-05 15:23:09] [certs] Using the existing "sa" key
[2024-12-05 15:23:09] [kubeconfig] Using kubeconfig folder "/etc/kubernetes"
[2024-12-05 15:23:09] [kubeconfig] Writing "admin.conf" kubeconfig file
[2024-12-05 15:23:09] [kubeconfig] Writing "super-admin.conf" kubeconfig file
[2024-12-05 15:23:09] [kubeconfig] Writing "kubelet.conf" kubeconfig file
[2024-12-05 15:23:09] [kubeconfig] Writing "controller-manager.conf" kubeconfig file
[2024-12-05 15:23:10] [kubeconfig] Writing "scheduler.conf" kubeconfig file
[2024-12-05 15:23:10] [etcd] Creating static Pod manifest for local etcd in "/etc/kubernetes/manifests"
[2024-12-05 15:23:10] [control-plane] Using manifest folder "/etc/kubernetes/manifests"
[2024-12-05 15:23:10] [control-plane] Creating static Pod manifest for "kube-apiserver"
[2024-12-05 15:23:10] [control-plane] Creating static Pod manifest for "kube-controller-manager"
[2024-12-05 15:23:10] [control-plane] Creating static Pod manifest for "kube-scheduler"
[2024-12-05 15:23:10] [kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
[2024-12-05 15:23:10] [kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
[2024-12-05 15:23:10] [kubelet-start] Starting the kubelet
[2024-12-05 15:23:10] [wait-control-plane] Waiting for the kubelet to boot up the control plane as static Pods from directory "/etc/kubernetes/manifests"
[2024-12-05 15:23:10] [kubelet-check] Waiting for a healthy kubelet at http://127.0.0.1:10248/healthz. This can take up to 4m0s
[2024-12-05 15:23:11] [kubelet-check] The kubelet is healthy after 1.001024842s
[2024-12-05 15:23:11] [api-check] Waiting for a healthy API server. This can take up to 4m0s
[2024-12-05 15:27:11] [api-check] The API server is not healthy after 4m0.091218898s
[2024-12-05 15:27:11]
[2024-12-05 15:27:11] Unfortunately, an error has occurred:
[2024-12-05 15:27:11] context deadline exceeded
[2024-12-05 15:27:11]
[2024-12-05 15:27:11] This error is likely caused by:
[2024-12-05 15:27:11] - The kubelet is not running
[2024-12-05 15:27:11] - The kubelet is unhealthy due to a misconfiguration of the node in some way (required cgroups disabled)
[2024-12-05 15:27:11]
[2024-12-05 15:27:11] If you are on a systemd-powered system, you can try to troubleshoot the error with the following commands:
[2024-12-05 15:27:11] - 'systemctl status kubelet'
[2024-12-05 15:27:11] - 'journalctl -xeu kubelet'
[2024-12-05 15:27:11]
[2024-12-05 15:27:11] Additionally, a control plane component may have crashed or exited when started by the container runtime.
[2024-12-05 15:27:11] To troubleshoot, list all containers using your preferred container runtimes CLI.
[2024-12-05 15:27:11] Here is one example how you may list all running Kubernetes containers by using crictl:
[2024-12-05 15:27:11] - 'crictl --runtime-endpoint unix:///var/run/containerd/containerd.sock ps -a | grep kube | grep -v pause'
[2024-12-05 15:27:11] Once you have found the failing container, you can inspect its logs with:
[2024-12-05 15:27:11] - 'crictl --runtime-endpoint unix:///var/run/containerd/containerd.sock logs CONTAINERID'
[2024-12-05 15:27:11] error execution phase wait-control-plane: could not initialize a Kubernetes cluster
[2024-12-05 15:27:11] To see the stack trace of this error execute with --v=5 or higher
[2024-12-05 15:27:11] 2024-12-05 15:27:11,565 - cc_scripts_user.py[WARNING]: Failed to run module scripts_user (scripts in /var/lib/cloud/instance/scripts)
"Unable to register node with API server"
● kubelet.service - kubelet: The Kubernetes Node Agent
Loaded: loaded (/usr/lib/systemd/system/kubelet.service; enabled; preset: enabled)
Drop-In: /usr/lib/systemd/system/kubelet.service.d
└─10-kubeadm.conf
Active: active (running) since Thu 2024-12-05 15:33:52 UTC; 1min 30s ago
Docs: https://kubernetes.io/docs/
Main PID: 954 (kubelet)
Tasks: 12 (limit: 9367)
Memory: 35.7M (peak: 36.2M)
CPU: 2.581s
CGroup: /system.slice/kubelet.service
└─954 /usr/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf --co>
Dec 05 15:35:08 ip-10-0-255-33 kubelet[954]: I1205 15:35:08.570321 954 kubelet_node_status.go:73] "Attempting to register node" node="ip-10->
Dec 05 15:35:08 ip-10-0-255-33 kubelet[954]: E1205 15:35:08.574519 954 kubelet_node_status.go:96] "Unable to register node with API server" >
Dec 05 15:35:12 ip-10-0-255-33 kubelet[954]: E1205 15:35:12.913876 954 eviction_manager.go:282] "Eviction manager: failed to get summary sta>
Dec 05 15:35:13 ip-10-0-255-33 kubelet[954]: E1205 15:35:13.960193 954 event.go:368] "Unable to write event (may retry after sleeping)" err=>
Dec 05 15:35:15 ip-10-0-255-33 kubelet[954]: I1205 15:35:15.576475 954 kubelet_node_status.go:73] "Attempting to register node" node="ip-10->
Dec 05 15:35:15 ip-10-0-255-33 kubelet[954]: E1205 15:35:15.580341 954 kubelet_node_status.go:96] "Unable to register node with API server" >
Dec 05 15:35:15 ip-10-0-255-33 kubelet[954]: E1205 15:35:15.693188 954 controller.go:145] "Failed to ensure lease exists, will retry" err="G>
Dec 05 15:35:22 ip-10-0-255-33 kubelet[954]: I1205 15:35:22.582828 954 kubelet_node_status.go:73] "Attempting to register node" node="ip-10->
Dec 05 15:35:22 ip-10-0-255-33 kubelet[954]: E1205 15:35:22.586453 954 kubelet_node_status.go:96] "Unable to register node with API server" >
Dec 05 15:35:22 ip-10-0-255-33 kubelet[954]: E1205 15:35:22.914755 954 eviction_manager.go:282] "Eviction manager: failed to get summary sta>
lines 1-23/23 (END)
# ??????
kubectl --kubeconfig=./capa-quickstart.kubeconfig \
apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.1/manifests/calico.yaml
****
Reduce choices to make it a "quick start". 1- Assume kind and mention that production "should use a well-architected cluster".
Remove other providers? 1- Many boxes do not even have AWS.
Fix boostrap issue?