apenney · December 2, 2025 20:50
diff --git a/gistfile1.txt b/gistfile1.txt
 A coworker of mine is trying to build the infra for a service we're writing, and things are going horribly wrong!

 He has a restatedeployment that looks like (cut down to remove noise):

 ```
 apiVersion: restate.dev/v1beta1
 kind: RestateDeployment
 metadata:
  annotations:
    reloader.stakater.com/auto: "true"
  creationTimestamp: "2025-11-27T23:19:18Z"
  finalizers:
  - deployments.restate.dev
  generation: 10
  labels:
    app.kubernetes.io/instance: relay-backend
    app.kubernetes.io/managed-by: Helm
    app.kubernetes.io/name: relay-backend
    app.kubernetes.io/version: 1.0.0
  name: relay-backend
  namespace: relay
  resourceVersion: "637555379"
  uid: d6983c95-5086-4609-b31f-7bd6043013e3
 spec:
  replicas: 1
  restate:
    register:
      url: http://restate.restate-cluster.svc.cluster.local:9070
    servicePath: /restate/v1
  revisionHistoryLimit: 1
  selector:
    matchLabels:
      app.kubernetes.io/instance: relay-backend
      app.kubernetes.io/name: relay-backend
  template:
    metadata:
      annotations:
        linkerd.io/inject: enabled
      labels:
        app.kubernetes.io/instance: relay-backend
        app.kubernetes.io/name: relay-backend
    spec:
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: karpenter.sh/nodepool
                operator: In
                values:
                - interruptable
      containers:
      - args:
        - ddtrace-run
        - granian
        - --interface
        - asgi
        - --workers
        - "4"
        - --host
        - 0.0.0.0
        - --port
        - "8000"
        - src.main:app
        env:
        - name: ENVIRONMENT
          value: develop
        - name: SOURCE_VERSION
          value: main-dd623e1
        envFrom:
        - secretRef:
            name: relay-backend-envvars
        - configMapRef:
            name: relay-backend-envvars
        image: xxx.dkr.ecr.us-east-1.amazonaws.com/relay-backend:main-dd623e1
        imagePullPolicy: IfNotPresent
        name: relay-backend
        ports:
        - containerPort: 8000
          name: http
          protocol: TCP
        resources:
          limits:
            memory: 4096Mi
          requests:
            cpu: 1000m
            memory: 4096Mi
        startupProbe:
          failureThreshold: 18
          httpGet:
            path: /health
            port: 8000
          periodSeconds: 10
        volumeMounts:
        - mountPath: /var/run/datadog
          name: apmsocketpath
      nodeSelector: {}
      serviceAccountName: relay-backend-service-account
      terminationGracePeriodSeconds: 10
      volumes:
      - hostPath:
          path: /var/run/datadog/
        name: apmsocketpath
      - emptyDir: {}
        name: temp
 status:
  availableReplicas: 1
  conditions:
  - lastTransitionTime: "2025-12-02T20:32:05Z"
    message: 'Failed to make Restate admin API call: HTTP status server error (500
      Internal Server Error) for url (http://restate.restate-cluster.svc.cluster.local:9070/query)'
    reason: FailedReconcile
    status: Unknown
    type: Ready
  labelSelector: app.kubernetes.io/instance=relay-backend,app.kubernetes.io/name=relay-backend
  observedGeneration: 10
  readyReplicas: 1
  replicas: 1
  unavailableReplicas: 0

 ```

 In restate I see a constant stream of:

 ```
 {"timestamp":"2025-12-02T20:49:08.646149Z","level":"WARN","fields":{"message":"Query failed","err":"External error: node lookup for partition 2 failed"},"target":"restate_admin::query_utils","span":{"http.request.method":"POST","http.version":"HTTP/1.1","url.path":"/query","url.query":"","url.scheme":"http","name":"admin-api-request"},"spans":[{"network.transport":"tcp","server.address":"0.0.0.0","server.port":9070,"server_name":"admin-api-server","name":"server"},{"peer_addr":"172.15.121.250:38836","name":"SocketHandler"},{"http.request.method":"POST","http.version":"HTTP/1.1","url.path":"/query","url.query":"","url.scheme":"http","name":"admin-api-request"}]}
 ```

 This seems related, as it triggers when restatedeployment tries to deploy a new version of the pod.  What happens is we get two replicasets, the old and new, and then it fails to register the new one so they both just sit there forever.

 I'm no restate expert, so I don't know what to do to help him!
	A coworker of mine is trying to build the infra for a service we're writing, and things are going horribly wrong!

	He has a restatedeployment that looks like (cut down to remove noise):

	```
	apiVersion: restate.dev/v1beta1
	kind: RestateDeployment
	metadata:
	annotations:
	reloader.stakater.com/auto: "true"
	creationTimestamp: "2025-11-27T23:19:18Z"
	finalizers:
	- deployments.restate.dev
	generation: 10
	labels:
	app.kubernetes.io/instance: relay-backend
	app.kubernetes.io/managed-by: Helm
	app.kubernetes.io/name: relay-backend
	app.kubernetes.io/version: 1.0.0
	name: relay-backend
	namespace: relay
	resourceVersion: "637555379"
	uid: d6983c95-5086-4609-b31f-7bd6043013e3
	spec:
	replicas: 1
	restate:
	register:
	url: http://restate.restate-cluster.svc.cluster.local:9070
	servicePath: /restate/v1
	revisionHistoryLimit: 1
	selector:
	matchLabels:
	app.kubernetes.io/instance: relay-backend
	app.kubernetes.io/name: relay-backend
	template:
	metadata:
	annotations:
	linkerd.io/inject: enabled
	labels:
	app.kubernetes.io/instance: relay-backend
	app.kubernetes.io/name: relay-backend
	spec:
	affinity:
	nodeAffinity:
	requiredDuringSchedulingIgnoredDuringExecution:
	nodeSelectorTerms:
	- matchExpressions:
	- key: karpenter.sh/nodepool
	operator: In
	values:
	- interruptable
	containers:
	- args:
	- ddtrace-run
	- granian
	- --interface
	- asgi
	- --workers
	- "4"
	- --host
	- 0.0.0.0
	- --port
	- "8000"
	- src.main:app
	env:
	- name: ENVIRONMENT
	value: develop
	- name: SOURCE_VERSION
	value: main-dd623e1
	envFrom:
	- secretRef:
	name: relay-backend-envvars
	- configMapRef:
	name: relay-backend-envvars
	image: xxx.dkr.ecr.us-east-1.amazonaws.com/relay-backend:main-dd623e1
	imagePullPolicy: IfNotPresent
	name: relay-backend
	ports:
	- containerPort: 8000
	name: http
	protocol: TCP
	resources:
	limits:
	memory: 4096Mi
	requests:
	cpu: 1000m
	memory: 4096Mi
	startupProbe:
	failureThreshold: 18
	httpGet:
	path: /health
	port: 8000
	periodSeconds: 10
	volumeMounts:
	- mountPath: /var/run/datadog
	name: apmsocketpath
	nodeSelector: {}
	serviceAccountName: relay-backend-service-account
	terminationGracePeriodSeconds: 10
	volumes:
	- hostPath:
	path: /var/run/datadog/
	name: apmsocketpath
	- emptyDir: {}
	name: temp
	status:
	availableReplicas: 1
	conditions:
	- lastTransitionTime: "2025-12-02T20:32:05Z"
	message: 'Failed to make Restate admin API call: HTTP status server error (500
	Internal Server Error) for url (http://restate.restate-cluster.svc.cluster.local:9070/query)'
	reason: FailedReconcile
	status: Unknown
	type: Ready
	labelSelector: app.kubernetes.io/instance=relay-backend,app.kubernetes.io/name=relay-backend
	observedGeneration: 10
	readyReplicas: 1
	replicas: 1
	unavailableReplicas: 0

	```

	In restate I see a constant stream of:

	```
	{"timestamp":"2025-12-02T20:49:08.646149Z","level":"WARN","fields":{"message":"Query failed","err":"External error: node lookup for partition 2 failed"},"target":"restate_admin::query_utils","span":{"http.request.method":"POST","http.version":"HTTP/1.1","url.path":"/query","url.query":"","url.scheme":"http","name":"admin-api-request"},"spans":[{"network.transport":"tcp","server.address":"0.0.0.0","server.port":9070,"server_name":"admin-api-server","name":"server"},{"peer_addr":"172.15.121.250:38836","name":"SocketHandler"},{"http.request.method":"POST","http.version":"HTTP/1.1","url.path":"/query","url.query":"","url.scheme":"http","name":"admin-api-request"}]}
	```

	This seems related, as it triggers when restatedeployment tries to deploy a new version of the pod. What happens is we get two replicasets, the old and new, and then it fails to register the new one so they both just sit there forever.

	I'm no restate expert, so I don't know what to do to help him!
No results found