marcinantkiewicz · July 31, 2025 21:10
diff --git a/basic ollama care and handling b/basic ollama care and handling
 # docker needs the container toolkit to be able to make nvidia drivers available in the containers and probably more.
 # - you will need nvidia drivers too. https://github.com/NVIDIA/nvidia-container-toolkit
 # - model directory will need some IOPS to load them, dedicated NVME is both fast and naturally limits the sprawl
 # - in GPU stats you will see both (G)raphics and (C)ompute jobs. LLM-related tooling only controls the C jobs.

 # -- once Ollama container is running
 #
 # this should produce help output
 $ docker exec -it ollama ollama

 # ollama.com hosts some of the models, so this nicely works
 # ex: https://ollama.com/dengcao/ERNIE-4.5-21B-A3B-PT
 $ docker exec -it ollama ollama pull dengcao/ERNIE-4.5-21B-A3B-PT:latest

 # will show which models are loaded into memory, balance between layers loaded into gpu and cpu
 # also check out nvtop 
 $ docker exec -it ollama ollama ps

 # you can create custom configs for the models, set parameters such as number of layers in GPU by editing the default one
 # to set number of layers in GPU, you either `/set parameter num_gpu 16` in the interactive interface or set it in the 
 # modelfile as `PARAMETER  num_gpu 16`. Note - this should be called `count_layers_in_gpu` the name is too generic. 
 # `num_gpu 0` disables gpu for the model
 $ docker exec -it ollama ollama show --modelfile dengcao/ERNIE-4.5-21B-A3B-PT > ERNIE.modelfile

 # copy the file into the container and create the new entry (smame model but new config) 
 $ docker exec -it ollama ollama create dengcao/ERNIE-4.5-21B-A3B-PT -f /app/ollama/modelfiles/ERNIE-16 
diff --git a/docker-ollama.service b/docker-ollama.service
 [Unit]
 Description=Ollama Docker Container
 Requires=docker.service
 After=docker.service

 [Service]
 Restart=always
 User=user
 ExecStart=docker run --rm --name ollama --gpus=all -v /space/ollama:/root/.ollama -p 0.0.0.0:11434:11434 -e OLLAMA_DEBUG=1 ollama/ollama
 ExecStop=/usr/bin/docker stop ollama

 [Install]
 WantedBy=multi-user.target
diff --git a/making it useful.md b/making it useful.md
diff --git a/model sources b/model sources
 Hugging Face
 - set your Local Apps in https://huggingface.co/settings/local-apps#local-apps
 - find model repo, `Use This Model` button, select your local app from the dropbox and the quantization. 
  - the different vaulue signify loss from the decreased precision of the weights, [good overview](https://github.com/ggml-org/llama.cpp/pull/1684#issuecomment-1579252501). For tl;dr and if GPU-poor, start with Q4_K.
  - at first stick to the official sources, `GGUF` or `safetensors`. Pytorch (.pt/.pth) are serialied python datastructures, the deserialization process is fragile if the contents are not 100% trustworthy.
	# docker needs the container toolkit to be able to make nvidia drivers available in the containers and probably more.
	# - you will need nvidia drivers too. https://github.com/NVIDIA/nvidia-container-toolkit
	# - model directory will need some IOPS to load them, dedicated NVME is both fast and naturally limits the sprawl
	# - in GPU stats you will see both (G)raphics and (C)ompute jobs. LLM-related tooling only controls the C jobs.

	# -- once Ollama container is running
	#
	# this should produce help output
	$ docker exec -it ollama ollama

	# ollama.com hosts some of the models, so this nicely works
	# ex: https://ollama.com/dengcao/ERNIE-4.5-21B-A3B-PT
	$ docker exec -it ollama ollama pull dengcao/ERNIE-4.5-21B-A3B-PT:latest

	# will show which models are loaded into memory, balance between layers loaded into gpu and cpu
	# also check out nvtop
	$ docker exec -it ollama ollama ps

	# you can create custom configs for the models, set parameters such as number of layers in GPU by editing the default one
	# to set number of layers in GPU, you either `/set parameter num_gpu 16` in the interactive interface or set it in the
	# modelfile as `PARAMETER num_gpu 16`. Note - this should be called `count_layers_in_gpu` the name is too generic.
	# `num_gpu 0` disables gpu for the model
	$ docker exec -it ollama ollama show --modelfile dengcao/ERNIE-4.5-21B-A3B-PT > ERNIE.modelfile

	# copy the file into the container and create the new entry (smame model but new config)
	$ docker exec -it ollama ollama create dengcao/ERNIE-4.5-21B-A3B-PT -f /app/ollama/modelfiles/ERNIE-16
	[Unit]
	Description=Ollama Docker Container
	Requires=docker.service
	After=docker.service

	[Service]
	Restart=always
	User=user
	ExecStart=docker run --rm --name ollama --gpus=all -v /space/ollama:/root/.ollama -p 0.0.0.0:11434:11434 -e OLLAMA_DEBUG=1 ollama/ollama
	ExecStop=/usr/bin/docker stop ollama

	[Install]
	WantedBy=multi-user.target
	Hugging Face
	- set your Local Apps in https://huggingface.co/settings/local-apps#local-apps
	- find model repo, `Use This Model` button, select your local app from the dropbox and the quantization.
	- the different vaulue signify loss from the decreased precision of the weights, [good overview](https://github.com/ggml-org/llama.cpp/pull/1684#issuecomment-1579252501). For tl;dr and if GPU-poor, start with Q4_K.
	- at first stick to the official sources, `GGUF` or `safetensors`. Pytorch (.pt/.pth) are serialied python datastructures, the deserialization process is fragile if the contents are not 100% trustworthy.