alkavan · June 5, 2025 20:57
diff --git a/hello.py b/hello.py
 import transformers
 import torch

 model_id = "meta-llama/Meta-Llama-3.1-70B"

 # Transformer input sequences
 inputs = [
    "Hello bot! How are you doing today?",
    #"What is the date today, and how is the weather in New York?",
    #"What is the answer to the ultimate question of life, the universe, and everything?"
 ]

 # Load the tokenizer separately, needed for batch
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

 # Set the pad token to be the 'End of Sequence' token
 # Used for: sequence termination, generation control,
 # input formatting, length normalization, and multi-sequence handling
 tokenizer.pad_token = tokenizer.eos_token

 # This sets the padding to be applied on the left side of the input sequences,
 # which is the correct approach for decoder-only models like Llama.
 # Left-padding is important for decoder-only models because:
 #
 # These models are trained to generate text from left to right.
 # With left-padding, the actual input text always appears at the end of the padded sequence,
 # which aligns better with how the model was trained.
 # It ensures that the model's attention mechanism focuses on the
 # relevant parts of the input when generating new tokens.
 tokenizer.padding_side = 'left'

 # Create transformer pipeline
 pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto"
 )

 # do_sample=False (default): Uses greedy decoding
 # do_sample=True: Enables sampling

 # When do_sample=True, you can use additional parameters like:
 #
 # temperature: Controls randomness (higher values increase randomness)
 # top_k: Limits selection to the k most probable tokens
 # top_p: Uses nucleus sampling to dynamically select top tokens

 outputs = pipeline(
    inputs,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    batch_size=2  # Process 2 inputs at a time
 )

 for input_text, output in zip(inputs, outputs):
    print(f"Input: {input_text}")
    print(f"Output: {output[0]['generated_text']}")
    print()
diff --git a/requirements.txt b/requirements.txt
 accelerate==0.33.0
 certifi==2024.7.4
 charset-normalizer==3.3.2
 colorama==0.4.6
 filelock==3.15.4
 fsspec==2024.6.1
 huggingface-hub==0.24.5
 idna==3.7
 Jinja2==3.1.4
 MarkupSafe==2.1.5
 mpmath==1.3.0
 networkx==3.2.1
 numpy==1.26.4
 nvidia-cublas-cu12==12.1.3.1
 nvidia-cuda-cupti-cu12==12.1.105
 nvidia-cuda-nvrtc-cu12==12.1.105
 nvidia-cuda-runtime-cu12==12.1.105
 nvidia-cudnn-cu12==9.1.0.70
 nvidia-cufft-cu12==11.0.2.54
 nvidia-curand-cu12==10.3.2.106
 nvidia-cusolver-cu12==11.4.5.107
 nvidia-cusparse-cu12==12.1.0.106
 nvidia-nccl-cu12==2.20.5
 nvidia-nvjitlink-cu12==12.6.20
 nvidia-nvtx-cu12==12.1.105
 packaging==24.1
 psutil==6.0.0
 PyYAML==6.0.1
 regex==2024.7.24
 requests==2.32.3
 safetensors==0.4.3
 sympy==1.13.1
 tokenizers==0.19.1
 torch==2.4.0
 tqdm==4.66.4
 transformers==4.43.3
 triton==3.0.0
 typing_extensions==4.12.2
 urllib3==2.2.2
diff --git a/rocky-linux-9_llama31-chatbot.md b/rocky-linux-9_llama31-chatbot.md
	import transformers
	import torch

	model_id = "meta-llama/Meta-Llama-3.1-70B"

	# Transformer input sequences
	inputs = [
	"Hello bot! How are you doing today?",
	#"What is the date today, and how is the weather in New York?",
	#"What is the answer to the ultimate question of life, the universe, and everything?"
	]

	# Load the tokenizer separately, needed for batch
	tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

	# Set the pad token to be the 'End of Sequence' token
	# Used for: sequence termination, generation control,
	# input formatting, length normalization, and multi-sequence handling
	tokenizer.pad_token = tokenizer.eos_token

	# This sets the padding to be applied on the left side of the input sequences,
	# which is the correct approach for decoder-only models like Llama.
	# Left-padding is important for decoder-only models because:
	#
	# These models are trained to generate text from left to right.
	# With left-padding, the actual input text always appears at the end of the padded sequence,
	# which aligns better with how the model was trained.
	# It ensures that the model's attention mechanism focuses on the
	# relevant parts of the input when generating new tokens.
	tokenizer.padding_side = 'left'

	# Create transformer pipeline
	pipeline = transformers.pipeline(
	"text-generation",
	model=model_id,
	tokenizer=tokenizer,
	model_kwargs={"torch_dtype": torch.bfloat16},
	device_map="auto"
	)

	# do_sample=False (default): Uses greedy decoding
	# do_sample=True: Enables sampling

	# When do_sample=True, you can use additional parameters like:
	#
	# temperature: Controls randomness (higher values increase randomness)
	# top_k: Limits selection to the k most probable tokens
	# top_p: Uses nucleus sampling to dynamically select top tokens

	outputs = pipeline(
	inputs,
	max_new_tokens=100,
	do_sample=True,
	temperature=0.7,
	batch_size=2 # Process 2 inputs at a time
	)

	for input_text, output in zip(inputs, outputs):
	print(f"Input: {input_text}")
	print(f"Output: {output[0]['generated_text']}")
	print()
	accelerate==0.33.0
	certifi==2024.7.4
	charset-normalizer==3.3.2
	colorama==0.4.6
	filelock==3.15.4
	fsspec==2024.6.1
	huggingface-hub==0.24.5
	idna==3.7
	Jinja2==3.1.4
	MarkupSafe==2.1.5
	mpmath==1.3.0
	networkx==3.2.1
	numpy==1.26.4
	nvidia-cublas-cu12==12.1.3.1
	nvidia-cuda-cupti-cu12==12.1.105
	nvidia-cuda-nvrtc-cu12==12.1.105
	nvidia-cuda-runtime-cu12==12.1.105
	nvidia-cudnn-cu12==9.1.0.70
	nvidia-cufft-cu12==11.0.2.54
	nvidia-curand-cu12==10.3.2.106
	nvidia-cusolver-cu12==11.4.5.107
	nvidia-cusparse-cu12==12.1.0.106
	nvidia-nccl-cu12==2.20.5
	nvidia-nvjitlink-cu12==12.6.20
	nvidia-nvtx-cu12==12.1.105
	packaging==24.1
	psutil==6.0.0
	PyYAML==6.0.1
	regex==2024.7.24
	requests==2.32.3
	safetensors==0.4.3
	sympy==1.13.1
	tokenizers==0.19.1
	torch==2.4.0
	tqdm==4.66.4
	transformers==4.43.3
	triton==3.0.0
	typing_extensions==4.12.2
	urllib3==2.2.2