bitbreakr · March 31, 2025 15:54
diff --git a/goflow.go b/goflow.go
 package goflow

 import (
 	"encoding/csv"
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"log"
 	"math"
 	"math/rand"
 	"net/http"
 	_ "net/http/pprof" // Import pprof package
 	"os"
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
 )

 // Tensor represents a multi-dimensional array.
 type Tensor struct {
 	Data         []float64 // Flat data slice
 	Shape        []int     // Shape (e.g., [2, 3] for 2x3 matrix)
 	Grad         []float64 // Gradient for backpropagation
 	RequiresGrad bool
 	mu           sync.RWMutex // RWMutex for thread-safe read/write access
 }

 // NewTensor creates a tensor with the given data and shape.
 func NewTensor(data []float64, shape []int, requiresGrad bool) *Tensor {
 	size := 1
 	for _, dim := range shape {
 		size *= dim
 	}
 	if len(data) != size {
 		panic(fmt.Sprintf("Data length %d does not match shape %v", len(data), shape))
 	}
 	t := &Tensor{
 		Data:    data,
 		Shape:   shape,
 		RequiresGrad: requiresGrad,
 	}
 	if requiresGrad {
 		t.Grad = make([]float64, size)
 	}
 	return t
 }

 // At accesses a tensor element (simplified for 1D/2D).
 func (t *Tensor) At(indices ...int) float64 {
 	t.mu.RLock()
 	defer t.mu.RUnlock()

 	index := 0
 	for i, idx := range indices {
 		index = index*t.Shape[i] + idx
 	}
 	return t.Data[index]
 }

 // Set updates a tensor element.
 func (t *Tensor) Set(value float64, indices ...int) {
 	t.mu.Lock()
 	defer t.mu.Unlock()

 	index := 0
 	for i, idx := range indices {
 		index = index*t.Shape[i] + idx
 	}
 	t.Data[index] = value
 }

 // OptimizedAdd performs element-wise addition with optimizations.
 func OptimizedAdd(a, b *Tensor) *Tensor {
 	if len(a.Data) != len(b.Data) {
 		panic("Tensor sizes mismatch")
 	}
 	result := make([]float64, len(a.Data))

 	// Use goroutines to parallelize the addition
 	var wg sync.WaitGroup
 	chunkSize := len(a.Data) / numCPU
 	for i := 0; i < numCPU; i++ {
 		wg.Add(1)
 		go func(start, end int) {
 			defer wg.Done()
 			for j := start; j < end; j++ {
 				result[j] = a.Data[j] + b.Data[j]
 			}
 		}(i*chunkSize, (i+1)*chunkSize)
 	}
 	wg.Wait()

 	t := NewTensor(result, a.Shape, a.RequiresGrad || b.RequiresGrad)
 	if t.RequiresGrad {
 		go func() {
 			for i := range t.Grad {
 				if a.RequiresGrad {
 					a.mu.Lock()
 					a.Grad[i] += t.Grad[i]
 					a.mu.Unlock()
 				}
 				if b.RequiresGrad {
 					b.mu.Lock()
 					b.Grad[i] += t.Grad[i]
 					b.mu.Unlock()
 				}
 			}
 		}()
 	}
 	return t
 }

 // OptimizedMul performs element-wise multiplication with optimizations.
 func OptimizedMul(a, b *Tensor) *Tensor {
 	if len(a.Data) != len(b.Data) {
 		panic("Tensor sizes mismatch")
 	}
 	result := make([]float64, len(a.Data))

 	// Use goroutines to parallelize the multiplication
 	var wg sync.WaitGroup
 	chunkSize := len(a.Data) / numCPU
 	for i := 0; i < numCPU; i++ {
 		wg.Add(1)
 		go func(start, end int) {
 			defer wg.Done()
 			for j := start; j < end; j++ {
 				result[j] = a.Data[j] * b.Data[j]
 			}
 		}(i*chunkSize, (i+1)*chunkSize)
 	}
 	wg.Wait()

 	t := NewTensor(result, a.Shape, a.RequiresGrad || b.RequiresGrad)
 	if t.RequiresGrad {
 		go func() {
 			for i := range t.Grad {
 				if a.RequiresGrad {
 					a.mu.Lock()
 					a.Grad[i] += t.Grad[i] * b.Data[i]
 					a.mu.Unlock()
 				}
 				if b.RequiresGrad {
 					b.mu.Lock()
 					b.Grad[i] += t.Grad[i] * a.Data[i]
 					b.mu.Unlock()
 				}
 			}
 		}()
 	}
 	return t
 }

 // Layer interface for neural network layers.
 type Layer interface {
 	Forward(input *Tensor) *Tensor
 	Parameters() []*Tensor
 	Type() string // For serialization
 }

 // Dense represents a fully connected layer.
 type Dense struct {
 	Weights *Tensor
 	Bias    *Tensor
 	mu      sync.RWMutex // RWMutex for thread-safe read/write access
 }

 // NewDense creates a dense layer with random initialization.
 func NewDense(inputSize, outputSize int) *Dense {
 	weightsData := make([]float64, inputSize*outputSize)
 	for i := range weightsData {
 		weightsData[i] = randFloat(-0.1, 0.1)
 	}
 	biasData := make([]float64, outputSize)
 	return &Dense{
 		Weights: NewTensor(weightsData, []int{inputSize, outputSize}, true),
 		Bias:    NewTensor(biasData, []int{outputSize}, true),
 	}
 }

 // Forward computes the layer output.
 func (d *Dense) Forward(input *Tensor) *Tensor {
 	d.mu.RLock()
 	defer d.mu.RUnlock()

 	if len(input.Shape) != 1 || input.Shape[0] != d.Weights.Shape[0] {
 		panic("Input shape mismatch")
 	}
 	result := make([]float64, d.Weights.Shape[1])
 	for j := 0; j < d.Weights.Shape[1]; j++ {
 		sum := 0.0
 		for i := 0; i < d.Weights.Shape[0]; i++ {
 			sum += input.Data[i] * d.Weights.At(i, j)
 		}
 		result[j] = sum + d.Bias.Data[j]
 	}
 	return NewTensor(result, []int{d.Weights.Shape[1]}, true)
 }

 // Parameters returns trainable parameters.
 func (d *Dense) Parameters() []*Tensor {
 	d.mu.RLock()
 	defer d.mu.RUnlock()

 	return []*Tensor{d.Weights, d.Bias}
 }

 // Type returns the layer type for serialization.
 func (d *Dense) Type() string {
 	return "Dense"
 }

 // Conv represents a convolutional layer.
 type Conv struct {
 	Filters *Tensor
 	Bias    *Tensor
 	KernelSize int
 	mu       sync.RWMutex // RWMutex for thread-safe read/write access
 }

 // NewConv creates a convolutional layer with random initialization.
 func NewConv(inputChannels, outputChannels, kernelSize int) *Conv {
 	filtersData := make([]float64, inputChannels*outputChannels*kernelSize*kernelSize)
 	for i := range filtersData {
 		filtersData[i] = randFloat(-0.1, 0.1)
 	}
 	biasData := make([]float64, outputChannels)
 	return &Conv{
 		Filters: NewTensor(filtersData, []int{outputChannels, inputChannels, kernelSize, kernelSize}, true),
 		Bias:    NewTensor(biasData, []int{outputChannels}, true),
 		KernelSize: kernelSize,
 	}
 }

 // Forward computes the layer output.
 func (c *Conv) Forward(input *Tensor) *Tensor {
 	c.mu.RLock()
 	defer c.mu.RUnlock()

 	inputChannels := input.Shape[0]
 	inputHeight := input.Shape[1]
 	inputWidth := input.Shape[2]
 	outputChannels := c.Filters.Shape[0]
 	outputHeight := inputHeight - c.KernelSize + 1
 	outputWidth := inputWidth - c.KernelSize + 1

 	result := make([]float64, outputChannels*outputHeight*outputWidth)
 	for oc := 0; oc < outputChannels; oc++ {
 		for y := 0; y < outputHeight; y++ {
 			for x := 0; x < outputWidth; x++ {
 				sum := 0.0
 				for ic := 0; ic < inputChannels; ic++ {
 					for ky := 0; ky < c.KernelSize; ky++ {
 						for kx := 0; kx < c.KernelSize; kx++ {
 							sum += input.At(ic, y+ky, x+kx) * c.Filters.At(oc, ic, ky, kx)
 						}
 					}
 				}
 				result[oc*outputHeight*outputWidth+y*outputWidth+x] = sum + c.Bias.Data[oc]
 			}
 		}
 	}
 	return NewTensor(result, []int{outputChannels, outputHeight, outputWidth}, true)
 }

 // Parameters returns trainable parameters.
 func (c *Conv) Parameters() []*Tensor {
 	c.mu.RLock()
 	defer c.mu.RUnlock()

 	return []*Tensor{c.Filters, c.Bias}
 }

 // Type returns the layer type for serialization.
 func (c *Conv) Type() string {
 	return "Conv"
 }

 // LSTM represents an LSTM layer.
 type LSTM struct {
 	Weights *Tensor
 	Bias    *Tensor
 	HiddenState *Tensor
 	CellState *Tensor
 	mu       sync.RWMutex // RWMutex for thread-safe read/write access
 }

 // NewLSTM creates an LSTM layer with random initialization.
 func NewLSTM(inputSize, hiddenSize int) *LSTM {
 	weightsData := make([]float64, 4*hiddenSize*(inputSize+hiddenSize+1))
 	for i := range weightsData {
 		weightsData[i] = randFloat(-0.1, 0.1)
 	}
 	biasData := make([]float64, 4*hiddenSize)
 	hiddenState := make([]float64, hiddenSize)
 	cellState := make([]float64, hiddenSize)
 	return &LSTM{
 		Weights: NewTensor(weightsData, []int{4, hiddenSize, inputSize + hiddenSize + 1}, true),
 		Bias:    NewTensor(biasData, []int{4, hiddenSize}, true),
 		HiddenState: NewTensor(hiddenState, []int{hiddenSize}, false),
 		CellState: NewTensor(cellState, []int{hiddenSize}, false),
 	}
 }

 // Forward computes the LSTM layer output.
 func (l *LSTM) Forward(input *Tensor) *Tensor {
 	l.mu.RLock()
 	defer l.mu.RUnlock()

 	hiddenSize := l.Weights.Shape[1]
 	batchSize := input.Shape[0]
 	seqLength := input.Shape[1]

 	hiddenState := make([]float64, batchSize*hiddenSize)
 	cellState := make([]float64, batchSize*hiddenSize)

 	for t := 0; t < seqLength; t++ {
 		x := make([]float64, batchSize*(input.Shape[2]+hiddenSize+1))
 		for b := 0; b < batchSize; b++ {
 			copy(x[b*(input.Shape[2]+hiddenSize+1):], input.Data[b*seqLength*input.Shape[2]+t*input.Shape[2]:b*seqLength*input.Shape[2]+(t+1)*input.Shape[2]])
 			copy(x[b*(input.Shape[2]+hiddenSize+1)+input.Shape[2]:], hiddenState[b*hiddenSize:(b+1)*hiddenSize])
 			x[b*(input.Shape[2]+hiddenSize+1)+input.Shape[2]+hiddenSize] = 1.0
 		}

 		gates := OptimizedMul(l.Weights, NewTensor(x, []int{batchSize, input.Shape[2]+hiddenSize+1}, false))
 		gates = OptimizedAdd(gates, l.Bias)

 		i, f, o, c := gates.Data[:hiddenSize], gates.Data[hiddenSize:2*hiddenSize], gates.Data[2*hiddenSize:3*hiddenSize], gates.Data[3*hiddenSize:]

 		for b := 0; b < batchSize; b++ {
 			i[b] = math.Sigmoid(i[b])
 			f[b] = math.Sigmoid(f[b])
 			o[b] = math.Sigmoid(o[b])
 			c[b] = math.Tanh(c[b])

 			cellState[b*hiddenSize:(b+1)*hiddenSize] = f[b]*cellState[b*hiddenSize:(b+1)*hiddenSize] + i[b]*c[b]
 			hiddenState[b*hiddenSize:(b+1)*hiddenSize] = o[b] * math.Tanh(cellState[b*hiddenSize:(b+1)*hiddenSize])
 		}
 	}

 	return NewTensor(hiddenState, []int{batchSize, hiddenSize}, true)
 }

 // Parameters returns trainable parameters.
 func (l *LSTM) Parameters() []*Tensor {
 	l.mu.RLock()
 	defer l.mu.RUnlock()

 	return []*Tensor{l.Weights, l.Bias}
 }

 // Type returns the layer type for serialization.
 func (l *LSTM) Type() string {
 	return "LSTM"
 }

 // MultiHeadAttention represents a multi-head attention layer.
 type MultiHeadAttention struct {
 	WeightsQKV *Tensor
 	BiasQKV    *Tensor
 	WeightsO   *Tensor
 	BiasO      *Tensor
 	NumHeads   int
 	mu         sync.RWMutex // RWMutex for thread-safe read/write access
 }

 // NewMultiHeadAttention creates a multi-head attention layer.
 func NewMultiHeadAttention(inputSize, hiddenSize, numHeads int) *MultiHeadAttention {
 	weightsQKVData := make([]float64, 3*hiddenSize*(inputSize+1))
 	for i := range weightsQKVData {
 		weightsQKVData[i] = randFloat(-0.1, 0.1)
 	}
 	biasQKVData := make([]float64, 3*hiddenSize)
 	weightsOData := make([]float64, hiddenSize*(hiddenSize+1))
 	for i := range weightsOData {
 		weightsOData[i] = randFloat(-0.1, 0.1)
 	}
 	biasOData := make([]float64, hiddenSize)
 	return &MultiHeadAttention{
 		WeightsQKV: NewTensor(weightsQKVData, []int{3, hiddenSize, inputSize + 1}, true),
 		BiasQKV:    NewTensor(biasQKVData, []int{3, hiddenSize}, true),
 		WeightsO:    NewTensor(weightsOData, []int{hiddenSize, hiddenSize + 1}, true),
 		BiasO:      NewTensor(biasOData, []int{hiddenSize}, true),
 		NumHeads:    numHeads,
 	}
 }

 // Forward computes the multi-head attention layer output.
 func (m *MultiHeadAttention) Forward(input *Tensor) *Tensor {
 	m.mu.RLock()
 	defer m.mu.RUnlock()

 	batchSize := input.Shape[0]
 	seqLength := input.Shape[1]
 	hiddenSize := m.WeightsQKV.Shape[1]
 	headSize := hiddenSize / m.NumHeads

 	qkv := OptimizedMul(m.WeightsQKV, input)
 	qkv = OptimizedAdd(qkv, m.BiasQKV)

 	q, k, v := qkv.Data[:hiddenSize], qkv.Data[hiddenSize:2*hiddenSize], qkv.Data[2*hiddenSize:]

 	attentionScores := make([]float64, batchSize*m.NumHeads*seqLength*seqLength)
 	for b := 0; b < batchSize; b++ {
 		for h := 0; h < m.NumHeads; h++ {
 			for i := 0; i < seqLength; i++ {
 				for j := 0; j < seqLength; j++ {
 					score := OptimizedDot(
 						NewTensor(q[b*seqLength*hiddenSize+i*headSize+h*headSize:b*seqLength*hiddenSize+(i+1)*headSize+h*headSize], []int{headSize}, false),
 						NewTensor(k[b*seqLength*hiddenSize+j*headSize+h*headSize:b*seqLength*hiddenSize+(j+1)*headSize+h*headSize], []int{headSize}, false),
 					)
 					attentionScores[b*m.NumHeads*seqLength*seqLength+h*seqLength*seqLength+i*seqLength+j] = score / math.Sqrt(float64(headSize))
 				}
 			}
 		}
 	}

 	attentionWeights := make([]float64, len(attentionScores))
 	for i := range attentionScores {
 		attentionWeights[i] = math.Softmax(attentionScores[i])
 	}

 	attentionOutput := make([]float64, batchSize*seqLength*hiddenSize)
 	for b := 0; b < batchSize; b++ {
 		for h := 0; h < m.NumHeads; h++ {
 			for i := 0; i < seqLength; i++ {
 				for j := 0; j < seqLength; j++ {
 					weight := attentionWeights[b*m.NumHeads*seqLength*seqLength+h*seqLength*seqLength+i*seqLength+j]
 					for k := 0; k < headSize; k++ {
 						attentionOutput[b*seqLength*hiddenSize+i*hiddenSize+h*headSize+k] += weight * v[b*seqLength*hiddenSize+j*headSize+h*headSize+k]
 					}
 				}
 			}
 		}
 	}

 	output := OptimizedMul(m.WeightsO, NewTensor(attentionOutput, []int{batchSize, seqLength, hiddenSize}, false))
 	output = OptimizedAdd(output, m.BiasO)

 	return output
 }

 // Parameters returns trainable parameters.
 func (m *MultiHeadAttention) Parameters() []*Tensor {
 	m.mu.RLock()
 	defer m.mu.RUnlock()

 	return []*Tensor{m.WeightsQKV, m.BiasQKV, m.WeightsO, m.BiasO}
 }

 // Type returns the layer type for serialization.
 func (m *MultiHeadAttention) Type() string {
 	return "MultiHeadAttention"
 }

 func randFloat(min, max float64) float64 {
 	return min + (max-min)*float64(rand.Intn(1000))/1000.0
 }

 // Model represents a neural network.
 type Model struct {
 	Layers []Layer
 	mu     sync.RWMutex // RWMutex for thread-safe read/write access
 }

 // NewModel creates a new model with the given layers.
 func NewModel(layers ...Layer) *Model {
 	return &Model{Layers: layers}
 }

 // Forward propagates input through the model.
 func (m *Model) Forward(input *Tensor) *Tensor {
 	m.mu.RLock()
 	defer m.mu.RUnlock()

 	output := input
 	for _, layer := range m.Layers {
 		output = layer.Forward(output)
 	}
 	return output
 }

 // Parameters returns all trainable parameters.
 func (m *Model) Parameters() []*Tensor {
 	m.mu.RLock()
 	defer m.mu.RUnlock()

 	var params []*Tensor
 	for _, layer := range m.Layers {
 		params = append(params, layer.Parameters()...)
 	}
 	return params
 }

 // Train trains the model with SGD.
 func (m *Model) Train(inputs, targets []*Tensor, epochs int, learningRate float64) {
 	for epoch := 0; epoch < epochs; epoch++ {
 		var wg sync.WaitGroup
 		totalLoss := 0.0

 		for i := range inputs {
 			wg.Add(1)
 			go func(i int) {
 				defer wg.Done()
 				output := m.Forward(inputs[i])
 				loss := mseLoss(output, targets[i])

 				m.mu.Lock()
 				totalLoss += loss.Data[0]
 				m.mu.Unlock()

 				for _, param := range m.Parameters() {
 					param.mu.Lock()
 					for j := range param.Grad {
 						param.Grad[j] = 0
 					}
 					param.mu.Unlock()
 				}
 				loss.Grad[0] = 1.0
 				for _, param := range m.Parameters() {
 					param.mu.Lock()
 					for j := range param.Data {
 						param.Data[j] -= learningRate * param.Grad[j]
 					}
 					param.mu.Unlock()
 				}
 			}(i)
 		}
 		wg.Wait()
 		fmt.Printf("Epoch %d, Loss: %.4f\n", epoch, totalLoss/float64(len(inputs)))
 	}
 }

 // mseLoss computes mean squared error.
 func mseLoss(pred, target *Tensor) *Tensor {
 	if len(pred.Data) != len(target.Data) {
 		panic("Prediction and target size mismatch")
 	}
 	result := make([]float64, 1)
 	sum := 0.0
 	for i := range pred.Data {
 		diff := pred.Data[i] - target.Data[i]
 		sum += diff * diff
 	}
 	result[0] = sum / float64(len(pred.Data))
 	return NewTensor(result, []int{1}, true)
 }

 // SaveModel saves the model to a file in JSON format.
 func (m *Model) SaveModel(filename string) error {
 	type LayerData struct {
 		Type     string      `json:"type"`
 		Weights  *TensorData `json:"weights"`
 		Bias     *TensorData `json:"bias"`
 		KernelSize int       `json:"kernel_size,omitempty"`
 		NumHeads   int       `json:"num_heads,omitempty"`
 	}

 	type TensorData struct {
 		Data  []float64 `json:"data"`
 		Shape []int     `json:"shape"`
 	}

 	type ModelData struct {
 		Layers []LayerData `json:"layers"`
 	}

 	data := ModelData{}
 	for _, layer := range m.Layers {
 		switch l := layer.(type) {
 		case *Dense:
 			l.mu.RLock()
 			data.Layers = append(data.Layers, LayerData{
 				Type: l.Type(),
 				Weights: &TensorData{
 					Data:  l.Weights.Data,
 					Shape: l.Weights.Shape,
 				},
 				Bias: &TensorData{
 					Data:  l.Bias.Data,
 					Shape: l.Bias.Shape,
 				},
 			})
 			l.mu.RUnlock()
 		case *Conv:
 			l.mu.RLock()
 			data.Layers = append(data.Layers, LayerData{
 				Type: l.Type(),
 				Weights: &TensorData{
 					Data:  l.Filters.Data,
 					Shape: l.Filters.Shape,
 				},
 				Bias: &TensorData{
 					Data:  l.Bias.Data,
 					Shape: l.Bias.Shape,
 				},
 				KernelSize: l.KernelSize,
 			})
 			l.mu.RUnlock()
 		case *LSTM:
 			l.mu.RLock()
 			data.Layers = append(data.Layers, LayerData{
 				Type: l.Type(),
 				Weights: &TensorData{
 					Data:  l.Weights.Data,
 					Shape: l.Weights.Shape,
 				},
 				Bias: &TensorData{
 					Data:  l.Bias.Data,
 					Shape: l.Bias.Shape,
 				},
 			})
 			l.mu.RUnlock()
 		case *MultiHeadAttention:
 			l.mu.RLock()
 			data.Layers = append(data.Layers, LayerData{
 				Type: l.Type(),
 				Weights: &TensorData{
 					Data:  l.WeightsQKV.Data,
 					Shape: l.WeightsQKV.Shape,
 				},
 				Bias: &TensorData{
 					Data:  l.BiasQKV.Data,
 					Shape: l.BiasQKV.Shape,
 				},
 				NumHeads: l.NumHeads,
 			})
 			l.mu.RUnlock()
 		default:
 			return fmt.Errorf("unsupported layer type: %T", layer)
 		}
 	}

 	jsonData, err := json.MarshalIndent(data, "", "  ")
 	if err != nil {
 		return err
 	}
 	return ioutil.WriteFile(filename, jsonData, 0644)
 }

 // LoadModel loads a model from a file.
 func LoadModel(filename string) (*Model, error) {
 	type LayerData struct {
 		Type     string      `json:"type"`
 		Weights  *TensorData `json:"weights"`
 		Bias     *TensorData `json:"bias"`
 		KernelSize int       `json:"kernel_size,omitempty"`
 		NumHeads   int       `json:"num_heads,omitempty"`
 	}

 	type TensorData struct {
 		Data  []float64 `json:"data"`
 		Shape []int     `json:"shape"`
 	}

 	type ModelData struct {
 		Layers []LayerData `json:"layers"`
 	}

 	data, err := ioutil.ReadFile(filename)
 	if err != nil {
 		return nil, err
 	}

 	var modelData ModelData
 	if err := json.Unmarshal(data, &modelData); err != nil {
 		return nil, err
 	}

 	model := &Model{}
 	for _, l := range modelData.Layers {
 		switch l.Type {
 		case "Dense":
 			dense := &Dense{
 				Weights: NewTensor(l.Weights.Data, l.Weights.Shape, true),
 				Bias:    NewTensor(l.Bias.Data, l.Bias.Shape, true),
 			}
 			model.Layers = append(model.Layers, dense)
 		case "Conv":
 			conv := &Conv{
 				Filters: NewTensor(l.Weights.Data, l.Weights.Shape, true),
 				Bias:    NewTensor(l.Bias.Data, l.Bias.Shape, true),
 				KernelSize: l.KernelSize,
 			}
 			model.Layers = append(model.Layers, conv)
 		case "LSTM":
 			lstm := &LSTM{
 				Weights: NewTensor(l.Weights.Data, l.Weights.Shape, true),
 				Bias:    NewTensor(l.Bias.Data, l.Bias.Shape, true),
 			}
 			model.Layers = append(model.Layers, lstm)
 		case "MultiHeadAttention":
 			attention := &MultiHeadAttention{
 				WeightsQKV: NewTensor(l.Weights.Data, l.Weights.Shape, true),
 				BiasQKV:    NewTensor(l.Bias.Data, l.Bias.Shape, true),
 				NumHeads:    l.NumHeads,
 			}
 			model.Layers = append(model.Layers, attention)
 		default:
 			return nil, fmt.Errorf("unknown layer type: %s", l.Type)
 		}
 	}

 	return model, nil
 }

 // LoadDatasetFromFile loads a dataset from a single file
 func LoadDatasetFromFile(filePath string) ([]*Tensor, []*Tensor, error) {
 	var inputs, targets []*Tensor

 	// Determine file type and load data
 	ext := filepath.Ext(filePath)
 	switch ext {
 	case ".csv":
 		inputs, targets, err = loadCSV(filePath)
 	case ".json":
 		inputs, targets, err = loadJSON(filePath)
 	default:
 		return nil, nil, fmt.Errorf("unsupported file type: %s", ext)
 	}

 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to load dataset from file: %v", err)
 	}

 	return inputs, targets, nil
 }

 // loadCSV loads data from a CSV file
 func loadCSV(filePath string) ([]*Tensor, []*Tensor, error) {
 	file, err := os.Open(filePath)
 	if err != nil {
 		return nil, nil, err
 	}
 	defer file.Close()

 	reader := csv.NewReader(file)
 	records, err := reader.ReadAll()
 	if err != nil {
 		return nil, nil, err
 	}

 	var inputs, targets []*Tensor
 	for _, record := range records {
 		// Assume the last column is the target and the rest are inputs
 		inputData := make([]float64, len(record)-1)
 		targetData := make([]float64, 1)

 		for i := 0; i < len(record)-1; i++ {
 			inputData[i] = parseFloat(record[i])
 		}
 		targetData[0] = parseFloat(record[len(record)-1])

 		inputs = append(inputs, NewTensor(inputData, []int{len(inputData)}, false))
 		targets = append(targets, NewTensor(targetData, []int{1}, false))
 	}

 	return inputs, targets, nil
 }

 // loadJSON loads data from a JSON file
 func loadJSON(filePath string) ([]*Tensor, []*Tensor, error) {
 	data, err := ioutil.ReadFile(filePath)
 	if err != nil {
 		return nil, nil, err
 	}

 	var records []map[string]interface{}
 	if err := json.Unmarshal(data, &records); err != nil {
 		return nil, nil, err
 	}

 	var inputs, targets []*Tensor
 	for _, record := range records {
 		inputData, ok := record["input"].([]interface{})
 		if !ok {
 			return nil, nil, fmt.Errorf("invalid input data in JSON")
 		}

 		targetData, ok := record["target"].(float64)
 		if !ok {
 			return nil, nil, fmt.Errorf("invalid target data in JSON")
 		}

 		floatInputData := make([]float64, len(inputData))
 		for i, v := range inputData {
 			floatInputData[i] = v.(float64)
 		}

 		inputs = append(inputs, NewTensor(floatInputData, []int{len(floatInputData)}, false))
 		targets = append(targets, NewTensor([]float64{targetData}, []int{1}, false))
 	}

 	return inputs, targets, nil
 }

 // LoadDatasetFromFolder loads a dataset from a folder containing multiple files
 func LoadDatasetFromFolder(folderPath string) ([]*Tensor, []*Tensor, error) {
 	var inputs, targets []*Tensor

 	files, err := ioutil.ReadDir(folderPath)
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to read folder: %v", err)
 	}

 	for _, file := range files {
 		if file.IsDir() {
 			continue
 		}
 		filePath := filepath.Join(folderPath, file.Name())
 		fileInputs, fileTargets, err := LoadDatasetFromFile(filePath)
 		if err != nil {
 			return nil, nil, fmt.Errorf("failed to load file %s: %v", filePath, err)
 		}
 		inputs = append(inputs, fileInputs...)
 		targets = append(targets, fileTargets...)
 	}

 	return inputs, targets, nil
 }

 // AugmentData applies random augmentations to the input data.
 func AugmentData(input *Tensor) *Tensor {
 	input.mu.Lock()
 	defer input.mu.Unlock()

 	// Horizontal flip
 	if rand.Float64() > 0.5 {
 		height := input.Shape[1]
 		width := input.Shape[2]
 		for y := 0; y < height; y++ {
 			for x := 0; x < width/2; x++ {
 				temp := input.At(0, y, x)
 				input.Set(input.At(0, y, width-1-x), 0, y, x)
 				input.Set(temp, 0, y, width-1-x)
 			}
 		}
 	}

 	// Random rotation (simple implementation)
 	if rand.Float64() > 0.5 {
 		// Rotate 90 degrees clockwise as an example
 		height := input.Shape[1]
 		width := input.Shape[2]
 		rotated := make([]float64, height*width)
 		for y := 0; y < height; y++ {
 			for x := 0; x < width; x++ {
 				rotated[x*height+height-1-y] = input.At(0, y, x)
 			}
 		}
 		for i := range rotated {
 			input.Data[i] = rotated[i]
 		}
 	}

 	// Random zoom (simple implementation)
 	if rand.Float64() > 0.5 {
 		zoomFactor := 0.9 + rand.Float64()*0.2
 		height := input.Shape[1]
 		width := input.Shape[2]
 		newHeight := int(float64(height) * zoomFactor)
 		newWidth := int(float64(width) * zoomFactor)
 		offsetY := (height - newHeight) / 2
 		offsetX := (width - newWidth) / 2
 		zoomed := make([]float64, height*width)
 		for y := 0; y < newHeight; y++ {
 			for x := 0; x < newWidth; x++ {
 				zoomed[(y+offsetY)*width+(x+offsetX)] = input.At(0, y, x)
 			}
 		}
 		for i := range zoomed {
 			input.Data[i] = zoomed[i]
 		}
 	}

 	return input
 }

 // Evaluate evaluates the model on a validation set and returns the accuracy.
 func (m *Model) Evaluate(inputs, targets []*Tensor) float64 {
 	correct := 0
 	total := len(inputs)

 	for i := range inputs {
 		output := m.Forward(inputs[i])
 		predicted := maxIndex(output.Data)
 		actual := maxIndex(targets[i].Data)
 		if predicted == actual {
 			correct++
 		}
 	}

 	accuracy := float64(correct) / float64(total)
 	return accuracy
 }

 // maxIndex returns the index of the maximum value in a slice.
 func maxIndex(data []float64) int {
 	maxIdx := 0
 	maxVal := data[0]
 	for i := 1; i < len(data); i++ {
 		if data[i] > maxVal {
 			maxVal = data[i]
 			maxIdx = i
 		}
 	}
 	return maxIdx
 }

 // parseFloat converts a string to a float64
 func parseFloat(value string) float64 {
 	floatValue, err := strconv.ParseFloat(value, 64)
 	if err != nil {
 		panic(fmt.Sprintf("failed to parse float: %v", err))
 	}
 	return floatValue
 }

 /*
 #cgo CFLAGS: -I/usr/include
 #cgo LDFLAGS: -L/usr/lib -lblas
 #include <cblas.h>
 */
 import "C"

 // BLASDot performs a dot product using BLAS.
 func BLASDot(a, b []float64) float64 {
 	n := len(a)
 	result := C.double(0)
 	inc := C.int(1)
 	C.cblas_ddot(C.int(n), (*C.double)(&a[0]), inc, (*C.double)(&b[0]), inc, &result)
 	return float64(result)
 }

 // OptimizedDot performs a dot product using the BLAS function.
 func OptimizedDot(a, b *Tensor) float64 {
 	if len(a.Data) != len(b.Data) {
 		panic("Tensor sizes mismatch")
 	}
 	return BLASDot(a.Data, b.Data)
 }

 // numCPU determines the number of CPU cores to use for parallel operations.
 var numCPU = runtime.NumCPU()