mirror of
https://github.com/ollama/ollama.git
synced 2026-03-27 02:58:43 +07:00
TeaCache: - Timestep embedding similarity caching for diffusion models - Polynomial rescaling with configurable thresholds - Reduces transformer forward passes by ~30-50% FP8 quantization: - Support for FP8 quantized models (8-bit weights with scales) - QuantizedMatmul on Metal, Dequantize on CUDA - Client-side quantization via ollama create --quantize fp8 Other bug fixes: - Fix `/api/show` API for image generation models - Server properly returns model info (architecture, parameters, quantization) - Memory allocation optimizations - CLI improvements for image generation
23 lines
703 B
Go
23 lines
703 B
Go
package imagegen
|
|
|
|
import (
|
|
"io"
|
|
"strings"
|
|
)
|
|
|
|
// QuantizingTensorLayerCreator creates tensor layers with optional quantization.
|
|
// When quantize is true, returns multiple layers (weight + scales + biases).
|
|
type QuantizingTensorLayerCreator func(r io.Reader, name, dtype string, shape []int32, quantize bool) ([]LayerInfo, error)
|
|
|
|
// ShouldQuantize returns true if a tensor should be quantized.
|
|
// Quantizes linear weights only, skipping VAE, embeddings, norms, and biases.
|
|
func ShouldQuantize(name, component string) bool {
|
|
if component == "vae" {
|
|
return false
|
|
}
|
|
if strings.Contains(name, "embed") || strings.Contains(name, "norm") {
|
|
return false
|
|
}
|
|
return strings.HasSuffix(name, ".weight")
|
|
}
|