From 8f4a0081398d89a88a34d7c553b74c6578d212be Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 2 Feb 2026 15:39:18 -0800 Subject: [PATCH] Add GLM-OCR vision model support (#14024) --- convert/convert.go | 2 + convert/convert_glmocr.go | 455 ++++++++++++++++++++++++++ convert/reader_safetensors.go | 1 + fs/ggml/ggml.go | 2 + ml/backend.go | 1 + ml/backend/ggml/ggml.go | 7 + model/models/glmocr/imageprocessor.go | 174 ++++++++++ model/models/glmocr/model.go | 235 +++++++++++++ model/models/glmocr/model_text.go | 190 +++++++++++ model/models/glmocr/model_vision.go | 355 ++++++++++++++++++++ model/models/models.go | 1 + model/parsers/glmocr.go | 17 + model/parsers/parsers.go | 2 + model/renderers/glmocr.go | 109 ++++++ model/renderers/renderer.go | 2 + 15 files changed, 1553 insertions(+) create mode 100644 convert/convert_glmocr.go create mode 100644 model/models/glmocr/imageprocessor.go create mode 100644 model/models/glmocr/model.go create mode 100644 model/models/glmocr/model_text.go create mode 100644 model/models/glmocr/model_vision.go create mode 100644 model/parsers/glmocr.go create mode 100644 model/renderers/glmocr.go diff --git a/convert/convert.go b/convert/convert.go index df4359224..73b494747 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -313,6 +313,8 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) { conv = &deepseek2Model{} case "Glm4MoeLiteForCausalLM": conv = &glm4MoeLiteModel{} + case "GlmOcrForConditionalGeneration": + conv = &glmOcrModel{} case "Lfm2ForCausalLM": conv = &lfm2Model{} default: diff --git a/convert/convert_glmocr.go b/convert/convert_glmocr.go new file mode 100644 index 000000000..c8524fdbf --- /dev/null +++ b/convert/convert_glmocr.go @@ -0,0 +1,455 @@ +package convert + +import ( + "cmp" + "encoding/json" + "io/fs" + "log/slog" + "regexp" + "strconv" + "strings" + + "github.com/ollama/ollama/fs/ggml" + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" +) + +// normalToNeoXRepacker creates a repacker that permutes Q/K weights from interleaved (LLaMA) +// to NeoX ordering for compatibility with GGML's M-RoPE kernel. +// +// For weights: reshape [out, in] -> [n_heads, head_dim, in], permute rotary dims, reshape back +// For biases: reshape [out] -> [n_heads, head_dim], permute rotary dims, reshape back +func normalToNeoXRepacker(nHeads, headDim int, partialRotaryFactor float32) func(string, []float32, []uint64) ([]float32, error) { + return func(_ string, data []float32, shape []uint64) ([]float32, error) { + rotaryDim := int(float32(headDim) * partialRotaryFactor) + if rotaryDim%2 != 0 { + rotaryDim = (rotaryDim / 2) * 2 // Round down to even + } + + // Handle 1D (bias) or 2D (weight) tensors + is1D := len(shape) == 1 + var inFeatures int + if is1D { + inFeatures = 1 + } else { + inFeatures = int(shape[1]) + } + outFeatures := int(shape[0]) + nEffectiveHeads := outFeatures / headDim + + if nEffectiveHeads != nHeads { + slog.Warn("normalToNeoX: unexpected head count", "effective", nEffectiveHeads, "expected", nHeads) + } + + // Reshape to [n_heads, head_dim, in_features] + reshaped := make([]float32, len(data)) + copy(reshaped, data) + + // Permute the rotary dimensions: even indices first, then odd + // For each head, reorder [0,1,2,3,4,5...] to [0,2,4...,1,3,5...] + result := make([]float32, len(data)) + halfRotary := rotaryDim / 2 + + for h := range nEffectiveHeads { + for f := range inFeatures { + for i := range halfRotary { + // Even dim (0, 2, 4, ...) -> position i + srcIdx := h*headDim*inFeatures + (2*i)*inFeatures + f + dstIdx := h*headDim*inFeatures + i*inFeatures + f + result[dstIdx] = reshaped[srcIdx] + + // Odd dim (1, 3, 5, ...) -> position halfRotary + i + srcIdx = h*headDim*inFeatures + (2*i+1)*inFeatures + f + dstIdx = h*headDim*inFeatures + (halfRotary+i)*inFeatures + f + result[dstIdx] = reshaped[srcIdx] + } + + // Non-rotary part: copy as-is + for i := rotaryDim; i < headDim; i++ { + srcIdx := h*headDim*inFeatures + i*inFeatures + f + result[srcIdx] = reshaped[srcIdx] + } + } + } + + return result, nil + } +} + +type glmOcrModel struct { + ModelParameters + + TextConfig struct { + HiddenSize uint32 `json:"hidden_size"` + IntermediateSize uint32 `json:"intermediate_size"` + NumHiddenLayers uint32 `json:"num_hidden_layers"` + NumAttentionHeads uint32 `json:"num_attention_heads"` + NumKeyValueHeads uint32 `json:"num_key_value_heads"` + HeadDim uint32 `json:"head_dim"` + MaxPositionEmbed uint32 `json:"max_position_embeddings"` + RMSNormEps float32 `json:"rms_norm_eps"` + PartialRotaryFactor float32 `json:"partial_rotary_factor"` + RopeParameters struct { + RopeType string `json:"rope_type"` + MRopeSection []int32 `json:"mrope_section"` + RopeTheta float32 `json:"rope_theta"` + PartialRotaryFactor float32 `json:"partial_rotary_factor"` + } `json:"rope_parameters"` + } `json:"text_config"` + + VisionConfig struct { + HiddenSize uint32 `json:"hidden_size"` + IntermediateSize uint32 `json:"intermediate_size"` + Depth uint32 `json:"depth"` + NumHeads uint32 `json:"num_heads"` + ImageSize uint32 `json:"image_size"` + PatchSize uint32 `json:"patch_size"` + OutHiddenSize uint32 `json:"out_hidden_size"` + RMSNormEps float32 `json:"rms_norm_eps"` + SpatialMergeSize uint32 `json:"spatial_merge_size"` + TemporalPatchSize uint32 `json:"temporal_patch_size"` + } `json:"vision_config"` + + ImageStartTokenID uint32 `json:"image_start_token_id"` + ImageEndTokenID uint32 `json:"image_end_token_id"` + VideoStartTokenID uint32 `json:"video_start_token_id"` + VideoEndTokenID uint32 `json:"video_end_token_id"` + ImageTokenID uint32 `json:"image_token_id"` + VideoTokenID uint32 `json:"video_token_id"` + + // Preprocessor config (preprocessor_config.json) + Preprocessor struct { + Size struct { + ShortestEdge uint32 `json:"shortest_edge"` + LongestEdge uint32 `json:"longest_edge"` + } `json:"size"` + PatchSize uint32 `json:"patch_size"` + TemporalPatchSize uint32 `json:"temporal_patch_size"` + MergeSize uint32 `json:"merge_size"` + ImageMean []float32 `json:"image_mean"` + ImageStd []float32 `json:"image_std"` + } `json:"-"` +} + +var _ ModelConverter = (*glmOcrModel)(nil) + +func (m *glmOcrModel) parseMore(fsys fs.FS) error { + bts, err := fs.ReadFile(fsys, "preprocessor_config.json") + if err != nil { + return err + } + + return json.Unmarshal(bts, &m.Preprocessor) +} + +func (m *glmOcrModel) KV(t *Tokenizer) KV { + kv := m.ModelParameters.KV(t) + kv["general.architecture"] = "glmocr" + + // Text model parameters + kv["glmocr.block_count"] = cmp.Or(m.TextConfig.NumHiddenLayers, 16) + kv["glmocr.embedding_length"] = cmp.Or(m.TextConfig.HiddenSize, 1536) + kv["glmocr.attention.head_count"] = cmp.Or(m.TextConfig.NumAttentionHeads, 16) + kv["glmocr.attention.head_count_kv"] = cmp.Or(m.TextConfig.NumKeyValueHeads, 8) + headDim := cmp.Or(m.TextConfig.HeadDim, m.TextConfig.HiddenSize/m.TextConfig.NumAttentionHeads) + kv["glmocr.attention.key_length"] = headDim + kv["glmocr.attention.value_length"] = headDim + kv["glmocr.feed_forward_length"] = cmp.Or(m.TextConfig.IntermediateSize, 4608) + kv["glmocr.attention.layer_norm_rms_epsilon"] = cmp.Or(m.TextConfig.RMSNormEps, 1e-5) + kv["glmocr.context_length"] = cmp.Or(m.TextConfig.MaxPositionEmbed, 131072) + kv["glmocr.rope.freq_base"] = cmp.Or(m.TextConfig.RopeParameters.RopeTheta, float32(10000)) + kv["glmocr.rope.partial_rotary_factor"] = cmp.Or(m.TextConfig.RopeParameters.PartialRotaryFactor, m.TextConfig.PartialRotaryFactor, float32(1.0)) + if len(m.TextConfig.RopeParameters.MRopeSection) > 0 { + kv["glmocr.rope.mrope_section"] = m.TextConfig.RopeParameters.MRopeSection + } + + // Vision model parameters + kv["glmocr.vision.block_count"] = cmp.Or(m.VisionConfig.Depth, 24) + kv["glmocr.vision.embedding_length"] = cmp.Or(m.VisionConfig.HiddenSize, 1024) + kv["glmocr.vision.attention.head_count"] = cmp.Or(m.VisionConfig.NumHeads, 16) + kv["glmocr.vision.image_size"] = cmp.Or(m.VisionConfig.ImageSize, 336) + kv["glmocr.vision.patch_size"] = cmp.Or(m.VisionConfig.PatchSize, m.Preprocessor.PatchSize, 14) + kv["glmocr.vision.spatial_merge_size"] = cmp.Or(m.VisionConfig.SpatialMergeSize, m.Preprocessor.MergeSize, 2) + kv["glmocr.vision.temporal_patch_size"] = cmp.Or(m.VisionConfig.TemporalPatchSize, m.Preprocessor.TemporalPatchSize, 2) + kv["glmocr.vision.out_hidden_size"] = cmp.Or(m.VisionConfig.OutHiddenSize, 1536) + kv["glmocr.vision.intermediate_size"] = cmp.Or(m.VisionConfig.IntermediateSize, 4096) + kv["glmocr.vision.attention.layer_norm_rms_epsilon"] = cmp.Or(m.VisionConfig.RMSNormEps, 1e-5) + + // Preprocessor-derived image settings (min/max pixels and normalization) + // Note: fs.Config.keyValue() auto-prepends architecture prefix, so use full key + if m.Preprocessor.Size.ShortestEdge > 0 { + kv["glmocr.vision.min_pixels"] = m.Preprocessor.Size.ShortestEdge + } + if m.Preprocessor.Size.LongestEdge > 0 { + kv["glmocr.vision.max_pixels"] = m.Preprocessor.Size.LongestEdge + } + if len(m.Preprocessor.ImageMean) == 3 { + kv["glmocr.vision.image_mean"] = m.Preprocessor.ImageMean + } + if len(m.Preprocessor.ImageStd) == 3 { + kv["glmocr.vision.image_std"] = m.Preprocessor.ImageStd + } + + // Special tokens + kv["glmocr.image_token_id"] = m.ImageTokenID + kv["glmocr.image_start_token_id"] = m.ImageStartTokenID + kv["glmocr.image_end_token_id"] = m.ImageEndTokenID + kv["glmocr.video_token_id"] = m.VideoTokenID + kv["glmocr.video_start_token_id"] = m.VideoStartTokenID + kv["glmocr.video_end_token_id"] = m.VideoEndTokenID + + return kv +} + +func (m *glmOcrModel) Tensors(ts []Tensor) []*ggml.Tensor { + var out []*ggml.Tensor + + // Skip layers >= num_hidden_layers (Multi-Token Prediction layers not needed for basic inference) + numLayers := int(cmp.Or(m.TextConfig.NumHiddenLayers, 16)) + skipLayer := func(name string) bool { + // Tensor names are already replaced to "blk.N.xxx" format + re := regexp.MustCompile(`^blk\.(\d+)`) + matches := re.FindStringSubmatch(name) + if matches == nil { + return false + } + blkNum, err := strconv.Atoi(matches[1]) + if err != nil { + return false + } + return blkNum >= numLayers + } + + for _, t := range ts { + name := t.Name() + + // Skip next-n prediction layers (layers >= num_hidden_layers) + if skipLayer(name) { + continue + } + + // Split ffn_gate_up into separate gate and up projections + if strings.Contains(name, "ffn_gate_up") { + for t := range splitDim(t, 0, + split{Replacer: strings.NewReplacer("ffn_gate_up", "ffn_gate")}, + split{Replacer: strings.NewReplacer("ffn_gate_up", "ffn_up")}, + ) { + out = append(out, t) + } + continue + } + + if strings.HasSuffix(name, "patch_embd.weight") { + shape := t.Shape() + if len(shape) == 5 && shape[2] == 2 { + newShape := []uint64{shape[0], shape[1], shape[3], shape[4]} + + t0 := t.Clone() + t0.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) { + dims := make([]int, len(shape)) + for i := range shape { + dims[i] = int(shape[i]) + } + var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + tt, err := tt.Slice(nil, nil, tensor.S(0, 1), nil, nil) + if err != nil { + return nil, err + } + tt = tensor.Materialize(tt) + newDims := []int{int(shape[0]), int(shape[1]), int(shape[3]), int(shape[4])} + if err := tt.Reshape(newDims...); err != nil { + return nil, err + } + if err := tt.Reshape(tt.Shape().TotalSize()); err != nil { + return nil, err + } + return native.VectorF32(tt.(*tensor.Dense)) + }) + out = append(out, &ggml.Tensor{ + Name: strings.Replace(name, "patch_embd.weight", "patch_embd_0.weight", 1), + Kind: t.Kind(), + Shape: newShape, + WriterTo: t0, + }) + + t1 := t.Clone() + t1.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) { + dims := make([]int, len(shape)) + for i := range shape { + dims[i] = int(shape[i]) + } + var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + tt, err := tt.Slice(nil, nil, tensor.S(1, 2), nil, nil) + if err != nil { + return nil, err + } + tt = tensor.Materialize(tt) + newDims := []int{int(shape[0]), int(shape[1]), int(shape[3]), int(shape[4])} + if err := tt.Reshape(newDims...); err != nil { + return nil, err + } + if err := tt.Reshape(tt.Shape().TotalSize()); err != nil { + return nil, err + } + return native.VectorF32(tt.(*tensor.Dense)) + }) + out = append(out, &ggml.Tensor{ + Name: strings.Replace(name, "patch_embd.weight", "patch_embd_1.weight", 1), + Kind: t.Kind(), + Shape: newShape, + WriterTo: t1, + }) + + continue + } + + if len(shape) == 4 { + out = append(out, &ggml.Tensor{ + Name: strings.Replace(name, "patch_embd.weight", "patch_embd_0.weight", 1), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + continue + } + + slog.Warn("glmocr: patch_embed weight has unexpected shape - not splitting", "shape", shape) + // Fall through to default handling + } + + // Handle pre-split patch embedding weights + // Pattern 1: v.patch_embd.0.weight, v.patch_embd.1.weight -> patch_embd_0.weight, patch_embd_1.weight + // Pattern 2: v.patch_embd.weight.0, v.patch_embd.weight.1 -> patch_embd_0.weight, patch_embd_1.weight + if strings.Contains(name, "patch_embd.0.") { + out = append(out, &ggml.Tensor{ + Name: strings.Replace(name, "patch_embd.0.", "patch_embd_0.", 1), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + continue + } + if strings.Contains(name, "patch_embd.1.") { + out = append(out, &ggml.Tensor{ + Name: strings.Replace(name, "patch_embd.1.", "patch_embd_1.", 1), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + continue + } + // Handle .weight.0 and .weight.1 suffix patterns + if strings.HasSuffix(name, "patch_embd.weight.0") { + out = append(out, &ggml.Tensor{ + Name: strings.Replace(name, "patch_embd.weight.0", "patch_embd_0.weight", 1), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + continue + } + if strings.HasSuffix(name, "patch_embd.weight.1") { + out = append(out, &ggml.Tensor{ + Name: strings.Replace(name, "patch_embd.weight.1", "patch_embd_1.weight", 1), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + continue + } + + // Permute Q/K weights for M-RoPE compatibility (interleaved -> NeoX ordering) + // GGML's M-RoPE kernel uses NeoX-style rotation, but GLM-OCR uses interleaved (LLaMA-style) + // We permute at conversion time so the weights work correctly with GGML's kernel + // This aligns Q/K rotary dimensions with GGML's NeoX-style rotation + if len(m.TextConfig.RopeParameters.MRopeSection) > 0 && + strings.Contains(name, "blk.") && (strings.Contains(name, "attn_q.") || strings.Contains(name, "attn_k.")) { + // Get config values for permutation + nHeads := int(cmp.Or(m.TextConfig.NumAttentionHeads, 16)) + nKVHeads := int(cmp.Or(m.TextConfig.NumKeyValueHeads, 8)) + hiddenSize := int(cmp.Or(m.TextConfig.HiddenSize, 1536)) + headDim := int(cmp.Or(m.TextConfig.HeadDim, uint32(hiddenSize/nHeads))) + partialRotaryFactor := cmp.Or(m.TextConfig.PartialRotaryFactor, m.TextConfig.RopeParameters.PartialRotaryFactor, float32(1.0)) + + // Use appropriate head count: nHeads for Q, nKVHeads for K + effectiveHeads := nHeads + if strings.Contains(name, "attn_k.") { + effectiveHeads = nKVHeads + } + + permutedT := t.Clone() + permutedT.SetRepacker(normalToNeoXRepacker(effectiveHeads, headDim, partialRotaryFactor)) + out = append(out, &ggml.Tensor{ + Name: name, + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: permutedT, + }) + continue + } + + out = append(out, &ggml.Tensor{ + Name: name, + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + + return out +} + +func (m *glmOcrModel) Replacements() []string { + return []string{ + // Vision encoder + "model.visual.patch_embed.proj_1", "v.patch_embd_1", // Second temporal split + "model.visual.patch_embed.proj", "v.patch_embd", + "model.visual.blocks", "v.blk", + "model.visual.post_layernorm", "v.post_ln", + "model.visual.downsample", "mm.patch_merger", + + // Vision attention + "attn.qkv", "attn_qkv", + "attn.proj", "attn_out", + "attn.q_norm", "attn_q_norm", + "attn.k_norm", "attn_k_norm", + + // Vision norms + "norm1", "ln1", + "norm2", "ln2", + + // Vision MLP + "mlp.gate_proj", "ffn_gate", + "mlp.up_proj", "ffn_up", + "mlp.down_proj", "ffn_down", + + // Merger (multimodal projector) + "model.visual.merger.proj", "mm.model.fc", + "model.visual.merger.post_projection_norm", "mm.post_norm", + "model.visual.merger.gate_proj", "mm.gate", + "model.visual.merger.up_proj", "mm.up", + "model.visual.merger.down_proj", "mm.down", + + // Language model + "model.language_model.embed_tokens", "token_embd", + "model.language_model.layers", "blk", + "model.language_model.norm", "output_norm", + "lm_head", "output", + + // Language model attention + "self_attn.q_proj", "attn_q", + "self_attn.k_proj", "attn_k", + "self_attn.v_proj", "attn_v", + "self_attn.o_proj", "attn_out", + + // Language model norms + "input_layernorm", "attn_norm", + "post_attention_layernorm", "ffn_norm", + "post_self_attn_layernorm", "post_attn_norm", + "post_mlp_layernorm", "post_ffn_norm", + + // Language model MLP (remove mlp. prefix so ffn_* names work) + "mlp.gate_up_proj", "ffn_gate_up", + "mlp.down_proj", "ffn_down", + } +} diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go index f7d9754f0..da1a62e02 100644 --- a/convert/reader_safetensors.go +++ b/convert/reader_safetensors.go @@ -99,6 +99,7 @@ func (st safetensor) Kind() uint32 { if st.dtype == "BF16" && !strings.HasPrefix(st.name, "v.") && !strings.HasPrefix(st.name, "s.") && + !strings.HasPrefix(st.name, "mm.") && kind != tensorKindFP32 { kind = tensorKindBF16 } diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 4f31221f9..aa5377ebc 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -270,6 +270,7 @@ func (kv KV) OllamaEngineRequired() bool { "qwen3", "qwen3moe", "qwen3vl", "qwen3vlmoe", "glm4moelite", + "glmocr", "lfm2", }, kv.Architecture()) } @@ -859,6 +860,7 @@ func (f GGML) FlashAttention() bool { "bert", "gemma3", "glm4moelite", + "glmocr", "gptoss", "gpt-oss", "lfm2", "mistral3", diff --git a/ml/backend.go b/ml/backend.go index fa1f32b69..624e2c773 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -170,6 +170,7 @@ type Tensor interface { Cos(ctx Context) Tensor Tanh(ctx Context) Tensor GELU(ctx Context, up ...Tensor) Tensor + GELU_ERF(ctx Context) Tensor QuickGELU(ctx Context, up ...Tensor) Tensor SILU(ctx Context, up ...Tensor) Tensor RELU(ctx Context, up ...Tensor) Tensor diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 138c646bd..ea69235b0 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -1581,6 +1581,13 @@ func (t *Tensor) GELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor { } } +func (t *Tensor) GELU_ERF(ctx ml.Context) ml.Tensor { + return &Tensor{ + b: t.b, + t: C.ggml_gelu_erf_inplace(ctx.(*Context).ctx, t.t), + } +} + func (t *Tensor) QuickGELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor { var tt *C.struct_ggml_tensor if len(t2) > 0 { diff --git a/model/models/glmocr/imageprocessor.go b/model/models/glmocr/imageprocessor.go new file mode 100644 index 000000000..a26f42c4c --- /dev/null +++ b/model/models/glmocr/imageprocessor.go @@ -0,0 +1,174 @@ +package glmocr + +import ( + "image" + "log/slog" + "math" + + "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/model/imageproc" +) + +type ImageProcessor struct { + imageSize int + patchSize int + temporalPatchSize int + spatialMergeSize int + minPixels int + maxPixels int + factor int + imageMean [3]float32 + imageStd [3]float32 +} + +func newImageProcessor(c fs.Config) ImageProcessor { + patchSize := int(c.Uint("vision.patch_size", 14)) + spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2)) + temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2)) + + // Read normalization values from config if available, otherwise use CLIP defaults + imageMean := c.Floats("vision.image_mean", imageproc.ClipDefaultMean[:]) + imageStd := c.Floats("vision.image_std", imageproc.ClipDefaultSTD[:]) + + // Default max_pixels: 2048 * patchSize^2 * mergeSize^2 * temporal = ~3.2M pixels + // This limits to ~16k patches (4k output tokens) to keep memory stable without flash attention + defaultMaxPixels := 2048 * patchSize * patchSize * spatialMergeSize * spatialMergeSize * temporalPatchSize + + return ImageProcessor{ + imageSize: int(c.Uint("vision.image_size", 336)), + patchSize: patchSize, + temporalPatchSize: temporalPatchSize, + spatialMergeSize: spatialMergeSize, + minPixels: int(c.Uint("vision.min_pixels", uint32(8*patchSize*patchSize*spatialMergeSize*spatialMergeSize*temporalPatchSize))), + maxPixels: int(c.Uint("vision.max_pixels", uint32(defaultMaxPixels))), + factor: patchSize * spatialMergeSize, + imageMean: [3]float32{imageMean[0], imageMean[1], imageMean[2]}, + imageStd: [3]float32{imageStd[0], imageStd[1], imageStd[2]}, + } +} + +func (p *ImageProcessor) SmartResize(height, width int) (int, int) { + factor := p.factor + temporalFactor := p.temporalPatchSize + numFrames := temporalFactor // single image + + if height < factor || width < factor { + // Scale up small images + scale := float64(factor) / float64(min(height, width)) + height = int(math.Ceil(float64(height) * scale)) + width = int(math.Ceil(float64(width) * scale)) + } + + if temporalFactor <= 0 { + slog.Warn("temporal_patch_size must be > 0, defaulting to 1") + temporalFactor = 1 + } + if numFrames < temporalFactor { + slog.Warn("num_frames must be >= temporal_patch_size, adjusting num_frames", "num_frames", numFrames, "temporal_patch_size", temporalFactor) + numFrames = temporalFactor + } + if aspectRatio := float64(max(height, width)) / float64(min(height, width)); aspectRatio > 200 { + slog.Warn("aspect ratio exceeds 200, image quality may be affected", "aspect_ratio", aspectRatio) + } + + round := func(x float64) int { return int(math.RoundToEven(x)) } + + hBar := round(float64(height)/float64(factor)) * factor + wBar := round(float64(width)/float64(factor)) * factor + tBar := round(float64(numFrames)/float64(temporalFactor)) * temporalFactor + + if tBar*hBar*wBar > p.maxPixels { + beta := math.Sqrt(float64(numFrames*height*width) / float64(p.maxPixels)) + hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor + wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor + } else if tBar*hBar*wBar < p.minPixels { + beta := math.Sqrt(float64(p.minPixels) / float64(numFrames*height*width)) + hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor + wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor + } + + return hBar, wBar +} + +func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) { + img = imageproc.Composite(img) + + origWidth := img.Bounds().Dx() + origHeight := img.Bounds().Dy() + + // Calculate smart resize dimensions + resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth) + + // Resize image + resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeCatmullrom) + + // Normalize pixels - output format is [C, H, W] with rescale and channelFirst + // We keep [C, H, W] for patch extraction + normalizedPixels := imageproc.Normalize(resizedImg, p.imageMean, p.imageStd, true, true) + + // Calculate grid dimensions (after Conv2D patching) + grid := &Grid{ + Height: resizedHeight / p.patchSize, + Width: resizedWidth / p.patchSize, + Temporal: 1, // Single image + ImageHeight: resizedHeight, + ImageWidth: resizedWidth, + } + + patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid) + if err != nil { + return nil, nil, err + } + + return patches, grid, nil +} + +func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) { + channels := 3 + patchSize := p.patchSize + mergeSize := p.spatialMergeSize + temporalPatchSize := p.temporalPatchSize + + numPatches := grid.Temporal * grid.Height * grid.Width + patchDim := channels * temporalPatchSize * patchSize * patchSize + result := make([]float32, numPatches*patchDim) + patchIndex := 0 + + // Single temporal frame handling (copies to all frames) + for range grid.Temporal { + for h := 0; h < grid.Height; h += mergeSize { + for w := 0; w < grid.Width; w += mergeSize { + for mh := range mergeSize { + for mw := range mergeSize { + baseOffset := patchIndex * patchDim + for c := range channels { + channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize) + for py := range patchSize { + for px := range patchSize { + y := (h+mh)*patchSize + py + x := (w+mw)*patchSize + px + srcIdx := c*height*width + y*width + x + dstIdx := channelOffset + (py * patchSize) + px + result[dstIdx] = pixels[srcIdx] + } + } + + if temporalPatchSize > 1 { + frameSize := patchSize * patchSize + for tp := 1; tp < temporalPatchSize; tp++ { + currentFrameOffset := channelOffset + (tp * frameSize) + copy(result[currentFrameOffset:currentFrameOffset+frameSize], + result[channelOffset:channelOffset+frameSize]) + } + } + } + + patchIndex++ + } + } + } + } + } + + return result, nil +} diff --git a/model/models/glmocr/model.go b/model/models/glmocr/model.go new file mode 100644 index 000000000..82d687660 --- /dev/null +++ b/model/models/glmocr/model.go @@ -0,0 +1,235 @@ +package glmocr + +import ( + "bytes" + "errors" + "image" + "slices" + + "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/kvcache" + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/model" + "github.com/ollama/ollama/model/input" +) + +type Model struct { + model.Base + model.BytePairEncoding + + *TextModel + *VisionModel `gguf:"v"` + VisionDownsample *VisionDownsample `gguf:"mm.patch_merger"` + PatchMerger *PatchMerger `gguf:"mm"` + + ImageProcessor + + imageTokenID int32 + imageStartTokenID int32 + imageEndTokenID int32 +} + +var _ model.MultimodalProcessor = (*Model)(nil) + +func New(c fs.Config) (model.Model, error) { + eosTokenID := int32(c.Uint("tokenizer.ggml.eos_token_id")) + eosTokenIDs := c.Ints("tokenizer.ggml.eos_token_ids") + allEOS := append([]int32{eosTokenID}, eosTokenIDs...) + + m := &Model{ + BytePairEncoding: model.NewBytePairEncoding( + &model.Vocabulary{ + Values: c.Strings("tokenizer.ggml.tokens"), + Types: c.Ints("tokenizer.ggml.token_type"), + Merges: c.Strings("tokenizer.ggml.merges"), + AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false), + BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))}, + AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), + EOS: allEOS, + }, + `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, + ), + TextModel: newTextModel(c), + VisionModel: newVisionModel(c), + ImageProcessor: newImageProcessor(c), + imageTokenID: int32(c.Uint("image_token_id", 59280)), + imageStartTokenID: int32(c.Uint("image_start_token_id", 59256)), + imageEndTokenID: int32(c.Uint("image_end_token_id", 59257)), + } + + m.Cache = kvcache.NewCausalCache(m.TextModel.Shift) + + return m, nil +} + +func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) { + if len(m.VisionModel.Blocks) == 0 { + return nil, model.ErrNoVisionModel + } + + img, _, err := image.Decode(bytes.NewReader(multimodalData)) + if err != nil { + return nil, err + } + + f32s, grid, err := m.ImageProcessor.ProcessImage(img) + if err != nil { + return nil, err + } + + // Create pixel values tensor from flattened patches + // Shape: [patchDim, numPatches] + patchDim := m.VisionModel.numChannels * m.temporalPatchSize * m.patchSize * m.patchSize + numPatches := grid.Temporal * grid.Height * grid.Width + pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches) + + // Forward through vision encoder + visionOutputs := m.VisionModel.Forward(ctx, pixelValues, grid) + + // Forward through downsample (patch merger) + if m.VisionDownsample == nil || m.VisionDownsample.Weight == nil { + return nil, errors.New("glmocr: missing vision downsample weights") + } + visionOutputs = m.VisionDownsample.Forward(ctx, visionOutputs, grid, m.VisionModel.VisionModelOptions) + + // Forward through patch merger (FC + LayerNorm + GELU + SwiGLU FFN) + if m.PatchMerger == nil { + return nil, errors.New("glmocr: missing patch merger weights") + } + visionOutputs = m.PatchMerger.Forward(ctx, visionOutputs, m.VisionModel.VisionModelOptions) + + return []input.Multimodal{{Tensor: visionOutputs, Data: grid}}, nil +} + +func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) { + var result []*input.Input + + // Reset position cache + m.TextModel.positionCache = m.TextModel.positionCache[:0] + m.TextModel.ropeDelta = 0 + + pos := int32(0) + for _, inp := range inputs { + if inp.Multimodal == nil { + result = append(result, inp) + m.TextModel.positionCache = append(m.TextModel.positionCache, pos) + pos++ + continue + } + + // Get grid info for position calculation + grid := inp.Multimodal[0].Data.(*Grid) + mergedH := grid.Height / m.VisionModel.spatialMergeSize + mergedW := grid.Width / m.VisionModel.spatialMergeSize + + // Add image start token + result = append(result, &input.Input{Token: m.imageStartTokenID}) + m.TextModel.positionCache = append(m.TextModel.positionCache, pos) + pos++ + + // Add image tokens with multimodal data + // All image tokens share the same base position for temporal dimension + tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1) + basePos := pos + sameBatch := tokensPerGrid - 1 + if sameBatch < 0 { + sameBatch = 0 + } + result = append(result, &input.Input{ + Token: m.imageTokenID, + Multimodal: inp.Multimodal, + MultimodalHash: inp.MultimodalHash, + SameBatch: sameBatch, + }) + m.TextModel.positionCache = append(m.TextModel.positionCache, basePos) + + // Add placeholder tokens for remaining positions + // All image tokens use the same base position (temporal stays constant) + for range tokensPerGrid - 1 { + result = append(result, &input.Input{Token: m.imageTokenID}) + m.TextModel.positionCache = append(m.TextModel.positionCache, basePos) + } + + // Advance position by max(mergedH, mergedW) after image tokens + pos = basePos + int32(max(mergedH, mergedW)) + + // Add image end token + result = append(result, &input.Input{Token: m.imageEndTokenID}) + m.TextModel.positionCache = append(m.TextModel.positionCache, pos) + pos++ + } + + // Compute rope delta for continuation after the prefill segment: + // delta = (max_position_id + 1) - sequence_length + if len(m.TextModel.positionCache) > 0 { + last := m.TextModel.positionCache[len(m.TextModel.positionCache)-1] + m.TextModel.ropeDelta = last + 1 - int32(len(m.TextModel.positionCache)) + } + + return result, nil +} + +func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { + // Initial token embedding + hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx) + ctx.Forward(hiddenStates) + + // Build position slices for M-RoPE + positionSlice := func() [][]int32 { + s := [][]int32{ + make([]int32, len(batch.Positions)), // temporal + make([]int32, len(batch.Positions)), // height + make([]int32, len(batch.Positions)), // width + make([]int32, len(batch.Positions)), // unused (zeros) + } + for i, position := range batch.Positions { + // Translate through position cache or continue sequence + if position < int32(len(m.TextModel.positionCache)) { + position = m.TextModel.positionCache[position] + } else if len(m.TextModel.positionCache) > 0 { + // Continue sequence after cached positions using ropeDelta + position = position + m.TextModel.ropeDelta + } + + s[0][i] = position + s[1][i] = position + s[2][i] = position + } + return s + }() + + // Inject vision embeddings and adjust positions for image tokens + for _, mi := range batch.Multimodal { + img := mi.Multimodal[0].Tensor + ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1)))) + + if grid, ok := mi.Multimodal[0].Data.(*Grid); ok { + w := grid.Width / m.VisionModel.spatialMergeSize + for i := range img.Dim(1) { + positionSlice[1][mi.Index+i] += int32(i / w) + positionSlice[2][mi.Index+i] += int32(i % w) + } + } + } + + positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice)) + + // Process through transformer layers + for i, layer := range m.TextModel.Layers { + m.Cache.SetLayer(i) + + var lastLayerOutputs ml.Tensor + if i == len(m.TextModel.Layers)-1 { + lastLayerOutputs = batch.Outputs + } + + hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, m.Cache, m.TextModel.TextModelOptions) + } + + hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.TextModel.eps) + return m.Output.Forward(ctx, hiddenStates), nil +} + +func init() { + model.Register("glmocr", New) +} diff --git a/model/models/glmocr/model_text.go b/model/models/glmocr/model_text.go new file mode 100644 index 000000000..ec9cd7301 --- /dev/null +++ b/model/models/glmocr/model_text.go @@ -0,0 +1,190 @@ +package glmocr + +import ( + "math" + + "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/kvcache" + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/ml/nn" + "github.com/ollama/ollama/ml/nn/rope" +) + +type TextModelOptions struct { + hiddenSize int + numHeads int + numKVHeads int + headDim int + rotaryDim int + intermediateSize int + eps float32 + ropeBase float32 + mropeSections []int +} + +func (o *TextModelOptions) applyMRoPE(ctx ml.Context, states, positions ml.Tensor) ml.Tensor { + // With 4 sections for [temporal, height, width, unused] + return nn.RoPE(ctx, states, positions, o.rotaryDim, o.ropeBase, 1.0, rope.WithMRoPE(o.mropeSections)) +} + +type TextSelfAttention struct { + Query *nn.Linear `gguf:"attn_q"` + Key *nn.Linear `gguf:"attn_k"` + Value *nn.Linear `gguf:"attn_v"` + Output *nn.Linear `gguf:"attn_out"` +} + +func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor { + batchSize := hiddenStates.Dim(1) + + // Separate Q, K, V projections + q := sa.Query.Forward(ctx, hiddenStates) + k := sa.Key.Forward(ctx, hiddenStates) + v := sa.Value.Forward(ctx, hiddenStates) + + // Reshape for GQA + q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize) + k = k.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize) + v = v.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize) + + // Apply M-RoPE (multi-resolution rotary position embeddings) + q = opts.applyMRoPE(ctx, q, positions) + k = opts.applyMRoPE(ctx, k, positions) + + // Scaled dot-product attention with KV cache + scaleFactor := 1.0 / math.Sqrt(float64(opts.headDim)) + kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache) + // Reshape attention output: [headDim, numHeads, batchSize] -> [numHeads*headDim, batchSize] + // Note: numHeads * headDim = 16 * 128 = 2048, which is the attention hidden size + kqv = kqv.Reshape(ctx, opts.numHeads*opts.headDim, batchSize) + + return sa.Output.Forward(ctx, kqv) +} + +type TextMLP struct { + Gate *nn.Linear `gguf:"ffn_gate"` + Up *nn.Linear `gguf:"ffn_up"` + Down *nn.Linear `gguf:"ffn_down"` +} + +func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextModelOptions) ml.Tensor { + // SwiGLU: down(silu(gate(x)) * up(x)) + gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates)) + return mlp.Down.Forward(ctx, gate) +} + +type TextDecoderLayer struct { + // Input layernorm (before attention) + AttentionNorm *nn.RMSNorm `gguf:"attn_norm"` + SelfAttention *TextSelfAttention + // Post self-attention layernorm (after attention, before residual add) + PostAttnNorm *nn.RMSNorm `gguf:"post_attn_norm"` + + // FFN input layernorm (after first residual, before MLP) + FFNNorm *nn.RMSNorm `gguf:"ffn_norm"` + MLP *TextMLP + // Post MLP layernorm (after MLP, before residual add) + PostFFNNorm *nn.RMSNorm `gguf:"post_ffn_norm"` +} + +func (l *TextDecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor { + // Attention block + residual := hiddenStates + hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps) + hiddenStates = l.SelfAttention.Forward(ctx, hiddenStates, positions, cache, opts) + hiddenStates = l.PostAttnNorm.Forward(ctx, hiddenStates, opts.eps) + + // Prune to output positions in final layer + if outputs != nil { + hiddenStates = hiddenStates.Rows(ctx, outputs) + residual = residual.Rows(ctx, outputs) + } + + hiddenStates = hiddenStates.Add(ctx, residual) + + // MLP block + residual = hiddenStates + hiddenStates = l.FFNNorm.Forward(ctx, hiddenStates, opts.eps) + hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts) + hiddenStates = l.PostFFNNorm.Forward(ctx, hiddenStates, opts.eps) + hiddenStates = hiddenStates.Add(ctx, residual) + + return hiddenStates +} + +type TextModel struct { + TokenEmbedding *nn.Embedding `gguf:"token_embd"` + Layers []TextDecoderLayer `gguf:"blk"` + OutputNorm *nn.RMSNorm `gguf:"output_norm"` + Output *nn.Linear `gguf:"output,alt:token_embd"` + + *TextModelOptions + + // positionCache stores the M-RoPE position for each token in the sequence. + // This is needed because image tokens share the same base position but have + // different height/width offsets, and the end token position depends on the + // image grid dimensions. + positionCache []int32 + ropeDelta int32 +} + +func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { + // Clear position cache when KV cache shifts + m.positionCache = nil + m.ropeDelta = 0 + return m.applyMRoPE(ctx, key, shift), nil +} + +func newTextModel(c fs.Config) *TextModel { + hiddenSize := int(c.Uint("embedding_length", 1536)) + numHeads := int(c.Uint("attention.head_count", 16)) + numKVHeads := int(c.Uint("attention.head_count_kv", 8)) + intermediateSize := int(c.Uint("feed_forward_length", 4608)) + eps := c.Float("attention.layer_norm_rms_epsilon", 1e-5) + ropeBase := c.Float("rope.freq_base", 10000) + + headDim := int(c.Uint("attention.key_length", uint32(hiddenSize/numHeads))) + ropeDim := int(c.Uint("rope.dimension_count", uint32(headDim))) + if ropeDim <= 0 { + ropeDim = headDim + } + + mropeSections := c.Ints("rope.mrope_section") + var sectionInts []int + + if len(mropeSections) > 0 { + sectionInts = make([]int, len(mropeSections)) + for i, section := range mropeSections { + sectionInts[i] = int(section) + } + } else { + // Default to GLM-OCR's HF ratio (2:3:3) scaled to rotaryDim/2. + // For rotaryDim=64 this yields [8, 12, 12]. + total := ropeDim / 2 + if total <= 0 { + total = 32 + } + s0 := total * 2 / 8 + s1 := total * 3 / 8 + s2 := total - s0 - s1 + sectionInts = []int{s0, s1, s2} + } + + // GGML rope_multi: sector = (dim_pair) % sum(sections), mapping each pair to its position dim + rotaryDim := ropeDim + + return &TextModel{ + Layers: make([]TextDecoderLayer, c.Uint("block_count", 16)), + TextModelOptions: &TextModelOptions{ + hiddenSize: hiddenSize, + numHeads: numHeads, + numKVHeads: numKVHeads, + headDim: headDim, + rotaryDim: rotaryDim, + intermediateSize: intermediateSize, + eps: eps, + ropeBase: ropeBase, + mropeSections: sectionInts, + }, + } +} diff --git a/model/models/glmocr/model_vision.go b/model/models/glmocr/model_vision.go new file mode 100644 index 000000000..6f8d19311 --- /dev/null +++ b/model/models/glmocr/model_vision.go @@ -0,0 +1,355 @@ +package glmocr + +import ( + "log/slog" + "math" + "slices" + + "github.com/ollama/ollama/fs" + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/ml/nn" + "github.com/ollama/ollama/ml/nn/rope" +) + +type Grid struct { + Height int // Number of patches in height direction + Width int // Number of patches in width direction + Temporal int + ImageHeight int // Full image height in pixels + ImageWidth int // Full image width in pixels +} + +type VisionModelOptions struct { + hiddenSize int + numHeads int + headDim int + numChannels int + patchSize int + temporalPatchSize int + imageSize int + spatialMergeSize int + outHiddenSize int + intermediateSize int + eps float32 +} + +type VisionPatchEmbed struct { + Proj *nn.Conv2D `gguf:"patch_embd_0"` + Proj1 *nn.Conv2D `gguf:"patch_embd_1"` + Bias ml.Tensor `gguf:"patch_embd.bias"` +} + +func (pe *VisionPatchEmbed) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor { + _ = grid // patches are already in merge-block order + + // pixelValues shape: [patchDim, numPatches] + numPatches := pixelValues.Shape()[1] + + // Reshape to [patchSize*patchSize, temporalPatchSize, numChannels, numPatches] + pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches) + // Permute to [temporalPatchSize, patchSize*patchSize, numChannels, numPatches] + pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) + + // Slice temporal frames for Conv2D (simulate Conv3D) + in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx) + in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches) + + s0, s1 := opts.patchSize, opts.patchSize + p0, p1 := 0, 0 + d0, d1 := 1, 1 + hiddenStates := pe.Proj.Forward(ctx, in0, s0, s1, p0, p1, d0, d1) + + if pe.Proj1 != nil && opts.temporalPatchSize > 1 { + in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx) + in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches) + out1 := pe.Proj1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1) + hiddenStates = hiddenStates.Add(ctx, out1) + } + + // Flatten to [hidden_size, num_patches] + hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, numPatches) + + // Add patch bias - reshape from [hidden_size] to [hidden_size, 1] for broadcasting + if pe.Bias != nil { + hiddenStates = hiddenStates.Add(ctx, pe.Bias.Reshape(ctx, opts.hiddenSize, 1)) + } + + return hiddenStates +} + +type VisionSelfAttention struct { + QKV *nn.Linear `gguf:"attn_qkv"` + QNorm *nn.RMSNorm `gguf:"attn_q_norm"` + KNorm *nn.RMSNorm `gguf:"attn_k_norm"` + Output *nn.Linear `gguf:"attn_out"` +} + +func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor { + batchSize := hiddenStates.Dim(1) + + // Combined QKV projection: [3*hidden_size, batch_size] + qkv := sa.QKV.Forward(ctx, hiddenStates) + + // Split using ChunkSections along dim 0 (handles byte offsets correctly) + // ChunkSections returns views - must make contiguous before further operations + chunks := qkv.ChunkSections(ctx, 0, opts.hiddenSize, opts.hiddenSize, opts.hiddenSize) + q := chunks[0].Contiguous(ctx) + k := chunks[1].Contiguous(ctx) + v := chunks[2].Contiguous(ctx) + + // Reshape for multi-head attention: [hiddenSize, N] -> [headDim, numHeads, N] + q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize) + k = k.Reshape(ctx, opts.headDim, opts.numHeads, batchSize) + v = v.Reshape(ctx, opts.headDim, opts.numHeads, batchSize) + + // Apply Q-norm and K-norm after head reshape + // Weights are [headDim]=64, tensor is [headDim, numHeads, N] + q = sa.QNorm.Forward(ctx, q, opts.eps) + k = sa.KNorm.Forward(ctx, k, opts.eps) + + // Apply rotary position embeddings with vision-style 2D positions. + // ggml's vision RoPE uses two position dimensions (H/W) with half-rotation pairs. + // We provide H/W sections and leave the remaining sections empty. + ropeFreqBase := float32(10000.0) + section := opts.headDim / 4 + if section <= 0 { + section = 1 + } + sections := []int{section, section, 0, 0} + q = nn.RoPE(ctx, q, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections)) + k = nn.RoPE(ctx, k, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections)) + + // Scale factor for scaled dot-product attention + scale := 1.0 / math.Sqrt(float64(opts.headDim)) + + // Try flash attention first (ScaledDotProductAttention), fall back to manual + if sdpa, ok := q.(ml.ScaledDotProductAttention); ok { + attention := sdpa.ScaledDotProductAttention(ctx, k, v, nil, nil, nil, scale, false) + attention = attention.Reshape(ctx, opts.hiddenSize, batchSize) + return sa.Output.Forward(ctx, attention) + } + + slog.Warn("glmocr: vision attention falling back to manual attention", + "batchSize", batchSize, "numHeads", opts.numHeads, + "hint", "set OLLAMA_FLASH_ATTENTION=1 to enable flash attention") + + // Manual attention fallback + // q, k, v are [headDim, numHeads, batchSize] - GGML treats as 4D with implicit dim 3 = 1 + q = q.Permute(ctx, 0, 2, 1, 3) + k = k.Permute(ctx, 0, 2, 1, 3) + v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx) + + // Attention scores + kq := k.MulmatFullPrec(ctx, q) + kq = kq.Scale(ctx, scale) + kq = kq.Softmax(ctx) + + // Attention output: v @ kq (note: v first) + kqv := v.Mulmat(ctx, kq) + attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) + attention = attention.Reshape(ctx, opts.hiddenSize, batchSize) + + return sa.Output.Forward(ctx, attention) +} + +type VisionMLP struct { + Gate *nn.Linear `gguf:"ffn_gate"` + Up *nn.Linear `gguf:"ffn_up"` + Down *nn.Linear `gguf:"ffn_down"` +} + +func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor { + // SwiGLU: down(silu(gate(x)) * up(x)) + gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates)) + return mlp.Down.Forward(ctx, gate) +} + +type VisionBlock struct { + Norm1 *nn.RMSNorm `gguf:"ln1"` + SelfAttention *VisionSelfAttention + Norm2 *nn.RMSNorm `gguf:"ln2"` + MLP *VisionMLP +} + +func (b *VisionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor { + // Pre-norm architecture + residual := hiddenStates + hiddenStates = b.Norm1.Forward(ctx, hiddenStates, opts.eps) + hiddenStates = b.SelfAttention.Forward(ctx, hiddenStates, positions, opts) + hiddenStates = hiddenStates.Add(ctx, residual) + + residual = hiddenStates + hiddenStates = b.Norm2.Forward(ctx, hiddenStates, opts.eps) + hiddenStates = b.MLP.Forward(ctx, hiddenStates) + hiddenStates = hiddenStates.Add(ctx, residual) + + return hiddenStates +} + +type VisionDownsample struct { + *nn.Conv2D +} + +func (d *VisionDownsample) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor { + // Apply spatial downsampling via Conv2D + // Input: [hidden_size, num_patches] where patches are in merge-block order + + if d.Conv2D == nil || d.Weight == nil { + slog.Error("VisionDownsample weights not loaded - model may be corrupted or incompatible") + return hiddenStates // Return input unchanged as fallback + } + + merge := opts.spatialMergeSize + numOutputTokens := (grid.Height / merge) * (grid.Width / merge) + + // Step 1: Reshape to [hidden_size, merge, merge, num_output_tokens] + hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, merge, merge, numOutputTokens) + + // Step 2: Permute to [merge, merge, hidden_size, num_output_tokens] + // ggml semantics: result.ne[perm[i]] = input.ne[i] + // So permute(2,0,1,3) on [1024,2,2,N] gives: ne[2]=1024, ne[0]=2, ne[1]=2, ne[3]=N -> [2,2,1024,N] + hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx) + + // Step 3: Apply Conv2D without bias (bias added after reshape) + // Note: ggml_conv_2d takes (kernel, input) - kernel must be receiver in ollama + s0, s1 := merge, merge + p0, p1 := 0, 0 + d0, d1 := 1, 1 + hiddenStates = d.Weight.Conv2D(ctx, hiddenStates, s0, s1, p0, p1, d0, d1) + + // Step 4: Reshape to [out_hidden_size, num_output_tokens] + hiddenStates = hiddenStates.Reshape(ctx, opts.outHiddenSize, numOutputTokens) + + // Step 5: Add bias after reshape + // Reshape bias from [out_hidden_size] to [out_hidden_size, 1] for proper broadcasting + if d.Bias != nil { + hiddenStates = hiddenStates.Add(ctx, d.Bias.Reshape(ctx, opts.outHiddenSize, 1)) + } + + return hiddenStates +} + +type PatchMerger struct { + // GGUF tags align with mm.* keys used by the model + Proj *nn.Linear `gguf:"model.fc"` // mm.model.fc.weight + PostLN *nn.LayerNorm `gguf:"post_norm"` // mm.post_norm.weight/bias + GateProj *nn.Linear `gguf:"gate"` // mm.gate.weight + UpProj *nn.Linear `gguf:"up"` // mm.up.weight + DownProj *nn.Linear `gguf:"down"` // mm.down.weight +} + +func (m *PatchMerger) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor { + // Linear projection + hiddenStates = m.Proj.Forward(ctx, hiddenStates) + + // Post-projection layer norm + GELU ERF + hiddenStates = m.PostLN.Forward(ctx, hiddenStates, opts.eps) + hiddenStates = hiddenStates.GELU_ERF(ctx) + // Force a copy to avoid in-place mutation issues with GELU_ERF + hiddenStates = hiddenStates.Contiguous(ctx) + + // SwiGLU MLP: down(silu(gate(x)) * up(x)) + gateOut := m.GateProj.Forward(ctx, hiddenStates) + upOut := m.UpProj.Forward(ctx, hiddenStates) + gate := gateOut.SILU(ctx, upOut) + return m.DownProj.Forward(ctx, gate) +} + +type VisionModel struct { + PatchEmbed *VisionPatchEmbed + Blocks []VisionBlock `gguf:"blk"` + PostLN *nn.RMSNorm `gguf:"post_ln"` + // Note: Downsample is applied at the model level so mm.patch_merger stays separate + + *VisionModelOptions +} + +func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor { + // Extract patch embeddings from flattened patches + hiddenStates := m.PatchEmbed.Forward(ctx, pixelValues, grid, m.VisionModelOptions) + + // Create position IDs for RoPE (spatial grid) + // Patches are already in merge-block order from preprocessing + positions := m.createPositions(ctx, grid) + + // Process through vision blocks + for _, block := range m.Blocks { + hiddenStates = block.Forward(ctx, hiddenStates, positions, m.VisionModelOptions) + } + + // Post-layernorm + hiddenStates = m.PostLN.Forward(ctx, hiddenStates, m.eps) + + // Note: Downsample is now applied separately in Model.EncodeMultimodal + // so mm.patch_merger remains a distinct module + + return hiddenStates +} + +func (m *VisionModel) createPositions(ctx ml.Context, grid *Grid) ml.Tensor { + // Create spatial position IDs for vision RoPE + // Position layout: [height, width, height, width] - 4 sections for mrope + // Patches are in MERGE-BLOCK order after VisionPatchEmbed interleaving + // This follows the GLM-OCR rot_pos_emb layout + numPatches := grid.Height * grid.Width + mergeRatio := m.spatialMergeSize + + // Build position arrays in merge-block order + // Each merge_ratio x merge_ratio block of patches is grouped together + hpos := make([]int32, numPatches) + wpos := make([]int32, numPatches) + ptr := 0 + for y := 0; y < grid.Height; y += mergeRatio { + for x := 0; x < grid.Width; x += mergeRatio { + for dy := range mergeRatio { + for dx := range mergeRatio { + hpos[ptr] = int32(y + dy) + wpos[ptr] = int32(x + dx) + ptr++ + } + } + } + } + + // Build position arrays for 4 sections (mrope). ggml vision RoPE uses only H/W; + // keep remaining sections zeroed to match its conventions. + zeros := make([]int32, numPatches) + s := [][]int32{ + hpos, // Section 0: height + wpos, // Section 1: width + zeros, // Section 2: unused + zeros, // Section 3: unused + } + + return ctx.Input().FromInts(slices.Concat(s...), numPatches*4) +} + +func newVisionModel(c fs.Config) *VisionModel { + hiddenSize := int(c.Uint("vision.embedding_length", 1024)) + numHeads := int(c.Uint("vision.attention.head_count", 16)) + numChannels := int(c.Uint("vision.num_channels", 3)) + patchSize := int(c.Uint("vision.patch_size", 14)) + temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2)) + imageSize := int(c.Uint("vision.image_size", 336)) + spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2)) + outHiddenSize := int(c.Uint("vision.out_hidden_size", 1536)) + intermediateSize := int(c.Uint("vision.intermediate_size", 4096)) + eps := c.Float("vision.attention.layer_norm_rms_epsilon", 1e-5) + + return &VisionModel{ + Blocks: make([]VisionBlock, c.Uint("vision.block_count", 24)), + VisionModelOptions: &VisionModelOptions{ + hiddenSize: hiddenSize, + numHeads: numHeads, + headDim: hiddenSize / numHeads, + numChannels: numChannels, + patchSize: patchSize, + temporalPatchSize: temporalPatchSize, + imageSize: imageSize, + spatialMergeSize: spatialMergeSize, + outHiddenSize: outHiddenSize, + intermediateSize: intermediateSize, + eps: eps, + }, + } +} diff --git a/model/models/models.go b/model/models/models.go index bf5daea7b..4818518c9 100644 --- a/model/models/models.go +++ b/model/models/models.go @@ -8,6 +8,7 @@ import ( _ "github.com/ollama/ollama/model/models/gemma3" _ "github.com/ollama/ollama/model/models/gemma3n" _ "github.com/ollama/ollama/model/models/glm4moelite" + _ "github.com/ollama/ollama/model/models/glmocr" _ "github.com/ollama/ollama/model/models/gptoss" _ "github.com/ollama/ollama/model/models/lfm2" _ "github.com/ollama/ollama/model/models/llama" diff --git a/model/parsers/glmocr.go b/model/parsers/glmocr.go new file mode 100644 index 000000000..671ba939c --- /dev/null +++ b/model/parsers/glmocr.go @@ -0,0 +1,17 @@ +package parsers + +import "github.com/ollama/ollama/api" + +// GlmOcrParser is the GLM46 parser with thinking disabled. +type GlmOcrParser struct { + GLM46Parser +} + +func (p *GlmOcrParser) HasThinkingSupport() bool { + return false +} + +func (p *GlmOcrParser) Init(tools []api.Tool, _ *api.Message, _ *api.ThinkValue) []api.Tool { + p.tools = tools + return tools +} diff --git a/model/parsers/parsers.go b/model/parsers/parsers.go index 7e5ad4114..2b471d1da 100644 --- a/model/parsers/parsers.go +++ b/model/parsers/parsers.go @@ -71,6 +71,8 @@ func ParserForName(name string) Parser { return &FunctionGemmaParser{} case "glm-4.7": return &GLM47Parser{} + case "glm-ocr": + return &GlmOcrParser{} case "lfm2": return &LFM2Parser{hasThinkingSupport: false} case "lfm2-thinking": diff --git a/model/renderers/glmocr.go b/model/renderers/glmocr.go new file mode 100644 index 000000000..b141da07d --- /dev/null +++ b/model/renderers/glmocr.go @@ -0,0 +1,109 @@ +package renderers + +import ( + "encoding/json" + "fmt" + "strings" + + "github.com/ollama/ollama/api" +) + +type GlmOcrRenderer struct{} + +func (r *GlmOcrRenderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) { + var sb strings.Builder + + sb.WriteString("[gMASK]") + + if len(tools) > 0 { + sb.WriteString("<|system|>\n") + sb.WriteString("# Tools\n\n") + sb.WriteString("You may call one or more functions to assist with the user query.\n\n") + sb.WriteString("You are provided with function signatures within XML tags:\n") + sb.WriteString("\n") + for _, tool := range tools { + d, _ := json.Marshal(tool) + sb.WriteString(formatGLM47ToolJSON(d)) + sb.WriteString("\n") + } + sb.WriteString("\n\n") + sb.WriteString("For each function call, output the function name and arguments within the following XML format:\n") + sb.WriteString("{function-name}{arg-key-1}{arg-value-1}{arg-key-2}{arg-value-2}...") + } + + enableThinking := false + thinkingExplicitlySet := false + if thinkValue != nil { + enableThinking = thinkValue.Bool() + thinkingExplicitlySet = true + } + + for i, message := range messages { + switch message.Role { + case "user": + sb.WriteString("<|user|>\n") + sb.WriteString(message.Content) + if thinkingExplicitlySet && !enableThinking && !strings.HasSuffix(message.Content, "/nothink") { + sb.WriteString("/nothink") + } + case "assistant": + sb.WriteString("<|assistant|>\n") + if message.Thinking != "" { + sb.WriteString("" + strings.TrimSpace(message.Thinking) + "") + } else { + sb.WriteString("") + } + if message.Content != "" { + sb.WriteString("\n" + strings.TrimSpace(message.Content)) + } + if len(message.ToolCalls) > 0 { + for _, toolCall := range message.ToolCalls { + sb.WriteString("\n" + toolCall.Function.Name) + sb.WriteString(renderGlmOcrToolArguments(toolCall.Function.Arguments)) + sb.WriteString("") + } + } + sb.WriteString("\n") + case "tool": + if i == 0 || messages[i-1].Role != "tool" { + sb.WriteString("<|observation|>") + } + sb.WriteString("\n\n") + sb.WriteString(message.Content) + sb.WriteString("\n\n") + case "system": + sb.WriteString("<|system|>\n") + sb.WriteString(message.Content) + sb.WriteString("\n") + } + } + + sb.WriteString("<|assistant|>\n") + if thinkingExplicitlySet && !enableThinking { + sb.WriteString("\n") + } + + return sb.String(), nil +} + +func renderGlmOcrToolArguments(args api.ToolCallFunctionArguments) string { + var sb strings.Builder + for key, value := range args.All() { + sb.WriteString("" + key + "") + var valueStr string + if str, ok := value.(string); ok { + valueStr = str + } else { + jsonBytes, err := json.Marshal(value) + if err != nil { + valueStr = fmt.Sprintf("%v", value) + } else { + valueStr = string(jsonBytes) + } + } + + sb.WriteString("" + valueStr + "") + } + + return sb.String() +} diff --git a/model/renderers/renderer.go b/model/renderers/renderer.go index efb966aad..baa0bc8c4 100644 --- a/model/renderers/renderer.go +++ b/model/renderers/renderer.go @@ -82,6 +82,8 @@ func rendererForName(name string) Renderer { return &FunctionGemmaRenderer{} case "glm-4.7": return &GLM47Renderer{} + case "glm-ocr": + return &GlmOcrRenderer{} case "lfm2": return &LFM2Renderer{IsThinking: false} case "lfm2-thinking":