mirror of
https://github.com/ollama/ollama.git
synced 2026-03-27 02:58:43 +07:00
Add QuantizedEmbedding and EmbeddingLayer interface so models can use quantized embedding weights and expose tied output projections. This change updates gemma3, glm4_moe_lite, llama, qwen3, and qwen3_5 to use the new interface.
112 lines
2.8 KiB
Go
112 lines
2.8 KiB
Go
// tokenizer.go - BPE and SentencePiece tokenizer for HuggingFace models
|
|
//
|
|
// Based on standard BPE algorithm (Sennrich et al. 2015) with:
|
|
// - GPT-2 byte-level encoding (OpenAI tiktoken)
|
|
// - HuggingFace tokenizer.json pretokenizer patterns
|
|
// - SentencePiece ▁-style space handling
|
|
|
|
package tokenizer
|
|
|
|
import "regexp"
|
|
|
|
// TokenizerType identifies the tokenization algorithm
|
|
type TokenizerType int
|
|
|
|
const (
|
|
TokenizerBPE TokenizerType = iota // GPT-2 style byte-level BPE
|
|
TokenizerSentencePiece // SentencePiece with ▁ for spaces
|
|
)
|
|
|
|
// Vocabulary holds the tokenizer vocabulary and merges
|
|
type Vocabulary struct {
|
|
Values []string
|
|
Reverse map[string]int32
|
|
Merges map[string]int
|
|
|
|
BOS int32
|
|
EOS []int32 // Multiple EOS tokens supported (e.g., Gemma has <eos> and <end_of_turn>)
|
|
PAD int32 // Padding token (often <|endoftext|> or <pad>)
|
|
AddBOS bool
|
|
AddEOS bool
|
|
|
|
// Precomputed byte token IDs for <0xNN> fallback (256 entries, -1 if not found)
|
|
byteTokens [256]int32
|
|
}
|
|
|
|
// Tokenizer handles BPE and SentencePiece tokenization
|
|
type Tokenizer struct {
|
|
vocab *Vocabulary
|
|
pretokenizer *regexp.Regexp
|
|
specialTokens map[string]int32 // Special tokens for direct lookup
|
|
sortedSpecialTokens []string // Special tokens sorted by length, longest first
|
|
typ TokenizerType // Algorithm type
|
|
}
|
|
|
|
// Precomputed GPT-2 byte-level encoding table
|
|
// Maps byte values to their encoded rune equivalents
|
|
var byteToRune [256]rune
|
|
|
|
func init() {
|
|
for b := 0; b < 256; b++ {
|
|
r := rune(b)
|
|
switch {
|
|
case r == 0x00ad:
|
|
r = 0x0143
|
|
case r <= 0x0020:
|
|
r = r + 0x0100
|
|
case r >= 0x007f && r <= 0x00a0:
|
|
r = r + 0x00a2
|
|
}
|
|
byteToRune[b] = r
|
|
}
|
|
}
|
|
|
|
// VocabSize returns the vocabulary size
|
|
func (t *Tokenizer) VocabSize() int {
|
|
return len(t.vocab.Values)
|
|
}
|
|
|
|
// BOS returns the beginning of sequence token ID
|
|
func (t *Tokenizer) BOS() int32 {
|
|
return t.vocab.BOS
|
|
}
|
|
|
|
// AddBOS returns whether a BOS token should be prepended during encoding.
|
|
func (t *Tokenizer) AddBOS() bool {
|
|
return t.vocab.AddBOS
|
|
}
|
|
|
|
// EOS returns the first end of sequence token ID (for backwards compatibility)
|
|
func (t *Tokenizer) EOS() int32 {
|
|
if len(t.vocab.EOS) > 0 {
|
|
return t.vocab.EOS[0]
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// EOSTokens returns all end of sequence token IDs
|
|
func (t *Tokenizer) EOSTokens() []int32 {
|
|
return t.vocab.EOS
|
|
}
|
|
|
|
// PAD returns the padding token ID, or -1 if not set
|
|
func (t *Tokenizer) PAD() int32 {
|
|
return t.vocab.PAD
|
|
}
|
|
|
|
// IsEOS returns true if the token ID is an end of sequence token
|
|
func (t *Tokenizer) IsEOS(id int32) bool {
|
|
for _, eos := range t.vocab.EOS {
|
|
if id == eos {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// GetSpecialToken returns the token ID for a special token string
|
|
func (t *Tokenizer) GetSpecialToken(name string) (int32, bool) {
|
|
id, ok := t.specialTokens[name]
|
|
return id, ok
|
|
}
|