Revert "runner: add token history sampling parameters to ollama runner (#14537)" (#14776)

This reverts commit 86513cb697.
This commit is contained in:
Jeffrey Morgan
2026-03-10 21:07:52 -07:00
committed by GitHub
parent 464186e995
commit 54e05172a0
8 changed files with 15 additions and 193 deletions

View File

@@ -1063,7 +1063,7 @@ func DefaultOptions() Options {
TopP: 0.9, TopP: 0.9,
TypicalP: 1.0, TypicalP: 1.0,
RepeatLastN: 64, RepeatLastN: 64,
RepeatPenalty: 1.0, RepeatPenalty: 1.1,
PresencePenalty: 0.0, PresencePenalty: 0.0,
FrequencyPenalty: 0.0, FrequencyPenalty: 0.0,
Seed: -1, Seed: -1,

View File

@@ -152,9 +152,7 @@ PARAMETER <parameter> <parametervalue>
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------------- |
| num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 | | num_ctx | Sets the size of the context window used to generate the next token. (Default: 2048) | int | num_ctx 4096 |
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 | | repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.0) | float | repeat_penalty 1.0 | | repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
| presence_penalty | Penalizes tokens that have already appeared in the generated text to reduce repetition. (Default: 0.0) | float | presence_penalty 1.5 |
| frequency_penalty | Penalizes tokens based on how often they have appeared in the generated text. (Default: 0.0) | float | frequency_penalty 1.0 |
| temperature | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) | float | temperature 0.7 | | temperature | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) | float | temperature 0.7 |
| seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) | int | seed 42 | | seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) | int | seed 42 |
| stop | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile. | string | stop "AI assistant:" | | stop | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile. | string | stop "AI assistant:" |

View File

@@ -562,7 +562,6 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
if errors.As(err, &reprocess) { if errors.As(err, &reprocess) {
// Prepend these inputs to the sequence's inputs queue for reprocessing // Prepend these inputs to the sequence's inputs queue for reprocessing
seq.inputs = append(reprocess.Inputs, seq.inputs...) seq.inputs = append(reprocess.Inputs, seq.inputs...)
seq.sampler.Reset()
// Skip this sequence but continue processing the rest // Skip this sequence but continue processing the rest
nextBatch.seqs[seqIdx] = nil // clear this sequence for this batch nextBatch.seqs[seqIdx] = nil // clear this sequence for this batch
err = nil err = nil
@@ -693,12 +692,6 @@ func (s *Server) computeBatch(activeBatch batchState) {
// (unless we take down the whole runner). // (unless we take down the whole runner).
if len(seq.pendingInputs) > 0 { if len(seq.pendingInputs) > 0 {
seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...) seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
for _, inp := range seq.pendingInputs {
if len(inp.Multimodal) != 0 {
continue
}
seq.sampler.Accept(inp.Token)
}
seq.pendingInputs = []*input.Input{} seq.pendingInputs = []*input.Input{}
} }
@@ -899,9 +892,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
req.Options.TopK, req.Options.TopK,
req.Options.TopP, req.Options.TopP,
req.Options.MinP, req.Options.MinP,
req.Options.RepeatPenalty,
req.Options.PresencePenalty,
req.Options.FrequencyPenalty,
req.Options.Seed, req.Options.Seed,
grammar, grammar,
) )
@@ -948,14 +938,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
return return
} }
seq.sampler.Reset()
for _, inp := range seq.cache.Inputs {
if len(inp.Multimodal) != 0 {
continue
}
seq.sampler.Accept(inp.Token)
}
s.seqs[i] = seq s.seqs[i] = seq
s.cond.Signal() s.cond.Signal()
found = true found = true

View File

@@ -16,49 +16,24 @@ type token struct {
value float32 // The raw logit or probability from the model value float32 // The raw logit or probability from the model
} }
const DefaultPenaltyLookback = 64
type Sampler struct { type Sampler struct {
rng *rand.Rand rng *rand.Rand
topK int topK int
topP float32 topP float32
minP float32 minP float32
temperature float32 temperature float32
repeat float32
presence float32
frequency float32
history []int32
grammar *GrammarSampler grammar *GrammarSampler
} }
func (s *Sampler) Reset() {
s.history = s.history[:0]
}
func (s *Sampler) Accept(token int32) {
s.history = append(s.history, token)
if len(s.history) > DefaultPenaltyLookback {
copy(s.history, s.history[len(s.history)-DefaultPenaltyLookback:])
s.history = s.history[:DefaultPenaltyLookback]
}
}
func (s *Sampler) Sample(logits []float32) (int32, error) { func (s *Sampler) Sample(logits []float32) (int32, error) {
if len(logits) == 0 { if len(logits) == 0 {
return -1, errors.New("sample: no logits provided to sample") return -1, errors.New("sample: no logits provided to sample")
} }
counts := tokenCounts(s.history, len(logits))
tokens := make([]token, len(logits)) tokens := make([]token, len(logits))
for i := range logits { for i := range logits {
value := logits[i]
if count := counts[int32(i)]; count > 0 {
value = applyPenalty(value, count, s.repeat, s.presence, s.frequency)
}
tokens[i].id = int32(i) tokens[i].id = int32(i)
tokens[i].value = value tokens[i].value = logits[i]
} }
t, err := s.sample(tokens) t, err := s.sample(tokens)
@@ -80,12 +55,8 @@ func (s *Sampler) Sample(logits []float32) (int32, error) {
// we need to reset them before applying the grammar and // we need to reset them before applying the grammar and
// sampling again // sampling again
for i := range logits { for i := range logits {
value := logits[i]
if count := counts[int32(i)]; count > 0 {
value = applyPenalty(value, count, s.repeat, s.presence, s.frequency)
}
tokens[i].id = int32(i) tokens[i].id = int32(i)
tokens[i].value = value tokens[i].value = logits[i]
} }
s.grammar.Apply(tokens) s.grammar.Apply(tokens)
t, err = s.sample(tokens) t, err = s.sample(tokens)
@@ -156,7 +127,7 @@ func (s *Sampler) sample(tokens []token) (token, error) {
} }
// TODO(parthsareen): update sampler interface to use json unmarshal https://github.com/ollama/ollama/issues/9278 // TODO(parthsareen): update sampler interface to use json unmarshal https://github.com/ollama/ollama/issues/9278
func NewSampler(temperature float32, topK int, topP float32, minP float32, repeatPenalty float32, presencePenalty float32, frequencyPenalty float32, seed int, grammar *GrammarSampler) Sampler { func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int, grammar *GrammarSampler) Sampler {
var rng *rand.Rand var rng *rand.Rand
if seed != -1 { if seed != -1 {
// PCG requires two parameters: sequence and stream // PCG requires two parameters: sequence and stream
@@ -183,19 +154,12 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, repea
minP = 1.0 minP = 1.0
} }
if repeatPenalty <= 0 {
repeatPenalty = 1.0
}
return Sampler{ return Sampler{
rng: rng, rng: rng,
topK: topK, topK: topK,
topP: topP, topP: topP,
minP: minP, minP: minP,
temperature: temperature, temperature: temperature,
repeat: repeatPenalty,
presence: presencePenalty,
frequency: frequencyPenalty,
grammar: grammar, grammar: grammar,
} }
} }

View File

@@ -16,7 +16,7 @@ func BenchmarkWeightedSampler(b *testing.B) {
logits[i] = float32(rand.Float64()*10 - 5) logits[i] = float32(rand.Float64()*10 - 5)
} }
sampler := NewSampler(0.8, 0, 0, 0, 1, 0, 0, 42, nil) sampler := NewSampler(0.8, 0, 0, 0, 42, nil)
b.ResetTimer() b.ResetTimer()
for b.Loop() { for b.Loop() {
sampler.Sample(logits) sampler.Sample(logits)
@@ -49,7 +49,7 @@ func BenchmarkWeightedSampler(b *testing.B) {
for _, tc := range configs { for _, tc := range configs {
b.Run("Config"+tc.name, func(b *testing.B) { b.Run("Config"+tc.name, func(b *testing.B) {
sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, 1, 0, 0, tc.seed, nil) sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed, nil)
sampler.Sample(logits) sampler.Sample(logits)
b.ResetTimer() b.ResetTimer()
@@ -62,7 +62,7 @@ func BenchmarkWeightedSampler(b *testing.B) {
// Test with combined transforms separately - topK influences performance greatly // Test with combined transforms separately - topK influences performance greatly
b.Run("TransformCombined", func(b *testing.B) { b.Run("TransformCombined", func(b *testing.B) {
sampler := NewSampler(0.8, 50, 0.9, 0.05, 1, 0, 0, 42, nil) sampler := NewSampler(0.8, 50, 0.9, 0.05, 42, nil)
b.ResetTimer() b.ResetTimer()
for b.Loop() { for b.Loop() {
@@ -81,7 +81,7 @@ func BenchmarkGreedySampler(b *testing.B) {
logits[i] = float32(rand.Float64()*10 - 5) logits[i] = float32(rand.Float64()*10 - 5)
} }
sampler := NewSampler(0, -1, 0, 0, 1, 0, 0, -1, nil) sampler := NewSampler(0, -1, 0, 0, -1, nil)
b.ResetTimer() b.ResetTimer()
for b.Loop() { for b.Loop() {

View File

@@ -13,7 +13,7 @@ import (
func TestWeighted(t *testing.T) { func TestWeighted(t *testing.T) {
logits := []float32{-10, 3, -10, -10} logits := []float32{-10, 3, -10, -10}
sampler := NewSampler(0, 0, 0, 0, 1, 0, 0, 0, nil) sampler := NewSampler(0, 0, 0, 0, 0, nil)
got, err := sampler.Sample(logits) got, err := sampler.Sample(logits)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
@@ -25,7 +25,7 @@ func TestWeighted(t *testing.T) {
} }
logits = []float32{-100, -10, 0, 10} logits = []float32{-100, -10, 0, 10}
sampler = NewSampler(0, 0, 0, 0, 1, 0, 0, 0, nil) sampler = NewSampler(0, 0, 0, 0, 0, nil)
got, err = sampler.Sample(logits) got, err = sampler.Sample(logits)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
@@ -39,7 +39,7 @@ func TestWeighted(t *testing.T) {
// Test very high p // Test very high p
logits = []float32{1.0, 0.9999999999999999, 0.5, 0.1} logits = []float32{1.0, 0.9999999999999999, 0.5, 0.1}
// Use extremely small topP to filter out all tokens // Use extremely small topP to filter out all tokens
sampler = NewSampler(1.0, 0, 1e-10, 0, 1, 0, 0, 0, nil) sampler = NewSampler(1.0, 0, 1e-10, 0, 0, nil)
got, err = sampler.Sample(logits) got, err = sampler.Sample(logits)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
@@ -52,7 +52,7 @@ func TestWeighted(t *testing.T) {
} }
logits = []float32{float32(math.NaN()), float32(math.NaN()), float32(math.NaN())} logits = []float32{float32(math.NaN()), float32(math.NaN()), float32(math.NaN())}
sampler = NewSampler(1, 0, 0.95, 0.05, 1, 0, 0, 0, nil) sampler = NewSampler(1, 0, 0.95, 0.05, 0, nil)
got, err = sampler.Sample(logits) got, err = sampler.Sample(logits)
if err == nil { if err == nil {
t.Errorf("expected error, got %d", got) t.Errorf("expected error, got %d", got)
@@ -151,8 +151,8 @@ func TestGrammar(t *testing.T) {
func BenchmarkSample(b *testing.B) { func BenchmarkSample(b *testing.B) {
samplers := map[string]Sampler{ samplers := map[string]Sampler{
"Greedy": NewSampler(0, 0, 0, 0, 1, 0, 0, 0, nil), // Use NewSampler with temp=0 for greedy "Greedy": NewSampler(0, 0, 0, 0, 0, nil), // Use NewSampler with temp=0 for greedy
"Weighted": NewSampler(0.5, 10, 0.9, 0.2, 1, 0, 0, -1, nil), "Weighted": NewSampler(0.5, 10, 0.9, 0.2, -1, nil),
} }
// Generate random logits for benchmarking // Generate random logits for benchmarking

View File

@@ -25,48 +25,6 @@ func (h *tokenHeap) Pop() any {
return x return x
} }
func tokenCounts(history []int32, vocabSize int) map[int32]int {
if len(history) == 0 {
return nil
}
start := 0
if len(history) > DefaultPenaltyLookback {
start = len(history) - DefaultPenaltyLookback
}
counts := make(map[int32]int, len(history)-start)
for _, token := range history[start:] {
if token < 0 || int(token) >= vocabSize {
continue
}
counts[token]++
}
return counts
}
func applyPenalty(logit float32, count int, repeatPenalty float32, presencePenalty float32, frequencyPenalty float32) float32 {
if repeatPenalty != 1.0 {
// Preserve ordering for negative logits when applying repeat penalty.
if logit < 0 {
logit *= repeatPenalty
} else {
logit /= repeatPenalty
}
}
if frequencyPenalty != 0 {
logit -= float32(count) * frequencyPenalty
}
if presencePenalty != 0 {
logit -= presencePenalty
}
return logit
}
// temperature applies scaling to the logits // temperature applies scaling to the logits
func temperature(ts []token, temp float32) { func temperature(ts []token, temp float32) {
// Ensure temperature clipping near 0 to avoid numerical instability // Ensure temperature clipping near 0 to avoid numerical instability

View File

@@ -295,86 +295,6 @@ func TestMinP(t *testing.T) {
} }
} }
func TestTokenCounts(t *testing.T) {
history := make([]int32, 70)
history[0] = 7
history[69] = 7
counts := tokenCounts(history, 8)
if got := counts[7]; got != 1 {
t.Fatalf("lookback mismatch: got %d want %d", got, 1)
}
}
func TestApplyPenalty(t *testing.T) {
logit := applyPenalty(5.0, 3, 1.0, 1.5, 0.5)
if math.Abs(float64(logit-2.0)) > 1e-6 {
t.Fatalf("unexpected penalty result: got %f want %f", logit, 2.0)
}
logit = applyPenalty(4.0, 1, 2.0, 0, 0)
if math.Abs(float64(logit-2.0)) > 1e-6 {
t.Fatalf("unexpected repeat penalty result for positive logits: got %f want %f", logit, 2.0)
}
logit = applyPenalty(-4.0, 1, 2.0, 0, 0)
if math.Abs(float64(logit-(-8.0))) > 1e-6 {
t.Fatalf("unexpected repeat penalty result for negative logits: got %f want %f", logit, -8.0)
}
}
func TestSamplerPresencePenalty(t *testing.T) {
logits := []float32{0.0, 5.0, 0.0}
baseline := NewSampler(0, 0, 1, 0, 1, 0, 0, -1, nil)
baseline.Accept(1)
got, err := baseline.Sample(logits)
if err != nil {
t.Fatal(err)
}
if got != 1 {
t.Fatalf("unexpected baseline token: got %d want %d", got, 1)
}
presence := NewSampler(0, 0, 1, 0, 1, 6, 0, -1, nil)
presence.Accept(1)
got, err = presence.Sample(logits)
if err != nil {
t.Fatal(err)
}
if got == 1 {
t.Fatalf("presence penalty did not change repeated token selection")
}
}
func TestSamplerFrequencyPenalty(t *testing.T) {
logits := []float32{0.0, 5.0, 4.0}
baseline := NewSampler(0, 0, 1, 0, 1, 0, 0, -1, nil)
baseline.Accept(1)
baseline.Accept(1)
baseline.Accept(1)
got, err := baseline.Sample(logits)
if err != nil {
t.Fatal(err)
}
if got != 1 {
t.Fatalf("unexpected baseline token: got %d want %d", got, 1)
}
frequency := NewSampler(0, 0, 1, 0, 1, 0, 1.0, -1, nil)
frequency.Accept(1)
frequency.Accept(1)
frequency.Accept(1)
got, err = frequency.Sample(logits)
if err != nil {
t.Fatal(err)
}
if got != 2 {
t.Fatalf("frequency penalty did not demote repeated token as expected: got %d want %d", got, 2)
}
}
func BenchmarkTransforms(b *testing.B) { func BenchmarkTransforms(b *testing.B) {
// Generate random logits // Generate random logits
tokens := make([]token, 1<<16) tokens := make([]token, 1<<16)