mirror of
https://github.com/ollama/ollama.git
synced 2026-03-27 02:58:43 +07:00
bench: improve benchmarking tool (#14240)
New features: - Warmup phase to eliminate cold-start outliers - time-to-first-token measured in each epoch - VRAM/memory tracking to identify CPU spillover - Controlled prompt length - Defaults to 6 epochs and 200 tokens max Benchstat fixes: - ns/request instead of ns/op — non-standard unit created a separate group instead of grouping with timing metrics - Token count as the N field — benchstat interprets N as iteration count for statistical weighting, not as a token count
This commit is contained in:
@@ -1,27 +1,31 @@
|
||||
Ollama Benchmark Tool
|
||||
---------------------
|
||||
|
||||
A Go-based command-line tool for benchmarking Ollama models with configurable parameters and multiple output formats.
|
||||
A Go-based command-line tool for benchmarking Ollama models with configurable parameters, warmup phases, TTFT tracking, VRAM monitoring, and benchstat/CSV output.
|
||||
|
||||
## Features
|
||||
|
||||
* Benchmark multiple models in a single run
|
||||
* Support for both text and image prompts
|
||||
* Configurable generation parameters (temperature, max tokens, seed, etc.)
|
||||
* Supports benchstat and CSV output formats
|
||||
* Detailed performance metrics (prefill, generate, load, total durations)
|
||||
* Warmup phase before timed epochs to stabilize measurements
|
||||
* Time-to-first-token (TTFT) tracking per epoch
|
||||
* Model metadata display (parameter size, quantization level, family)
|
||||
* VRAM and CPU memory usage tracking via running process info
|
||||
* Controlled prompt token length for reproducible benchmarks
|
||||
* Benchstat and CSV output formats
|
||||
|
||||
## Building from Source
|
||||
|
||||
```
|
||||
go build -o ollama-bench bench.go
|
||||
./ollama-bench -model gpt-oss:20b -epochs 6 -format csv
|
||||
go build -o ollama-bench ./cmd/bench
|
||||
./ollama-bench -model gemma3 -epochs 6 -format csv
|
||||
```
|
||||
|
||||
Using Go Run (without building)
|
||||
|
||||
```
|
||||
go run bench.go -model gpt-oss:20b -epochs 3
|
||||
go run ./cmd/bench -model gemma3 -epochs 3
|
||||
```
|
||||
|
||||
## Usage
|
||||
@@ -45,10 +49,16 @@ benchstat -col /name gemma.bench
|
||||
./ollama-bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
|
||||
```
|
||||
|
||||
### Controlled Prompt Length
|
||||
|
||||
```
|
||||
./ollama-bench -model gemma3 -epochs 6 -prompt-tokens 512
|
||||
```
|
||||
|
||||
### Advanced Example
|
||||
|
||||
```
|
||||
./ollama-bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
|
||||
./ollama-bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -warmup 2 -format csv -output results.csv
|
||||
```
|
||||
|
||||
## Command Line Options
|
||||
@@ -56,41 +66,48 @@ benchstat -col /name gemma.bench
|
||||
| Option | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| -model | Comma-separated list of models to benchmark | (required) |
|
||||
| -epochs | Number of iterations per model | 1 |
|
||||
| -max-tokens | Maximum tokens for model response | 0 (unlimited) |
|
||||
| -epochs | Number of iterations per model | 6 |
|
||||
| -max-tokens | Maximum tokens for model response | 200 |
|
||||
| -temperature | Temperature parameter | 0.0 |
|
||||
| -seed | Random seed | 0 (random) |
|
||||
| -timeout | Timeout in seconds | 300 |
|
||||
| -p | Prompt text | "Write a long story." |
|
||||
| -p | Prompt text | (default story prompt) |
|
||||
| -image | Image file to include in prompt | |
|
||||
| -k | Keep-alive duration in seconds | 0 |
|
||||
| -format | Output format (benchstat, csv) | benchstat |
|
||||
| -output | Output file for results | "" (stdout) |
|
||||
| -warmup | Number of warmup requests before timing | 1 |
|
||||
| -prompt-tokens | Generate prompt targeting ~N tokens (0 = use -p) | 0 |
|
||||
| -v | Verbose mode | false |
|
||||
| -debug | Show debug information | false |
|
||||
|
||||
## Output Formats
|
||||
|
||||
### Markdown Format
|
||||
### Benchstat Format (default)
|
||||
|
||||
The default markdown format is suitable for copying and pasting into a GitHub issue and will look like:
|
||||
```
|
||||
Model | Step | Count | Duration | nsPerToken | tokensPerSec |
|
||||
|-------|------|-------|----------|------------|--------------|
|
||||
| gpt-oss:20b | prefill | 124 | 30.006458ms | 241987.56 | 4132.44 |
|
||||
| gpt-oss:20b | generate | 200 | 2.646843954s | 13234219.77 | 75.56 |
|
||||
| gpt-oss:20b | load | 1 | 121.674208ms | - | - |
|
||||
| gpt-oss:20b | total | 1 | 2.861047625s | - | - |
|
||||
```
|
||||
|
||||
### Benchstat Format
|
||||
|
||||
Compatible with Go's benchstat tool for statistical analysis:
|
||||
Compatible with Go's benchstat tool for statistical analysis. Uses one value/unit pair per line, standard `ns/op` for timing metrics, and `ns/token` for throughput. Each epoch produces one set of lines -- benchstat aggregates across repeated runs to compute statistics.
|
||||
|
||||
```
|
||||
BenchmarkModel/name=gpt-oss:20b/step=prefill 128 78125.00 ns/token 12800.00 token/sec
|
||||
BenchmarkModel/name=gpt-oss:20b/step=generate 512 19531.25 ns/token 51200.00 token/sec
|
||||
BenchmarkModel/name=gpt-oss:20b/step=load 1 1500000000 ns/request
|
||||
# Model: gemma3 | Params: 4.3B | Quant: Q4_K_M | Family: gemma3 | Size: 4080218931 | VRAM: 4080218931
|
||||
BenchmarkModel/name=gemma3/step=prefill 1 78125.00 ns/token 12800.00 token/sec
|
||||
BenchmarkModel/name=gemma3/step=generate 1 19531.25 ns/token 51200.00 token/sec
|
||||
BenchmarkModel/name=gemma3/step=ttft 1 45123000 ns/op
|
||||
BenchmarkModel/name=gemma3/step=load 1 1500000000 ns/op
|
||||
BenchmarkModel/name=gemma3/step=total 1 2861047625 ns/op
|
||||
```
|
||||
|
||||
Use with benchstat:
|
||||
```
|
||||
./ollama-bench -model gemma3 -epochs 6 > gemma3.bench
|
||||
benchstat -col /step gemma3.bench
|
||||
```
|
||||
|
||||
Compare two runs:
|
||||
```
|
||||
./ollama-bench -model gemma3 -epochs 6 > before.bench
|
||||
# ... make changes ...
|
||||
./ollama-bench -model gemma3 -epochs 6 > after.bench
|
||||
benchstat before.bench after.bench
|
||||
```
|
||||
|
||||
### CSV Format
|
||||
@@ -99,17 +116,28 @@ Machine-readable comma-separated values:
|
||||
|
||||
```
|
||||
NAME,STEP,COUNT,NS_PER_COUNT,TOKEN_PER_SEC
|
||||
gpt-oss:20b,prefill,128,78125.00,12800.00
|
||||
gpt-oss:20b,generate,512,19531.25,51200.00
|
||||
gpt-oss:20b,load,1,1500000000,0
|
||||
# Model: gemma3 | Params: 4.3B | Quant: Q4_K_M | Family: gemma3 | Size: 4080218931 | VRAM: 4080218931
|
||||
gemma3,prefill,128,78125.00,12800.00
|
||||
gemma3,generate,512,19531.25,51200.00
|
||||
gemma3,ttft,1,45123000,0
|
||||
gemma3,load,1,1500000000,0
|
||||
gemma3,total,1,2861047625,0
|
||||
```
|
||||
|
||||
## Metrics Explained
|
||||
|
||||
The tool reports four types of metrics for each model:
|
||||
The tool reports the following metrics for each epoch:
|
||||
|
||||
* prefill: Time spent processing the prompt
|
||||
* generate: Time spent generating the response
|
||||
* load: Model loading time (one-time cost)
|
||||
* total: Total request duration
|
||||
* **prefill**: Time spent processing the prompt (ns/token)
|
||||
* **generate**: Time spent generating the response (ns/token)
|
||||
* **ttft**: Time to first token -- latency from request start to first response content
|
||||
* **load**: Model loading time (one-time cost)
|
||||
* **total**: Total request duration
|
||||
|
||||
Additionally, the model info comment line (displayed once per model before epochs) includes:
|
||||
|
||||
* **Params**: Model parameter count (e.g., 4.3B)
|
||||
* **Quant**: Quantization level (e.g., Q4_K_M)
|
||||
* **Family**: Model family (e.g., gemma3)
|
||||
* **Size**: Total model memory in bytes
|
||||
* **VRAM**: GPU memory used by the loaded model (when Size > VRAM, the difference is CPU spill)
|
||||
|
||||
@@ -17,19 +17,21 @@ import (
|
||||
)
|
||||
|
||||
type flagOptions struct {
|
||||
models *string
|
||||
epochs *int
|
||||
maxTokens *int
|
||||
temperature *float64
|
||||
seed *int
|
||||
timeout *int
|
||||
prompt *string
|
||||
imageFile *string
|
||||
keepAlive *float64
|
||||
format *string
|
||||
outputFile *string
|
||||
debug *bool
|
||||
verbose *bool
|
||||
models *string
|
||||
epochs *int
|
||||
maxTokens *int
|
||||
temperature *float64
|
||||
seed *int
|
||||
timeout *int
|
||||
prompt *string
|
||||
imageFile *string
|
||||
keepAlive *float64
|
||||
format *string
|
||||
outputFile *string
|
||||
debug *bool
|
||||
verbose *bool
|
||||
warmup *int
|
||||
promptTokens *int
|
||||
}
|
||||
|
||||
type Metrics struct {
|
||||
@@ -39,48 +41,169 @@ type Metrics struct {
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
var once sync.Once
|
||||
type ModelInfo struct {
|
||||
Name string
|
||||
ParameterSize string
|
||||
QuantizationLevel string
|
||||
Family string
|
||||
SizeBytes int64
|
||||
VRAMBytes int64
|
||||
}
|
||||
|
||||
const DefaultPrompt = `Please write a descriptive story about a llama named Alonso who grows up to be President of the Land of Llamas. Include details about Alonso's childhood, adolescent years, and how he grew up to be a political mover and shaker. Write the story with a sense of whimsy.`
|
||||
|
||||
// Word list for generating prompts targeting a specific token count.
|
||||
var promptWordList = []string{
|
||||
"the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
|
||||
"a", "bright", "sunny", "day", "in", "the", "meadow", "where",
|
||||
"flowers", "bloom", "and", "birds", "sing", "their", "morning",
|
||||
"songs", "while", "gentle", "breeze", "carries", "sweet", "scent",
|
||||
"of", "pine", "trees", "across", "rolling", "hills", "toward",
|
||||
"distant", "mountains", "covered", "with", "fresh", "snow",
|
||||
"beneath", "clear", "blue", "sky", "children", "play", "near",
|
||||
"old", "stone", "bridge", "that", "crosses", "winding", "river",
|
||||
}
|
||||
|
||||
func generatePromptForTokenCount(targetTokens int, epoch int) string {
|
||||
// ~1.3 tokens per word heuristic
|
||||
targetWords := int(float64(targetTokens) / 1.3)
|
||||
if targetWords < 1 {
|
||||
targetWords = 1
|
||||
}
|
||||
|
||||
// Vary the starting offset by epoch to defeat KV cache prefix matching
|
||||
offset := epoch * 7 // stride by a prime to get good distribution
|
||||
n := len(promptWordList)
|
||||
words := make([]string, targetWords)
|
||||
for i := range words {
|
||||
words[i] = promptWordList[((i+offset)%n+n)%n]
|
||||
}
|
||||
return strings.Join(words, " ")
|
||||
}
|
||||
|
||||
func buildGenerateRequest(model string, fOpt flagOptions, imgData api.ImageData, epoch int) *api.GenerateRequest {
|
||||
options := make(map[string]interface{})
|
||||
if *fOpt.maxTokens > 0 {
|
||||
options["num_predict"] = *fOpt.maxTokens
|
||||
}
|
||||
options["temperature"] = *fOpt.temperature
|
||||
if fOpt.seed != nil && *fOpt.seed > 0 {
|
||||
options["seed"] = *fOpt.seed
|
||||
}
|
||||
|
||||
var keepAliveDuration *api.Duration
|
||||
if *fOpt.keepAlive > 0 {
|
||||
duration := api.Duration{Duration: time.Duration(*fOpt.keepAlive * float64(time.Second))}
|
||||
keepAliveDuration = &duration
|
||||
}
|
||||
|
||||
prompt := *fOpt.prompt
|
||||
if *fOpt.promptTokens > 0 {
|
||||
prompt = generatePromptForTokenCount(*fOpt.promptTokens, epoch)
|
||||
} else {
|
||||
// Vary the prompt per epoch to defeat KV cache prefix matching
|
||||
prompt = fmt.Sprintf("[%d] %s", epoch, prompt)
|
||||
}
|
||||
|
||||
req := &api.GenerateRequest{
|
||||
Model: model,
|
||||
Prompt: prompt,
|
||||
Raw: true,
|
||||
Options: options,
|
||||
KeepAlive: keepAliveDuration,
|
||||
}
|
||||
|
||||
if imgData != nil {
|
||||
req.Images = []api.ImageData{imgData}
|
||||
}
|
||||
|
||||
return req
|
||||
}
|
||||
|
||||
func fetchModelInfo(ctx context.Context, client *api.Client, model string) ModelInfo {
|
||||
info := ModelInfo{Name: model}
|
||||
resp, err := client.Show(ctx, &api.ShowRequest{Model: model})
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "WARNING: Could not fetch model info for '%s': %v\n", model, err)
|
||||
return info
|
||||
}
|
||||
info.ParameterSize = resp.Details.ParameterSize
|
||||
info.QuantizationLevel = resp.Details.QuantizationLevel
|
||||
info.Family = resp.Details.Family
|
||||
return info
|
||||
}
|
||||
|
||||
func fetchMemoryUsage(ctx context.Context, client *api.Client, model string) (size, vram int64) {
|
||||
resp, err := client.ListRunning(ctx)
|
||||
if err != nil {
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
fmt.Fprintf(os.Stderr, "WARNING: Could not fetch memory usage: %v\n", err)
|
||||
}
|
||||
return 0, 0
|
||||
}
|
||||
for _, m := range resp.Models {
|
||||
if m.Name == model || m.Model == model {
|
||||
return m.Size, m.SizeVRAM
|
||||
}
|
||||
}
|
||||
// Try prefix match (model names may include :latest or tags)
|
||||
for _, m := range resp.Models {
|
||||
if strings.HasPrefix(m.Name, model) || strings.HasPrefix(m.Model, model) {
|
||||
return m.Size, m.SizeVRAM
|
||||
}
|
||||
}
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
func outputFormatHeader(w io.Writer, format string, verbose bool) {
|
||||
switch format {
|
||||
case "benchstat":
|
||||
if verbose {
|
||||
fmt.Fprintf(w, "goos: %s\n", runtime.GOOS)
|
||||
fmt.Fprintf(w, "goarch: %s\n", runtime.GOARCH)
|
||||
}
|
||||
case "csv":
|
||||
headings := []string{"NAME", "STEP", "COUNT", "NS_PER_COUNT", "TOKEN_PER_SEC"}
|
||||
fmt.Fprintln(w, strings.Join(headings, ","))
|
||||
}
|
||||
}
|
||||
|
||||
func outputModelInfo(w io.Writer, format string, info ModelInfo) {
|
||||
params := cmp.Or(info.ParameterSize, "unknown")
|
||||
quant := cmp.Or(info.QuantizationLevel, "unknown")
|
||||
family := cmp.Or(info.Family, "unknown")
|
||||
|
||||
memStr := ""
|
||||
if info.SizeBytes > 0 {
|
||||
memStr = fmt.Sprintf(" | Size: %d | VRAM: %d", info.SizeBytes, info.VRAMBytes)
|
||||
}
|
||||
fmt.Fprintf(w, "# Model: %s | Params: %s | Quant: %s | Family: %s%s\n",
|
||||
info.Name, params, quant, family, memStr)
|
||||
}
|
||||
|
||||
func OutputMetrics(w io.Writer, format string, metrics []Metrics, verbose bool) {
|
||||
switch format {
|
||||
case "benchstat":
|
||||
if verbose {
|
||||
printHeader := func() {
|
||||
fmt.Fprintf(w, "sysname: %s\n", runtime.GOOS)
|
||||
fmt.Fprintf(w, "machine: %s\n", runtime.GOARCH)
|
||||
}
|
||||
once.Do(printHeader)
|
||||
}
|
||||
for _, m := range metrics {
|
||||
if m.Step == "generate" || m.Step == "prefill" {
|
||||
if m.Count > 0 {
|
||||
nsPerToken := float64(m.Duration.Nanoseconds()) / float64(m.Count)
|
||||
tokensPerSec := float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
|
||||
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s %d %.2f ns/token %.2f token/sec\n",
|
||||
m.Model, m.Step, m.Count, nsPerToken, tokensPerSec)
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s 1 %.2f ns/token %.2f token/sec\n",
|
||||
m.Model, m.Step, nsPerToken, tokensPerSec)
|
||||
} else {
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s %d 0 ns/token 0 token/sec\n",
|
||||
m.Model, m.Step, m.Count)
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s 1 0 ns/token 0 token/sec\n",
|
||||
m.Model, m.Step)
|
||||
}
|
||||
} else if m.Step == "ttft" {
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s/step=ttft 1 %d ns/op\n",
|
||||
m.Model, m.Duration.Nanoseconds())
|
||||
} else {
|
||||
var suffix string
|
||||
if m.Step == "load" {
|
||||
suffix = "/step=load"
|
||||
}
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s%s 1 %d ns/request\n",
|
||||
m.Model, suffix, m.Duration.Nanoseconds())
|
||||
fmt.Fprintf(w, "BenchmarkModel/name=%s/step=%s 1 %d ns/op\n",
|
||||
m.Model, m.Step, m.Duration.Nanoseconds())
|
||||
}
|
||||
}
|
||||
case "csv":
|
||||
printHeader := func() {
|
||||
headings := []string{"NAME", "STEP", "COUNT", "NS_PER_COUNT", "TOKEN_PER_SEC"}
|
||||
fmt.Fprintln(w, strings.Join(headings, ","))
|
||||
}
|
||||
once.Do(printHeader)
|
||||
|
||||
for _, m := range metrics {
|
||||
if m.Step == "generate" || m.Step == "prefill" {
|
||||
var nsPerToken float64
|
||||
@@ -94,39 +217,14 @@ func OutputMetrics(w io.Writer, format string, metrics []Metrics, verbose bool)
|
||||
fmt.Fprintf(w, "%s,%s,1,%d,0\n", m.Model, m.Step, m.Duration.Nanoseconds())
|
||||
}
|
||||
}
|
||||
case "markdown":
|
||||
printHeader := func() {
|
||||
fmt.Fprintln(w, "| Model | Step | Count | Duration | nsPerToken | tokensPerSec |")
|
||||
fmt.Fprintln(w, "|-------|------|-------|----------|------------|--------------|")
|
||||
}
|
||||
once.Do(printHeader)
|
||||
|
||||
for _, m := range metrics {
|
||||
var nsPerToken, tokensPerSec float64
|
||||
var nsPerTokenStr, tokensPerSecStr string
|
||||
|
||||
if m.Step == "generate" || m.Step == "prefill" {
|
||||
nsPerToken = float64(m.Duration.Nanoseconds()) / float64(m.Count)
|
||||
tokensPerSec = float64(m.Count) / (float64(m.Duration.Nanoseconds()) + 1e-12) * 1e9
|
||||
nsPerTokenStr = fmt.Sprintf("%.2f", nsPerToken)
|
||||
tokensPerSecStr = fmt.Sprintf("%.2f", tokensPerSec)
|
||||
} else {
|
||||
nsPerTokenStr = "-"
|
||||
tokensPerSecStr = "-"
|
||||
}
|
||||
|
||||
fmt.Fprintf(w, "| %s | %s | %d | %v | %s | %s |\n",
|
||||
m.Model, m.Step, m.Count, m.Duration, nsPerTokenStr, tokensPerSecStr)
|
||||
}
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr, "Unknown output format '%s'\n", format)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkChat(fOpt flagOptions) error {
|
||||
func BenchmarkModel(fOpt flagOptions) error {
|
||||
models := strings.Split(*fOpt.models, ",")
|
||||
|
||||
// todo - add multi-image support
|
||||
var imgData api.ImageData
|
||||
var err error
|
||||
if *fOpt.imageFile != "" {
|
||||
@@ -158,71 +256,124 @@ func BenchmarkChat(fOpt flagOptions) error {
|
||||
out = f
|
||||
}
|
||||
|
||||
outputFormatHeader(out, *fOpt.format, *fOpt.verbose)
|
||||
|
||||
// Log prompt-tokens info in debug mode
|
||||
if *fOpt.debug && *fOpt.promptTokens > 0 {
|
||||
prompt := generatePromptForTokenCount(*fOpt.promptTokens, 0)
|
||||
wordCount := len(strings.Fields(prompt))
|
||||
fmt.Fprintf(os.Stderr, "Generated prompt targeting ~%d tokens (%d words, varied per epoch)\n", *fOpt.promptTokens, wordCount)
|
||||
}
|
||||
|
||||
for _, model := range models {
|
||||
for range *fOpt.epochs {
|
||||
options := make(map[string]interface{})
|
||||
if *fOpt.maxTokens > 0 {
|
||||
options["num_predict"] = *fOpt.maxTokens
|
||||
}
|
||||
options["temperature"] = *fOpt.temperature
|
||||
if fOpt.seed != nil && *fOpt.seed > 0 {
|
||||
options["seed"] = *fOpt.seed
|
||||
}
|
||||
|
||||
var keepAliveDuration *api.Duration
|
||||
if *fOpt.keepAlive > 0 {
|
||||
duration := api.Duration{Duration: time.Duration(*fOpt.keepAlive * float64(time.Second))}
|
||||
keepAliveDuration = &duration
|
||||
}
|
||||
|
||||
req := &api.ChatRequest{
|
||||
Model: model,
|
||||
Messages: []api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: *fOpt.prompt,
|
||||
},
|
||||
},
|
||||
Options: options,
|
||||
KeepAlive: keepAliveDuration,
|
||||
}
|
||||
|
||||
if imgData != nil {
|
||||
req.Messages[0].Images = []api.ImageData{imgData}
|
||||
}
|
||||
|
||||
var responseMetrics *api.Metrics
|
||||
// Fetch model info
|
||||
infoCtx, infoCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
info := fetchModelInfo(infoCtx, client, model)
|
||||
infoCancel()
|
||||
|
||||
// Warmup phase (uses negative epoch numbers to avoid colliding with timed epochs)
|
||||
for i := range *fOpt.warmup {
|
||||
req := buildGenerateRequest(model, fOpt, imgData, -(i + 1))
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(*fOpt.timeout)*time.Second)
|
||||
defer cancel()
|
||||
|
||||
err = client.Chat(ctx, req, func(resp api.ChatResponse) error {
|
||||
if *fOpt.debug {
|
||||
fmt.Fprintf(os.Stderr, "%s", cmp.Or(resp.Message.Thinking, resp.Message.Content))
|
||||
}
|
||||
|
||||
if resp.Done {
|
||||
responseMetrics = &resp.Metrics
|
||||
}
|
||||
err = client.Generate(ctx, req, func(resp api.GenerateResponse) error {
|
||||
return nil
|
||||
})
|
||||
|
||||
if *fOpt.debug {
|
||||
fmt.Fprintln(os.Stderr)
|
||||
}
|
||||
cancel()
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Chat request timed out with model '%s' after %vs\n", model, 1)
|
||||
continue
|
||||
fmt.Fprintf(os.Stderr, "WARNING: Warmup %d/%d for %s failed: %v\n", i+1, *fOpt.warmup, model, err)
|
||||
} else if *fOpt.debug {
|
||||
fmt.Fprintf(os.Stderr, "Warmup %d/%d for %s complete\n", i+1, *fOpt.warmup, model)
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch memory usage once after warmup (model is loaded and stable)
|
||||
memCtx, memCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
info.SizeBytes, info.VRAMBytes = fetchMemoryUsage(memCtx, client, model)
|
||||
memCancel()
|
||||
|
||||
outputModelInfo(out, *fOpt.format, info)
|
||||
|
||||
// Timed epoch loop
|
||||
shortCount := 0
|
||||
for epoch := range *fOpt.epochs {
|
||||
var responseMetrics *api.Metrics
|
||||
var ttft time.Duration
|
||||
short := false
|
||||
|
||||
// Retry loop: if the model hits a stop token before max-tokens,
|
||||
// retry with a different prompt (up to maxRetries times).
|
||||
const maxRetries = 3
|
||||
for attempt := range maxRetries + 1 {
|
||||
responseMetrics = nil
|
||||
ttft = 0
|
||||
var ttftOnce sync.Once
|
||||
|
||||
req := buildGenerateRequest(model, fOpt, imgData, epoch+attempt*1000)
|
||||
requestStart := time.Now()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(*fOpt.timeout)*time.Second)
|
||||
|
||||
err = client.Generate(ctx, req, func(resp api.GenerateResponse) error {
|
||||
if *fOpt.debug {
|
||||
fmt.Fprintf(os.Stderr, "%s", cmp.Or(resp.Thinking, resp.Response))
|
||||
}
|
||||
|
||||
// Capture TTFT on first content
|
||||
ttftOnce.Do(func() {
|
||||
if resp.Response != "" || resp.Thinking != "" {
|
||||
ttft = time.Since(requestStart)
|
||||
}
|
||||
})
|
||||
|
||||
if resp.Done {
|
||||
responseMetrics = &resp.Metrics
|
||||
}
|
||||
return nil
|
||||
})
|
||||
cancel()
|
||||
|
||||
if *fOpt.debug {
|
||||
fmt.Fprintln(os.Stderr)
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Couldn't chat with model '%s': %v\n", model, err)
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Request timed out with model '%s' after %vs\n", model, *fOpt.timeout)
|
||||
} else {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Couldn't generate with model '%s': %v\n", model, err)
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
if responseMetrics == nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: No metrics received for model '%s'\n", model)
|
||||
break
|
||||
}
|
||||
|
||||
// Check if the response was shorter than requested
|
||||
short = *fOpt.maxTokens > 0 && responseMetrics.EvalCount < *fOpt.maxTokens
|
||||
if !short || attempt == maxRetries {
|
||||
break
|
||||
}
|
||||
|
||||
if *fOpt.debug {
|
||||
fmt.Fprintf(os.Stderr, "Short response (%d/%d tokens), retrying with different prompt (attempt %d/%d)\n",
|
||||
responseMetrics.EvalCount, *fOpt.maxTokens, attempt+1, maxRetries)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil || responseMetrics == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if responseMetrics == nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: No metrics received for model '%s'\n", model)
|
||||
continue
|
||||
if short {
|
||||
shortCount++
|
||||
if *fOpt.debug {
|
||||
fmt.Fprintf(os.Stderr, "WARNING: Short response (%d/%d tokens) after %d retries for epoch %d\n",
|
||||
responseMetrics.EvalCount, *fOpt.maxTokens, maxRetries, epoch+1)
|
||||
}
|
||||
}
|
||||
|
||||
metrics := []Metrics{
|
||||
@@ -238,6 +389,12 @@ func BenchmarkChat(fOpt flagOptions) error {
|
||||
Count: responseMetrics.EvalCount,
|
||||
Duration: responseMetrics.EvalDuration,
|
||||
},
|
||||
{
|
||||
Model: model,
|
||||
Step: "ttft",
|
||||
Count: 1,
|
||||
Duration: ttft,
|
||||
},
|
||||
{
|
||||
Model: model,
|
||||
Step: "load",
|
||||
@@ -254,15 +411,42 @@ func BenchmarkChat(fOpt flagOptions) error {
|
||||
|
||||
OutputMetrics(out, *fOpt.format, metrics, *fOpt.verbose)
|
||||
|
||||
if *fOpt.debug && *fOpt.promptTokens > 0 {
|
||||
fmt.Fprintf(os.Stderr, "Generated prompt targeting ~%d tokens (actual: %d)\n",
|
||||
*fOpt.promptTokens, responseMetrics.PromptEvalCount)
|
||||
}
|
||||
|
||||
if *fOpt.keepAlive > 0 {
|
||||
time.Sleep(time.Duration(*fOpt.keepAlive*float64(time.Second)) + 200*time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
if shortCount > 0 {
|
||||
fmt.Fprintf(os.Stderr, "WARNING: %d/%d epochs for '%s' had short responses (<%d tokens). Generation metrics may be unreliable.\n",
|
||||
shortCount, *fOpt.epochs, model, *fOpt.maxTokens)
|
||||
}
|
||||
|
||||
// Unload model before moving to the next one
|
||||
unloadModel(client, model, *fOpt.timeout)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func unloadModel(client *api.Client, model string, timeout int) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Second)
|
||||
defer cancel()
|
||||
|
||||
zero := api.Duration{Duration: 0}
|
||||
req := &api.GenerateRequest{
|
||||
Model: model,
|
||||
KeepAlive: &zero,
|
||||
}
|
||||
_ = client.Generate(ctx, req, func(resp api.GenerateResponse) error {
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
func readImage(filePath string) (api.ImageData, error) {
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
@@ -280,19 +464,21 @@ func readImage(filePath string) (api.ImageData, error) {
|
||||
|
||||
func main() {
|
||||
fOpt := flagOptions{
|
||||
models: flag.String("model", "", "Model to benchmark"),
|
||||
epochs: flag.Int("epochs", 6, "Number of epochs (iterations) per model"),
|
||||
maxTokens: flag.Int("max-tokens", 200, "Maximum tokens for model response"),
|
||||
temperature: flag.Float64("temperature", 0, "Temperature parameter"),
|
||||
seed: flag.Int("seed", 0, "Random seed"),
|
||||
timeout: flag.Int("timeout", 60*5, "Timeout in seconds (default 300s)"),
|
||||
prompt: flag.String("p", DefaultPrompt, "Prompt to use"),
|
||||
imageFile: flag.String("image", "", "Filename for an image to include"),
|
||||
keepAlive: flag.Float64("k", 0, "Keep alive duration in seconds"),
|
||||
format: flag.String("format", "markdown", "Output format [benchstat|csv] (default benchstat)"),
|
||||
outputFile: flag.String("output", "", "Output file for results (stdout if empty)"),
|
||||
verbose: flag.Bool("v", false, "Show system information"),
|
||||
debug: flag.Bool("debug", false, "Show debug information"),
|
||||
models: flag.String("model", "", "Model to benchmark"),
|
||||
epochs: flag.Int("epochs", 6, "Number of epochs (iterations) per model"),
|
||||
maxTokens: flag.Int("max-tokens", 200, "Maximum tokens for model response"),
|
||||
temperature: flag.Float64("temperature", 0, "Temperature parameter"),
|
||||
seed: flag.Int("seed", 0, "Random seed"),
|
||||
timeout: flag.Int("timeout", 60*5, "Timeout in seconds (default 300s)"),
|
||||
prompt: flag.String("p", DefaultPrompt, "Prompt to use"),
|
||||
imageFile: flag.String("image", "", "Filename for an image to include"),
|
||||
keepAlive: flag.Float64("k", 0, "Keep alive duration in seconds"),
|
||||
format: flag.String("format", "benchstat", "Output format [benchstat|csv]"),
|
||||
outputFile: flag.String("output", "", "Output file for results (stdout if empty)"),
|
||||
verbose: flag.Bool("v", false, "Show system information"),
|
||||
debug: flag.Bool("debug", false, "Show debug information"),
|
||||
warmup: flag.Int("warmup", 1, "Number of warmup requests before timing"),
|
||||
promptTokens: flag.Int("prompt-tokens", 0, "Generate prompt targeting ~N tokens (0 = use -p prompt)"),
|
||||
}
|
||||
|
||||
flag.Usage = func() {
|
||||
@@ -302,11 +488,12 @@ func main() {
|
||||
fmt.Fprintf(os.Stderr, "Options:\n")
|
||||
flag.PrintDefaults()
|
||||
fmt.Fprintf(os.Stderr, "\nExamples:\n")
|
||||
fmt.Fprintf(os.Stderr, " bench -model gpt-oss:20b -epochs 3 -temperature 0.7\n")
|
||||
fmt.Fprintf(os.Stderr, " bench -model gemma3,llama3 -epochs 6\n")
|
||||
fmt.Fprintf(os.Stderr, " bench -model gemma3 -epochs 6 -prompt-tokens 512 -format csv\n")
|
||||
}
|
||||
flag.Parse()
|
||||
|
||||
if !slices.Contains([]string{"markdown", "benchstat", "csv"}, *fOpt.format) {
|
||||
if !slices.Contains([]string{"benchstat", "csv"}, *fOpt.format) {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: Unknown format '%s'\n", *fOpt.format)
|
||||
os.Exit(1)
|
||||
}
|
||||
@@ -317,5 +504,5 @@ func main() {
|
||||
return
|
||||
}
|
||||
|
||||
BenchmarkChat(fOpt)
|
||||
BenchmarkModel(fOpt)
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user