mirror of
https://github.com/ollama/ollama.git
synced 2026-03-27 02:58:43 +07:00
261 lines
7.7 KiB
Go
261 lines
7.7 KiB
Go
package lfm2
|
|
|
|
import (
|
|
"image"
|
|
stdimage "image/draw"
|
|
"math"
|
|
"slices"
|
|
|
|
"github.com/ollama/ollama/fs"
|
|
"github.com/ollama/ollama/model/imageproc"
|
|
)
|
|
|
|
type ImageProcessor struct {
|
|
imageSize, patchSize, numChannels int
|
|
downsampleFactor int
|
|
imageMean, imageStd [3]float32
|
|
|
|
doImageSplitting bool
|
|
minTiles int
|
|
maxTiles int
|
|
useThumbnail bool
|
|
tileSize int
|
|
|
|
minImageTokens int
|
|
maxImageTokens int
|
|
maxPixelsTolerance float64
|
|
}
|
|
|
|
type processedVisionImage struct {
|
|
data []float32
|
|
size image.Point
|
|
row int
|
|
col int
|
|
thumbnail bool
|
|
}
|
|
|
|
type processedVisionLayout struct {
|
|
rows int
|
|
cols int
|
|
hasThumbnail bool
|
|
}
|
|
|
|
func newImageProcessor(c fs.Config) ImageProcessor {
|
|
mean := c.Floats("vision.image_mean")
|
|
std := c.Floats("vision.image_std")
|
|
|
|
processor := ImageProcessor{
|
|
imageSize: int(c.Uint("vision.image_size", 256)),
|
|
patchSize: int(c.Uint("vision.patch_size", 16)),
|
|
numChannels: int(c.Uint("vision.num_channels", 3)),
|
|
downsampleFactor: int(c.Uint("vision.projector.scale_factor", 2)),
|
|
imageMean: [3]float32{0.5, 0.5, 0.5},
|
|
imageStd: [3]float32{0.5, 0.5, 0.5},
|
|
doImageSplitting: c.Bool("vision.do_image_splitting", true),
|
|
minTiles: int(c.Uint("vision.min_tiles", 2)),
|
|
maxTiles: int(c.Uint("vision.max_tiles", 10)),
|
|
useThumbnail: c.Bool("vision.use_thumbnail", true),
|
|
tileSize: int(c.Uint("vision.tile_size", 512)),
|
|
minImageTokens: int(c.Uint("vision.min_image_tokens", 64)),
|
|
maxImageTokens: int(c.Uint("vision.max_image_tokens", 256)),
|
|
maxPixelsTolerance: float64(c.Float("vision.max_pixels_tolerance", 2.0)),
|
|
}
|
|
|
|
if len(mean) >= 3 {
|
|
processor.imageMean = [3]float32{mean[0], mean[1], mean[2]}
|
|
}
|
|
if len(std) >= 3 {
|
|
processor.imageStd = [3]float32{std[0], std[1], std[2]}
|
|
}
|
|
|
|
// Keep defaults aligned with HF unless explicitly configured.
|
|
if processor.downsampleFactor <= 0 {
|
|
processor.downsampleFactor = 2
|
|
}
|
|
if processor.patchSize <= 0 {
|
|
processor.patchSize = 16
|
|
}
|
|
if processor.tileSize <= 0 {
|
|
processor.tileSize = 512
|
|
}
|
|
if processor.minTiles <= 0 {
|
|
processor.minTiles = 2
|
|
}
|
|
if processor.maxTiles < processor.minTiles {
|
|
processor.maxTiles = processor.minTiles
|
|
}
|
|
if processor.minImageTokens <= 0 {
|
|
processor.minImageTokens = 64
|
|
}
|
|
if processor.maxImageTokens < processor.minImageTokens {
|
|
processor.maxImageTokens = processor.minImageTokens
|
|
}
|
|
if processor.maxPixelsTolerance <= 0 {
|
|
processor.maxPixelsTolerance = 2.0
|
|
}
|
|
|
|
return processor
|
|
}
|
|
|
|
func (p ImageProcessor) ProcessImage(img image.Image) ([]processedVisionImage, processedVisionLayout, error) {
|
|
img = imageproc.Composite(img)
|
|
|
|
orig := img.Bounds().Size()
|
|
resizedWidth, resizedHeight := p.smartResize(orig.Y, orig.X)
|
|
|
|
layout := processedVisionLayout{rows: 1, cols: 1}
|
|
if p.shouldSplit(orig.Y, orig.X) {
|
|
gridWidth, gridHeight, targetWidth, targetHeight := p.gridLayout(orig.Y, orig.X)
|
|
layout.rows = gridHeight
|
|
layout.cols = gridWidth
|
|
layout.hasThumbnail = p.useThumbnail && gridWidth*gridHeight != 1
|
|
|
|
resized := imageproc.Resize(img, image.Point{X: targetWidth, Y: targetHeight}, imageproc.ResizeBilinear)
|
|
images := make([]processedVisionImage, 0, gridWidth*gridHeight+1)
|
|
for row := range gridHeight {
|
|
for col := range gridWidth {
|
|
rect := image.Rect(
|
|
col*p.tileSize,
|
|
row*p.tileSize,
|
|
(col+1)*p.tileSize,
|
|
(row+1)*p.tileSize,
|
|
)
|
|
tile := cropImage(resized, rect)
|
|
images = append(images, processedVisionImage{
|
|
data: imageproc.Normalize(tile, p.imageMean, p.imageStd, true, true),
|
|
size: tile.Bounds().Size(),
|
|
row: row + 1,
|
|
col: col + 1,
|
|
})
|
|
}
|
|
}
|
|
|
|
if layout.hasThumbnail {
|
|
thumbnail := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
|
|
images = append(images, processedVisionImage{
|
|
data: imageproc.Normalize(thumbnail, p.imageMean, p.imageStd, true, true),
|
|
size: thumbnail.Bounds().Size(),
|
|
thumbnail: true,
|
|
})
|
|
}
|
|
|
|
return images, layout, nil
|
|
}
|
|
|
|
single := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
|
|
return []processedVisionImage{{
|
|
data: imageproc.Normalize(single, p.imageMean, p.imageStd, true, true),
|
|
size: single.Bounds().Size(),
|
|
}}, layout, nil
|
|
}
|
|
|
|
func (p ImageProcessor) shouldSplit(height, width int) bool {
|
|
if !p.doImageSplitting || p.minTiles == 1 && p.maxTiles == 1 {
|
|
return false
|
|
}
|
|
|
|
totalFactor := p.patchSize * p.downsampleFactor
|
|
hBar := max(p.patchSize, roundByFactor(height, totalFactor))
|
|
wBar := max(p.patchSize, roundByFactor(width, totalFactor))
|
|
|
|
limit := float64(p.maxImageTokens * p.patchSize * p.patchSize * p.downsampleFactor * p.downsampleFactor)
|
|
limit *= p.maxPixelsTolerance
|
|
|
|
return float64(hBar*wBar) > limit
|
|
}
|
|
|
|
func (p ImageProcessor) smartResize(height, width int) (int, int) {
|
|
totalFactor := p.patchSize * p.downsampleFactor
|
|
minPixels := p.minImageTokens * p.patchSize * p.patchSize * p.downsampleFactor * p.downsampleFactor
|
|
maxPixels := p.maxImageTokens * p.patchSize * p.patchSize * p.downsampleFactor * p.downsampleFactor
|
|
|
|
hBar := max(totalFactor, roundByFactor(height, totalFactor))
|
|
wBar := max(totalFactor, roundByFactor(width, totalFactor))
|
|
|
|
if hBar*wBar > maxPixels {
|
|
beta := math.Sqrt(float64(height*width) / float64(maxPixels))
|
|
hBar = max(totalFactor, int(math.Floor(float64(height)/beta/float64(totalFactor)))*totalFactor)
|
|
wBar = max(totalFactor, int(math.Floor(float64(width)/beta/float64(totalFactor)))*totalFactor)
|
|
} else if hBar*wBar < minPixels {
|
|
beta := math.Sqrt(float64(minPixels) / float64(height*width))
|
|
hBar = int(math.Ceil(float64(height)*beta/float64(totalFactor))) * totalFactor
|
|
wBar = int(math.Ceil(float64(width)*beta/float64(totalFactor))) * totalFactor
|
|
}
|
|
|
|
return wBar, hBar
|
|
}
|
|
|
|
func (p ImageProcessor) gridLayout(height, width int) (gridWidth, gridHeight, targetWidth, targetHeight int) {
|
|
aspectRatio := float64(width) / float64(height)
|
|
targetRatios := p.targetRatios()
|
|
bestRatio := clipImageSize{width: 1, height: 1}
|
|
bestRatioDiff := math.MaxFloat64
|
|
area := float64(width * height)
|
|
|
|
for _, ratio := range targetRatios {
|
|
targetAspect := float64(ratio.width) / float64(ratio.height)
|
|
ratioDiff := math.Abs(aspectRatio - targetAspect)
|
|
|
|
if ratioDiff < bestRatioDiff {
|
|
bestRatioDiff = ratioDiff
|
|
bestRatio = ratio
|
|
continue
|
|
}
|
|
|
|
if ratioDiff == bestRatioDiff {
|
|
targetArea := float64(p.tileSize * p.tileSize * ratio.width * ratio.height)
|
|
if area > 0.5*targetArea {
|
|
bestRatio = ratio
|
|
}
|
|
}
|
|
}
|
|
|
|
return bestRatio.width, bestRatio.height, p.tileSize * bestRatio.width, p.tileSize * bestRatio.height
|
|
}
|
|
|
|
type clipImageSize struct {
|
|
width int
|
|
height int
|
|
}
|
|
|
|
func (p ImageProcessor) targetRatios() []clipImageSize {
|
|
targetRatios := make([]clipImageSize, 0, p.maxTiles*p.maxTiles)
|
|
for n := p.minTiles; n <= p.maxTiles; n++ {
|
|
for w := 1; w <= n; w++ {
|
|
for h := 1; h <= n; h++ {
|
|
if w*h < p.minTiles || w*h > p.maxTiles {
|
|
continue
|
|
}
|
|
targetRatios = append(targetRatios, clipImageSize{width: w, height: h})
|
|
}
|
|
}
|
|
}
|
|
|
|
unique := targetRatios[:0]
|
|
for _, ratio := range targetRatios {
|
|
if slices.Contains(unique, ratio) {
|
|
continue
|
|
}
|
|
unique = append(unique, ratio)
|
|
}
|
|
|
|
slices.SortFunc(unique, func(a, b clipImageSize) int {
|
|
return a.width*a.height - b.width*b.height
|
|
})
|
|
|
|
return unique
|
|
}
|
|
|
|
func roundByFactor(number, factor int) int {
|
|
if factor <= 0 {
|
|
return number
|
|
}
|
|
return int(math.RoundToEven(float64(number)/float64(factor))) * factor
|
|
}
|
|
|
|
func cropImage(img image.Image, rect image.Rectangle) image.Image {
|
|
dst := image.NewRGBA(image.Rect(0, 0, rect.Dx(), rect.Dy()))
|
|
stdimage.Draw(dst, dst.Bounds(), img, rect.Min, stdimage.Src)
|
|
return dst
|
|
}
|