server: reduce download concurrency for HuggingFace URLs

Reduces concurrent download parts from 16 to 4 for HuggingFace URLs
to avoid triggering rate limits (HTTP 429 errors).

Adds OLLAMA_HF_CONCURRENCY environment variable for users who want
to customize the concurrency level.

Fixes #13297
This commit is contained in:
Parth Sareen
2026-01-18 16:38:49 -08:00
parent e4b488a7b5
commit 805ed4644c

View File

@@ -96,10 +96,36 @@ func (p *blobDownloadPart) UnmarshalJSON(b []byte) error {
const (
numDownloadParts = 16
numHFDownloadParts = 4
minDownloadPartSize int64 = 100 * format.MegaByte
maxDownloadPartSize int64 = 1000 * format.MegaByte
)
// isHuggingFaceURL returns true if the URL is from HuggingFace domains
func isHuggingFaceURL(u *url.URL) bool {
if u == nil {
return false
}
host := strings.ToLower(u.Hostname())
return strings.HasSuffix(host, "huggingface.co") ||
strings.HasSuffix(host, ".hf.co") ||
host == "hf.co"
}
// getNumDownloadParts returns the number of concurrent download parts to use
// for the given URL. HuggingFace URLs use reduced concurrency to avoid rate limiting.
func getNumDownloadParts(u *url.URL) int {
if isHuggingFaceURL(u) {
if v := os.Getenv("OLLAMA_HF_CONCURRENCY"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n > 0 {
return n
}
}
return numHFDownloadParts
}
return numDownloadParts
}
func (p *blobDownloadPart) Name() string {
return strings.Join([]string{
p.blobDownload.Name, "partial", strconv.Itoa(p.N),
@@ -271,7 +297,11 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
}
g, inner := errgroup.WithContext(ctx)
g.SetLimit(numDownloadParts)
concurrency := getNumDownloadParts(directURL)
if concurrency != numDownloadParts {
slog.Info(fmt.Sprintf("using reduced concurrency (%d) for HuggingFace download", concurrency))
}
g.SetLimit(concurrency)
for i := range b.Parts {
part := b.Parts[i]
if part.Completed.Load() == part.Size {