mlx: fix subprocess log deadlock

The stderr reader used bufio.Scanner which has a 64KB max line size. If the subprocess wrote a line exceeding this limit, the scanner would stop reading, the OS pipe buffer would fill, and the subprocess would deadlock. Replace the scanner with a statusWriter that wraps io.Copy. The writer forwards all stderr to os.Stderr while capturing the last short line (≤256 bytes) for error reporting, avoiding both the deadlock and the need to buffer arbitrarily long lines.
2026-03-27 02:58:43 +07:00 · 2026-03-17 17:02:51 -07:00
parent 46cb7795e1
commit 0ff7d724ff
1 changed files with 64 additions and 22 deletions
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -2,6 +2,7 @@ package mlxrunner

 import (
 	"bufio"
+	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
@@ -38,12 +39,66 @@ type Client struct {
 	memory        atomic.Uint64
 	done          chan error
 	client        *http.Client
-	lastErr       string
-	lastErrLock   sync.Mutex
+	status        *statusWriter
 	mu            sync.Mutex
 	cmd           *exec.Cmd
 }

+// statusWriter captures the last stderr line from the subprocess while
+// forwarding all output to os.Stderr. Lines longer than maxStatusLen are
+// truncated to the first maxStatusLen bytes.
+type statusWriter struct {
+	lastErrMsg string
+	buf        []byte
+	discarding bool
+	mu         sync.Mutex
+	out        *os.File
+}
+
+const maxStatusLen = 256
+
+func (w *statusWriter) Write(b []byte) (int, error) {
+	n, err := w.out.Write(b)
+
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	w.buf = append(w.buf, b...)
+	for {
+		i := bytes.IndexByte(w.buf, '\n')
+		if i < 0 {
+			break
+		}
+		if !w.discarding {
+			line := bytes.TrimSpace(w.buf[:i])
+			if len(line) > 0 {
+				if len(line) > maxStatusLen {
+					line = line[:maxStatusLen]
+				}
+				w.lastErrMsg = string(line)
+			}
+		}
+		w.buf = w.buf[i+1:]
+		w.discarding = false
+	}
+	// if the buffer grows past maxStatusLen without a newline, keep the front
+	if len(w.buf) > maxStatusLen {
+		if !w.discarding {
+			w.lastErrMsg = string(bytes.TrimSpace(w.buf[:maxStatusLen]))
+			w.discarding = true
+		}
+		w.buf = w.buf[:0]
+	}
+
+	return n, err
+}
+
+func (w *statusWriter) getLastErr() string {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.lastErrMsg
+}
+
 // NewClient prepares a new MLX runner client for LLM models.
 // The subprocess is not started until Load() is called.
 func NewClient(modelName string) (*Client, error) {
@@ -66,12 +121,6 @@ func NewClient(modelName string) (*Client, error) {
 	return c, nil
 }

-func (c *Client) getLastErr() string {
-	c.lastErrLock.Lock()
-	defer c.lastErrLock.Unlock()
-	return c.lastErr
-}
-
 // WaitUntilRunning waits for the subprocess to be ready.
 func (c *Client) WaitUntilRunning(ctx context.Context) error {
 	timeout := time.After(2 * time.Minute)
@@ -83,15 +132,13 @@ func (c *Client) WaitUntilRunning(ctx context.Context) error {
 		case <-ctx.Done():
 			return ctx.Err()
 		case err := <-c.done:
-			errMsg := c.getLastErr()
-			if errMsg != "" {
-				return fmt.Errorf("mlx runner failed: %s (exit: %v)", errMsg, err)
+			if msg := c.status.getLastErr(); msg != "" {
+				return fmt.Errorf("mlx runner failed: %s (exit: %v)", msg, err)
 			}
 			return fmt.Errorf("mlx runner exited unexpectedly: %w", err)
 		case <-timeout:
-			errMsg := c.getLastErr()
-			if errMsg != "" {
-				return fmt.Errorf("timeout waiting for mlx runner: %s", errMsg)
+			if msg := c.status.getLastErr(); msg != "" {
+				return fmt.Errorf("timeout waiting for mlx runner: %s", msg)
 			}
 			return errors.New("timeout waiting for mlx runner to start")
 		case <-ticker.C:
@@ -348,18 +395,13 @@ func (c *Client) Load(ctx context.Context, _ ml.SystemInfo, gpus []ml.DeviceInfo
 	// Forward subprocess stdout/stderr to server logs
 	stdout, _ := cmd.StdoutPipe()
 	stderr, _ := cmd.StderrPipe()
+	status := &statusWriter{out: os.Stderr}
+	c.status = status
 	go func() {
 		io.Copy(os.Stderr, stdout) //nolint:errcheck
 	}()
 	go func() {
-		scanner := bufio.NewScanner(stderr)
-		for scanner.Scan() {
-			line := scanner.Text()
-			fmt.Fprintln(os.Stderr, line)
-			c.lastErrLock.Lock()
-			c.lastErr = line
-			c.lastErrLock.Unlock()
-		}
+		io.Copy(status, stderr) //nolint:errcheck
 	}()

 	slog.Info("starting mlx runner subprocess", "model", c.modelName, "port", c.port)