From c47a4518d5f11ba1195cfa309ac1a660a1bb740b Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 11 Oct 2024 12:16:41 +0200
Subject: [PATCH 1/5] fix(llama-cpp): consistently select fallback

We didn't took in consideration the case where the host has the CPU
flagset, but the binaries were not actually present in the asset dir.

This made possible for instance for models that specified the llama-cpp
backend directly in the config to not eventually pick-up the fallback
binary in case the optimized binaries were not present.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 pkg/model/initializers.go                   | 64 ++++++++++++++-------
 pkg/model/{options.go => loader_options.go} |  0
 2 files changed, 44 insertions(+), 20 deletions(-)
 rename pkg/model/{options.go => loader_options.go} (100%)

diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index 1171de4d9418..9ff71303e943 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -62,7 +62,7 @@ func backendPath(assetDir, backend string) string {
 
 // backendsInAssetDir returns the list of backends in the asset directory
 // that should be loaded
-func backendsInAssetDir(assetDir string) ([]string, error) {
+func backendsInAssetDir(assetDir string) (map[string][]string, error) {
 	// Exclude backends from automatic loading
 	excludeBackends := []string{LocalStoreBackend}
 	entry, err := os.ReadDir(backendPath(assetDir, ""))
@@ -136,6 +136,10 @@ ENTRY:
 		}
 	}
 
+	return backends, nil
+}
+
+func orderBackends(backends map[string][]string) ([]string, error) {
 	// order backends from the asset directory.
 	// as we scan for backends, we want to keep some order which backends are tried of.
 	// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
@@ -248,18 +252,36 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 		if _, err := os.Stat(p); err == nil {
 			log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
 			grpcProcess = p
+		} else {
+			p = backendPath(assetDir, LLamaCPPFallback)
+			if _, err := os.Stat(p); err == nil {
+				log.Info().Msgf("[%s] No AVX2 variant found, trying to load with fallback variant", backend)
+				grpcProcess = p
+			}
 		}
 	} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
 		p := backendPath(assetDir, LLamaCPPAVX)
 		if _, err := os.Stat(p); err == nil {
 			log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
 			grpcProcess = p
+		} else {
+			p = backendPath(assetDir, LLamaCPPFallback)
+			if _, err := os.Stat(p); err == nil {
+				log.Info().Msgf("[%s] No AVX2 variant found, trying to load with fallback variant", backend)
+				grpcProcess = p
+			}
 		}
 	} else {
 		p := backendPath(assetDir, LLamaCPPFallback)
 		if _, err := os.Stat(p); err == nil {
 			log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
 			grpcProcess = p
+		} else {
+			p = backendPath(assetDir, LLamaCPPFallback)
+			if _, err := os.Stat(p); err == nil {
+				log.Info().Msgf("[%s] No AVX2 variant found, trying to load with fallback variant", backend)
+				grpcProcess = p
+			}
 		}
 	}
 
@@ -407,7 +429,11 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 }
 
 func (ml *ModelLoader) ListAvailableBackends(assetdir string) ([]string, error) {
-	return backendsInAssetDir(assetdir)
+	backends, err := backendsInAssetDir(assetdir)
+	if err != nil {
+		return nil, err
+	}
+	return orderBackends(backends)
 }
 
 func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err error) {
@@ -421,13 +447,7 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 		log.Debug().Msgf("%s is an alias of %s", backend, realBackend)
 	}
 
-	if o.singleActiveBackend {
-		log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
-		err := ml.StopGRPC(allExcept(o.modelID))
-		if err != nil {
-			log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel")
-		}
-	}
+	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
 
 	var backendToConsume string
 
@@ -447,6 +467,17 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 	return model.GRPC(o.parallelRequests, ml.wd), nil
 }
 
+func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
+	// If we can have only one backend active, kill all the others (except external backends)
+	if singleActiveBackend {
+		log.Debug().Msgf("Stopping all backends except '%s'", modelID)
+		err := ml.StopGRPC(allExcept(modelID))
+		if err != nil {
+			log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
+		}
+	}
+}
+
 func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 	o := NewOptions(opts...)
 
@@ -458,19 +489,12 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 		return m.GRPC(o.parallelRequests, ml.wd), nil
 	}
 
-	// If we can have only one backend active, kill all the others (except external backends)
-	if o.singleActiveBackend {
-		log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
-		err := ml.StopGRPC(allExcept(o.modelID))
-		if err != nil {
-			log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
-		}
-	}
+	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
 
 	var err error
 
 	// get backends embedded in the binary
-	autoLoadBackends, err := backendsInAssetDir(o.assetDir)
+	autoLoadBackends, err := ml.ListAvailableBackends(o.assetDir)
 	if err != nil {
 		return nil, err
 	}
@@ -504,7 +528,7 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 
 		if autoDetect && key == LLamaCPP && err != nil {
 			// try as hard as possible to run the llama.cpp variants
-			backendToUse := ""
+			backendToUse := LLamaCPPFallback
 			if xsysinfo.HasCPUCaps(cpuid.AVX2) {
 				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
 					backendToUse = LLamaCPPAVX2
@@ -523,7 +547,7 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 			}
 
 			// Autodetection failed, try the fallback
-			log.Info().Msgf("[%s] Autodetection failed, trying the fallback", key)
+			log.Info().Msgf("[%s] Autodetection failed, trying the fallback %s", key, backendToUse)
 			options = append(options, WithBackendString(backendToUse))
 			model, modelerr = ml.BackendLoader(options...)
 			if modelerr == nil && model != nil {
diff --git a/pkg/model/options.go b/pkg/model/loader_options.go
similarity index 100%
rename from pkg/model/options.go
rename to pkg/model/loader_options.go

From 2df7c2a4ea41f60fd1b13fe189e4154a24f8b211 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 11 Oct 2024 12:32:38 +0200
Subject: [PATCH 2/5] chore: adjust and simplify selection

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 pkg/model/initializers.go | 37 ++++++++++---------------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index 9ff71303e943..6bb9ce20cd14 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -203,6 +203,7 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 		return backendPath(assetDir, LLamaCPPGRPC)
 	}
 
+	// Check for GPU-binaries that are shipped with single binary releases
 	gpus, err := xsysinfo.GPUs()
 	if err == nil {
 		for _, gpu := range gpus {
@@ -247,45 +248,27 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 		return grpcProcess
 	}
 
+	selectedProcess := backendPath(assetDir, LLamaCPPFallback)
+
 	if xsysinfo.HasCPUCaps(cpuid.AVX2) {
 		p := backendPath(assetDir, LLamaCPPAVX2)
 		if _, err := os.Stat(p); err == nil {
 			log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
-			grpcProcess = p
-		} else {
-			p = backendPath(assetDir, LLamaCPPFallback)
-			if _, err := os.Stat(p); err == nil {
-				log.Info().Msgf("[%s] No AVX2 variant found, trying to load with fallback variant", backend)
-				grpcProcess = p
-			}
+			selectedProcess = p
 		}
 	} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
 		p := backendPath(assetDir, LLamaCPPAVX)
 		if _, err := os.Stat(p); err == nil {
 			log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
-			grpcProcess = p
-		} else {
-			p = backendPath(assetDir, LLamaCPPFallback)
-			if _, err := os.Stat(p); err == nil {
-				log.Info().Msgf("[%s] No AVX2 variant found, trying to load with fallback variant", backend)
-				grpcProcess = p
-			}
-		}
-	} else {
-		p := backendPath(assetDir, LLamaCPPFallback)
-		if _, err := os.Stat(p); err == nil {
-			log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
-			grpcProcess = p
-		} else {
-			p = backendPath(assetDir, LLamaCPPFallback)
-			if _, err := os.Stat(p); err == nil {
-				log.Info().Msgf("[%s] No AVX2 variant found, trying to load with fallback variant", backend)
-				grpcProcess = p
-			}
+			selectedProcess = p
 		}
 	}
 
-	return grpcProcess
+	if _, err := os.Stat(selectedProcess); err == nil {
+		return selectedProcess
+	}
+
+	return ""
 }
 
 // starts the grpcModelProcess for the backend, and returns a grpc client

From 2487d94366b36dcc1055555d1f3f052313313b5a Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 11 Oct 2024 12:38:54 +0200
Subject: [PATCH 3/5] fix: move failure recovery to BackendLoader()

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 pkg/model/initializers.go | 57 +++++++++++----------------------------
 1 file changed, 16 insertions(+), 41 deletions(-)

diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index 6bb9ce20cd14..42fc2da76f61 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -28,7 +28,7 @@ var Aliases map[string]string = map[string]string{
 	"langchain-huggingface": LCHuggingFaceBackend,
 }
 
-var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
+var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
 
 const (
 	LlamaGGML = "llama-ggml"
@@ -86,7 +86,7 @@ ENTRY:
 
 		// Skip the llama.cpp variants if we are autoDetecting
 		// But we always load the fallback variant if it exists
-		if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && autoDetect {
+		if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && AutoDetect {
 			continue
 		}
 
@@ -94,7 +94,7 @@ ENTRY:
 	}
 
 	// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
-	if autoDetect {
+	if AutoDetect {
 		// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
 		// when starting the service
 		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
@@ -273,7 +273,7 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 
 // starts the grpcModelProcess for the backend, and returns a grpc client
 // It also loads the model
-func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string, string) (*Model, error) {
+func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) func(string, string, string) (*Model, error) {
 	return func(modelID, modelName, modelFile string) (*Model, error) {
 
 		log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelID, modelFile, backend, *o)
@@ -329,7 +329,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 				return nil, fmt.Errorf("refering to a backend not in asset dir: %s", err.Error())
 			}
 
-			if autoDetect {
+			if autodetect {
 				// autoDetect GRPC process to start based on system capabilities
 				if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
 					grpcProcess = selectedProcess
@@ -442,9 +442,17 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 		backendToConsume = backend
 	}
 
-	model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, o))
-	if err != nil {
-		return nil, err
+	model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, AutoDetect, o))
+	if backend == LLamaCPP && err != nil {
+		// XXX: This is too backend specific(llama-cpp), remove this bit or generalize further
+		// We failed somehow starting the binary. For instance, could be that we are missing
+		// some libraries if running in binary-only mode.
+		// In this case, we attempt to load the model with the fallback variant.
+		log.Error().Msgf("[%s] Failed loading model, trying with fallback '%s'", backend, LLamaCPPFallback)
+		model, err = ml.LoadModel(o.modelID, o.model, ml.grpcModel(LLamaCPPFallback, AutoDetect, o))
+		if err != nil {
+			return nil, err
+		}
 	}
 
 	return model.GRPC(o.parallelRequests, ml.wd), nil
@@ -508,39 +516,6 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 			err = errors.Join(err, fmt.Errorf("backend %s returned no usable model", key))
 			log.Info().Msgf("[%s] Fails: %s", key, "backend returned no usable model")
 		}
-
-		if autoDetect && key == LLamaCPP && err != nil {
-			// try as hard as possible to run the llama.cpp variants
-			backendToUse := LLamaCPPFallback
-			if xsysinfo.HasCPUCaps(cpuid.AVX2) {
-				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
-					backendToUse = LLamaCPPAVX2
-				}
-			} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
-				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
-					backendToUse = LLamaCPPAVX
-				}
-			} else {
-				if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPFallback)); err == nil {
-					backendToUse = LLamaCPPFallback
-				} else {
-					// If we don't have a fallback, just skip fallback
-					continue
-				}
-			}
-
-			// Autodetection failed, try the fallback
-			log.Info().Msgf("[%s] Autodetection failed, trying the fallback %s", key, backendToUse)
-			options = append(options, WithBackendString(backendToUse))
-			model, modelerr = ml.BackendLoader(options...)
-			if modelerr == nil && model != nil {
-				log.Info().Msgf("[%s] Loads OK", key)
-				return model, nil
-			} else {
-				err = errors.Join(err, fmt.Errorf("[%s]: %w", key, modelerr))
-				log.Info().Msgf("[%s] Fails: %s", key, modelerr.Error())
-			}
-		}
 	}
 
 	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())

From 53cff5ce53e36c4bdf80fa56d617900f23d27b9a Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 11 Oct 2024 12:48:52 +0200
Subject: [PATCH 4/5] comments

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 pkg/model/initializers.go | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index 42fc2da76f61..c6d702c0b18e 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -185,8 +185,9 @@ func orderBackends(backends map[string][]string) ([]string, error) {
 	return orderedBackends.Keys(), nil
 }
 
-// selectGRPCProcess selects the GRPC process to start based on system capabilities
-func selectGRPCProcess(backend, assetDir string, f16 bool) string {
+// selectGRPCProcessByHostCapabilities selects the GRPC process to start based on system capabilities
+// Note: this is now relevant only for llama.cpp
+func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) string {
 	foundCUDA := false
 	foundAMDGPU := false
 	foundIntelGPU := false
@@ -248,8 +249,12 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 		return grpcProcess
 	}
 
+	// No GPU found or no specific binaries found, try to load the CPU variant(s)
+
+	// Select the Fallback by default
 	selectedProcess := backendPath(assetDir, LLamaCPPFallback)
 
+	// IF we find any optimized binary, we use that
 	if xsysinfo.HasCPUCaps(cpuid.AVX2) {
 		p := backendPath(assetDir, LLamaCPPAVX2)
 		if _, err := os.Stat(p); err == nil {
@@ -264,6 +269,7 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 		}
 	}
 
+	// Check if the binary exists!
 	if _, err := os.Stat(selectedProcess); err == nil {
 		return selectedProcess
 	}
@@ -331,7 +337,7 @@ func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) fu
 
 			if autodetect {
 				// autoDetect GRPC process to start based on system capabilities
-				if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
+				if selectedProcess := selectGRPCProcessByHostCapabilities(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
 					grpcProcess = selectedProcess
 				}
 			}

From 8055c87be0cdfd9cc76590a7bcf846b7edca5cec Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 11 Oct 2024 15:17:28 +0200
Subject: [PATCH 5/5] minor fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 pkg/model/initializers.go | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index c6d702c0b18e..c3b3717979e7 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -449,13 +449,20 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
 	}
 
 	model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, AutoDetect, o))
-	if backend == LLamaCPP && err != nil {
+	if err != nil {
 		// XXX: This is too backend specific(llama-cpp), remove this bit or generalize further
 		// We failed somehow starting the binary. For instance, could be that we are missing
 		// some libraries if running in binary-only mode.
 		// In this case, we attempt to load the model with the fallback variant.
+
+		// If not llama-cpp backend, return error immediately
+		if backend != LLamaCPP {
+			return nil, err
+		}
+
+		// Otherwise attempt with fallback
 		log.Error().Msgf("[%s] Failed loading model, trying with fallback '%s'", backend, LLamaCPPFallback)
-		model, err = ml.LoadModel(o.modelID, o.model, ml.grpcModel(LLamaCPPFallback, AutoDetect, o))
+		model, err = ml.LoadModel(o.modelID, o.model, ml.grpcModel(LLamaCPPFallback, false, o))
 		if err != nil {
 			return nil, err
 		}