Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(llama-cpp): consistently select fallback #3789

Merged
merged 5 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 65 additions & 70 deletions pkg/model/initializers.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ var Aliases map[string]string = map[string]string{
"langchain-huggingface": LCHuggingFaceBackend,
}

var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"

const (
LlamaGGML = "llama-ggml"
Expand Down Expand Up @@ -62,7 +62,7 @@ func backendPath(assetDir, backend string) string {

// backendsInAssetDir returns the list of backends in the asset directory
// that should be loaded
func backendsInAssetDir(assetDir string) ([]string, error) {
func backendsInAssetDir(assetDir string) (map[string][]string, error) {
// Exclude backends from automatic loading
excludeBackends := []string{LocalStoreBackend}
entry, err := os.ReadDir(backendPath(assetDir, ""))
Expand All @@ -86,15 +86,15 @@ ENTRY:

// Skip the llama.cpp variants if we are autoDetecting
// But we always load the fallback variant if it exists
if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && autoDetect {
if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && AutoDetect {
continue
}

backends[e.Name()] = []string{}
}

// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
if autoDetect {
if AutoDetect {
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
// when starting the service
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
Expand Down Expand Up @@ -136,6 +136,10 @@ ENTRY:
}
}

return backends, nil
}

func orderBackends(backends map[string][]string) ([]string, error) {
// order backends from the asset directory.
// as we scan for backends, we want to keep some order which backends are tried of.
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
Expand Down Expand Up @@ -181,8 +185,9 @@ ENTRY:
return orderedBackends.Keys(), nil
}

// selectGRPCProcess selects the GRPC process to start based on system capabilities
func selectGRPCProcess(backend, assetDir string, f16 bool) string {
// selectGRPCProcessByHostCapabilities selects the GRPC process to start based on system capabilities
// Note: this is now relevant only for llama.cpp
func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) string {
foundCUDA := false
foundAMDGPU := false
foundIntelGPU := false
Expand All @@ -199,6 +204,7 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
return backendPath(assetDir, LLamaCPPGRPC)
}

// Check for GPU-binaries that are shipped with single binary releases
gpus, err := xsysinfo.GPUs()
if err == nil {
for _, gpu := range gpus {
Expand Down Expand Up @@ -243,32 +249,37 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
return grpcProcess
}

// No GPU found or no specific binaries found, try to load the CPU variant(s)

// Select the Fallback by default
selectedProcess := backendPath(assetDir, LLamaCPPFallback)

// IF we find any optimized binary, we use that
if xsysinfo.HasCPUCaps(cpuid.AVX2) {
p := backendPath(assetDir, LLamaCPPAVX2)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
grpcProcess = p
selectedProcess = p
}
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
p := backendPath(assetDir, LLamaCPPAVX)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
grpcProcess = p
}
} else {
p := backendPath(assetDir, LLamaCPPFallback)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
grpcProcess = p
selectedProcess = p
}
}

return grpcProcess
// Check if the binary exists!
if _, err := os.Stat(selectedProcess); err == nil {
return selectedProcess
}

return ""
}

// starts the grpcModelProcess for the backend, and returns a grpc client
// It also loads the model
func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string, string) (*Model, error) {
func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) func(string, string, string) (*Model, error) {
return func(modelID, modelName, modelFile string) (*Model, error) {

log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelID, modelFile, backend, *o)
Expand Down Expand Up @@ -324,9 +335,9 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
return nil, fmt.Errorf("refering to a backend not in asset dir: %s", err.Error())
}

if autoDetect {
if autodetect {
// autoDetect GRPC process to start based on system capabilities
if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
if selectedProcess := selectGRPCProcessByHostCapabilities(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
grpcProcess = selectedProcess
}
}
Expand Down Expand Up @@ -407,7 +418,11 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
}

func (ml *ModelLoader) ListAvailableBackends(assetdir string) ([]string, error) {
return backendsInAssetDir(assetdir)
backends, err := backendsInAssetDir(assetdir)
if err != nil {
return nil, err
}
return orderBackends(backends)
}

func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err error) {
Expand All @@ -421,13 +436,7 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
log.Debug().Msgf("%s is an alias of %s", backend, realBackend)
}

if o.singleActiveBackend {
log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
err := ml.StopGRPC(allExcept(o.modelID))
if err != nil {
log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel")
}
}
ml.stopActiveBackends(o.modelID, o.singleActiveBackend)

var backendToConsume string

Expand All @@ -439,14 +448,40 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
backendToConsume = backend
}

model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, o))
model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, AutoDetect, o))
if err != nil {
return nil, err
// XXX: This is too backend specific(llama-cpp), remove this bit or generalize further
// We failed somehow starting the binary. For instance, could be that we are missing
// some libraries if running in binary-only mode.
// In this case, we attempt to load the model with the fallback variant.

// If not llama-cpp backend, return error immediately
if backend != LLamaCPP {
return nil, err
}

// Otherwise attempt with fallback
log.Error().Msgf("[%s] Failed loading model, trying with fallback '%s'", backend, LLamaCPPFallback)
model, err = ml.LoadModel(o.modelID, o.model, ml.grpcModel(LLamaCPPFallback, false, o))
if err != nil {
return nil, err
}
}

return model.GRPC(o.parallelRequests, ml.wd), nil
}

func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
// If we can have only one backend active, kill all the others (except external backends)
if singleActiveBackend {
log.Debug().Msgf("Stopping all backends except '%s'", modelID)
err := ml.StopGRPC(allExcept(modelID))
if err != nil {
log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
}
}
}

func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
o := NewOptions(opts...)

Expand All @@ -458,19 +493,12 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
return m.GRPC(o.parallelRequests, ml.wd), nil
}

// If we can have only one backend active, kill all the others (except external backends)
if o.singleActiveBackend {
log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
err := ml.StopGRPC(allExcept(o.modelID))
if err != nil {
log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
}
}
ml.stopActiveBackends(o.modelID, o.singleActiveBackend)

var err error

// get backends embedded in the binary
autoLoadBackends, err := backendsInAssetDir(o.assetDir)
autoLoadBackends, err := ml.ListAvailableBackends(o.assetDir)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -501,39 +529,6 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
err = errors.Join(err, fmt.Errorf("backend %s returned no usable model", key))
log.Info().Msgf("[%s] Fails: %s", key, "backend returned no usable model")
}

if autoDetect && key == LLamaCPP && err != nil {
// try as hard as possible to run the llama.cpp variants
backendToUse := ""
if xsysinfo.HasCPUCaps(cpuid.AVX2) {
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
backendToUse = LLamaCPPAVX2
}
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
backendToUse = LLamaCPPAVX
}
} else {
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPFallback)); err == nil {
backendToUse = LLamaCPPFallback
} else {
// If we don't have a fallback, just skip fallback
continue
}
}

// Autodetection failed, try the fallback
log.Info().Msgf("[%s] Autodetection failed, trying the fallback", key)
options = append(options, WithBackendString(backendToUse))
model, modelerr = ml.BackendLoader(options...)
if modelerr == nil && model != nil {
log.Info().Msgf("[%s] Loads OK", key)
return model, nil
} else {
err = errors.Join(err, fmt.Errorf("[%s]: %w", key, modelerr))
log.Info().Msgf("[%s] Fails: %s", key, modelerr.Error())
}
}
}

return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
Expand Down
File renamed without changes.