Added support for model at remote address, see the mllm-streamlit project for this

2025-08-19 09:24:50 -06:00 · 2025-08-19 09:24:50 -06:00 · 1aa7cc0a1e
commit 1aa7cc0a1e
parent 3a7eaafe9a
1 changed files with 39 additions and 11 deletions
--- a/app.py
+++ b/app.py
@ -26,10 +26,9 @@ lock            = False
 praise = 0
 print("Loading model...", end=" ")
 model_settings_path = "model.json"
 model_settings  = {
    "remote_address": None,
    "model_path": None,
    "formatter": "chatml",
    "n_gpu_layers": -1,
@ -58,22 +57,52 @@ if model_settings["model_path"] is None:
 if model_settings["model_path"] is None:
    raise Exception("No .gguf model was found in the program directory. Please specify a model's relative or absolute path using the generated model.json configuration file.")
-formatter = importlib.import_module(model_settings["formatter"])
+formatter   = importlib.import_module(model_settings["formatter"])
 LLM         = None
 # Enable loading the model only if the remote address is unspecified:
 if model_settings["remote_address"] is None:
    print("Loading model...", end=" ")
-LLM = Llama(
+    LLM = Llama(
-    model_path      = model_settings["model_path"],
+        model_path      = model_settings["model_path"],
-    n_gpu_layers    = model_settings["n_gpu_layers"],
+        n_gpu_layers    = model_settings["n_gpu_layers"],
-    n_ctx           = model_settings["n_ctx"],
+        n_ctx           = model_settings["n_ctx"],
-    verbose         = False,
+        verbose         = False,
-    n_threads       = model_settings["n_threads"])
+        n_threads       = model_settings["n_threads"])
    print("Loaded model {model_path}".format(model_path=model_settings["model_path"]))
 def get_response_remote(text):
    global model_settings
    remote_address = model_settings["remote_address"]
    # e.g. http://127.0.0.1:11434/
    # The project mllm-streamlit has a built-in webserver that runs inference on POSTed text:
    response = requests.post(
        remote_address,
        data=text.encode("utf-8"),
        headers={"Content-Type": "text/plain"},
        stream=True)
    response.raise_for_status()
    for chunk in response.iter_content(chunk_size=None, decode_unicode=True):
        if chunk:
            chunk_text = json.loads(chunk)
            print(chunk_text, end="")
            yield chunk_text
 print("Loaded model {model_path}".format(model_path=model_settings["model_path"]))
 def get_response(text):
    global lock
    global model_settings
    # If the remote address is specified, use this routine:
    if model_settings["remote_address"] is not None:
        return "".join(get_response_remote(text))
    while lock == True:
        time.sleep(0.1)
@ -268,7 +297,6 @@ async def on_message(msg):
        print(f"{user_name}: {msg.content}")
        print(f"{bot_name}: ", end="")
        async with chl.typing():
            f_body = formatter.format(context, messages, for_completion=True)
            f_resp = await get_response_wrapper(f_body)