Added support for model at remote address, see the mllm-streamlit project for this

This commit is contained in:
Conner Harkness 2025-08-19 09:24:50 -06:00
parent 3a7eaafe9a
commit 1aa7cc0a1e

50
app.py
View File

@ -26,10 +26,9 @@ lock = False
praise = 0 praise = 0
print("Loading model...", end=" ")
model_settings_path = "model.json" model_settings_path = "model.json"
model_settings = { model_settings = {
"remote_address": None,
"model_path": None, "model_path": None,
"formatter": "chatml", "formatter": "chatml",
"n_gpu_layers": -1, "n_gpu_layers": -1,
@ -58,22 +57,52 @@ if model_settings["model_path"] is None:
if model_settings["model_path"] is None: if model_settings["model_path"] is None:
raise Exception("No .gguf model was found in the program directory. Please specify a model's relative or absolute path using the generated model.json configuration file.") raise Exception("No .gguf model was found in the program directory. Please specify a model's relative or absolute path using the generated model.json configuration file.")
formatter = importlib.import_module(model_settings["formatter"]) formatter = importlib.import_module(model_settings["formatter"])
LLM = None
# Enable loading the model only if the remote address is unspecified:
if model_settings["remote_address"] is None:
print("Loading model...", end=" ")
LLM = Llama( LLM = Llama(
model_path = model_settings["model_path"], model_path = model_settings["model_path"],
n_gpu_layers = model_settings["n_gpu_layers"], n_gpu_layers = model_settings["n_gpu_layers"],
n_ctx = model_settings["n_ctx"], n_ctx = model_settings["n_ctx"],
verbose = False, verbose = False,
n_threads = model_settings["n_threads"]) n_threads = model_settings["n_threads"])
print("Loaded model {model_path}".format(model_path=model_settings["model_path"]))
def get_response_remote(text):
global model_settings
remote_address = model_settings["remote_address"]
# e.g. http://127.0.0.1:11434/
# The project mllm-streamlit has a built-in webserver that runs inference on POSTed text:
response = requests.post(
remote_address,
data=text.encode("utf-8"),
headers={"Content-Type": "text/plain"},
stream=True)
response.raise_for_status()
for chunk in response.iter_content(chunk_size=None, decode_unicode=True):
if chunk:
chunk_text = json.loads(chunk)
print(chunk_text, end="")
yield chunk_text
print("Loaded model {model_path}".format(model_path=model_settings["model_path"]))
def get_response(text): def get_response(text):
global lock global lock
global model_settings global model_settings
# If the remote address is specified, use this routine:
if model_settings["remote_address"] is not None:
return "".join(get_response_remote(text))
while lock == True: while lock == True:
time.sleep(0.1) time.sleep(0.1)
@ -268,7 +297,6 @@ async def on_message(msg):
print(f"{user_name}: {msg.content}") print(f"{user_name}: {msg.content}")
print(f"{bot_name}: ", end="") print(f"{bot_name}: ", end="")
async with chl.typing(): async with chl.typing():
f_body = formatter.format(context, messages, for_completion=True) f_body = formatter.format(context, messages, for_completion=True)
f_resp = await get_response_wrapper(f_body) f_resp = await get_response_wrapper(f_body)