mllm-streamlit/lib/llmhost.py

import sys

sys.path.append(".")
sys.path.append("./lib")

import re
import requests
import os
import json
import traceback
import time

from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from llama_cpp import Llama

print("Loading model...", end=" ")

model_settings_path = "model.json"
model_settings  = {
    "model_path": None,
    "n_gpu_layers": -1,
    "n_ctx": 32768,
    "n_threads": 8,
    "max_tokens": 16384,
    "stop": ["<|im_end|>", "</s>", "<|im_start|>"],
    "repeat_penalty": 1.1,
    "temperature": 0.75
}

if not os.path.isfile(model_settings_path):
    with open(model_settings_path, "w") as f:
        f.write(json.dumps(model_settings, indent=4))

with open(model_settings_path) as f:
    model_settings = json.loads(f.read())

if model_settings["model_path"] is None:
    for f in os.scandir("."):
        if re.search(r"\.gguf$", f.path):
            model_settings["model_path"] = f.path
            break

if model_settings["model_path"] is None:
    raise Exception("No .gguf model was found in the program directory. Please specify a model's relative or absolute path using the generated model.json configuration file.")

LLM = Llama(
    model_path      = model_settings["model_path"],
    n_gpu_layers    = model_settings["n_gpu_layers"],
    n_ctx           = model_settings["n_ctx"],
    verbose         = False,
    n_threads       = model_settings["n_threads"])

print("Loaded model {model_path}".format(model_path=model_settings["model_path"]))

class PrivateHandler(BaseHTTPRequestHandler):
    LOCK = False

    def do_POST(self):
        content_length  = int(self.headers.get("Content-Length", 0))
        post_data       = self.rfile.read(content_length).decode("utf-8")
        trimmed_path    = self.path[1:].strip()

        if len(trimmed_path) > 0:
            args        = trimmed_path.split("/")
            fn_given    = args[0]
            fn_actual   = f"public_{fn_given}"

            if hasattr(self, fn_actual):
                attr = getattr(self, fn_actual)
                if hasattr(attr, "__call__"):
                    function    = attr
                    extra_args  = args[1:] if len(args) > 1 else None
                    function(post_data, extra_args)
                    return

        self.index(post_data)

    def index(self, post_data):
        try:
            while PrivateHandler.LOCK:
                print(".", end="")
                time.sleep(0.1)

            PrivateHandler.LOCK = True

            text = post_data

            print("POST:")
            print(text, end="")

            # Set response headers
            self.send_response(200)
            self.send_header("Content-Type", "text/plain")
            self.send_header("Transfer-Encoding", "chunked")
            self.end_headers()

            response = LLM(
                text,
                max_tokens      = model_settings["max_tokens"],
                stop            = model_settings["stop"],
                echo            = False,
                repeat_penalty  = model_settings["repeat_penalty"],
                temperature     = model_settings["temperature"],
                stream          = True)

            # Stream a buffered response
            for token in response:
                token_text = token["choices"][0]["text"]
                token_json = json.dumps(token_text)
                self.wfile.write(f"{len(token_json):x}\r\n".encode("utf-8"))  # Chunk size in hex
                self.wfile.write(f"{token_json}\r\n".encode("utf-8"))
                print(token_text, end="")

            # Signal the end of the stream
            self.wfile.write(b"0\r\n\r\n")
            print("\n-----")

        except Exception as x:
            print(traceback.format_exc())
            print(x)
            pass

        PrivateHandler.LOCK = False
        time.sleep(0.2)

if __name__ == "__main__":
    address = "0.0.0.0"
    port    = 11434

    print(f"Listening on {address} port {port}...")

    httpd = ThreadingHTTPServer((address, port), PrivateHandler)
    httpd.serve_forever()