mllm-streamlit/lib/llmhost.py

134 lines
4.0 KiB
Python

import sys
sys.path.append(".")
sys.path.append("./lib")
import re
import requests
import os
import json
import traceback
import time
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from llama_cpp import Llama
print("Loading model...", end=" ")
model_settings_path = "model.json"
model_settings = {
"model_path": None,
"n_gpu_layers": -1,
"n_ctx": 32768,
"n_threads": 8,
"max_tokens": 16384,
"stop": ["<|im_end|>", "</s>", "<|im_start|>"],
"repeat_penalty": 1.1,
"temperature": 0.75
}
if not os.path.isfile(model_settings_path):
with open(model_settings_path, "w") as f:
f.write(json.dumps(model_settings, indent=4))
with open(model_settings_path) as f:
model_settings = json.loads(f.read())
if model_settings["model_path"] is None:
for f in os.scandir("."):
if re.search(r"\.gguf$", f.path):
model_settings["model_path"] = f.path
break
if model_settings["model_path"] is None:
raise Exception("No .gguf model was found in the program directory. Please specify a model's relative or absolute path using the generated model.json configuration file.")
LLM = Llama(
model_path = model_settings["model_path"],
n_gpu_layers = model_settings["n_gpu_layers"],
n_ctx = model_settings["n_ctx"],
verbose = False,
n_threads = model_settings["n_threads"])
print("Loaded model {model_path}".format(model_path=model_settings["model_path"]))
class PrivateHandler(BaseHTTPRequestHandler):
LOCK = False
def do_POST(self):
content_length = int(self.headers.get("Content-Length", 0))
post_data = self.rfile.read(content_length).decode("utf-8")
trimmed_path = self.path[1:].strip()
if len(trimmed_path) > 0:
args = trimmed_path.split("/")
fn_given = args[0]
fn_actual = f"public_{fn_given}"
if hasattr(self, fn_actual):
attr = getattr(self, fn_actual)
if hasattr(attr, "__call__"):
function = attr
extra_args = args[1:] if len(args) > 1 else None
function(post_data, extra_args)
return
self.index(post_data)
def index(self, post_data):
try:
while PrivateHandler.LOCK:
print(".", end="")
time.sleep(0.1)
PrivateHandler.LOCK = True
text = post_data
print("POST:")
print(text, end="")
# Set response headers
self.send_response(200)
self.send_header("Content-Type", "text/plain")
self.send_header("Transfer-Encoding", "chunked")
self.end_headers()
response = LLM(
text,
max_tokens = model_settings["max_tokens"],
stop = model_settings["stop"],
echo = False,
repeat_penalty = model_settings["repeat_penalty"],
temperature = model_settings["temperature"],
stream = True)
# Stream a buffered response
for token in response:
token_text = token["choices"][0]["text"]
token_json = json.dumps(token_text)
self.wfile.write(f"{len(token_json):x}\r\n".encode("utf-8")) # Chunk size in hex
self.wfile.write(f"{token_json}\r\n".encode("utf-8"))
print(token_text, end="")
# Signal the end of the stream
self.wfile.write(b"0\r\n\r\n")
print("\n-----")
except Exception as x:
print(traceback.format_exc())
print(x)
pass
PrivateHandler.LOCK = False
time.sleep(0.2)
if __name__ == "__main__":
address = "0.0.0.0"
port = 11434
print(f"Listening on {address} port {port}...")
httpd = ThreadingHTTPServer((address, port), PrivateHandler)
httpd.serve_forever()