import sys sys.path.append(".") sys.path.append("./lib") import re import requests import os import json import traceback import time from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from llama_cpp import Llama print("Loading model...", end=" ") model_settings_path = "model.json" model_settings = { "model_path": None, "n_gpu_layers": -1, "n_ctx": 32768, "n_threads": 8, "max_tokens": 16384, "stop": ["<|im_end|>", "", "<|im_start|>"], "repeat_penalty": 1.1, "temperature": 0.75 } if not os.path.isfile(model_settings_path): with open(model_settings_path, "w") as f: f.write(json.dumps(model_settings, indent=4)) with open(model_settings_path) as f: model_settings = json.loads(f.read()) if model_settings["model_path"] is None: for f in os.scandir("."): if re.search(r"\.gguf$", f.path): model_settings["model_path"] = f.path break if model_settings["model_path"] is None: raise Exception("No .gguf model was found in the program directory. Please specify a model's relative or absolute path using the generated model.json configuration file.") LLM = Llama( model_path = model_settings["model_path"], n_gpu_layers = model_settings["n_gpu_layers"], n_ctx = model_settings["n_ctx"], verbose = False, n_threads = model_settings["n_threads"]) print("Loaded model {model_path}".format(model_path=model_settings["model_path"])) class PrivateHandler(BaseHTTPRequestHandler): LOCK = False def do_POST(self): content_length = int(self.headers.get("Content-Length", 0)) post_data = self.rfile.read(content_length).decode("utf-8") trimmed_path = self.path[1:].strip() if len(trimmed_path) > 0: args = trimmed_path.split("/") fn_given = args[0] fn_actual = f"public_{fn_given}" if hasattr(self, fn_actual): attr = getattr(self, fn_actual) if hasattr(attr, "__call__"): function = attr extra_args = args[1:] if len(args) > 1 else None function(post_data, extra_args) return self.index(post_data) def index(self, post_data): try: while PrivateHandler.LOCK: print(".", end="") time.sleep(0.1) PrivateHandler.LOCK = True text = post_data print("POST:") print(text, end="") # Set response headers self.send_response(200) self.send_header("Content-Type", "text/plain") self.send_header("Transfer-Encoding", "chunked") self.end_headers() response = LLM( text, max_tokens = model_settings["max_tokens"], stop = model_settings["stop"], echo = False, repeat_penalty = model_settings["repeat_penalty"], temperature = model_settings["temperature"], stream = True) # Stream a buffered response for token in response: token_text = token["choices"][0]["text"] token_json = json.dumps(token_text) self.wfile.write(f"{len(token_json):x}\r\n".encode("utf-8")) # Chunk size in hex self.wfile.write(f"{token_json}\r\n".encode("utf-8")) print(token_text, end="") # Signal the end of the stream self.wfile.write(b"0\r\n\r\n") print("\n-----") except Exception as x: print(traceback.format_exc()) print(x) pass PrivateHandler.LOCK = False time.sleep(0.2) if __name__ == "__main__": address = "0.0.0.0" port = 11434 print(f"Listening on {address} port {port}...") httpd = ThreadingHTTPServer((address, port), PrivateHandler) httpd.serve_forever()