134 lines
4.0 KiB
Python
134 lines
4.0 KiB
Python
import sys
|
|
|
|
sys.path.append(".")
|
|
sys.path.append("./lib")
|
|
|
|
import re
|
|
import requests
|
|
import os
|
|
import json
|
|
import traceback
|
|
import time
|
|
|
|
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
|
from llama_cpp import Llama
|
|
|
|
print("Loading model...", end=" ")
|
|
|
|
model_settings_path = "model.json"
|
|
model_settings = {
|
|
"model_path": None,
|
|
"n_gpu_layers": -1,
|
|
"n_ctx": 32768,
|
|
"n_threads": 8,
|
|
"max_tokens": 16384,
|
|
"stop": ["<|im_end|>", "</s>", "<|im_start|>"],
|
|
"repeat_penalty": 1.1,
|
|
"temperature": 0.75
|
|
}
|
|
|
|
if not os.path.isfile(model_settings_path):
|
|
with open(model_settings_path, "w") as f:
|
|
f.write(json.dumps(model_settings, indent=4))
|
|
|
|
with open(model_settings_path) as f:
|
|
model_settings = json.loads(f.read())
|
|
|
|
if model_settings["model_path"] is None:
|
|
for f in os.scandir("."):
|
|
if re.search(r"\.gguf$", f.path):
|
|
model_settings["model_path"] = f.path
|
|
break
|
|
|
|
if model_settings["model_path"] is None:
|
|
raise Exception("No .gguf model was found in the program directory. Please specify a model's relative or absolute path using the generated model.json configuration file.")
|
|
|
|
LLM = Llama(
|
|
model_path = model_settings["model_path"],
|
|
n_gpu_layers = model_settings["n_gpu_layers"],
|
|
n_ctx = model_settings["n_ctx"],
|
|
verbose = False,
|
|
n_threads = model_settings["n_threads"])
|
|
|
|
print("Loaded model {model_path}".format(model_path=model_settings["model_path"]))
|
|
|
|
class PrivateHandler(BaseHTTPRequestHandler):
|
|
LOCK = False
|
|
|
|
def do_POST(self):
|
|
content_length = int(self.headers.get("Content-Length", 0))
|
|
post_data = self.rfile.read(content_length).decode("utf-8")
|
|
trimmed_path = self.path[1:].strip()
|
|
|
|
if len(trimmed_path) > 0:
|
|
args = trimmed_path.split("/")
|
|
fn_given = args[0]
|
|
fn_actual = f"public_{fn_given}"
|
|
|
|
if hasattr(self, fn_actual):
|
|
attr = getattr(self, fn_actual)
|
|
if hasattr(attr, "__call__"):
|
|
function = attr
|
|
extra_args = args[1:] if len(args) > 1 else None
|
|
function(post_data, extra_args)
|
|
return
|
|
|
|
self.index(post_data)
|
|
|
|
def index(self, post_data):
|
|
try:
|
|
while PrivateHandler.LOCK:
|
|
print(".", end="")
|
|
time.sleep(0.1)
|
|
|
|
PrivateHandler.LOCK = True
|
|
|
|
text = post_data
|
|
|
|
print("POST:")
|
|
print(text, end="")
|
|
|
|
# Set response headers
|
|
self.send_response(200)
|
|
self.send_header("Content-Type", "text/plain")
|
|
self.send_header("Transfer-Encoding", "chunked")
|
|
self.end_headers()
|
|
|
|
response = LLM(
|
|
text,
|
|
max_tokens = model_settings["max_tokens"],
|
|
stop = model_settings["stop"],
|
|
echo = False,
|
|
repeat_penalty = model_settings["repeat_penalty"],
|
|
temperature = model_settings["temperature"],
|
|
stream = True)
|
|
|
|
# Stream a buffered response
|
|
for token in response:
|
|
token_text = token["choices"][0]["text"]
|
|
token_json = json.dumps(token_text)
|
|
self.wfile.write(f"{len(token_json):x}\r\n".encode("utf-8")) # Chunk size in hex
|
|
self.wfile.write(f"{token_json}\r\n".encode("utf-8"))
|
|
print(token_text, end="")
|
|
|
|
# Signal the end of the stream
|
|
self.wfile.write(b"0\r\n\r\n")
|
|
print("\n-----")
|
|
|
|
except Exception as x:
|
|
print(traceback.format_exc())
|
|
print(x)
|
|
pass
|
|
|
|
PrivateHandler.LOCK = False
|
|
time.sleep(0.2)
|
|
|
|
if __name__ == "__main__":
|
|
address = "0.0.0.0"
|
|
port = 11434
|
|
|
|
print(f"Listening on {address} port {port}...")
|
|
|
|
httpd = ThreadingHTTPServer((address, port), PrivateHandler)
|
|
httpd.serve_forever()
|