-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathserver.py
1121 lines (919 loc) · 40.7 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Import libs
import sys, os, subprocess, resource, argparse, shutil, time, requests, configparser, json, threading, datetime, logging
import re
from dotenv import load_dotenv
from huggingface_hub import hf_hub_url, HfFileSystem
from flask import Flask, request, jsonify, Response, stream_with_context
from flask_cors import CORS
from transformers import AutoTokenizer
# Local file
from src.classes import *
from src.rkllm import *
from src.process import Request
import src.variables as variables
from src.server_utils import process_ollama_chat_request, process_ollama_generate_request
from src.debug_utils import StreamDebugger, check_response_format
from src.model_utils import (
get_simplified_model_name, get_original_model_path, extract_model_details,
initialize_model_mappings, find_model_by_name, get_huggingface_model_info
)
# Import the config module
import config
# Check for debug mode using the improved method
DEBUG_MODE = config.is_debug_mode()
# Ensure logs directory exists before configuring logging
logs_dir = config.get_path("logs")
os.makedirs(logs_dir, exist_ok=True)
# Set up logging with appropriate level based on debug mode
logging_level = logging.DEBUG if DEBUG_MODE else logging.INFO
logging.basicConfig(
level=logging_level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler(os.path.join(logs_dir, "rkllama_server.log"))
]
)
logger = logging.getLogger("rkllama.server")
def print_color(message, color):
# Function for displaying color messages
colors = {
"red": "\033[91m",
"green": "\033[92m",
"yellow": "\033[93m",
"blue": "\033[94m",
"magenta": "\033[95m",
"cyan": "\033[96m",
"reset": "\033[0m"
}
print(f"{colors.get(color, colors['reset'])}{message}{colors['reset']}")
current_model = None # Global variable for storing the loaded model
modele_rkllm = None # Model instance
def create_modelfile(huggingface_path, From, system="", temperature=1.0):
struct_modelfile = f"""
FROM="{From}"
HUGGINGFACE_PATH="{huggingface_path}"
SYSTEM="{system}"
TEMPERATURE={temperature}
"""
# Use config for models path
path = os.path.join(config.get_path("models"), From.replace('.rkllm', ''))
# Create the directory if it doesn't exist
if not os.path.exists(path):
os.makedirs(path)
# Create the Modelfile and write the content
with open(os.path.join(path, "Modelfile"), "w") as f:
f.write(struct_modelfile)
def load_model(model_name, huggingface_path=None, system="", temperature=1.0, From=None):
# Use config for models path
model_dir = os.path.join(config.get_path("models"), model_name)
if not os.path.exists(model_dir):
return None, f"Model directory '{model_name}' not found."
if not os.path.exists(os.path.join(model_dir, "Modelfile")) and (huggingface_path is None and From is None):
return None, f"Modelfile not found in '{model_name}' directory."
elif huggingface_path is not None and From is not None:
create_modelfile(huggingface_path=huggingface_path, From=From, system=system, temperature=temperature)
time.sleep(0.1)
# Load modelfile
load_dotenv(os.path.join(model_dir, "Modelfile"), override=True)
from_value = os.getenv("FROM")
huggingface_path = os.getenv("HUGGINGFACE_PATH")
# View config Vars
print_color(f"FROM: {from_value}\nHuggingFace Path: {huggingface_path}", "green")
if not from_value or not huggingface_path:
return None, "FROM or HUGGINGFACE_PATH not defined in Modelfile."
# Change value of model_id with huggingface_path
variables.model_id = huggingface_path
modele_rkllm = RKLLM(os.path.join(model_dir, from_value))
return modele_rkllm, None
def unload_model():
global modele_rkllm
if modele_rkllm:
modele_rkllm.release()
modele_rkllm = None
app = Flask(__name__)
# Enable CORS for all routes
CORS(app)
# Original RKLLAMA Routes:
# GET /models
# POST /load_model
# POST /unload_model
# POST /generate
# POST /pull
# DELETE /rm
# Route to view models
@app.route('/models', methods=['GET'])
def list_models():
# Return the list of available models using config path
models_dir = config.get_path("models")
if not os.path.exists(models_dir):
return jsonify({"error": f"The models directory {models_dir} is not found."}), 500
direct_models = [f for f in os.listdir(models_dir) if f.endswith(".rkllm")]
for model in direct_models:
model_name = os.path.splitext(model)[0]
model_dir = os.path.join(models_dir, model_name)
os.makedirs(model_dir, exist_ok=True)
shutil.move(os.path.join(models_dir, model), os.path.join(model_dir, model))
model_dirs = []
for subdir in os.listdir(models_dir):
subdir_path = os.path.join(models_dir, subdir)
if os.path.isdir(subdir_path):
for file in os.listdir(subdir_path):
if file.endswith(".rkllm"):
model_dirs.append(subdir)
break
return jsonify({"models": model_dirs}), 200
# Delete a model
@app.route('/rm', methods=['DELETE'])
def Rm_model():
data = request.json
if "model" not in data:
return jsonify({"error": "Please specify a model."}), 400
model_path = os.path.join(config.get_path("models"), data['model'])
if not os.path.exists(model_path):
return jsonify({"error": f"The model: {data['model']} cannot be found."}), 404
os.remove(model_path)
initialize_model_mappings()
return jsonify({"message": f"The model has been successfully deleted!"}), 200
# route to pull a model
@app.route('/pull', methods=['POST'])
def pull_model():
@stream_with_context
def generate_progress():
data = request.json
if "model" not in data:
yield "Error: Model not specified.\n"
return
splitted = data["model"].split('/')
if len(splitted) < 3:
yield f"Error: Invalid path '{data['model']}'\n"
return
file = splitted[2]
repo = data["model"].replace(f"/{file}", "")
try:
# Use Hugging Face HfFileSystem to get the file metadata
fs = HfFileSystem()
file_info = fs.info(repo + "/" + file)
total_size = file_info["size"] # File size in bytes
if total_size == 0:
yield "Error: Unable to retrieve file size.\n"
return
# Use config to get models path
model_dir = os.path.join(config.get_path("models"), file.replace('.rkllm', ''))
os.makedirs(model_dir, exist_ok=True)
# Définir le fichier à télécharger
local_filename = os.path.join(model_dir, file)
# Créer le fichier de configuration du model
create_modelfile(huggingface_path=repo, From=file)
yield f"Downloading {file} ({total_size / (1024**2):.2f} MB)...\n"
try:
# Download the file with progress
url = hf_hub_url(repo_id=repo, filename=file)
with requests.get(url, stream=True) as r, open(local_filename, "wb") as f:
downloaded_size = 0
chunk_size = 8192 # 8KB
for chunk in r.iter_content(chunk_size=chunk_size):
if chunk:
f.write(chunk)
downloaded_size += len(chunk)
progress = int((downloaded_size / total_size) * 100)
yield f"{progress}%\n"
except Exception as download_error:
# Remove the file if an error occurs during download
if os.path.exists(local_filename):
os.remove(local_filename)
yield f"Error during download: {str(download_error)}\n"
return
except Exception as e:
yield f"Error: {str(e)}\n"
# Use the appropriate content type for streaming responses
is_ollama_request = request.path.startswith('/api/')
content_type = 'application/x-ndjson' if is_ollama_request else 'text/plain'
return Response(generate_progress(), content_type=content_type)
# Route for loading a model into the NPU
@app.route('/load_model', methods=['POST'])
def load_model_route():
global current_model, modele_rkllm
# Check if a model is currently loaded
if modele_rkllm:
return jsonify({"error": "A model is already loaded. Please unload it first."}), 400
data = request.json
if "model_name" not in data:
return jsonify({"error": "Please enter the name of the model to be loaded."}), 400
model_name = data["model_name"]
#print(data)
# Check if other params like "from" or "huggingface_path" for create modelfile
if "from" in data or "huggingface_path" in data:
modele_rkllm, error = load_model(model_name, From=data["from"], huggingface_path=data["huggingface_path"])
else:
modele_rkllm, error = load_model(model_name)
if error:
return jsonify({"error": error}), 400
current_model = model_name
return jsonify({"message": f"Model {model_name} loaded successfully."}), 200
# Route to unload a model from the NPU
@app.route('/unload_model', methods=['POST'])
def unload_model_route():
global current_model, modele_rkllm
if not modele_rkllm:
return jsonify({"error": "No models are currently loaded."}), 400
unload_model()
current_model = None
return jsonify({"message": "Model successfully unloaded!"}), 200
# Route to retrieve the current model
@app.route('/current_model', methods=['GET'])
def get_current_model():
global current_model, modele_rkllm
if current_model and modele_rkllm:
return jsonify({"model_name": current_model}), 200
else:
return jsonify({"error": "No models are currently loaded."}), 404
# Route to make a request to the model
@app.route('/generate', methods=['POST'])
def recevoir_message():
global modele_rkllm
if not modele_rkllm:
return jsonify({"error": "No models are currently loaded."}), 400
variables.verrou.acquire()
return Request(modele_rkllm)
# Ollama API compatibility routes
@app.route('/api/tags', methods=['GET'])
def list_ollama_models():
# Return models in Ollama API format
models_dir = config.get_path("models")
if not os.path.exists(models_dir):
return jsonify({"models": []}), 200
models = []
for subdir in os.listdir(models_dir):
subdir_path = os.path.join(models_dir, subdir)
if os.path.isdir(subdir_path):
for file in os.listdir(subdir_path):
if file.endswith(".rkllm"):
size = os.path.getsize(os.path.join(subdir_path, file))
# Generate a simplified model name in Ollama style
simple_name = get_simplified_model_name(subdir)
# Extract parameter size and quantization details if available
model_details = extract_model_details(subdir)
models.append({
"name": simple_name, # Use simplified name like qwen:3b
"model": simple_name, # Match Ollama's format
"modified_at": datetime.datetime.fromtimestamp(
os.path.getmtime(os.path.join(subdir_path, file))
).strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
"size": size,
"digest": "", # Ollama field (not used but included for compatibility)
"details": {
"format": "rkllm",
"family": "llama", # Default family
"parameter_size": model_details.get("parameter_size", "Unknown"),
"quantization_level": model_details.get("quantization_level", "Unknown")
}
})
break
return jsonify({"models": models}), 200
@app.route('/api/show', methods=['POST'])
def show_model_info():
data = request.json
model_name = data.get('name')
if not model_name:
return jsonify({"error": "Missing model name"}), 400
# Handle simplified model names
original_model_path = get_original_model_path(model_name)
if original_model_path:
model_name = original_model_path
model_dir = os.path.join(config.get_path("models"), model_name)
if not os.path.exists(model_dir):
return jsonify({"error": f"Model '{model_name}' not found"}), 404
# Read modelfile content if available
modelfile_path = os.path.join(model_dir, "Modelfile")
modelfile_content = ""
system_prompt = ""
template = "{{ .Prompt }}"
license_text = ""
huggingface_path = None
temperature = 0.8 # Default temperature
if os.path.exists(modelfile_path):
with open(modelfile_path, "r") as f:
modelfile_content = f.read()
# Extract system prompt if available
system_match = re.search(r'SYSTEM="(.*?)"', modelfile_content, re.DOTALL)
if system_match:
system_prompt = system_match.group(1).strip()
# Check for template pattern
template_match = re.search(r'TEMPLATE="(.*?)"', modelfile_content, re.DOTALL)
if template_match:
template = template_match.group(1).strip()
# Check for LICENSE pattern (some modelfiles have this)
license_match = re.search(r'LICENSE="(.*?)"', modelfile_content, re.DOTALL)
if license_match:
license_text = license_match.group(1).strip()
# Extract HuggingFace path for API access
hf_path_match = re.search(r'HUGGINGFACE_PATH="(.*?)"', modelfile_content, re.DOTALL)
if hf_path_match:
huggingface_path = hf_path_match.group(1).strip()
# Extract temperature if available
temp_match = re.search(r'TEMPERATURE=(\d+\.?\d*)', modelfile_content)
if temp_match:
try:
temperature = float(temp_match.group(1))
except ValueError:
pass
# Find the .rkllm file
model_file = None
for file in os.listdir(model_dir):
if file.endswith(".rkllm"):
model_file = file
break
if not model_file:
return jsonify({"error": f"Model file not found in '{model_name}' directory"}), 404
file_path = os.path.join(model_dir, model_file)
size = os.path.getsize(file_path)
# Extract model details
model_details = extract_model_details(model_name)
parameter_size = model_details.get("parameter_size", "Unknown")
quantization_level = model_details.get("quantization_level", "Unknown")
# Determine model family based on name patterns
family = "llama" # default family
families = ["llama"]
# Try to get enhanced information from Hugging Face API
hf_metadata = get_huggingface_model_info(huggingface_path) if huggingface_path else None
# Use HF metadata to improve model info if available
if hf_metadata:
# Extract tags from HF metadata
tags = hf_metadata.get('tags', [])
# Better determine model family based on HF tags or architecture field
if hf_metadata.get('architecture') == 'qwen' or 'qwen' in tags or 'qwen2' in tags:
family = "qwen2"
families = ["qwen2"]
elif hf_metadata.get('architecture') == 'mistral' or 'mistral' in tags:
family = "mistral"
families = ["mistral"]
elif hf_metadata.get('architecture') == 'deepseek' or 'deepseek' in tags:
family = "deepseek"
families = ["deepseek"]
elif hf_metadata.get('architecture') == 'phi' or 'phi' in tags:
family = "phi"
families = ["phi"]
elif hf_metadata.get('architecture') == 'gemma' or 'gemma' in tags:
family = "gemma"
families = ["gemma"]
elif 'tinyllama' in tags:
family = "tinyllama"
families = ["tinyllama", "llama"]
elif any('llama-3' in tag for tag in tags) or any('llama3' in tag for tag in tags):
family = "llama3"
families = ["llama3", "llama"]
elif any('llama-2' in tag for tag in tags) or any('llama2' in tag for tag in tags):
family = "llama2"
families = ["llama2", "llama"]
# Extract model card metadata
model_card = hf_metadata.get('cardData', {})
# Better parameter size from HF metadata
if 'params' in model_card:
try:
params = int(model_card['params'])
if params >= 1_000_000_000:
parameter_size = f"{params/1_000_000_000:.1f}B".replace('.0B', 'B')
# Also store the raw parameter count for model_info
parameter_count = params
except (ValueError, TypeError):
parameter_count = None
else:
parameter_count = None
# Extract quantization info
if 'quantization' in hf_metadata:
quantization_level = hf_metadata['quantization']
# Better license information
if 'license' in hf_metadata and not license_text:
license_text = hf_metadata['license']
else:
# Fallback to pattern matching if no HF metadata
if re.search(r'(?i)Qwen', model_name):
family = "qwen2"
families = ["qwen2"]
elif re.search(r'(?i)Mistral', model_name):
family = "mistral"
families = ["mistral"]
elif re.search(r'(?i)DeepSeek', model_name):
family = "deepseek"
families = ["deepseek"]
elif re.search(r'(?i)Phi', model_name):
family = "phi"
families = ["phi"]
elif re.search(r'(?i)Gemma', model_name):
family = "gemma"
families = ["gemma"]
elif re.search(r'(?i)TinyLlama', model_name):
family = "tinyllama"
families = ["tinyllama", "llama"]
elif re.search(r'(?i)Llama[-_]?3', model_name):
family = "llama3"
families = ["llama3", "llama"]
elif re.search(r'(?i)Llama[-_]?2', model_name):
family = "llama2"
families = ["llama2", "llama"]
parameter_count = None
# Convert modelfile to Ollama-compatible format
ollama_modelfile = f"# Modelfile generated by \"ollama show\"\n"
ollama_modelfile += f"# To build a new Modelfile based on this, replace FROM with:\n"
ollama_modelfile += f"# FROM {get_simplified_model_name(model_name)}\n\n"
# Change this section to use a more compatible FROM format
# Instead of absolute paths, use the model file name which is more compatible with Ollama
# Original: model_blob_path = f"{model_dir}/{model_file}"
simple_name = get_simplified_model_name(model_name)
if DEBUG_MODE:
# In debug mode, use absolute paths to help with troubleshooting
model_blob_path = f"{model_dir}/{model_file}"
ollama_modelfile += f"FROM {model_blob_path}\n"
else:
# In normal mode, use the simplified name format that Ollama clients expect
ollama_modelfile += f"FROM {simple_name}\n"
if template != "{{ .Prompt }}":
ollama_modelfile += f'TEMPLATE """{template}"""\n'
if system_prompt:
ollama_modelfile += f'SYSTEM "{system_prompt}"\n'
if license_text:
ollama_modelfile += f'LICENSE """{license_text}"""\n'
# Additional model info from HF
model_description = ""
repo_url = None
if hf_metadata:
model_description = hf_metadata.get('description', '').strip()
# Add description comment to modelfile if available
if model_description:
desc_lines = model_description.split('\n')
desc_comment = '\n'.join([f"# {line}" for line in desc_lines[:5]]) # First 5 lines only
ollama_modelfile = desc_comment + "\n\n" + ollama_modelfile
# Extract repo URL if available
if huggingface_path:
repo_url = f"https://huggingface.co/{huggingface_path}"
# Parse parameter size into numeric format
numeric_param_size = None
if parameter_size != "Unknown":
param_match = re.search(r'(\d+\.?\d*)B', parameter_size)
if param_match:
try:
size_in_billions = float(param_match.group(1))
numeric_param_size = int(size_in_billions * 1_000_000_000)
except ValueError:
pass
# Use parameter_count from HF metadata if available, otherwise use parsed value
if parameter_count is None and numeric_param_size is not None:
parameter_count = numeric_param_size
elif parameter_count is None:
# Default fallback
if "7B" in model_name or "7b" in model_name:
parameter_count = 7000000000
elif "3B" in model_name or "3b" in model_name:
parameter_count = 3000000000
elif "1.5B" in model_name or "1.5b" in model_name:
parameter_count = 1500000000
else:
parameter_count = 0
# Extract base model name (without fine-tuning suffixes)
base_name = model_name.split('-')[0]
# Determine finetune type if present
finetune = None
if "instruct" in model_name.lower():
finetune = "Instruct"
elif "chat" in model_name.lower():
finetune = "Chat"
# Build a more complete model_info dict with architecture details
model_info = {
"general.architecture": family,
"general.base_model.0.name": f"{base_name} {parameter_size}",
"general.base_model.0.organization": family.capitalize(),
"general.basename": base_name,
"general.file_type": 15, # RKLLM file type
"general.parameter_count": parameter_count,
"general.quantization_version": 2,
"general.size_label": parameter_size,
"general.tags": ["chat", "text-generation"],
"general.type": "model",
"tokenizer.ggml.pre": family
}
# Add repo URL if available
if repo_url:
model_info["general.base_model.0.repo_url"] = repo_url
model_info["general.base_model.count"] = 1
# Add finetune info if available
if finetune:
model_info["general.finetune"] = finetune
# Add license info if available
if license_text:
license_name = "other"
license_link = ""
# Try to detect common licenses
if "apache" in license_text.lower():
license_name = "apache-2.0"
elif "mit" in license_text.lower():
license_name = "mit"
elif "qwen research" in license_text.lower():
license_name = "qwen-research"
if huggingface_path:
license_link = f"https://huggingface.co/{huggingface_path}/blob/main/LICENSE"
model_info["general.license"] = license_name
if license_link:
model_info["general.license.link"] = license_link
model_info["general.license.name"] = license_name
# Add language info if we can detect it
if hf_metadata and 'languages' in hf_metadata:
model_info["general.languages"] = hf_metadata['languages']
else:
# Default to English
model_info["general.languages"] = ["en"]
# Add architecture-specific parameters based on model family
if family == "qwen2":
model_info.update({
"qwen2.attention.head_count": 16,
"qwen2.attention.head_count_kv": 2,
"qwen2.attention.layer_norm_rms_epsilon": 0.000001,
"qwen2.block_count": 36 if "3B" in parameter_size else 24,
"qwen2.context_length": 32768,
"qwen2.embedding_length": 2048 if "3B" in parameter_size else 1536,
"qwen2.feed_forward_length": 11008 if "3B" in parameter_size else 8192,
"qwen2.rope.freq_base": 1000000
})
elif family == "llama" or family == "llama2" or family == "llama3":
model_info.update({
f"{family}.attention.head_count": 32,
f"{family}.attention.head_count_kv": 4,
f"{family}.attention.layer_norm_rms_epsilon": 0.000001,
f"{family}.block_count": 32,
f"{family}.context_length": 4096,
f"{family}.embedding_length": 4096,
f"{family}.feed_forward_length": 11008,
f"{family}.rope.freq_base": 10000
})
elif family == "mistral":
model_info.update({
"mistral.attention.head_count": 32,
"mistral.attention.head_count_kv": 8,
"mistral.attention.layer_norm_rms_epsilon": 0.000001,
"mistral.block_count": 32,
"mistral.context_length": 8192,
"mistral.embedding_length": 4096,
"mistral.feed_forward_length": 14336
})
# Calculate modified timestamp
modified_at = datetime.datetime.fromtimestamp(
os.path.getmtime(file_path)
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
# Format parameters string nicely
parameters_str = parameter_size
if parameters_str == "Unknown" and parameter_count:
if parameter_count >= 1_000_000_000:
parameters_str = f"{parameter_count/1_000_000_000:.1f}B".replace('.0B', 'B')
else:
parameters_str = f"{parameter_count/1_000_000:.1f}M".replace('.0M', 'M')
# Prepare response with enhanced metadata
response = {
"license": license_text or "Unknown",
"modelfile": ollama_modelfile,
"parameters": parameters_str,
"template": template,
"system": system_prompt,
"name": model_name,
"details": {
"parent_model": huggingface_path or "",
"format": "rkllm",
"family": family,
"families": families,
"parameter_size": parameter_size,
"quantization_level": quantization_level
},
"model_info": model_info,
"size": size,
"modified_at": modified_at
}
# Add Hugging Face specific fields if available
if hf_metadata:
response["huggingface"] = {
"repo_id": huggingface_path,
"description": model_description[:500] if model_description else "", # Truncate if too long
"tags": hf_metadata.get('tags', []),
"downloads": hf_metadata.get('downloads', 0),
"likes": hf_metadata.get('likes', 0)
}
return jsonify(response), 200
@app.route('/api/create', methods=['POST'])
def create_model():
data = request.json
model_name = data.get('name')
modelfile = data.get('modelfile', '')
if not model_name:
return jsonify({"error": "Missing model name"}), 400
model_dir = os.path.join(config.get_path("models"), model_name)
os.makedirs(model_dir, exist_ok=True)
with open(os.path.join(model_dir, "Modelfile"), "w") as f:
f.write(modelfile)
# Parse the modelfile to extract parameters
modelfile_lines = modelfile.strip().split('\n')
from_line = next((line for line in modelfile_lines if line.startswith('FROM=')), None)
huggingface_path = next((line for line in modelfile_lines if line.startswith('HUGGINGFACE_PATH=')), None)
if not from_line or not huggingface_path:
return jsonify({"error": "Invalid Modelfile: missing FROM or HUGGINGFACE_PATH"}), 400
# Extract values
from_value = from_line.split('=')[1].strip('"\'')
huggingface_path = huggingface_path.split('=')[1].strip('"\'')
# For compatibility with existing implementation
return jsonify({"status": "success", "model": model_name}), 200
@app.route('/api/pull', methods=['POST'])
def pull_model_ollama():
# TODO: Implement the pull model
data = request.json
model = data.get('name')
if not model:
return jsonify({"error": "Missing model name"}), 400
# Ollama API uses application/x-ndjson for streaming
response_stream = pull_model() # Call the existing function directly
response_stream.content_type = 'application/x-ndjson'
return response_stream
@app.route('/api/delete', methods=['DELETE'])
def delete_model_ollama():
data = request.json
model_name = data.get('name')
if not model_name:
return jsonify({"error": "Missing model name"}), 400
# Resolve simplified name to full model name
full_model_name = find_model_by_name(model_name)
if not full_model_name:
if DEBUG_MODE:
logger.error(f"Model '{model_name}' not found for deletion")
return jsonify({"error": f"Model '{model_name}' not found"}), 404
model_path = os.path.join(config.get_path("models"), full_model_name)
if not os.path.exists(model_path):
return jsonify({"error": f"Model directory for '{model_name}' not found"}), 404
# Check if model is currently loaded
if current_model == full_model_name:
if DEBUG_MODE:
logger.debug(f"Unloading model '{full_model_name}' before deletion")
unload_model()
try:
if DEBUG_MODE:
logger.debug(f"Deleting model directory: {model_path}")
shutil.rmtree(model_path)
initialize_model_mappings()
return jsonify({}), 200
except Exception as e:
logger.error(f"Failed to delete model '{full_model_name}': {str(e)}")
return jsonify({"error": f"Failed to delete model: {str(e)}"}), 500
@app.route('/api/generate', methods=['POST'])
def generate_ollama():
global modele_rkllm, current_model
lock_acquired = False # Track lock status
try:
data = request.json
model_name = data.get('model')
prompt = data.get('prompt')
system = data.get('system', '')
stream = data.get('stream', True)
# Support format options for structured JSON output
format_spec = data.get('format')
options = data.get('options', {})
if DEBUG_MODE:
logger.debug(f"API generate request: model={model_name}, stream={stream}, format={format_spec}")
if not model_name:
return jsonify({"error": "Missing model name"}), 400
if not prompt:
return jsonify({"error": "Missing prompt"}), 400
# Improved model resolution
full_model_name = find_model_by_name(model_name)
if not full_model_name:
if DEBUG_MODE:
logger.error(f"Model '{model_name}' not found")
return jsonify({"error": f"Model '{model_name}' not found"}), 404
# Use the full model name for loading
model_name = full_model_name
# Load model if needed
if current_model != model_name:
if current_model:
unload_model()
modele_instance, error = load_model(model_name)
if error:
return jsonify({"error": f"Failed to load model '{model_name}': {error}"}), 500
modele_rkllm = modele_instance
current_model = model_name
# Acquire lock before processing
variables.verrou.acquire()
lock_acquired = True
# DIRECTLY use the GenerateEndpointHandler instead of the process_ollama_generate_request wrapper
from src.server_utils import GenerateEndpointHandler
return GenerateEndpointHandler.handle_request(
modele_rkllm=modele_rkllm,
model_name=model_name,
prompt=prompt,
system=system,
stream=stream,
format_spec=format_spec,
options=options
)
except Exception as e:
if DEBUG_MODE:
logger.exception(f"Error in generate_ollama: {str(e)}")
return jsonify({"error": str(e)}), 500
finally:
# Only release if we acquired it
if lock_acquired and variables.verrou.locked():
variables.verrou.release()
# Also update the chat endpoint for consistency
@app.route('/api/chat', methods=['POST'])
def chat_ollama():
global modele_rkllm, current_model
lock_acquired = False # Track lock status
try:
data = request.json
model_name = data.get('model')
messages = data.get('messages', [])
system = data.get('system', '')
stream = data.get('stream', True)
# Extract format parameters - can be object or string
format_spec = data.get('format')
options = data.get('options', {})
if DEBUG_MODE:
logger.debug(f"API chat request: model={model_name}, format={format_spec}")
# Check if we're starting a new conversation
# A new conversation is one that doesn't include any assistant messages
is_new_conversation = not any(msg.get('role') == 'assistant' for msg in messages)
# Always reset system prompt for new conversations
if is_new_conversation:
variables.system = ""
if DEBUG_MODE:
logger.debug("New conversation detected, resetting system prompt")
# Extract system message from messages array if present
system_in_messages = False
filtered_messages = []
for message in messages:
if message.get('role') == 'system':
system = message.get('content', '')
system_in_messages = True
# Don't add system message to filtered messages
else:
filtered_messages.append(message)
# Only use the extracted system message or explicit system parameter if provided
if system_in_messages or system:
variables.system = system
messages = filtered_messages
if DEBUG_MODE:
logger.debug(f"Using system message: {system}")
# Improved model resolution
full_model_name = find_model_by_name(model_name)
if not full_model_name:
if DEBUG_MODE:
logger.error(f"Model '{model_name}' not found")
return jsonify({"error": f"Model '{model_name}' not found"}), 404
# Use the full model name for loading
model_name = full_model_name
# Load model if needed
if current_model != model_name:
if current_model:
if DEBUG_MODE:
logger.debug(f"Unloading current model: {current_model}")
unload_model()
if DEBUG_MODE:
logger.debug(f"Loading model: {model_name}")
modele_instance, error = load_model(model_name)
if error:
if DEBUG_MODE:
logger.error(f"Failed to load model {model_name}: {error}")
return jsonify({"error": f"Failed to load model '{model_name}': {error}"}), 500
modele_rkllm = modele_instance
current_model = model_name
if DEBUG_MODE:
logger.debug(f"Model {model_name} loaded successfully")
# Apply options to model parameters if provided
if options and isinstance(options, dict):
if "temperature" in options:
try:
temperature = float(options["temperature"])
# Set temperature for model if supported
except (ValueError, TypeError):
pass
# Store format settings in model instance
if modele_rkllm:
modele_rkllm.format_schema = format_spec
modele_rkllm.format_options = options
# Acquire lock before processing the request
variables.verrou.acquire()
lock_acquired = True # Mark lock as acquired
# Create custom request for processing
custom_req = type('obj', (object,), {
'json': {
"model": model_name,
"messages": messages,
"stream": stream,
"system": system,
"format": format_spec,
"options": options
},
'path': '/api/chat'
})
# Set a flag on the custom request to indicate it should not release the lock
# as we'll handle it here
custom_req.handle_lock = False
# Process the request - this won't release the lock
from src.server_utils import ChatEndpointHandler
return ChatEndpointHandler.handle_request(
modele_rkllm=modele_rkllm,