Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Latest commit

 

History

History
History
96 lines (79 loc) · 2.99 KB

File metadata and controls

96 lines (79 loc) · 2.99 KB
Copy raw file
Download raw file
Open symbols panel
Edit and raw actions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import structlog
from llama_cpp import Llama
logger = structlog.get_logger("codegate")
class LlamaCppInferenceEngine:
"""
A wrapper class for llama.cpp models
Attributes:
__inference_engine: Singleton instance of this class
"""
__inference_engine = None
def __new__(cls):
if cls.__inference_engine is None:
cls.__inference_engine = super().__new__(cls)
return cls.__inference_engine
def __init__(self):
if not hasattr(self, "models"):
self.__models = {}
def __del__(self):
self._close_models()
def _close_models(self):
"""
Closes all open models and samplers
"""
for _, model in self.__models.items():
if model._sampler:
model._sampler.close()
model.close()
async def __get_model(self, model_path, embedding=False, n_ctx=512, n_gpu_layers=0) -> Llama:
"""
Returns Llama model object from __models if present. Otherwise, the model
is loaded and added to __models and returned.
"""
if model_path not in self.__models:
logger.info(
f"Loading model from {model_path} with parameters "
f"n_gpu_layers={n_gpu_layers} and n_ctx={n_ctx}"
)
self.__models[model_path] = Llama(
model_path=model_path,
n_gpu_layers=n_gpu_layers,
verbose=False,
n_ctx=n_ctx,
embedding=embedding,
)
return self.__models[model_path]
async def complete(self, model_path, n_ctx=512, n_gpu_layers=0, **completion_request):
"""
Generates a chat completion using the specified model and request parameters.
"""
model = await self.__get_model(
model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers
)
return model.create_completion(**completion_request)
async def chat(self, model_path, n_ctx=512, n_gpu_layers=0, **chat_completion_request):
"""
Generates a chat completion using the specified model and request parameters.
"""
model = await self.__get_model(
model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers
)
return model.create_chat_completion(**chat_completion_request)
async def embed(self, model_path, content):
"""
Generates an embedding for the given content using the specified model.
"""
logger.debug(
"Generating embedding",
model=model_path.split("/")[-1],
content=content,
content_length=len(content[0]) if content else 0,
)
model = await self.__get_model(model_path=model_path, embedding=True)
embedding = model.embed(content)
logger.debug(
"Generated embedding",
model=model_path.split("/")[-1],
vector_length=len(embedding[0]) if embedding else 0,
)
return embedding
Morty Proxy This is a proxified and sanitized view of the page, visit original site.