codegate/src/codegate/inference/inference_engine.py at tests · SuperCollectionAI/codegate

96 lines (79 loc) · 2.99 KB

import structlog
from llama_cpp import Llama

logger = structlog.get_logger("codegate")


class LlamaCppInferenceEngine:
    """
    A wrapper class for llama.cpp models

    Attributes:
        __inference_engine: Singleton instance of this class
    """

    __inference_engine = None

    def __new__(cls):
        if cls.__inference_engine is None:
            cls.__inference_engine = super().__new__(cls)
        return cls.__inference_engine

    def __init__(self):
        if not hasattr(self, "models"):
            self.__models = {}

    def __del__(self):
        self._close_models()

    def _close_models(self):
        """
        Closes all open models and samplers
        """
        for _, model in self.__models.items():
            if model._sampler:
                model._sampler.close()
            model.close()

    async def __get_model(self, model_path, embedding=False, n_ctx=512, n_gpu_layers=0) -> Llama:
        """
        Returns Llama model object from __models if present. Otherwise, the model
        is loaded and added to __models and returned.
        """
        if model_path not in self.__models:
            logger.info(
                f"Loading model from {model_path} with parameters "
                f"n_gpu_layers={n_gpu_layers} and n_ctx={n_ctx}"
            )
            self.__models[model_path] = Llama(
                model_path=model_path,
                n_gpu_layers=n_gpu_layers,
                verbose=False,
                n_ctx=n_ctx,
                embedding=embedding,
            )

        return self.__models[model_path]

    async def complete(self, model_path, n_ctx=512, n_gpu_layers=0, **completion_request):
        """
        Generates a chat completion using the specified model and request parameters.
        """
        model = await self.__get_model(
            model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers
        )
        return model.create_completion(**completion_request)

    async def chat(self, model_path, n_ctx=512, n_gpu_layers=0, **chat_completion_request):
        """
        Generates a chat completion using the specified model and request parameters.
        """
        model = await self.__get_model(
            model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers
        )
        return model.create_chat_completion(**chat_completion_request)

    async def embed(self, model_path, content):
        """
        Generates an embedding for the given content using the specified model.
        """
        logger.debug(
            "Generating embedding",
            model=model_path.split("/")[-1],
            content=content,
            content_length=len(content[0]) if content else 0,
        )

        model = await self.__get_model(model_path=model_path, embedding=True)
        embedding = model.embed(content)

        logger.debug(
            "Generated embedding",
            model=model_path.split("/")[-1],
            vector_length=len(embedding[0]) if embedding else 0,
        )

        return embedding

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Expand file tree

Search code, repositories, users, issues, pull requests...

Uh oh!

FilesExpand file tree

inference_engine.py

Latest commit

History

inference_engine.py

File metadata and controls

Expand file tree