Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Merging llm-monitoring into synced main #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions 1 .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ _skbuild/

.envrc

# LLMs - comment if you'd like to bake the model into the image
models/

# Byte-compiled / optimized / DLL files
Expand Down
44 changes: 44 additions & 0 deletions 44 dev.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Define the image argument and provide a default value
ARG IMAGE=python:3.11.8

# Use the image as specified
FROM ${IMAGE}

# Re-declare the ARG after FROM
ARG IMAGE

# Update and upgrade the existing packages
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
python3 \
python3-pip \
ninja-build \
libopenblas-dev \
build-essential \
git

RUN mkdir /app
WORKDIR /app
COPY . /app

RUN python3 -m pip install --upgrade pip

RUN make deps && make build && make clean

# Set environment variable for the host
ENV GH_TOKEN=$GH_TOKEN
ENV HOST=0.0.0.0
ENV PORT=8000
ENV MODEL=/app/models/mistral-7b-openorca.Q5_K_M.gguf

# # Install depencencies
# RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context psutil prometheus_client

# # Install llama-cpp-python (build with METAL)
# RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install git+https://${GH_TOKEN}@github.com/ZenHubHQ/llama-cpp-python.git --force-reinstall --upgrade --no-cache-dir --verbose

# Expose a port for the server
EXPOSE 8000

# Run the server start script
CMD ["/bin/sh", "/app/docker/simple/run.sh"]
# CMD python3 -m llama_cpp.server --n_gpu_layers -1
15 changes: 15 additions & 0 deletions 15 dev.docker-compose
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version: '3'
services:
dev-llama-cpp-python:
build:
context: .
dockerfile: dev.Dockerfile
ports:
- 8000:8000
volumes:
- ./llama_cpp:/app/llama_cpp
networks:
- zh-service-network
networks:
zh-service-network:
external: true
3 changes: 2 additions & 1 deletion 3 docker/simple/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash

make build
uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT
# uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT --reload
python3 -m llama_cpp.server --model $MODEL --n_gpu_layers -1
50 changes: 49 additions & 1 deletion 50 llama_cpp/_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
import sys
import psutil
import subprocess

from typing import Any, Dict
from typing import Any, Dict, List, Tuple, Union

# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
outnull_file = open(os.devnull, "w")
Expand Down Expand Up @@ -75,3 +77,49 @@ class Singleton(object, metaclass=MetaSingleton):

def __init__(self):
super(Singleton, self).__init__()


# Get snapshot of RAM and GPU usage before and after function execution.
# Adapted from: https://github.com/abetlen/llama-cpp-python/issues/223#issuecomment-1556203616
def get_cpu_usage(pid) -> float:
"""
CPU usage in percentage by the current process.
"""
process = psutil.Process(pid)
return process.cpu_percent()

def get_ram_usage(pid) -> float:
"""
RAM usage in MiB by the current process.
"""
process = psutil.Process(pid)
ram_info = process.memory_info()
ram_usage = ram_info.rss / (1024 * 1024) # Convert to MiB
return ram_usage

def get_gpu_info_by_pid(pid) -> float:
"""
GPU memory usage by the current process (if GPU is available)
"""
try:
gpu_info = subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid,used_memory", "--format=csv,noheader"]).decode("utf-8")
gpu_info = gpu_info.strip().split("\n")
for info in gpu_info:
gpu_pid, gpu_ram_usage = info.split(", ")
if int(gpu_pid) == pid:
return float(gpu_ram_usage.split()[0])
except (subprocess.CalledProcessError, FileNotFoundError):
pass
return 0.0

def get_gpu_general_info() -> Tuple[float, float, float]:
"""
GPU general info (if GPU is available)
"""
try:
gpu_info = subprocess.check_output(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.free", "--format=csv,noheader"]).decode("utf-8")
gpu_utilization, gpu_memory_used, gpu_memory_free = gpu_info.strip().split("\n")[0].split(", ")
return tuple(float(tup.split()[0]) for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free])
except (subprocess.CalledProcessError, FileNotFoundError):
pass
return 0.0, 0.0, 0.0
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.