diff --git a/.dockerignore b/.dockerignore index fd64c09b3..1b85f8c9a 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,6 +2,7 @@ _skbuild/ .envrc +# LLMs - comment if you'd like to bake the model into the image models/ # Byte-compiled / optimized / DLL files diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 8fbd68f4a..e736cf964 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04, windows-2019, macos-11] + os: [ubuntu-20.04, windows-2019, macos-12] steps: - uses: actions/checkout@v4 @@ -29,7 +29,7 @@ jobs: python -m pip install -e .[all] - name: Build wheels - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.19.1 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" @@ -56,7 +56,7 @@ jobs: platforms: linux/arm64 - name: Build wheels - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.19.1 env: CIBW_SKIP: "*musllinux* pp*" CIBW_REPAIR_WHEEL_COMMAND: "" diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 4ebe3bb6d..b5c7346db 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -31,7 +31,7 @@ jobs: - name: Build and push id: docker_build - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . file: "docker/simple/Dockerfile" diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index ae9e8632c..78d0cba81 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -20,8 +20,8 @@ jobs: id: set-matrix run: | $matrix = @{ - 'os' = @('ubuntu-20.04', 'windows-latest') - 'pyver' = @("3.10", "3.11", "3.12") + 'os' = @('ubuntu-latest', 'windows-2019') + 'pyver' = @("3.9", "3.10", "3.11", "3.12") 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") 'releasetag' = @("basic") } @@ -43,6 +43,12 @@ jobs: AVXVER: ${{ matrix.releasetag }} steps: + - name: Add MSBuild to PATH + if: runner.os == 'Windows' + uses: microsoft/setup-msbuild@v1.1 + with: + vs-version: '[16.11,16.12)' + - uses: actions/checkout@v4 with: submodules: "recursive" @@ -50,6 +56,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.pyver }} + cache: 'pip' - name: Setup Mamba uses: conda-incubator/setup-miniconda@v3.0.4 @@ -84,7 +91,7 @@ jobs: if: runner.os == 'Windows' run: | $y = (gi '.\MSBuildExtensions').fullname + '\*' - (gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_}) + (gi 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_}) $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_') echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV @@ -107,17 +114,17 @@ jobs: $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH } $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all' + $env:CMAKE_ARGS = '-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all' $env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS" - if ($env:AVXVER -eq 'AVX') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off' - } - if ($env:AVXVER -eq 'AVX512') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on' - } - if ($env:AVXVER -eq 'basic') { - $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off' - } + # if ($env:AVXVER -eq 'AVX') { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off' + # } + # if ($env:AVXVER -eq 'AVX512') { + # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on' + # } + # if ($env:AVXVER -eq 'basic') { + # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off' + # } python -m build --wheel # write the build tag to the output Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index fc798c84e..11ab795c8 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -6,81 +6,60 @@ permissions: contents: write jobs: - define_matrix: - name: Define Build Matrix - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - defaults: - run: - shell: pwsh - - steps: - - name: Define Job Output - id: set-matrix - run: | - $matrix = @{ - 'os' = @('macos-11', 'macos-12', 'macos-13') - 'pyver' = @('3.10', '3.11', '3.12') - } - - $matrixOut = ConvertTo-Json $matrix -Compress - Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT - build_wheels: - name: ${{ matrix.os }} Python ${{ matrix.pyver }} - needs: define_matrix + name: Build wheels on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: - matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }} - env: - OSVER: ${{ matrix.os }} + matrix: + os: [macos-12, macos-13, macos-14] steps: - uses: actions/checkout@v4 with: submodules: "recursive" + # Used to host cibuildwheel - uses: actions/setup-python@v5 with: - python-version: ${{ matrix.pyver }} - - - name: Install Dependencies - run: | - python -m pip install build wheel cmake + python-version: "3.12" + cache: 'pip' - - name: Build Wheel + - name: Install dependencies run: | - XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer" - XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin" - export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on" - [[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0" - [[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0" - [[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0" + python -m pip install --upgrade pip + python -m pip install -e .[all] - export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64" - VERBOSE=1 python -m build --wheel - - if [[ "$OSVER" == "macos-13" ]]; then - export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk" - export MACOSX_DEPLOYMENT_TARGET="14.0" - VERBOSE=1 python -m build --wheel - fi - - for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done + - name: Build wheels + uses: pypa/cibuildwheel@v2.19.1 + env: + # disable repair + CIBW_REPAIR_WHEEL_COMMAND: "" + CIBW_ARCHS: "arm64" + CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DLLAMA_METAL=on" + CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" + with: + package-dir: . + output-dir: wheelhouse2 - export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64" - VERBOSE=1 python -m build --wheel + - uses: actions/upload-artifact@v4 + with: + name: wheels-mac_${{ matrix.os }} + path: ./wheelhouse2/*.whl - if [[ "$OSVER" == "macos-13" ]]; then - export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk" - export MACOSX_DEPLOYMENT_TARGET="14.0" - VERBOSE=1 python -m build --wheel - fi + release: + name: Release + needs: [build_wheels] + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v4 + with: + merge-multiple: true + path: dist2 + - uses: softprops/action-gh-release@v2 with: - files: dist/* + files: dist2/* # set release name to -metal tag_name: ${{ github.ref_name }}-metal env: diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml index 2bf0ea9ba..19613233b 100644 --- a/.github/workflows/publish-to-test.yaml +++ b/.github/workflows/publish-to-test.yaml @@ -22,7 +22,8 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.11" + cache: 'pip' - name: Append Dev Version to __version__ run: | DEV_VERSION=${{ github.event.inputs.dev_version }} @@ -31,11 +32,11 @@ jobs: sed -i 's/__version__ = \".*\"/__version__ = \"'"${NEW_VERSION}"'\"/' llama_cpp/__init__.py - name: Install dependencies run: | - python3 -m pip install --upgrade pip build - python3 -m pip install -e .[all] + python -m pip install --upgrade pip build + python -m pip install -e .[all] - name: Build source distribution run: | - python3 -m build --sdist + python -m build --sdist - name: Publish to Test PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index bc4ec9063..c6abb43b3 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -16,14 +16,14 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.9" - name: Install dependencies run: | - python3 -m pip install --upgrade pip build - python3 -m pip install -e .[all] + python -m pip install --upgrade pip build + python -m pip install -e .[all] - name: Build source distribution run: | - python3 -m build --sdist + python -m build --sdist - name: Publish distribution to PyPI # TODO: move to tag based releases # if: startsWith(github.ref, 'refs/tags') diff --git a/.github/workflows/test-pypi.yaml b/.github/workflows/test-pypi.yaml index aa8e8fa0b..d7131956d 100644 --- a/.github/workflows/test-pypi.yaml +++ b/.github/workflows/test-pypi.yaml @@ -8,57 +8,60 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: 'pip' - name: Install dependencies run: | - python3 -m pip install --upgrade pip - python3 -m pip install --verbose llama-cpp-python[all] + python -m pip install --upgrade pip + python -m pip install --verbose llama-cpp-python[all] - name: Test with pytest run: | - python3 -c "import llama_cpp" + python -c "import llama_cpp" build-windows: runs-on: windows-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: 'pip' - name: Install dependencies run: | - python3 -m pip install --upgrade pip - python3 -m pip install --verbose llama-cpp-python[all] + python -m pip install --upgrade pip + python -m pip install --verbose llama-cpp-python[all] - name: Test with pytest run: | - python3 -c "import llama_cpp" + python -c "import llama_cpp" build-macos: runs-on: macos-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: 'pip' - name: Install dependencies run: | - python3 -m pip install --upgrade pip - python3 -m pip install --verbose llama-cpp-python[all] + python -m pip install --upgrade pip + python -m pip install --verbose llama-cpp-python[all] - name: Test with pytest run: | - python3 -c "import llama_cpp" + python -c "import llama_cpp" diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 292343c2b..9a938ae11 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 @@ -24,20 +24,21 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: 'pip' - name: Install dependencies run: | - python3 -m pip install --upgrade pip - python3 -m pip install .[all] -v + python -m pip install --upgrade pip + python -m pip install .[all] -v - name: Test with pytest run: | - python3 -m pytest + python -m pytest build-windows: runs-on: windows-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 @@ -47,20 +48,21 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: 'pip' - name: Install dependencies run: | - python3 -m pip install --upgrade pip - python3 -m pip install .[all] -v + python -m pip install --upgrade pip + python -m pip install .[all] -v - name: Test with pytest run: | - python3 -m pytest + python -m pytest build-macos: - runs-on: macos-13 + runs-on: macos-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 @@ -70,57 +72,32 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: 'pip' - name: Install dependencies run: | - python3 -m pip install --upgrade pip - python3 -m pip install .[all] --verbose + python -m pip install --upgrade pip + python -m pip install .[all] --verbose - name: Test with pytest run: | - python3 -m pytest - - # build-linux-opencl: - - # runs-on: ubuntu-latest - - # steps: - # - uses: actions/checkout@v4 - # with: - # submodules: "recursive" - # - name: Set up Python 3.8 - # uses: actions/setup-python@v5 - # with: - # python-version: "3.8" - # - name: Set up OpenCL & CLBlast - # run: | - # wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null - # echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list - # sudo apt-get update - # sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev - # - name: Install dependencies - # run: | - # python3 -m pip install --upgrade pip - # CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install .[all] --verbose - # - name: Test with pytest - # run: | - # python3 -m pytest + python -m pytest build-macos-metal: - runs-on: macos-13 + runs-on: macos-latest steps: - uses: actions/checkout@v4 with: submodules: "recursive" - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.9" - name: Install dependencies run: | - python3 -m pip install --upgrade pip - CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose + python -m pip install --upgrade pip + CMAKE_ARGS="-DLLAMA_METAL=on" python -m pip install .[all] --verbose - name: Test with pytest run: | - python3 -m pytest + python -m pytest diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b995509e..a953af7e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,92 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.81] + +- feat: Update llama.cpp to ggerganov/llama.cpp@968967376dc2c018d29f897c4883d335bbf384fb +- fix(ci): Fix CUDA wheels, use LLAMA_CUDA instead of removed LLAMA_CUBLAS by @abetlen in 4fb6fc12a02a68884c25dd9f6a421cacec7604c6 +- fix(ci): Fix MacOS release, use macos-12 image instead of removed macos-11 by @abetlen in 3a551eb5263fdbd24b36d7770856374c04e92788 + +## [0.2.80] + +- feat: Update llama.cpp to ggerganov/llama.cpp@023b8807e10bc3ade24a255f01c1ad2a01bb4228 +- fix(server): Fix bug in FastAPI streaming response where dependency was released before request completes causing SEGFAULT by @abetlen in 296304b60bb83689659883c9cc24f4c074dd88ff +- fix(server): Update default config value for embeddings to False to fix error in text generation where logits were not allocated by llama.cpp by @abetlen in bf5e0bb4b151f4ca2f5a21af68eb832a96a79d75 +- fix(ci): Fix the CUDA workflow by @oobabooga in #1551 +- docs: Update readme examples to use newer Qwen2 model by @jncraton in #1544 + +## [0.2.79] + +- feat: Update llama.cpp to ggerganov/llama.cpp@9c77ec1d74874ee22bdef8f110e8e8d41389abf2 +- feat(ci): Update workflows and pre-built wheels by @Smartappli in #1416 +- feat: Add .close() method to Llama class to explicitly free model from memory by @jkawamoto in #1513 +- feat: Support SPM infill by @CISC in #1492 + +## [0.2.78] + +- feat: Update llama.cpp to ggerganov/llama.cpp@fd5ea0f897ecb3659d6c269ef6f3d833e865ead7 +- fix: Avoid duplicate special tokens in chat formats by @CISC in #1439 +- fix: fix logprobs when BOS is not present by @ghorbani in #1471 +- feat: adding rpc_servers parameter to Llama class by @chraac in #1477 + +## [0.2.77] + +- feat: Update llama.cpp to ggerganov/llama.cpp@bde7cd3cd949c1a85d3a199498ac98e78039d46f +- fix: string value kv_overrides by @abetlen in df45a4b3fe46e72664bda87301b318210c6d4782 +- fix: Fix typo in Llama3VisionAlphaChatHandler by @abetlen in 165b4dc6c188f8fda2fc616154e111f710484eba +- fix: Use numpy recarray for candidates data, fixes bug with temp < 0 by @abetlen in af3ed503e9ce60fe6b5365031abad4176a3536b3 +fix: Disable Windows+CUDA workaround when compiling for HIPBLAS by Engininja2 in #1493 + +## [0.2.76] + +- feat: Update llama.cpp to ggerganov/llama.cpp@0df0aa8e43c3378975269a51f9b876c8692e70da +- feat: Improve Llama.eval performance by avoiding list conversion by @thoughtp0lice in #1476 +- example: LLM inference with Ray Serve by @rgerganov in #1465 + +## [0.2.75] + +- feat: Update llama.cpp to ggerganov/llama.cpp@13ad16af1231ab2d245d35df3295bcfa23de1305 +- fix: segfault for models without eos / bos tokens by @abetlen in d99a6ba607a4885fb00e63e967964aa41bdbbbcb +- feat: add MinTokensLogitProcessor and min_tokens argument to server by @twaka in #1333 +- misc: Remove unnecessary metadata lookups by @CISC in #1448 + +## [0.2.74] + +- feat: Update llama.cpp to ggerganov/llama.cpp@b228aba91ac2cd9eb90e9d423ba1d0d20e0117e2 +- fix: Enable CUDA backend for llava by @abetlen in 7f59856fa6f3e23f07e12fc15aeb9359dc6c3bb4 +- docs: Fix typo in README.md by @yupbank in #1444 + +## [0.2.73] + +- feat: Update llama.cpp to ggerganov/llama.cpp@25c6e82e7a1ad25a42b0894e87d9b5c557409516 +- fix: Clear kv cache at beginning of image chat formats to avoid bug when image is evaluated first by @abetlen in ac55d0a175115d1e719672ce1cb1bec776c738b1 + +## [0.2.72] + +- fix(security): Remote Code Execution by Server-Side Template Injection in Model Metadata by @retr0reg in b454f40a9a1787b2b5659cd2cb00819d983185df +- fix(security): Update remaining jinja chat templates to use immutable sandbox by @CISC in #1441 + +## [0.2.71] + +- feat: Update llama.cpp to ggerganov/llama.cpp@911b3900dded9a1cfe0f0e41b82c7a29baf3a217 +- fix: Make leading bos_token optional for image chat formats, fix nanollava system message by @abetlen in 77122638b4153e31d9f277b3d905c2900b536632 +- fix: free last image embed in llava chat handler by @abetlen in 3757328b703b2cd32dcbd5853271e3a8c8599fe7 + +## [0.2.70] + +- feat: Update llama.cpp to ggerganov/llama.cpp@c0e6fbf8c380718102bd25fcb8d2e55f8f9480d1 +- feat: fill-in-middle support by @CISC in #1386 +- fix: adding missing args in create_completion for functionary chat handler by @skalade in #1430 +- docs: update README.md @eltociear in #1432 +- fix: chat_format log where auto-detected format prints None by @balvisio in #1434 +- feat(server): Add support for setting root_path by @abetlen in 0318702cdc860999ee70f277425edbbfe0e60419 +- feat(ci): Add docker checks and check deps more frequently by @Smartappli in #1426 +- fix: detokenization case where first token does not start with a leading space by @noamgat in #1375 +- feat: Implement streaming for Functionary v2 + Bug fixes by @jeffrey-fong in #1419 +- fix: Use memmove to copy str_value kv_override by @abetlen in 9f7a85571ae80d3b6ddbd3e1bae407b9f1e3448a +- feat(server): Remove temperature bounds checks for server by @abetlen in 0a454bebe67d12a446981eb16028c168ca5faa81 +- fix(server): Propagate flash_attn to model load by @dthuerck in #1424 + ## [0.2.69] - feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2 diff --git a/CMakeLists.txt b/CMakeLists.txt index 70f9b9993..0c4fa026f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,9 +5,56 @@ project(llama_cpp) option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON) option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON) +function(llama_cpp_python_install_target target) + install( + TARGETS ${target} + LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + ) + install( + TARGETS ${target} + LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib + RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib + ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib + FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib + RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib + ) + set_target_properties(${target} PROPERTIES + INSTALL_RPATH "$ORIGIN" + BUILD_WITH_INSTALL_RPATH TRUE + ) + if(UNIX) + if(APPLE) + set_target_properties(${target} PROPERTIES + INSTALL_RPATH "@loader_path" + BUILD_WITH_INSTALL_RPATH TRUE + ) + else() + set_target_properties(${target} PROPERTIES + INSTALL_RPATH "$ORIGIN" + BUILD_WITH_INSTALL_RPATH TRUE + ) + endif() + endif() +endfunction() + if (LLAMA_BUILD) set(BUILD_SHARED_LIBS "On") + set(CMAKE_SKIP_BUILD_RPATH FALSE) + + # When building, don't use the install RPATH already + # (but later on when installing) + set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) + + # Add the automatically determined parts of the RPATH + # which point to directories outside the build tree to the install RPATH + set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + set(CMAKE_SKIP_RPATH FALSE) + # Building llama if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") # Need to disable these llama.cpp flags on Apple x86_64, @@ -23,36 +70,33 @@ if (LLAMA_BUILD) endif() add_subdirectory(vendor/llama.cpp) - install( - TARGETS llama - LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - ) - # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374 - install( - TARGETS llama - LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - ) + llama_cpp_python_install_target(llama) + llama_cpp_python_install_target(ggml) + # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563 - install( - FILES $ - DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - ) - install( - FILES $ - DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - ) + if (WIN32) + install( + FILES $ + DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + ) + install( + FILES $ + DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib + ) + install( + FILES $ + DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + ) + install( + FILES $ + DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib + ) + endif() if (LLAVA_BUILD) - if (LLAMA_CUBLAS) + if (LLAMA_CUBLAS OR LLAMA_CUDA) add_compile_definitions(GGML_USE_CUBLAS) + add_compile_definitions(GGML_USE_CUDA) endif() if (LLAMA_METAL) @@ -66,22 +110,16 @@ if (LLAMA_BUILD) if (WIN32) set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF) endif() - install( - TARGETS llava_shared - LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp - ) - # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374 - install( - TARGETS llava_shared - LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp - ) + llama_cpp_python_install_target(llava_shared) + if (WIN32) + install( + FILES $ + DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib + ) + install( + FILES $ + DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib + ) + endif() endif() -endif() +endif() \ No newline at end of file diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md new file mode 100644 index 000000000..da84dda5c --- /dev/null +++ b/INSTRUCTIONS.md @@ -0,0 +1,35 @@ +## Syncing with upstream repo + +See [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork#syncing-a-fork-branch-from-the-web-ui) for more details. + +1. On the GitHub UI, create a new branch `repo-sync`, if the branch doesn't exist already. + +2. Click on the "Sync fork" button and then click on the "Update branch" button. This will import all the commits from the upstream repo. + +3. Create a local branch `repo-sync` and pull the contents from the remote `repo-sync` branch. + +4. Solve for any conflicts if they arise. Otherwise, proceed to the next step. + +5. Update all the git submodles: + +``` +git submodule update --recursive +``` + +6. Since changes have probably been made to the vendor libraries (`llama_cpp`, `kompute`), we need to recompile the `llama_cpp` package. Navigate to the `vendor/llama.cpp` folder and clean the build cache: + +``` +make clean +``` +6. Navigate back to the root directory and type the following to recompile the `llama_cpp` package and build the dependenies again: + +``` +make deps && make build +``` +7. Launch the `llama_cpp_python` server using the following command: +``` +python -m llama_cpp.server --model $MODEL --n_gpu_layers -1 +``` +NOTE: Modify the launch arguments as needed. Make sure the `MODEL` environment variable points to an absolute path containing a `.gguf` model. + +8. If the server launches without issues, then you can proceed to create a PR with the latest changes \ No newline at end of file diff --git a/Makefile b/Makefile index 4ae011074..90f562fa1 100644 --- a/Makefile +++ b/Makefile @@ -13,13 +13,16 @@ build: python3 -m pip install --verbose -e . build.debug: - CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable . + python3 -m pip install \ + --verbose \ + --config-settings=cmake.verbose=true \ + --config-settings=logging.level=INFO \ + --config-settings=install.strip=false \ + --config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-ggdb -O0';-DCMAKE_CXX_FLAGS='-ggdb -O0'" \ + --editable . build.cuda: - CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --verbose -e . - -build.opencl: - CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e . + CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e . build.openblas: CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e . @@ -39,6 +42,9 @@ build.kompute: build.sycl: CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e . +build.rpc: + CMAKE_ARGS="-DLLAMA_RPC=on" python3 -m pip install --verbose -e . + build.sdist: python3 -m build --sdist diff --git a/README.md b/README.md index 640671460..4a710d4e5 100644 --- a/README.md +++ b/README.md @@ -165,17 +165,6 @@ pip install llama-cpp-python \ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal ``` - -
- -CLBlast (OpenCL) - -To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python -``` -
@@ -221,6 +210,17 @@ CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pi ```
+
+RPC + +To install with RPC support, set the `LLAMA_RPC=on` environment variable before installing: + +```bash +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DLLAMA_RPC=on" pip install llama-cpp-python +``` +
+ ### Windows Notes @@ -327,7 +327,7 @@ You'll need to install the `huggingface-hub` package to use this feature (`pip i ```python llm = Llama.from_pretrained( - repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", + repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", filename="*q8_0.gguf", verbose=False ) @@ -499,13 +499,16 @@ llm = Llama.from_pretrained( `llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images. -You'll first need to download one of the available multi-modal models in GGUF format: +Below are the supported multi-modal models and their respective chat handlers (Python API) and chat formats (Server API). -- [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) -- [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) -- [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1) -- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) -- [moondream2](https://huggingface.co/vikhyatk/moondream2) +| Model | `LlamaChatHandler` | `chat_format` | +|:--- |:--- |:--- | +| [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) | `Llava15ChatHandler` | `llava-1-5` | +| [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) | `Llava15ChatHandler` | `llava-1-5` | +| [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) | `Llava16ChatHandler` | `llava-1-6` | +| [moondream2](https://huggingface.co/vikhyatk/moondream2) | `MoondreamChatHandler` | `moondream2` | +| [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` | +| [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. @@ -516,7 +519,7 @@ chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin") llm = Llama( model_path="./path/to/llava/llama-model.gguf", chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accomodate the image embedding + n_ctx=2048, # n_ctx should be increased to accommodate the image embedding ) llm.create_chat_completion( messages = [ @@ -547,10 +550,10 @@ llm = Llama.from_pretrained( repo_id="vikhyatk/moondream2", filename="*text-model*", chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accomodate the image embedding + n_ctx=2048, # n_ctx should be increased to accommodate the image embedding ) -respoonse = llm.create_chat_completion( +response = llm.create_chat_completion( messages = [ { "role": "user", @@ -685,7 +688,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_ If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub. ```bash -python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf' +python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf' ``` ### Web Server Features diff --git a/dev.Dockerfile b/dev.Dockerfile new file mode 100644 index 000000000..24f5be727 --- /dev/null +++ b/dev.Dockerfile @@ -0,0 +1,44 @@ +# Define the image argument and provide a default value +ARG IMAGE=python:3.11.8 + +# Use the image as specified +FROM ${IMAGE} + +# Re-declare the ARG after FROM +ARG IMAGE + +# Update and upgrade the existing packages +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ninja-build \ + libopenblas-dev \ + build-essential \ + git + +RUN mkdir /app +WORKDIR /app +COPY . /app + +RUN python3 -m pip install --upgrade pip + +RUN make deps && make build && make clean + +# Set environment variable for the host +ENV GH_TOKEN=$GH_TOKEN +ENV HOST=0.0.0.0 +ENV PORT=8000 +ENV MODEL=/app/models/mistral-7b-openorca.Q5_K_M.gguf + +# # Install depencencies +# RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context psutil prometheus_client + +# # Install llama-cpp-python (build with METAL) +# RUN CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install git+https://${GH_TOKEN}@github.com/ZenHubHQ/llama-cpp-python.git --force-reinstall --upgrade --no-cache-dir --verbose + +# Expose a port for the server +EXPOSE 8000 + +# Run the server start script +CMD ["/bin/sh", "/app/docker/simple/run.sh"] +# CMD python3 -m llama_cpp.server --n_gpu_layers -1 diff --git a/dev.docker-compose b/dev.docker-compose new file mode 100644 index 000000000..7b21e468a --- /dev/null +++ b/dev.docker-compose @@ -0,0 +1,15 @@ +version: '3' +services: + dev-llama-cpp-python: + build: + context: . + dockerfile: dev.Dockerfile + ports: + - 8000:8000 + volumes: + - ./llama_cpp:/app/llama_cpp + networks: + - zh-service-network +networks: + zh-service-network: + external: true \ No newline at end of file diff --git a/docker/simple/run.sh b/docker/simple/run.sh index c85e73d2b..d4fd489a0 100644 --- a/docker/simple/run.sh +++ b/docker/simple/run.sh @@ -1,4 +1,5 @@ #!/bin/bash make build -uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT +# uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT --reload +python3 -m llama_cpp.server --model $MODEL --n_gpu_layers -1 \ No newline at end of file diff --git a/examples/high_level_api/high_level_api_infill.py b/examples/high_level_api/high_level_api_infill.py new file mode 100644 index 000000000..27af6367e --- /dev/null +++ b/examples/high_level_api/high_level_api_infill.py @@ -0,0 +1,33 @@ +import argparse + +from llama_cpp import Llama + +parser = argparse.ArgumentParser() +parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") +parser.add_argument("-p", "--prompt", type=str, default="def add(") +parser.add_argument("-s", "--suffix", type=str, default="\n return sum\n\n") +parser.add_argument("-i", "--spm-infill", action='store_true') +args = parser.parse_args() + +llm = Llama(model_path=args.model, n_gpu_layers=-1, spm_infill=args.spm_infill) + +output = llm.create_completion( + temperature = 0.0, + repeat_penalty = 1.0, + prompt = args.prompt, + suffix = args.suffix, +) + +# Models sometimes repeat suffix in response, attempt to filter that +response = output["choices"][0]["text"] +response_stripped = response.rstrip() +unwanted_response_suffix = args.suffix.rstrip() +unwanted_response_length = len(unwanted_response_suffix) + +filtered = False +if unwanted_response_suffix and response_stripped[-unwanted_response_length:] == unwanted_response_suffix: + response = response_stripped[:-unwanted_response_length] + filtered = True + +print(f"Fill-in-Middle completion{' (filtered)' if filtered else ''}:\n\n{args.prompt}\033[32m{response}\033[{'33' if filtered else '0'}m{args.suffix}\033[0m") + diff --git a/examples/ray/README.md b/examples/ray/README.md new file mode 100644 index 000000000..6e338ba17 --- /dev/null +++ b/examples/ray/README.md @@ -0,0 +1,19 @@ +This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html). + +First, install the requirements: + +```bash +$ pip install -r requirements.txt +``` + +Deploy a GGUF model to Ray Serve with the following command: + +```bash +$ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf' +``` + +This will start an API endpoint at `http://localhost:8000/`. You can query the model like this: + +```bash +$ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000 +``` diff --git a/examples/ray/llm.py b/examples/ray/llm.py new file mode 100755 index 000000000..27b15d21d --- /dev/null +++ b/examples/ray/llm.py @@ -0,0 +1,20 @@ +from starlette.requests import Request +from typing import Dict +from ray import serve +from ray.serve import Application +from llama_cpp import Llama + +@serve.deployment +class LlamaDeployment: + def __init__(self, model_path: str): + self._llm = Llama(model_path=model_path) + + async def __call__(self, http_request: Request) -> Dict: + input_json = await http_request.json() + prompt = input_json["prompt"] + max_tokens = input_json.get("max_tokens", 64) + return self._llm(prompt, max_tokens=max_tokens) + + +def llm_builder(args: Dict[str, str]) -> Application: + return LlamaDeployment.bind(args["model_path"]) diff --git a/examples/ray/requirements.txt b/examples/ray/requirements.txt new file mode 100644 index 000000000..a409fb882 --- /dev/null +++ b/examples/ray/requirements.txt @@ -0,0 +1,3 @@ +ray[serve] +--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu +llama-cpp-python diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 95c881966..f87e2a388 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.69" \ No newline at end of file +__version__ = "0.2.81" \ No newline at end of file diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b404601d3..423c1b5bc 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -9,6 +9,7 @@ Sequence, ) from dataclasses import dataclass, field +from contextlib import ExitStack import numpy as np import numpy.typing as npt @@ -27,9 +28,6 @@ class _LlamaModel: """Intermediate Python wrapper for a llama.cpp llama_model. NOTE: For stability it's recommended you use the Llama class instead.""" - _llama_free_model = None - # NOTE: this must be "saved" here to avoid exceptions when calling __del__ - def __init__( self, *, @@ -40,8 +38,7 @@ def __init__( self.path_model = path_model self.params = params self.verbose = verbose - - self._llama_free_model = llama_cpp._lib.llama_free_model # type: ignore + self._exit_stack = ExitStack() self.model = None @@ -56,11 +53,20 @@ def __init__( if self.model is None: raise ValueError(f"Failed to load model from file: {path_model}") - def __del__(self): - if self.model is not None and self._llama_free_model is not None: - self._llama_free_model(self.model) + def free_model(): + if self.model is None: + return + llama_cpp.llama_free_model(self.model) self.model = None + self._exit_stack.callback(free_model) + + def close(self): + self._exit_stack.close() + + def __del__(self): + self.close() + def vocab_type(self) -> int: assert self.model is not None return llama_cpp.llama_vocab_type(self.model) @@ -128,9 +134,9 @@ def token_get_score(self, token: int) -> float: assert self.model is not None return llama_cpp.llama_token_get_score(self.model, token) - def token_get_type(self, token: int) -> int: + def token_get_attr(self, token: int) -> int: assert self.model is not None - return llama_cpp.llama_token_get_type(self.model, token) + return llama_cpp.llama_token_get_attr(self.model, token) # Special tokens @@ -142,6 +148,14 @@ def token_eos(self) -> int: assert self.model is not None return llama_cpp.llama_token_eos(self.model) + def token_cls(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_cls(self.model) + + def token_sep(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_sep(self.model) + def token_nl(self) -> int: assert self.model is not None return llama_cpp.llama_token_nl(self.model) @@ -162,6 +176,14 @@ def token_eot(self) -> int: assert self.model is not None return llama_cpp.llama_token_eot(self.model) + def add_bos_token(self) -> int: + assert self.model is not None + return llama_cpp.llama_add_bos_token(self.model) + + def add_eos_token(self) -> int: + assert self.model is not None + return llama_cpp.llama_add_eos_token(self.model) + # Tokenization def tokenize(self, text: bytes, add_bos: bool, special: bool): @@ -241,8 +263,6 @@ class _LlamaContext: """Intermediate Python wrapper for a llama.cpp llama_context. NOTE: For stability it's recommended you use the Llama class instead.""" - _llama_free = None - def __init__( self, *, @@ -253,24 +273,31 @@ def __init__( self.model = model self.params = params self.verbose = verbose + self._exit_stack = ExitStack() - self._llama_free = llama_cpp._lib.llama_free # type: ignore self.ctx = None assert self.model.model is not None - self.ctx = llama_cpp.llama_new_context_with_model( - self.model.model, self.params - ) + self.ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params) if self.ctx is None: raise ValueError("Failed to create llama_context") - def __del__(self): - if self.ctx is not None and self._llama_free is not None: - self._llama_free(self.ctx) + def free_ctx(): + if self.ctx is None: + return + llama_cpp.llama_free(self.ctx) self.ctx = None + self._exit_stack.callback(free_ctx) + + def close(self): + self._exit_stack.close() + + def __del__(self): + self.close() + def n_ctx(self) -> int: assert self.ctx is not None return llama_cpp.llama_n_ctx(self.ctx) @@ -485,8 +512,6 @@ def default_params(): class _LlamaBatch: - _llama_batch_free = None - def __init__( self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True ): @@ -494,19 +519,27 @@ def __init__( self.embd = embd self.n_seq_max = n_seq_max self.verbose = verbose - - self._llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore + self._exit_stack = ExitStack() self.batch = None self.batch = llama_cpp.llama_batch_init( self._n_tokens, self.embd, self.n_seq_max ) - def __del__(self): - if self.batch is not None and self._llama_batch_free is not None: - self._llama_batch_free(self.batch) + def free_batch(): + if self.batch is None: + return + llama_cpp.llama_batch_free(self.batch) self.batch = None + self._exit_stack.callback(free_batch) + + def close(self): + self._exit_stack.close() + + def __del__(self): + self.close() + def n_tokens(self) -> int: assert self.batch is not None return self.batch.n_tokens @@ -545,13 +578,12 @@ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool): class _LlamaTokenDataArray: def __init__(self, *, n_vocab: int): self.n_vocab = n_vocab - self.candidates_data = np.array( - [], + self.candidates_data = np.recarray( + (self.n_vocab,), dtype=np.dtype( [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True ), ) - self.candidates_data.resize(3, self.n_vocab, refcheck=False) self.candidates = llama_cpp.llama_token_data_array( data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), size=self.n_vocab, @@ -561,14 +593,11 @@ def __init__(self, *, n_vocab: int): self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single) def copy_logits(self, logits: npt.NDArray[np.single]): - self.candidates_data["id"][:] = self.default_candidates_data_id - self.candidates_data["logit"][:] = logits - self.candidates_data["p"][:] = self.default_candidates_data_p - self.candidates.data = self.candidates_data.ctypes.data_as( - llama_cpp.llama_token_data_p - ) - self.candidates.sorted = ctypes.c_bool(False) - self.candidates.size = ctypes.c_size_t(self.n_vocab) + self.candidates_data.id[:] = self.default_candidates_data_id + self.candidates_data.logit[:] = logits + self.candidates_data.p[:] = self.default_candidates_data_p + self.candidates.sorted = False + self.candidates.size = self.n_vocab # Python wrappers over common/common @@ -759,14 +788,14 @@ def sample( self.params.penalty_present, ) if not self.params.penalize_nl: - token_data_array.candidates_data["logit"][nl_token] = nl_logit + token_data_array.candidates_data.logit[nl_token] = nl_logit if self.grammar is not None: ctx_main.sample_grammar(token_data_array, self.grammar) if self.params.temp < 0: ctx_main.sample_softmax(token_data_array) - id = token_data_array.candidates_data["id"][0] + id = token_data_array.candidates_data.id[0] elif self.params.temp == 0: id = ctx_main.sample_token_greedy(token_data_array) else: diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py index 7638170a9..f3407c92b 100644 --- a/llama_cpp/_logger.py +++ b/llama_cpp/_logger.py @@ -1,6 +1,7 @@ import sys import ctypes import logging +import logging.config import llama_cpp @@ -17,8 +18,38 @@ 5: logging.DEBUG, } -logger = logging.getLogger("llama-cpp-python") +UVICORN_LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": {"format": "%(asctime)s [%(levelname)s] %(message)s"}, + }, + "handlers": { + "default": { + "level": "INFO", + "formatter": "standard", + "class": "logging.StreamHandler", + "stream": "ext://sys.stdout", # Default is stderr + }, + }, + "loggers": { + "uvicorn.error": { + "level": "DEBUG", + "handlers": ["default"], + }, + "uvicorn.access": { + "level": "DEBUG", + "handlers": ["default"], + }, + }, +} +# Set up llama-cpp-python logger matching the format of uvicorn logger +logger = logging.getLogger("llama-cpp-python") +handler = logging.StreamHandler() +formatter = logging.Formatter("%(asctime)s - [%(levelname)s] - %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) @llama_cpp.llama_log_callback def llama_log_callback( @@ -27,7 +58,13 @@ def llama_log_callback( user_data: ctypes.c_void_p, ): if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]: - print(text.decode("utf-8"), end="", flush=True, file=sys.stderr) + _text = text.decode("utf-8") + if _text.endswith("\n"): + _text = _text[:-1] + + # Skip if the message only contains "." + if not _text == ".": + logger.log(GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level], _text) llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0)) diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py index 7ab94964b..14d0542fc 100644 --- a/llama_cpp/_utils.py +++ b/llama_cpp/_utils.py @@ -1,9 +1,13 @@ import os import sys import psutil +import asyncio import subprocess -from typing import Any, Dict, List +from typing import Any, Dict, List, Tuple, Union + +from llama_cpp.llama_metrics import QueueMetrics, MetricsExporter + # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor outnull_file = open(os.devnull, "w") @@ -12,6 +16,7 @@ STDOUT_FILENO = 1 STDERR_FILENO = 2 + class suppress_stdout_stderr(object): # NOTE: these must be "saved" here to avoid exceptions when using # this context manager inside of a __del__ method @@ -88,6 +93,7 @@ def get_cpu_usage(pid) -> float: process = psutil.Process(pid) return process.cpu_percent() + def get_ram_usage(pid) -> float: """ RAM usage in MiB by the current process. @@ -97,12 +103,19 @@ def get_ram_usage(pid) -> float: ram_usage = ram_info.rss / (1024 * 1024) # Convert to MiB return ram_usage + def get_gpu_info_by_pid(pid) -> float: """ GPU memory usage by the current process (if GPU is available) """ try: - gpu_info = subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid,used_memory", "--format=csv,noheader"]).decode("utf-8") + gpu_info = subprocess.check_output( + [ + "nvidia-smi", + "--query-compute-apps=pid,used_memory", + "--format=csv,noheader", + ] + ).decode("utf-8") gpu_info = gpu_info.strip().split("\n") for info in gpu_info: gpu_pid, gpu_ram_usage = info.split(", ") @@ -112,36 +125,74 @@ def get_gpu_info_by_pid(pid) -> float: pass return 0.0 -def get_gpu_general_info() -> tuple[float, float, float]: + +def get_gpu_general_info() -> Tuple[float, float, float]: """ GPU general info (if GPU is available) """ try: - gpu_info = subprocess.check_output(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.free", "--format=csv,noheader"]).decode("utf-8") - gpu_utilization, gpu_memory_used, gpu_memory_free = gpu_info.strip().split("\n")[0].split(", ") - return tuple(float(tup.split()[0]) for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free]) + gpu_info = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=utilization.gpu,memory.used,memory.free", + "--format=csv,noheader", + ] + ).decode("utf-8") + gpu_utilization, gpu_memory_used, gpu_memory_free = ( + gpu_info.strip().split("\n")[0].split(", ") + ) + return tuple( + float(tup.split()[0]) + for tup in [gpu_utilization, gpu_memory_used, gpu_memory_free] + ) except (subprocess.CalledProcessError, FileNotFoundError): pass return 0.0, 0.0, 0.0 -def infer_service_from_prompt(prompt: str | List[str]): + +async def monitor_task_queue( + status_dict: Dict[str, Union[int, float]], metrics_exporter: MetricsExporter +): """ - Infer the service for which a completion request is sent based on the prompt. + An asynchronous function that monitors the task queue and updates + a shared status dictionary with the number of tasks that have not + started and the number of tasks that are currently running. + It recursively calls itself to continuously monitor the task queue. + NOTE: There will always be 3 tasks running in the task queue: + - LifespanOn.main: Main application coroutine + - Server.serve: Server coroutine + - monitor_task_queue: Task queue monitoring coroutine + Any upcoming requests will be added to the task queue in the form of + another RequestReponseCycle.run_asgi coroutine. """ - LABEL_SUGGESTIONS_TASK = "Your task is to select the most relevant labels for a GitHub issue title from a list of labels provided." - ACCEPTANCE_CRITERIA_TASK = "Your task is to write the acceptance criteria for a GitHub issue." - SPRINT_REVIEW_TASK = "You are helping me prepare a sprint review." - - if isinstance(prompt, list): - prompt = " ".join(prompt) - - if LABEL_SUGGESTIONS_TASK in prompt: - return "label-suggestions" - - elif ACCEPTANCE_CRITERIA_TASK in prompt: - return "acceptance-criteria" - - elif SPRINT_REVIEW_TASK in prompt: - return "sprint-review" - - return "not-specified" + if not isinstance(metrics_exporter, MetricsExporter): + raise ValueError("metrics_exporter must be an instance of MetricsExporter") + + all_tasks = asyncio.all_tasks() + + # Get count of all running tasks + _all_tasks = [task for task in all_tasks if task._state == "PENDING"] + status_dict["running_tasks_count"] = len(_all_tasks) + # Get basic metadata of all running tasks + status_dict["running_tasks"] = { + task.get_name(): str(task.get_coro()) + .lstrip("\u003C") + .rstrip("\u003E") + for task in all_tasks + } + + assert status_dict is not None + + # Register current running tasks as a Prometheus metric + _labels = { + "service": "general", + "request_type": "health_check", + } + _queue_metrics = QueueMetrics(**status_dict) + metrics_exporter.log_queue_metrics(_queue_metrics, _labels) + + await asyncio.sleep(5) # adds a delay of 5 seconds to avoid overloading the CPU + + asyncio.create_task( + monitor_task_queue(status_dict, metrics_exporter) + ) # pass status_dict to the next task diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 432f4db3b..476d75d54 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -6,8 +6,12 @@ import time import json import ctypes +import typing import fnmatch +import warnings +import contextlib import multiprocessing +from types import TracebackType from typing import ( List, @@ -19,6 +23,7 @@ Deque, Callable, Dict, + Type, ) from collections import deque from pathlib import Path @@ -38,14 +43,13 @@ import llama_cpp.llama_cpp as llama_cpp import llama_cpp.llama_chat_format as llama_chat_format -from llama_cpp.llama_metrics import Metrics, MetricsExporter +from llama_cpp.llama_metrics import RequestMetrics from llama_cpp._utils import ( - infer_service_from_prompt, get_cpu_usage, get_ram_usage, get_gpu_info_by_pid, - get_gpu_general_info, + get_gpu_general_info ) from llama_cpp.llama_speculative import LlamaDraftModel @@ -62,7 +66,7 @@ _LlamaSamplingContext, # type: ignore _normalize_embedding, # type: ignore ) -from ._logger import set_verbose +from ._logger import set_verbose, logger from ._utils import suppress_stdout_stderr @@ -80,6 +84,7 @@ def __init__( split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER, main_gpu: int = 0, tensor_split: Optional[List[float]] = None, + rpc_servers: Optional[str] = None, vocab_only: bool = False, use_mmap: bool = True, use_mlock: bool = False, @@ -122,6 +127,7 @@ def __init__( type_k: Optional[int] = None, type_v: Optional[int] = None, # Misc + spm_infill: bool = False, verbose: bool = True, # Extra Params **kwargs, # type: ignore @@ -158,6 +164,7 @@ def __init__( split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options. main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split. + rpc_servers: Comma separated list of RPC servers to use for offloading vocab_only: Only load the vocabulary no weights. use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. @@ -191,6 +198,7 @@ def __init__( verbose: Print verbose output to stderr. type_k: KV cache data type for K (default: f16) type_v: KV cache data type for V (default: f16) + spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. Raises: ValueError: If the model path does not exist. @@ -229,6 +237,11 @@ def __init__( ) # 0x7FFFFFFF is INT32 max, will be auto set to all layers self.model_params.split_mode = split_mode self.model_params.main_gpu = main_gpu + if rpc_servers is not None: + self.model_params.rpc_servers = rpc_servers.encode('utf-8') + self._rpc_servers = rpc_servers + else: + self._rpc_servers = None self.tensor_split = tensor_split self._c_tensor_split = None if self.tensor_split is not None: @@ -259,13 +272,13 @@ def __init__( self._kv_overrides_array[i].key = k.encode("utf-8") if isinstance(v, bool): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL - self._kv_overrides_array[i].value.bool_value = v + self._kv_overrides_array[i].value.val_bool = v elif isinstance(v, int): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT - self._kv_overrides_array[i].value.int_value = v + self._kv_overrides_array[i].value.val_i64 = v elif isinstance(v, float): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT - self._kv_overrides_array[i].value.float_value = v + self._kv_overrides_array[i].value.val_f64 = v elif isinstance(v, str): # type: ignore v_bytes = v.encode("utf-8") if len(v_bytes) > 128: # TODO: Make this a constant @@ -273,10 +286,12 @@ def __init__( v_bytes = v_bytes.ljust(128, b"\0") self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR # copy min(v_bytes, 128) to str_value + address = typing.cast(int, ctypes.addressof(self._kv_overrides_array[i].value) + llama_cpp.llama_model_kv_override_value.val_str.offset) + buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char)) ctypes.memmove( - self._kv_overrides_array[i].value.str_value, + buffer_start, v_bytes, - min(len(v_bytes), 128), + 128, ) else: raise ValueError(f"Unknown value type for {k}: {v}") @@ -342,12 +357,16 @@ def __init__( self.lora_scale = lora_scale self.lora_path = lora_path + self.spm_infill = spm_infill + if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") - self._model = _LlamaModel( + self._stack = contextlib.ExitStack() + + self._model = self._stack.enter_context(contextlib.closing(_LlamaModel( path_model=self.model_path, params=self.model_params, verbose=self.verbose - ) + ))) # Override tokenizer self.tokenizer_ = tokenizer or LlamaTokenizer(self) @@ -359,18 +378,18 @@ def __init__( self.context_params.n_ctx = self._model.n_ctx_train() self.context_params.n_batch = self.n_batch - self._ctx = _LlamaContext( + self._ctx = self._stack.enter_context(contextlib.closing(_LlamaContext( model=self._model, params=self.context_params, verbose=self.verbose, - ) + ))) - self._batch = _LlamaBatch( + self._batch = self._stack.enter_context(contextlib.closing(_LlamaBatch( n_tokens=self.n_batch, embd=0, n_seq_max=self.context_params.n_ctx, verbose=self.verbose, - ) + ))) if self.lora_path: if self._model.apply_lora_from_file( @@ -384,10 +403,11 @@ def __init__( ) if self.verbose: - print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + logger.info(f'System info: {llama_cpp.llama_print_system_info().decode("utf-8")}') self.chat_format = chat_format self.chat_handler = chat_handler + self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = {} self.draft_model = draft_model @@ -414,15 +434,38 @@ def __init__( except Exception as e: self.metadata = {} if self.verbose: - print(f"Failed to load metadata: {e}", file=sys.stderr) + logger.error(f"Failed to load metadata: {e}") if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) + logger.info(f"Model metadata: {self.metadata}") + + eos_token_id = self.token_eos() + bos_token_id = self.token_bos() + + eos_token = self._model.token_get_text(eos_token_id) if eos_token_id != -1 else "" + bos_token = self._model.token_get_text(bos_token_id) if bos_token_id != -1 else "" + + # Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates + template_choices = dict((name[10:], template) for name, template in self.metadata.items() if name.startswith("tokenizer.chat_template.")) + + if "tokenizer.chat_template" in self.metadata: + template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"] + + if self.verbose and template_choices: + logger.info(f"Available chat formats from metadata: {', '.join(template_choices.keys())}") + + for name, template in template_choices.items(): + self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter( + template=template, + eos_token=eos_token, + bos_token=bos_token, + stop_token_ids=[eos_token_id], + ).to_chat_handler() if ( self.chat_format is None and self.chat_handler is None - and "tokenizer.chat_template" in self.metadata + and "chat_template.default" in template_choices ): chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata( self.metadata @@ -431,41 +474,20 @@ def __init__( if chat_format is not None: self.chat_format = chat_format if self.verbose: - print(f"Guessed chat format: {chat_format}", file=sys.stderr) + logger.info(f"Guessed chat format: {chat_format}") else: - template = self.metadata["tokenizer.chat_template"] - try: - eos_token_id = int(self.metadata["tokenizer.ggml.eos_token_id"]) - except: - eos_token_id = self.token_eos() - try: - bos_token_id = int(self.metadata["tokenizer.ggml.bos_token_id"]) - except: - bos_token_id = self.token_bos() - - eos_token = self._model.token_get_text(eos_token_id) - bos_token = self._model.token_get_text(bos_token_id) - if self.verbose: - print(f"Using gguf chat template: {template}", file=sys.stderr) - print(f"Using chat eos_token: {eos_token}", file=sys.stderr) - print(f"Using chat bos_token: {bos_token}", file=sys.stderr) + logger.info(f"Using gguf chat template: {template_choices['chat_template.default']}") + logger.info(f"Using chat eos_token: {eos_token}") + logger.info(f"Using chat bos_token: {bos_token}") - self.chat_handler = llama_chat_format.Jinja2ChatFormatter( - template=template, - eos_token=eos_token, - bos_token=bos_token, - stop_token_ids=[eos_token_id], - ).to_chat_handler() + self.chat_format = "chat_template.default" if self.chat_format is None and self.chat_handler is None: self.chat_format = "llama-2" if self.verbose: - print(f"Using fallback chat format: {chat_format}", file=sys.stderr) - - # Prometheus metrics - self.metrics = MetricsExporter() - + logger.info(f"Using fallback chat format: {self.chat_format}") + @property def ctx(self) -> llama_cpp.llama_context_p: assert self._ctx.ctx is not None @@ -569,12 +591,12 @@ def eval(self, tokens: Sequence[int]): if self.context_params.logits_all: rows = n_tokens cols = self._n_vocab - logits = self._ctx.get_logits()[: rows * cols] + logits = np.ctypeslib.as_array(self._ctx.get_logits(), shape=(rows * cols, )) self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits else: rows = 1 cols = self._n_vocab - logits = self._ctx.get_logits()[: rows * cols] + logits = np.ctypeslib.as_array(self._ctx.get_logits(), shape=(rows * cols, )) self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits # Update n_tokens self.n_tokens += n_tokens @@ -706,7 +728,8 @@ def generate( break if longest_prefix > 0: if self.verbose: - print("Llama.generate: prefix-match hit", file=sys.stderr) + # print("Llama.generate: prefix-match hit", file=sys.stderr) + logger.info("Llama.generate: prefix-match hit") reset = False tokens = tokens[longest_prefix:] self.n_tokens = longest_prefix @@ -932,7 +955,7 @@ def decode_batch(seq_sizes: List[int]): return output, total_tokens else: return output - + def _create_completion( self, prompt: Union[str, List[int]], @@ -960,40 +983,88 @@ def _create_completion( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, logit_bias: Optional[Dict[str, float]] = None, + ai_service: Optional[str] = None ) -> Union[ Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: assert self._ctx is not None assert suffix is None or suffix.__class__ is str - - completion_id: str = f"cmpl-{str(uuid.uuid4())}" - created: int = int(time.time()) - # Variables required for metric collection _metrics_dict = {} _ttft_start = time.time() _pid = os.getpid() _tpot_metrics = [] _labels = { - "service": infer_service_from_prompt(prompt), # Infer the service for which the completion is being generated + "service": ai_service if ai_service is not None else "not-specified", "request_type": "chat/completions", } # Get CPU usage before generating completion so it can be used to calculate CPU when called after completing the process _ = get_cpu_usage(_pid) + completion_id: str = f"cmpl-{str(uuid.uuid4())}" + created: int = int(time.time()) + bos_token_id: int = self.token_bos() + cls_token_id: int = self._model.token_cls() + sep_token_id: int = self._model.token_sep() + prefix_token_id: int = self._model.token_prefix() + middle_token_id: int = self._model.token_middle() + suffix_token_id: int = self._model.token_suffix() + add_space_prefix: bool = self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true" + bos_tokens: List[int] = [cls_token_id if cls_token_id != -1 else bos_token_id] + eos_tokens: List[int] = [sep_token_id if sep_token_id != -1 else self.token_eos()] + + if (isinstance(prompt, list) and suffix is None) or self._model.add_bos_token() == 0 or bos_tokens[:1] == [-1]: + bos_tokens = [] + + if (isinstance(prompt, list) and suffix is None) or (self._model.add_eos_token() != 1 and sep_token_id == -1): + eos_tokens = [] + + suffix_space_prefix: int = 0 + # Tokenizer hack to remove leading space + if add_space_prefix and suffix_token_id >= 0 and suffix: + suffix = "☺" + suffix + suffix_space_prefix = 2 + # If prompt is empty, initialize completion with BOS token to avoid # detokenization including a space at the beginning of the completion - completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()] + completion_tokens: List[int] = [] if len(prompt) > 0 else [bos_token_id] # Add blank space to start of prompt to match OG llama tokenizer - prompt_tokens: List[int] = ( + prefix_tokens: List[int] = ( ( - self.tokenize(prompt.encode("utf-8"), special=True) - if prompt != "" - else [self.token_bos()] + [prefix_token_id] + if prefix_token_id >= 0 and suffix is not None + else [] ) - if isinstance(prompt, str) - else prompt + + + ( + ( + self.tokenize(prompt.encode("utf-8"), add_bos=False, special=(prefix_token_id < 0 or suffix is None)) + if prompt != "" + else [] + ) + if isinstance(prompt, str) + else prompt + ) + ) + suffix_tokens: List[int] = ( + ( + [suffix_token_id] + + + ( + self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)[suffix_space_prefix:] + if suffix + else [] + ) + ) + if suffix_token_id >= 0 and suffix is not None + else [] + ) + middle_tokens: List[int] = ( + [middle_token_id] + if middle_token_id >= 0 and suffix is not None + else [] ) + prompt_tokens: List[int] = bos_tokens + ((suffix_tokens + prefix_tokens + middle_tokens) if self.spm_infill else (prefix_tokens + suffix_tokens + middle_tokens)) + eos_tokens text: bytes = b"" returned_tokens: int = 0 stop = ( @@ -1001,6 +1072,12 @@ def _create_completion( ) model_name: str = model if model is not None else self.model_path + if prompt_tokens[:2] == [self.token_bos()] * 2: + warnings.warn( + f'Detected duplicate leading "{self._model.token_get_text(self.token_bos())}" in prompt, this will likely reduce response quality, consider removing it...', + RuntimeWarning, + ) + # NOTE: This likely doesn't work correctly for the first token in the prompt # because of the extra space added to the start of the prompt_tokens if logit_bias is not None: @@ -1147,7 +1224,7 @@ def logit_bias_processor( # not sure how to handle this branch when dealing # with CJK output, so keep it unchanged for token in remaining_tokens: - if token == self.token_bos(): + if token == bos_token_id: continue token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens])) # Check if stop sequence is in the token @@ -1278,11 +1355,19 @@ def logit_bias_processor( token_end_position = 0 for token in remaining_tokens: + # Record TTFT metric (once) + if idx == 0: + _metrics_dict["time_to_first_token"] = time.time() - _ttft_start + # Record TPOT metric + else: + _tpot_metrics.append(time.time() - _tpot_start) + _tpot_start = time.time() # reset + token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens])) logprobs_or_none: Optional[CompletionLogprobs] = None if logprobs is not None: - if token == self.token_bos(): + if token == bos_token_id: continue token_str = self.detokenize([token]).decode( "utf-8", errors="ignore" @@ -1371,6 +1456,53 @@ def logit_bias_processor( print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() print("Llama._create_completion: cache saved", file=sys.stderr) + + ## PROMETHEUS METRICS IN STREAMING MODE ## + # Record TTFT metric -- Setting to None if no tokens were generated + if not _metrics_dict.get("time_to_first_token"): + _metrics_dict["time_to_first_token"] = None + + # Record TPOT metrics (per generated token) + _metrics_dict["time_per_output_token"] = _tpot_metrics + + # Record metrics from the C++ backend (converted to seconds) + _timings = llama_cpp.llama_get_timings(self._ctx.ctx) + _metrics_dict["load_time"] = round(_timings.t_load_ms / 1e3, 2) + _metrics_dict["sample_time"] = round(_timings.t_sample_ms / 1e3, 2) + _metrics_dict["sample_throughput"] = round(1e3 / _timings.t_sample_ms * _timings.n_sample, 2) if _timings.t_sample_ms > 0 else 0.0 + _metrics_dict["prompt_eval_time"] = round(_timings.t_p_eval_ms / 1e3, 2) + _metrics_dict["prompt_eval_throughput"] = round(1e3 / _timings.t_p_eval_ms * _timings.n_p_eval, 2) if _timings.t_p_eval_ms > 0 else 0.0 + _metrics_dict["completion_eval_time"] = round(_timings.t_eval_ms / 1e3, 2) + _metrics_dict["completion_eval_throughput"] = round(1e3 / _timings.t_eval_ms * _timings.n_eval, 2) if _timings.t_eval_ms > 0 else 0.0 + _metrics_dict["end_to_end_latency"] = round((_timings.t_end_ms - _timings.t_start_ms) / 1e3, 2) + + # Record prefill and generation token metrics + _metrics_dict["prefill_tokens"] = len(prompt_tokens) + _metrics_dict["generation_tokens"] = len(completion_tokens) + + # Record system info + _gpu_utilization, _gpu_memory_used, _gpu_memory_free = get_gpu_general_info() + _metrics_dict["cpu_utilization"] = get_cpu_usage(_pid) # TODO: Returning always 0.0 -> check + _metrics_dict["cpu_ram_pid"] = get_ram_usage(_pid) + _metrics_dict["gpu_utilization"] = _gpu_utilization + _metrics_dict["gpu_ram_usage"] = _gpu_memory_used + _metrics_dict["gpu_ram_free"] = _gpu_memory_free + _metrics_dict["gpu_ram_pid"] = get_gpu_info_by_pid(_pid) + _metrics_dict["state_size"] = llama_cpp.llama_get_state_size(self._ctx.ctx) + _metrics_dict["kv_cache_usage_ratio"] = round(1. * llama_cpp.llama_get_kv_cache_used_cells(self._ctx.ctx) / self.n_ctx(), 2) + _metrics_dict["system_info"] = { + "model": model_name, + "n_params": str(llama_cpp.llama_model_n_params(self.model)), + "n_embd": str(self.n_embd()), + "n_ctx": str(self.n_ctx()), + "n_vocab": str(self.n_vocab()), + "n_threads": str(self.n_threads) + } + + # Log metrics to Prometheus + _all_metrics = RequestMetrics(**_metrics_dict) + self.metrics.log_request_metrics(_all_metrics, labels=_labels) + return if self.cache: @@ -1383,7 +1515,7 @@ def logit_bias_processor( if echo: text_str = prompt + text_str - if suffix is not None: + if suffix_token_id < 0 and suffix is not None: text_str = text_str + suffix logprobs_or_none: Optional[CompletionLogprobs] = None @@ -1396,8 +1528,8 @@ def logit_bias_processor( top_logprobs: List[Optional[Dict[str, float]]] = [] if echo: - # Remove leading BOS token - all_tokens = prompt_tokens[1:] + completion_tokens + # Remove leading BOS token if exists + all_tokens = prompt_tokens[1 if prompt_tokens[0] == self.token_bos() else 0:] + completion_tokens else: all_tokens = completion_tokens @@ -1410,7 +1542,7 @@ def logit_bias_processor( for idx, (token, token_str, logprobs_token) in enumerate( zip(all_tokens, all_token_strs, all_logprobs) ): - if token == self.token_bos(): + if token == bos_token_id: continue text_offsets.append( text_offset @@ -1446,6 +1578,11 @@ def logit_bias_processor( "top_logprobs": top_logprobs, } + ## PROMETHEUS METRICS IN CHAT COMPLETION MODE ## + # Record TTFT metric -- Setting to None if no tokens were generated + if not _metrics_dict.get("time_to_first_token"): + _metrics_dict["time_to_first_token"] = None + # Record TPOT metrics (per generated token) _metrics_dict["time_per_output_token"] = _tpot_metrics @@ -1484,15 +1621,15 @@ def logit_bias_processor( } # Log metrics to Prometheus - #print(_metrics_dict, file=sys.stderr) - _all_metrics = Metrics(**_metrics_dict) - self.metrics.log_metrics(_all_metrics, labels=_labels) + _all_metrics = RequestMetrics(**_metrics_dict) + self.metrics.log_request_metrics(_all_metrics, labels=_labels) yield { "id": completion_id, "object": "text_completion", "created": created, "model": model_name, + "service": ai_service, "choices": [ { "text": text_str, @@ -1535,6 +1672,7 @@ def create_completion( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, logit_bias: Optional[Dict[str, float]] = None, + ai_service: Optional[str] = None ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -1598,6 +1736,7 @@ def create_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + ai_service=ai_service ) if stream: chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks @@ -1632,6 +1771,7 @@ def __call__( logits_processor: Optional[LogitsProcessorList] = None, grammar: Optional[LlamaGrammar] = None, logit_bias: Optional[Dict[str, float]] = None, + ai_service: Optional[str] = None ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]: """Generate text from a prompt. @@ -1695,6 +1835,7 @@ def __call__( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + ai_service=ai_service ) def create_chat_completion( @@ -1727,6 +1868,7 @@ def create_chat_completion( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + ai_service: Optional[str] = None ) -> Union[ CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse] ]: @@ -1763,7 +1905,7 @@ def create_chat_completion( Returns: Generated chat completion or a stream of chat completion chunks. """ - handler = self.chat_handler or llama_chat_format.get_chat_completion_handler( + handler = self.chat_handler or self._chat_handlers.get(self.chat_format) or llama_chat_format.get_chat_completion_handler( self.chat_format ) return handler( @@ -1796,6 +1938,7 @@ def create_chat_completion( logits_processor=logits_processor, grammar=grammar, logit_bias=logit_bias, + ai_service=ai_service ) def create_chat_completion_openai_v1( @@ -1879,6 +2022,7 @@ def __getstate__(self): type_k=self.context_params.type_k, type_v=self.context_params.type_v, # Misc + spm_infill=self.spm_infill, verbose=self.verbose, ) @@ -1961,6 +2105,13 @@ def pooling_type(self) -> str: """Return the pooling type.""" return self._ctx.pooling_type() + def close(self) -> None: + """Explicitly free the model from memory.""" + self._stack.close() + + def __del__(self) -> None: + self.close() + @staticmethod def logits_to_logprobs( logits: Union[npt.NDArray[np.single], List], axis: int = -1 @@ -2122,3 +2273,19 @@ def __call__( self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single] ) -> bool: return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) + + +class MinTokensLogitsProcessor(LogitsProcessor): + def __init__(self, min_tokens: int, token_eos: int): + self.min_tokens = min_tokens + self.token_eos = token_eos + self.prompt_tokens = None + + def __call__( + self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single] + ) -> npt.NDArray[np.single]: + if self.prompt_tokens is None: + self.prompt_tokens = len(input_ids) + if len(input_ids) - self.prompt_tokens < self.min_tokens: + scores[self.token_eos] = -np.inf + return scores diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 3ab94e0d3..60bef9e84 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -11,6 +11,7 @@ from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol, cast import jinja2 +from jinja2.sandbox import ImmutableSandboxedEnvironment import numpy as np import numpy.typing as npt @@ -87,6 +88,7 @@ def __call__( grammar: Optional[llama.LlamaGrammar] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + ai_service: Optional[str] = None, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -159,6 +161,7 @@ class ChatFormatterResponse: prompt: str stop: Optional[Union[str, List[str]]] = None stopping_criteria: Optional[llama.StoppingCriteriaList] = None + added_special: bool = False class ChatFormatter(Protocol): @@ -191,7 +194,7 @@ def __init__( self.add_generation_prompt = add_generation_prompt self.stop_token_ids = set(stop_token_ids) if stop_token_ids is not None else None - self._environment = jinja2.Environment( + self._environment = ImmutableSandboxedEnvironment( loader=jinja2.BaseLoader(), trim_blocks=True, lstrip_blocks=True, @@ -231,7 +234,7 @@ def stop_on_last_token( return tokens[-1] in self.stop_token_ids stopping_criteria = llama.StoppingCriteriaList([stop_on_last_token]) - return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token], stopping_criteria=stopping_criteria) + return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token], stopping_criteria=stopping_criteria, added_special=True) def to_chat_handler(self) -> LlamaChatCompletionHandler: return chat_formatter_to_chat_completion_handler(self) @@ -535,6 +538,7 @@ def chat_completion_handler( logit_bias: Optional[Dict[str, float]] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, + ai_service: Optional[str] = None, **kwargs, # type: ignore ) -> Union[ llama_types.CreateChatCompletionResponse, @@ -547,7 +551,7 @@ def chat_completion_handler( tools=tools, tool_choice=tool_choice, ) - prompt = result.prompt + prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True) if result.stop is not None: stop = [] if stop is None else [stop] if isinstance(stop, str) else stop rstop = result.stop if isinstance(result.stop, list) else [result.stop] @@ -625,6 +629,7 @@ def chat_completion_handler( stopping_criteria=stopping_criteria, grammar=grammar, logit_bias=logit_bias, + ai_service=ai_service ) if tool is not None: tool_name = tool["function"]["name"] @@ -654,7 +659,7 @@ def format_autotokenizer( prompt: str = tokenizer.apply_chat_template(messages, tokenize=False) # type: ignore assert isinstance(prompt, str) # Return formatted prompt and eos token by default - return ChatFormatterResponse(prompt=prompt, stop=tokenizer.eos_token) + return ChatFormatterResponse(prompt=prompt, stop=tokenizer.eos_token, added_special=True) return format_autotokenizer @@ -684,8 +689,7 @@ def hf_tokenizer_config_to_chat_formatter( assert isinstance(tokenizer_config["eos_token"], str) eos_token = tokenizer_config["eos_token"] - env = jinja2.Environment( - loader=jinja2.BaseLoader(), + env = ImmutableSandboxedEnvironment( trim_blocks=True, lstrip_blocks=True, ).from_string(chat_template) @@ -708,7 +712,7 @@ def format_tokenizer_config( bos_token=bos_token, eos_token=eos_token, ) - return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token]) + return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token], added_special=True) return format_tokenizer_config @@ -918,7 +922,7 @@ def format_llama2( messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, ) -> ChatFormatterResponse: - _system_template = "[INST] <>\n{system_message}\n<>" + _system_template = "[INST] <>\n{system_message}\n<>" _roles = dict(user="[INST]", assistant="[/INST]") _messages = _map_roles(messages, _roles) system_message = _get_system_message(messages) @@ -940,11 +944,10 @@ def format_llama3( user="<|start_header_id|>user<|end_header_id|>\n\n", assistant="<|start_header_id|>assistant<|end_header_id|>\n\n", ) - _begin_token = "<|begin_of_text|>" _sep = "<|eot_id|>" _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = _format_no_colon_single(_begin_token, _messages, _sep) + _prompt = _format_no_colon_single("", _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) @@ -1229,10 +1232,9 @@ def format_mistral_instruct( messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, ) -> ChatFormatterResponse: - bos = "" eos = "" stop = eos - prompt = bos + prompt = "" for message in messages: if ( message["role"] == "user" @@ -1715,6 +1717,7 @@ def functionary_v1_v2_chat_handler( model: Optional[str] = None, logits_processor: Optional[llama.LogitsProcessorList] = None, grammar: Optional[llama.LlamaGrammar] = None, + ai_service: Optional[str] = None, **kwargs, # type: ignore ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" @@ -1931,6 +1934,7 @@ def prepare_messages_for_inference( model=model, logits_processor=logits_processor, grammar=grammar, + ai_service=ai_service ) if stream is False: completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip() @@ -2322,7 +2326,7 @@ def generate_streaming(tools, functions, function_call, prompt): prompt = prompt stops = ["\n", END_ASSISTANT_TOKEN] - completion = create_completion(stop=stops) + completion = create_completion(prompt=prompt, stop=stops, grammar=grammar) completion_text = completion["choices"][0]["text"] completion_tokens += completion["usage"]["completion_tokens"] @@ -2349,7 +2353,7 @@ def generate_streaming(tools, functions, function_call, prompt): completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip() ) grammar = get_grammar(function_calls[-1]) - completion = create_completion(stop=END_FUNCTION_CALL_TOKEN) + completion = create_completion(prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar) completion_tokens += completion["usage"]["completion_tokens"] function_bodies.append(completion["choices"][0]["text"].strip()) # If the prompt involves a function call, just append generated parameters to function_bodies @@ -2363,7 +2367,7 @@ def generate_streaming(tools, functions, function_call, prompt): function_calls.append(function_call) grammar = get_grammar(function_call) stops = [STOP_TOKEN, FROM_TOKEN] - completion = create_completion(stop=stops) + completion = create_completion(prompt=prompt, stop=stops, grammar=grammar) completion_text = completion["choices"][0]["text"] completion_tokens += completion["usage"]["completion_tokens"] function_bodies.append(completion_text.strip()) @@ -2373,7 +2377,7 @@ def generate_streaming(tools, functions, function_call, prompt): # Generate function name first grammar = None stops = CONTENT_TOKEN - completion = create_completion(stop=stops) + completion = create_completion(prompt=prompt, stop=stops, grammar=grammar) completion_text = completion["choices"][0]["text"] completion_tokens += completion["usage"]["completion_tokens"] function_name = completion_text.strip() @@ -2386,7 +2390,7 @@ def generate_streaming(tools, functions, function_call, prompt): grammar = get_grammar(function_call) # Generate content stops = [RECIPIENT_TOKEN, STOP_TOKEN] - completion = create_completion(stop=stops) + completion = create_completion(prompt=prompt, stop=stops, grammar=grammar) completion_text = completion["choices"][0]["text"] completion_tokens += completion["usage"]["completion_tokens"] if function_name == "all": @@ -2413,7 +2417,7 @@ def generate_streaming(tools, functions, function_call, prompt): # Check whether the model wants to generate another turn prompt += completion_text.strip() grammar = None - completion = create_completion(stop=stops) + completion = create_completion(prompt=prompt, stop=stops, grammar=grammar) completion_tokens += completion["usage"]["completion_tokens"] if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]: prompt += "\n<|from|>assistant\n<|recipient|>" @@ -2602,14 +2606,27 @@ def __call__( messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages image_urls = self.get_image_urls(messages) - template = jinja2.Template(self.CHAT_FORMAT) - text = template.render(messages=messages, add_generation_prompt=True) + template = ImmutableSandboxedEnvironment( + trim_blocks=True, + lstrip_blocks=True, + ).from_string(self.CHAT_FORMAT) + text = template.render( + messages=messages, + add_generation_prompt=True, + eos_token=llama.detokenize([llama.token_eos()]), + bos_token=llama.detokenize([llama.token_bos()]), + ) split_text = self.split_text_on_image_urls(text, image_urls) def embed_image_bytes(image_bytes: bytes): if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash: return self._last_image_embed with suppress_stdout_stderr(disable=self.verbose): + # Free the previous image embed + if self._last_image_embed is not None: + self._llava_cpp.llava_image_embed_free(self._last_image_embed) + self._last_image_embed = None + self._last_image_hash = None embed = ( self._llava_cpp.llava_image_embed_make_with_bytes( self.clip_ctx, @@ -2624,17 +2641,18 @@ def embed_image_bytes(image_bytes: bytes): # Evaluate prompt llama.reset() - for i, (type_, value) in enumerate(split_text): + llama._ctx.kv_cache_clear() + for type_, value in split_text: if type_ == "text": - tokens = llama.tokenize(value.encode("utf8"), add_bos=i == 0) + tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True) if llama.n_tokens + len(tokens) > llama.n_ctx(): - raise ValueError("Prompt exceeds n_ctx") # TODO: Fix + raise ValueError(f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}") llama.eval(tokens) else: image_bytes = self.load_image(value) embed = embed_image_bytes(image_bytes) if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx(): - raise ValueError("Prompt exceeds n_ctx") # TODO: Fix + raise ValueError(f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}") n_past = ctypes.c_int(llama.n_tokens) n_past_p = ctypes.pointer(n_past) with suppress_stdout_stderr(disable=self.verbose): @@ -2644,6 +2662,8 @@ def embed_image_bytes(image_bytes: bytes): llama.n_batch, n_past_p, ) + # Required to avoid issues with hf tokenizer + llama.input_ids[llama.n_tokens : n_past.value] = -1 llama.n_tokens = n_past.value # Get prompt tokens to avoid a cache miss @@ -3033,6 +3053,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler): # Answer the question<|im_end|><|im_start|>user # # What is the picture about?<|im_end|><|im_start|>assistant + DEFAULT_SYSTEM_MESSAGE = "Answer the question" CHAT_FORMAT = ( "{% for message in messages %}" @@ -3081,7 +3102,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler): "{% endif %}" ) -class Llama3VisionAlpha(Llava15ChatHandler): +class Llama3VisionAlphaChatHandler(Llava15ChatHandler): # question = "" + q # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" @@ -3142,6 +3163,10 @@ class Llama3VisionAlpha(Llava15ChatHandler): "{% endif %}" ) +# alias +Llama3VisionAlpha = Llama3VisionAlphaChatHandler + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, @@ -3176,7 +3201,6 @@ def chatml_function_calling( llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse], ]: - print(logprobs) function_calling_template = ( "{% for message in messages %}" "<|im_start|>{{ message.role }}\n" @@ -3228,8 +3252,7 @@ def chatml_function_calling( "{% endfor %}" "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" ) - template_renderer = jinja2.Environment( - loader=jinja2.BaseLoader(), + template_renderer = ImmutableSandboxedEnvironment( autoescape=jinja2.select_autoescape(["html", "xml"]), undefined=jinja2.StrictUndefined, ).from_string(function_calling_template) @@ -3564,4 +3587,4 @@ def chatml_function_calling( }, } - raise ValueError("Automatic streaming tool choice is not supported") \ No newline at end of file + raise ValueError("Automatic streaming tool choice is not supported") diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 9e934e052..ee63de7cf 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -23,7 +23,7 @@ # Load the library def _load_shared_library(lib_base_name: str): # Construct the paths to the possible shared library names - _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) + _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" # Searching for the library in the current directory under the name "libllama" (default name # for llamacpp) and "llama" (default name for this repo) _lib_paths: List[pathlib.Path] = [] @@ -52,7 +52,12 @@ def _load_shared_library(lib_base_name: str): _lib_paths = [_lib.resolve()] cdll_args = dict() # type: ignore + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32": + os.add_dll_directory(str(_base_path)) + os.environ['PATH'] = str(_base_path) + os.pathsep + os.environ['PATH'] + if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) if "CUDA_PATH" in os.environ: @@ -273,6 +278,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback # LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE # LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece +# LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram # }; LLAMA_VOCAB_TYPE_NONE = 0 """For models without vocab""" @@ -282,6 +288,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa """GPT-2 tokenizer based on byte-level BPE""" LLAMA_VOCAB_TYPE_WPM = 3 """BERT tokenizer based on WordPiece""" +LLAMA_VOCAB_TYPE_UGM = 4 +"""T5 tokenizer based on Unigram""" # // pre-tokenization types @@ -296,6 +304,14 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, # LLAMA_VOCAB_PRE_TYPE_REFACT = 8, # LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, +# LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10, +# LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, +# LLAMA_VOCAB_PRE_TYPE_OLMO = 12, +# LLAMA_VOCAB_PRE_TYPE_DBRX = 13, +# LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, +# LLAMA_VOCAB_PRE_TYPE_PORO = 15, +# LLAMA_VOCAB_PRE_TYPE_VIKING = 16, +# LLAMA_VOCAB_PRE_TYPE_JAIS = 17, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -307,6 +323,14 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa LLAMA_VOCAB_PRE_TYPE_GPT2 = 7 LLAMA_VOCAB_PRE_TYPE_REFACT = 8 LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9 +LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10 +LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11 +LLAMA_VOCAB_PRE_TYPE_OLMO = 12 +LLAMA_VOCAB_PRE_TYPE_DBRX = 13 +LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 +LLAMA_VOCAB_PRE_TYPE_PORO = 15 +LLAMA_VOCAB_PRE_TYPE_VIKING = 16 +LLAMA_VOCAB_PRE_TYPE_JAIS = 17 # // note: these values should be synchronized with ggml_rope @@ -323,7 +347,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa LLAMA_ROPE_TYPE_GLM = 4 -# enum llama_token_type { +# enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file # LLAMA_TOKEN_TYPE_UNDEFINED = 0, # LLAMA_TOKEN_TYPE_NORMAL = 1, # LLAMA_TOKEN_TYPE_UNKNOWN = 2, @@ -341,6 +365,32 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa LLAMA_TOKEN_TYPE_BYTE = 6 +# enum llama_token_attr { +# LLAMA_TOKEN_ATTR_UNDEFINED = 0, +# LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0, +# LLAMA_TOKEN_ATTR_UNUSED = 1 << 1, +# LLAMA_TOKEN_ATTR_NORMAL = 1 << 2, +# LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL? +# LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4, +# LLAMA_TOKEN_ATTR_BYTE = 1 << 5, +# LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6, +# LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7, +# LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8, +# LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9, +# }; +LLAMA_TOKEN_ATTR_UNDEFINED = 0 +LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0 +LLAMA_TOKEN_ATTR_UNUSED = 1 << 1 +LLAMA_TOKEN_ATTR_NORMAL = 1 << 2 +LLAMA_TOKEN_ATTR_CONTROL = 1 << 3 +LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4 +LLAMA_TOKEN_ATTR_BYTE = 1 << 5 +LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6 +LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7 +LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8 +LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9 + + # // model file types # enum llama_ftype { # LLAMA_FTYPE_ALL_F32 = 0, @@ -375,6 +425,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -407,6 +458,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa LLAMA_FTYPE_MOSTLY_IQ2_S = 28 LLAMA_FTYPE_MOSTLY_IQ2_M = 29 LLAMA_FTYPE_MOSTLY_IQ4_XS = 30 +LLAMA_FTYPE_MOSTLY_IQ1_M = 31 +LLAMA_FTYPE_MOSTLY_BF16 = 32 LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { @@ -427,11 +480,13 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # LLAMA_POOLING_TYPE_NONE = 0, # LLAMA_POOLING_TYPE_MEAN = 1, # LLAMA_POOLING_TYPE_CLS = 2, +# LLAMA_POOLING_TYPE_LAST = 3, # }; LLAMA_POOLING_TYPE_UNSPECIFIED = -1 LLAMA_POOLING_TYPE_NONE = 0 LLAMA_POOLING_TYPE_MEAN = 1 LLAMA_POOLING_TYPE_CLS = 2 +LLAMA_POOLING_TYPE_LAST = 3 # enum llama_split_mode { # LLAMA_SPLIT_MODE_NONE = 0, // single GPU @@ -600,17 +655,17 @@ class llama_batch(ctypes.Structure): # }; class llama_model_kv_override_value(ctypes.Union): _fields_ = [ - ("int_value", ctypes.c_int64), - ("float_value", ctypes.c_double), - ("bool_value", ctypes.c_bool), - ("str_value", ctypes.c_char * 128), + ("val_i64", ctypes.c_int64), + ("val_f64", ctypes.c_double), + ("val_bool", ctypes.c_bool), + ("val_str", ctypes.c_char * 128), ] if TYPE_CHECKING: - int_value: int - float_value: float - bool_value: bool - str_value: bytes + val_i64: int + val_f64: float + val_bool: bool + val_str: bytes class llama_model_kv_override(ctypes.Structure): @@ -639,6 +694,9 @@ class llama_model_kv_override(ctypes.Structure): # // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() # const float * tensor_split; +# // comma separated list of RPC servers to use for offloading +# const char * rpc_servers; + # // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. # // If the provided progress_callback returns true, model loading continues. # // If it returns false, model loading is immediately aborted. @@ -665,6 +723,7 @@ class llama_model_params(ctypes.Structure): split_mode (int): how to split the model across multiple GPUs main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() + rpc_servers (ctypes.c_char_p): comma separated list of RPC servers to use for offloading progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted. progress_callback_user_data (ctypes.ctypes.c_void_p): context pointer passed to the progress callback kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data @@ -678,6 +737,7 @@ class llama_model_params(ctypes.Structure): split_mode: int main_gpu: int tensor_split: CtypesArray[ctypes.c_float] + rpc_servers: ctypes.c_char_p progress_callback: Callable[[float, ctypes.c_void_p], bool] progress_callback_user_data: ctypes.c_void_p kv_overrides: CtypesArray[llama_model_kv_override] @@ -691,6 +751,7 @@ class llama_model_params(ctypes.Structure): ("split_mode", ctypes.c_int), ("main_gpu", ctypes.c_int32), ("tensor_split", ctypes.POINTER(ctypes.c_float)), + ("rpc_servers", ctypes.c_char_p), ("progress_callback", llama_progress_callback), ("progress_callback_user_data", ctypes.c_void_p), ("kv_overrides", ctypes.POINTER(llama_model_kv_override)), @@ -701,6 +762,8 @@ class llama_model_params(ctypes.Structure): ] +# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations +# // https://github.com/ggerganov/llama.cpp/pull/7544 # struct llama_context_params { # uint32_t seed; // RNG seed, -1 for random # uint32_t n_ctx; // text context, 0 = from model @@ -712,7 +775,6 @@ class llama_model_params(ctypes.Structure): # enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` # enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id -# // (ignored if no pooling layer) # // ref: https://github.com/ggerganov/llama.cpp/pull/2054 # float rope_freq_base; // RoPE base frequency, 0 = from model @@ -727,15 +789,14 @@ class llama_model_params(ctypes.Structure): # ggml_backend_sched_eval_callback cb_eval; # void * cb_eval_user_data; -# enum ggml_type type_k; // data type for K cache -# enum ggml_type type_v; // data type for V cache +# enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] +# enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] # // Keep the booleans together to avoid misalignment during copy-by-value. # bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool embeddings; // if true, extract embeddings (together with logits) # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU -# bool flash_attn; // whether to use flash attention - +# bool flash_attn; // whether to use flash attention [EXPERIMENTAL] # // Abort callback # // if it returns true, execution of llama_decode() will be aborted @@ -939,6 +1000,9 @@ class llama_model_quantize_params(ctypes.Structure): # // modifies a preceding LLAMA_GRETYPE_CHAR or # // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) # LLAMA_GRETYPE_CHAR_ALT = 6, + +# // any character (.) +# LLAMA_GRETYPE_CHAR_ANY = 7, # }; LLAMA_GRETYPE_END = 0 LLAMA_GRETYPE_ALT = 1 @@ -947,6 +1011,7 @@ class llama_model_quantize_params(ctypes.Structure): LLAMA_GRETYPE_CHAR_NOT = 4 LLAMA_GRETYPE_CHAR_RNG_UPPER = 5 LLAMA_GRETYPE_CHAR_ALT = 6 +LLAMA_GRETYPE_CHAR_ANY = 7 # typedef struct llama_grammar_element { @@ -1213,12 +1278,12 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... def llama_pooling_type(ctx: llama_context_p, /) -> int: ... -# LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); +# LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); @ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int) def llama_vocab_type(model: llama_model_p, /) -> int: ... -# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); +# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); @ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int) def llama_rope_type(model: llama_model_p, /) -> int: ... @@ -2248,6 +2313,32 @@ def llama_set_n_threads( ... +# // Get the number of threads used for generation of a single token. +# LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx); +@ctypes_function("llama_n_threads", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_threads(ctx: llama_context_p, /) -> int: + """Get the number of threads used for generation of a single token""" + ... + + +# // Get the number of threads used for prompt and batch processing (multiple token). +# LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); +@ctypes_function("llama_n_threads_batch", [llama_context_p_ctypes], ctypes.c_uint32) +def llama_n_threads_batch(ctx: llama_context_p, /) -> int: + """Get the number of threads used for prompt and batch processing (multiple token)""" + ... + + +# // Set whether the model is in embeddings mode or not +# // If true, embeddings will be returned but logits will not +# LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); +@ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None) +def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /): + """Set whether the model is in embeddings model or not + If true, embeddings will be returned but logits will not""" + ... + + # // Set whether to use causal attention or not # // If set to true, the model will only attend to the past tokens # LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); @@ -2402,11 +2493,11 @@ def llama_token_get_score( ) -> float: ... -# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token); +# LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token); @ctypes_function( - "llama_token_get_type", [llama_model_p_ctypes, llama_token], ctypes.c_int + "llama_token_get_attr", [llama_model_p_ctypes, llama_token], ctypes.c_int ) -def llama_token_get_type( +def llama_token_get_attr( model: llama_model_p, token: Union[llama_token, int], / ) -> int: ... @@ -2421,6 +2512,16 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) ... +# // Identify if Token Id is a control token or a render-able token +# LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token); +@ctypes_function( + "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool +) +def llama_token_is_control(model: llama_model_p, token: Union[llama_token, int], /) -> bool: + """Identify if Token Id is a control token or a render-able token""" + ... + + # // Special tokens @@ -3154,104 +3255,9 @@ def llama_grammar_accept_token( # // -# // Beam search +# // Model split # // -# struct llama_beam_view { -# const llama_token * tokens; - - -# size_t n_tokens; -# float p; // Cumulative beam probability (renormalized relative to all beams) -# bool eob; // Callback should set this to true when a beam is at end-of-beam. -# }; -class llama_beam_view(ctypes.Structure): - if TYPE_CHECKING: - tokens: CtypesArray[llama_token] - n_tokens: int - p: float - eob: bool - - _fields_ = [ - ("tokens", llama_token_p), - ("n_tokens", ctypes.c_size_t), - ("p", ctypes.c_float), - ("eob", ctypes.c_bool), - ] - - -# // Passed to beam_search_callback function. -# // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams -# // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks. -# // These pointers are valid only during the synchronous callback, so should not be saved. -# struct llama_beams_state { -# struct llama_beam_view * beam_views; -# size_t n_beams; // Number of elements in beam_views[]. -# size_t common_prefix_length; // Current max length of prefix tokens shared by all beams. -# bool last_call; // True iff this is the last callback invocation. -# }; -class llama_beams_state(ctypes.Structure): - if TYPE_CHECKING: - beam_views: CtypesArray[llama_beam_view] - n_beams: int - common_prefix_length: int - last_call: bool - - _fields_ = [ - ("beam_views", ctypes.POINTER(llama_beam_view)), - ("n_beams", ctypes.c_size_t), - ("common_prefix_length", ctypes.c_size_t), - ("last_call", ctypes.c_bool), - ] - - -# // Type of pointer to the beam_search_callback function. -# // void* callback_data is any custom data passed to llama_beam_search, that is subsequently -# // passed back to beam_search_callback. This avoids having to use global variables in the callback. -# typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state); -llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE( - None, ctypes.c_void_p, llama_beams_state -) - - -# /// @details Deterministically returns entire sentence constructed by a beam search. -# /// @param ctx Pointer to the llama_context. -# /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state. -# /// @param callback_data A pointer that is simply passed back to callback. -# /// @param n_beams Number of beams to use. -# /// @param n_past Number of tokens already evaluated. -# /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier. -# /// @param n_threads Number of threads as passed to llama_eval(). -# LLAMA_API void llama_beam_search( -# struct llama_context * ctx, -# llama_beam_search_callback_fn_t callback, -# void * callback_data, -# size_t n_beams, -# int32_t n_past, -# int32_t n_predict); -@ctypes_function( - "llama_beam_search", - [ - llama_context_p_ctypes, - llama_beam_search_callback_fn_t, - ctypes.c_void_p, - ctypes.c_size_t, - ctypes.c_int32, - ctypes.c_int32, - ], - None, -) -def llama_beam_search( - ctx: llama_context_p, - callback: CtypesFuncPointer, - callback_data: ctypes.c_void_p, - n_beams: Union[ctypes.c_size_t, int], - n_past: Union[ctypes.c_int, int], - n_predict: Union[ctypes.c_int, int], - /, -): ... - - # /// @details Build a split GGUF final path for this chunk. # /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" # // Returns the split_path length. diff --git a/llama_cpp/llama_metrics.py b/llama_cpp/llama_metrics.py index 7334c7140..a7ffc7a09 100644 --- a/llama_cpp/llama_metrics.py +++ b/llama_cpp/llama_metrics.py @@ -6,11 +6,13 @@ LABELS = ["request_type", "service"] + @dataclass -class Metrics: +class RequestMetrics: """ - A dataclass to store metrics for a request. + A dataclass to store metrics for a given request. """ + # System metrics system_info: Dict[str, Any] state_size: int @@ -33,26 +35,54 @@ class Metrics: completion_eval_throughput: float end_to_end_latency: float prefill_tokens: int - generation_tokens: int + generation_tokens: int kv_cache_usage_ratio: float +@dataclass +class QueueMetrics: + """ + A dataclass to store metrics for the task queue. + """ + + running_tasks_count: int + running_tasks: dict + + class MetricsExporter: """ A custom Prometheus Metrics Explorer for the LLAMA C++ backend. Collects metrics per request sent to the backend. """ + def __init__(self): self.labels = LABELS # One-time metrics - self._histrogram_load_time = Histogram( + self._histogram_load_time = Histogram( name="llama_cpp_python:load_t_seconds", documentation="Histogram of load time in seconds", labelnames=self.labels, buckets=[ - 0.1, 0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, - 8.0, 9.0, 10.0, 12.5, 15.0, 20.0, 25.0, 30.0 - ] + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.0, + 3.0, + 4.0, + 5.0, + 6.0, + 7.0, + 8.0, + 9.0, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + ], ) # Request-level latencies self._histogram_sample_time = Histogram( @@ -60,54 +90,151 @@ def __init__(self): documentation="Histogram of token sampling time in seconds", labelnames=self.labels, buckets=[ - 0.00001, 0.00005, 0.0001, 0.00025, 0.0005, 0.001, 0.0025, - 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, - ] + 0.00001, + 0.00005, + 0.0001, + 0.00025, + 0.0005, + 0.001, + 0.0025, + 0.005, + 0.0075, + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.25, + 0.5, + ], ) self._histogram_time_to_first_token = Histogram( name="llama_cpp_python:ttft_seconds", documentation="Histogram of time to first token in seconds", labelnames=self.labels, buckets=[ - 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 20.0, 25.0, 30.0 - ] + 0.001, + 0.005, + 0.01, + 0.02, + 0.04, + 0.06, + 0.08, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + ], ) self._histogram_time_per_output_token = Histogram( name="llama_cpp_python:tpot_seconds", documentation="Histogram of time per output token in seconds", labelnames=self.labels, buckets=[ - 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 20.0, 25.0, 30.0 - ] + 0.001, + 0.005, + 0.01, + 0.02, + 0.04, + 0.06, + 0.08, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + ], ) self._histogram_prompt_eval_time = Histogram( name="llama_cpp_python:p_eval_t_seconds", documentation="Histogram of prompt evaluation time in seconds", labelnames=self.labels, buckets=[ - 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, - 20.0, 25.0, 30.0, 40.0, 50.0, 60.0 - ] + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + 40.0, + 50.0, + 60.0, + ], ) self._histogram_completion_eval_time = Histogram( name="llama_cpp_python:c_eval_t_seconds", documentation="Histogram of completion evaluation time in seconds", labelnames=self.labels, buckets=[ - 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, - 20.0, 25.0, 30.0, 40.0, 50.0, 60.0 - ] + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + 40.0, + 50.0, + 60.0, + ], ) self._histogram_e2e_request_latency = Histogram( name="llama_cpp_python:e2e_seconds", documentation="Histogram of end-to-end request latency in seconds", labelnames=self.labels, buckets=[ - 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, - 20.0, 25.0, 30.0, 40.0, 50.0, 60.0 - ] + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 12.5, + 15.0, + 20.0, + 25.0, + 30.0, + 40.0, + 50.0, + 60.0, + ], ) # Prefill and generation tokens self._histogram_prefill_tokens = Histogram( @@ -115,97 +242,169 @@ def __init__(self): documentation="Histogram of number of prefill tokens processed", labelnames=self.labels, buckets=[ - 1, 10, 25, 50, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 3000, - 3500, 4000, 4500, 5000 - ] + 1, + 10, + 25, + 50, + 100, + 250, + 500, + 750, + 1000, + 1500, + 2000, + 2500, + 3000, + 3500, + 4000, + 4500, + 5000, + ], ) self._histogram_generation_tokens = Histogram( name="llama_cpp_python:completion_tokens_total", documentation="Histogram of number of generation tokens processed", labelnames=self.labels, buckets=[ - 1, 10, 25, 50, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 3000, - 3500, 4000, 4500, 5000 - ] + 1, + 10, + 25, + 50, + 100, + 250, + 500, + 750, + 1000, + 1500, + 2000, + 2500, + 3000, + 3500, + 4000, + 4500, + 5000, + ], ) # Current throughput self._gauge_prompt_eval_throughput = Gauge( name="llama_cpp_python:prompt_eval_throughput", documentation="Current throughput of the prompt evaluation process (in tokens/second)", - labelnames=self.labels + labelnames=self.labels, ) self._gauge_completion_eval_throughput = Gauge( name="llama_cpp_python:completion_eval_throughput", documentation="Current throughput of the completion evaluation process (in tokens/second)", - labelnames=self.labels + labelnames=self.labels, ) self._gauge_sample_throughput = Gauge( name="llama_cpp_python:sample_throughput", documentation="Current throughput of the token sampling process (in tokens/second)", - labelnames=self.labels + labelnames=self.labels, ) # System info self._gauge_state_size = Gauge( name="llama_cpp_python:state_size", documentation="Current state size in bytes of various components such as rng (random number generator), logits, embedding, and kv_cache (key-value cache)", - labelnames=self.labels + labelnames=self.labels, ) self._gauge_cpu_utilization = Gauge( name="llama_cpp_python:cpu_utilization", documentation="Current CPU utilization", - labelnames=self.labels + labelnames=self.labels, ) self._gauge_cpu_ram_usage_by_pid = Gauge( name="llama_cpp_python:cpu_memory_usage_by_pid", documentation="Current CPU memory usage during the request", - labelnames=self.labels + labelnames=self.labels, ) self._gauge_gpu_utilization = Gauge( name="llama_cpp_python:gpu_utilization", documentation="Current GPU utilization", - labelnames=self.labels + labelnames=self.labels, ) self._gauge_gpu_memory_usage = Gauge( name="llama_cpp_python:gpu_memory_usage", documentation="Current GPU memory usage", - labelnames=self.labels + labelnames=self.labels, ) self._gauge_gpu_memory_free = Gauge( name="llama_cpp_python:gpu_memory_free", documentation="Current free GPU memory", - labelnames=self.labels + labelnames=self.labels, ) self._gauge_gpu_memory_usage_by_pid = Gauge( name="llama_cpp_python:gpu_memory_usage_by_pid", documentation="Current GPU memory usage during the request", - labelnames=self.labels + labelnames=self.labels, ) self._gauge_kv_cache_usage_ratio = Gauge( name="llama_cpp_python:kv_cache_usage_ratio", documentation="KV-cache usage. 1 means 100 percent usage", - labelnames=self.labels + labelnames=self.labels, ) - self._info = Info( - name="llama_cpp_python:info", - documentation="Server metadata" + self._gauge_running_tasks = Histogram( + name="llama_cpp_python:running_tasks", + documentation="Number of running tasks in the task queue", + labelnames=self.labels, + buckets=[ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 12, + 14, + 16, + 18, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + ], ) - def log_metrics(self, metrics: Metrics, labels: Dict[str, str]): + # Server metadata + self._info = Info(name="llama_cpp_python:info", documentation="Server metadata") + + def log_request_metrics(self, metrics: RequestMetrics, labels: Dict[str, str]): """ Log the metrics using the Prometheus client. """ - self._histrogram_load_time.labels(**labels).observe(metrics.load_time) + self._histogram_load_time.labels(**labels).observe(metrics.load_time) self._histogram_sample_time.labels(**labels).observe(metrics.sample_time) - self._histogram_time_to_first_token.labels(**labels).observe(metrics.time_to_first_token) + if metrics.time_to_first_token: + self._histogram_time_to_first_token.labels(**labels).observe( + metrics.time_to_first_token + ) for _tpot in metrics.time_per_output_token: self._histogram_time_per_output_token.labels(**labels).observe(_tpot) - self._histogram_prompt_eval_time.labels(**labels).observe(metrics.prompt_eval_time) - self._histogram_completion_eval_time.labels(**labels).observe(metrics.completion_eval_time) - self._histogram_e2e_request_latency.labels(**labels).observe(metrics.end_to_end_latency) + self._histogram_prompt_eval_time.labels(**labels).observe( + metrics.prompt_eval_time + ) + self._histogram_completion_eval_time.labels(**labels).observe( + metrics.completion_eval_time + ) + self._histogram_e2e_request_latency.labels(**labels).observe( + metrics.end_to_end_latency + ) self._histogram_prefill_tokens.labels(**labels).observe(metrics.prefill_tokens) - self._histogram_generation_tokens.labels(**labels).observe(metrics.generation_tokens) - self._gauge_prompt_eval_throughput.labels(**labels).set(metrics.prompt_eval_throughput) - self._gauge_completion_eval_throughput.labels(**labels).set(metrics.completion_eval_throughput) + self._histogram_generation_tokens.labels(**labels).observe( + metrics.generation_tokens + ) + self._gauge_prompt_eval_throughput.labels(**labels).set( + metrics.prompt_eval_throughput + ) + self._gauge_completion_eval_throughput.labels(**labels).set( + metrics.completion_eval_throughput + ) self._gauge_sample_throughput.labels(**labels).set(metrics.sample_throughput) self._gauge_cpu_utilization.labels(**labels).set(metrics.cpu_utilization) self._gauge_cpu_ram_usage_by_pid.labels(**labels).set(metrics.cpu_ram_pid) @@ -214,5 +413,13 @@ def log_metrics(self, metrics: Metrics, labels: Dict[str, str]): self._gauge_gpu_memory_free.labels(**labels).set(metrics.gpu_ram_free) self._gauge_gpu_memory_usage_by_pid.labels(**labels).set(metrics.gpu_ram_pid) self._gauge_state_size.labels(**labels).set(metrics.state_size) - self._gauge_kv_cache_usage_ratio.labels(**labels).set(metrics.kv_cache_usage_ratio) - self._info.info(metrics.system_info) \ No newline at end of file + self._gauge_kv_cache_usage_ratio.labels(**labels).set( + metrics.kv_cache_usage_ratio + ) + self._info.info(metrics.system_info) + + def log_queue_metrics(self, metrics: QueueMetrics, labels: Dict[str, str]): + """ + Log the metrics for the task queue. + """ + self._gauge_running_tasks.labels(**labels).observe(metrics.running_tasks_count) diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py index 3ded1f25d..b80d85913 100644 --- a/llama_cpp/llava_cpp.py +++ b/llama_cpp/llava_cpp.py @@ -35,7 +35,7 @@ # Load the library def _load_shared_library(lib_base_name: str): # Construct the paths to the possible shared library names - _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) + _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" # Searching for the library in the current directory under the name "libllama" (default name # for llamacpp) and "llama" (default name for this repo) _lib_paths: List[pathlib.Path] = [] diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index a6f1f4e9c..6ced7bc10 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -37,6 +37,7 @@ ConfigFileSettings, ) from llama_cpp.server.cli import add_args_from_model, parse_model_from_args +from llama_cpp._logger import logger, UVICORN_LOGGING_CONFIG def main(): @@ -75,7 +76,7 @@ def main(): server_settings = parse_model_from_args(ServerSettings, args) model_settings = [parse_model_from_args(ModelSettings, args)] except Exception as e: - print(e, file=sys.stderr) + logger.error(e) parser.print_help() sys.exit(1) assert server_settings is not None @@ -90,6 +91,7 @@ def main(): port=int(os.getenv("PORT", server_settings.port)), ssl_keyfile=server_settings.ssl_keyfile, ssl_certfile=server_settings.ssl_certfile, + log_config=UVICORN_LOGGING_CONFIG ) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index cb3a30582..ed90fa155 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,7 +1,11 @@ from __future__ import annotations +from contextlib import asynccontextmanager + import os import json +import typing +import contextlib from threading import Lock from functools import partial @@ -15,6 +19,7 @@ from anyio.streams.memory import MemoryObjectSendStream from starlette.concurrency import run_in_threadpool, iterate_in_threadpool from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body +from fastapi.responses import JSONResponse from fastapi.middleware import Middleware from fastapi.middleware.cors import CORSMiddleware from fastapi.security import HTTPBearer @@ -41,8 +46,12 @@ TokenizeInputCountResponse, DetokenizeInputRequest, DetokenizeInputResponse, + HealthMetrics ) from llama_cpp.server.errors import RouteErrorHandler +from llama_cpp._utils import monitor_task_queue +from llama_cpp.llama_metrics import MetricsExporter +from llama_cpp._logger import logger router = APIRouter(route_class=RouteErrorHandler) @@ -89,11 +98,35 @@ def get_llama_proxy(): llama_outer_lock.release() -_ping_message_factory = None +_ping_message_factory: typing.Optional[typing.Callable[[], bytes]] = None + + +def set_ping_message_factory(factory: typing.Callable[[], bytes]): + global _ping_message_factory + _ping_message_factory = factory + + +def set_metrics_exporter(): + global metrics_exporter + try: + metrics_exporter + except NameError: + metrics_exporter = MetricsExporter() + + return metrics_exporter + +task_queue_status = {} + -def set_ping_message_factory(factory): - global _ping_message_factory - _ping_message_factory = factory +@asynccontextmanager +async def lifespan(app: FastAPI): + """ + A context manager that launches tasks to be run during the application's lifespan. + """ + metrics_exporter = set_metrics_exporter() + + await monitor_task_queue(task_queue_status, metrics_exporter) + yield def create_app( @@ -135,6 +168,7 @@ def create_app( title="🦙 llama.cpp Python API", version=llama_cpp.__version__, root_path=server_settings.root_path, + lifespan=lifespan ) app.add_middleware( CORSMiddleware, @@ -161,27 +195,30 @@ def create_app( async def get_event_publisher( request: Request, - inner_send_chan: MemoryObjectSendStream, - iterator: Iterator, + inner_send_chan: MemoryObjectSendStream[typing.Any], + iterator: Iterator[typing.Any], + on_complete: typing.Optional[typing.Callable[[], None]] = None, ): + server_settings = next(get_server_settings()) + interrupt_requests = server_settings.interrupt_requests if server_settings else False async with inner_send_chan: try: async for chunk in iterate_in_threadpool(iterator): await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() - if ( - next(get_server_settings()).interrupt_requests - and llama_outer_lock.locked() - ): + if interrupt_requests and llama_outer_lock.locked(): await inner_send_chan.send(dict(data="[DONE]")) raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) except anyio.get_cancelled_exc_class() as e: - print("disconnected") + logger.warning(f"Disconnected from client {request.client}") with anyio.move_on_after(1, shield=True): - print(f"Disconnected from client (via refresh/close) {request.client}") + logger.error(f"Disconnected from client (via refresh/close) {request.client}") raise e + finally: + if on_complete: + on_complete() def _logit_bias_tokens_to_input_ids( @@ -223,6 +260,32 @@ async def authenticate( openai_v1_tag = "OpenAI V1" +@router.get( + "/v1/health", + response_model=HealthMetrics, + summary="Server's health", +) +async def check_health(): + # 3 running tasks + new scheduled request + if 0 <= task_queue_status.get("running_tasks_count", 0) <= 4: + return JSONResponse( + content={"status": "OK", "task_queue_status": task_queue_status} + ) + # 2 - 6 scheduled requests + elif 4 < task_queue_status.get("running_tasks_count", 0) < 10: + return JSONResponse( + content={"status": "Warning", "task_queue_status": task_queue_status} + ) + # 7+ scheduled requests + # TODO: Evaluate if in this case we should manually stop the execution of certain tasks to clear the queue + elif task_queue_status.get("running_tasks_count", 0) >= 10: + return JSONResponse( + content={"status": "Critical", "task_queue_status": task_queue_status} + ) + else: + pass + + @router.post( "/v1/completions", summary="Completion", @@ -265,8 +328,16 @@ async def authenticate( async def create_completion( request: Request, body: CreateCompletionRequest, - llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> llama_cpp.Completion: + exit_stack = contextlib.ExitStack() + llama_proxy = await run_in_threadpool( + lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)()) + ) + if llama_proxy is None: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Service is not available", + ) if isinstance(body.prompt, list): assert len(body.prompt) <= 1 body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" @@ -276,12 +347,12 @@ async def create_completion( if request.url.path != "/v1/engines/copilot-codex/completions" else "copilot-codex" ) - exclude = { "n", "best_of", "logit_bias_type", "user", + "min_tokens", } kwargs = body.model_dump(exclude=exclude) @@ -295,6 +366,15 @@ async def create_completion( if body.grammar is not None: kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + if body.min_tokens > 0: + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( + [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + ) + if "logits_processor" not in kwargs: + kwargs["logits_processor"] = _min_tokens_logits_processor + else: + kwargs["logits_processor"].extend(_min_tokens_logits_processor) + iterator_or_completion: Union[ llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse], @@ -309,6 +389,7 @@ async def create_completion( def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: yield first_response yield from iterator_or_completion + exit_stack.close() send_chan, recv_chan = anyio.create_memory_object_stream(10) return EventSourceResponse( @@ -318,6 +399,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: request=request, inner_send_chan=send_chan, iterator=iterator(), + on_complete=exit_stack.close, ), sep="\n", ping_message_factory=_ping_message_factory, @@ -341,7 +423,6 @@ async def create_embedding( **request.model_dump(exclude={"user"}), ) - @router.post( "/v1/chat/completions", summary="Chat", @@ -386,6 +467,7 @@ async def create_chat_completion( {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"}, ], + "ai_service": "copilot" }, }, "json_mode": { @@ -396,7 +478,7 @@ async def create_chat_completion( {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who won the world series in 2020"}, ], - "response_format": { "type": "json_object" } + "response_format": {"type": "json_object"}, }, }, "tool_calling": { @@ -421,15 +503,15 @@ async def create_chat_completion( }, "required": ["name", "age"], }, - } + }, } ], "tool_choice": { "type": "function", "function": { "name": "User", - } - } + }, + }, }, }, "logprobs": { @@ -441,19 +523,39 @@ async def create_chat_completion( {"role": "user", "content": "What is the capital of France?"}, ], "logprobs": True, - "top_logprobs": 10 + "top_logprobs": 10, }, }, } ), - llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> llama_cpp.ChatCompletion: + # This is a workaround for an issue in FastAPI dependencies + # where the dependency is cleaned up before a StreamingResponse + # is complete. + # https://github.com/tiangolo/fastapi/issues/11143 + exit_stack = contextlib.ExitStack() + llama_proxy = await run_in_threadpool( + lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)()) + ) + if llama_proxy is None: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Service is not available", + ) exclude = { "n", "logit_bias_type", "user", + "min_tokens", } + + # Extract relevant kwargs from the request body kwargs = body.model_dump(exclude=exclude) + + # Adds the ai_service value from the request body to the kwargs + # to be passed downstream to the llama_cpp.ChatCompletion object + kwargs["ai_service"] = body.ai_service + llama = llama_proxy(body.model) if body.logit_bias is not None: kwargs["logit_bias"] = ( @@ -465,10 +567,22 @@ async def create_chat_completion( if body.grammar is not None: kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + if body.min_tokens > 0: + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( + [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + ) + if "logits_processor" not in kwargs: + kwargs["logits_processor"] = _min_tokens_logits_processor + else: + kwargs["logits_processor"].extend(_min_tokens_logits_processor) + + # Set the metrics exporter for the llama object + llama.metrics = set_metrics_exporter() + iterator_or_completion: Union[ llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk] ] = await run_in_threadpool(llama.create_chat_completion, **kwargs) - + if isinstance(iterator_or_completion, Iterator): # EAFP: It's easier to ask for forgiveness than permission first_response = await run_in_threadpool(next, iterator_or_completion) @@ -478,6 +592,7 @@ async def create_chat_completion( def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: yield first_response yield from iterator_or_completion + exit_stack.close() send_chan, recv_chan = anyio.create_memory_object_stream(10) return EventSourceResponse( @@ -487,11 +602,13 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: request=request, inner_send_chan=send_chan, iterator=iterator(), + on_complete=exit_stack.close, ), sep="\n", ping_message_factory=_ping_message_factory, ) else: + exit_stack.close() return iterator_or_completion diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py index fbf9fd80d..826e7ed94 100644 --- a/llama_cpp/server/errors.py +++ b/llama_cpp/server/errors.py @@ -21,6 +21,7 @@ CreateEmbeddingRequest, CreateChatCompletionRequest, ) +from llama_cpp._logger import logger class ErrorResponse(TypedDict): @@ -134,7 +135,7 @@ def error_message_wrapper( ] = None, ) -> Tuple[int, ErrorResponse]: """Wraps error message in OpenAI style error response""" - print(f"Exception: {str(error)}", file=sys.stderr) + logger.error(f"Exception: {str(error)}") traceback.print_exc(file=sys.stderr) if body is not None and isinstance( body, diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index f00292410..c35fd37dd 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -9,6 +9,7 @@ import llama_cpp.llama_tokenizer as llama_tokenizer from llama_cpp.server.settings import ModelSettings +from llama_cpp._logger import logger class LlamaProxy: @@ -44,6 +45,8 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: if self._current_model is not None: return self._current_model + if self._current_model: + self._current_model.close() self._current_model = None settings = self._model_settings_dict[model] @@ -65,6 +68,7 @@ def __iter__(self): def free(self): if self._current_model: + self._current_model.close() del self._current_model @staticmethod @@ -183,7 +187,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: num_pred_tokens=settings.draft_model_num_pred_tokens ) - kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None + kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None if settings.kv_overrides is not None: assert isinstance(settings.kv_overrides, list) kv_overrides = {} @@ -197,6 +201,8 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: kv_overrides[key] = int(value) elif value_type == "float": kv_overrides[key] = float(value) + elif value_type == "str": + kv_overrides[key] = value else: raise ValueError(f"Unknown value type {value_type}") @@ -224,6 +230,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: use_mmap=settings.use_mmap, use_mlock=settings.use_mlock, kv_overrides=kv_overrides, + rpc_servers=settings.rpc_servers, # Context Params seed=settings.seed, n_ctx=settings.n_ctx, @@ -266,11 +273,11 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: if settings.cache: if settings.cache_type == "disk": if settings.verbose: - print(f"Using disk cache with size {settings.cache_size}") + logger.info(f"Using disk cache with size {settings.cache_size}") cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) else: if settings.verbose: - print(f"Using ram cache with size {settings.cache_size}") + logger.info(f"Using ram cache with size {settings.cache_size}") cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) _model.set_cache(cache) return _model diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index a3e185007..9d9b533d5 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -58,6 +58,10 @@ class ModelSettings(BaseSettings): default=None, description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.", ) + rpc_servers: Optional[str] = Field( + default=None, + description="comma seperated list of rpc servers for offloading", + ) # Context Params seed: int = Field( default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random." @@ -92,7 +96,7 @@ class ModelSettings(BaseSettings): default=True, description="if true, use experimental mul_mat_q kernels" ) logits_all: bool = Field(default=True, description="Whether to return logits.") - embedding: bool = Field(default=True, description="Whether to use embeddings.") + embedding: bool = Field(default=False, description="Whether to use embeddings.") offload_kqv: bool = Field( default=True, description="Whether to offload kqv to the GPU." ) diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index a20b3940f..9c32fe568 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import List, Optional, Union, Dict +from typing import List, Optional, Union, Dict, Any from typing_extensions import TypedDict, Literal from pydantic import BaseModel, Field @@ -16,6 +16,12 @@ default=16, ge=1, description="The maximum number of tokens to generate." ) +min_tokens_field = Field( + default=0, + ge=0, + description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).", +) + temperature_field = Field( default=0.8, description="Adjust the randomness of the generated text.\n\n" @@ -111,6 +117,7 @@ class CreateCompletionRequest(BaseModel): max_tokens: Optional[int] = Field( default=16, ge=0, description="The maximum number of tokens to generate." ) + min_tokens: int = min_tokens_field temperature: float = temperature_field top_p: float = top_p_field min_p: float = min_p_field @@ -206,6 +213,7 @@ class CreateChatCompletionRequest(BaseModel): default=None, description="The maximum number of tokens to generate. Defaults to inf", ) + min_tokens: int = min_tokens_field logprobs: Optional[bool] = Field( default=False, description="Whether to output the logprobs or not. Default is True" @@ -259,6 +267,9 @@ class CreateChatCompletionRequest(BaseModel): } } + # AI service added as request body parameter by Client + ai_service: Optional[str] = None + class ModelData(TypedDict): id: str @@ -306,3 +317,7 @@ class DetokenizeInputResponse(BaseModel): model_config = { "json_schema_extra": {"example": {"text": "How many tokens in this query?"}} } + +class HealthMetrics(BaseModel): + model_config = {"arbitrary_types_allowed": True} + task_queue_status: Dict[str, Any] \ No newline at end of file diff --git a/scripts/releases-to-pep-503.sh b/scripts/releases-to-pep-503.sh index 00ae56721..44fbbf3cf 100755 --- a/scripts/releases-to-pep-503.sh +++ b/scripts/releases-to-pep-503.sh @@ -44,7 +44,9 @@ releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern) # For each release, get all assets for release in $releases; do assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets) - echo "

$release

" >> index.html + # Get release version from release ie v0.1.0-cu121 -> v0.1.0 + release_version=$(echo $release | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+") + echo "

$release_version

" >> index.html for asset in $(echo $assets | jq -r .[].browser_download_url); do if [[ $asset == *".whl" ]]; then echo " $asset" >> index.html diff --git a/tests/test_llama.py b/tests/test_llama.py index 469ef91ca..49150c9f2 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -5,9 +5,18 @@ from scipy.special import log_softmax import llama_cpp +from llama_cpp.llama_metrics import MetricsExporter MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf" +def set_test_metrics_exporter(): + global metrics_exporter + try: + metrics_exporter + except NameError: + metrics_exporter = MetricsExporter() + + return metrics_exporter def test_llama_cpp_tokenization(): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False) @@ -153,7 +162,11 @@ def mock_kv_cache_seq_add( def test_llama_patch(mock_llama): n_ctx = 128 + ai_service_completion = "test-label-suggestions" + ai_service_streaming = "test-acceptance-criteria" llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx) + llama.metrics = set_test_metrics_exporter() + n_vocab = llama_cpp.llama_n_vocab(llama._model.model) assert n_vocab == 32000 @@ -163,32 +176,32 @@ def test_llama_patch(mock_llama): ## Test basic completion from bos until eos mock_llama(llama, all_text) - completion = llama.create_completion("", max_tokens=36) + completion = llama.create_completion("", max_tokens=36, ai_service=ai_service_completion) assert completion["choices"][0]["text"] == all_text assert completion["choices"][0]["finish_reason"] == "stop" ## Test basic completion until eos mock_llama(llama, all_text) - completion = llama.create_completion(text, max_tokens=20) + completion = llama.create_completion(text, max_tokens=20, ai_service=ai_service_completion) assert completion["choices"][0]["text"] == output_text assert completion["choices"][0]["finish_reason"] == "stop" ## Test streaming completion until eos mock_llama(llama, all_text) - chunks = list(llama.create_completion(text, max_tokens=20, stream=True)) + chunks = list(llama.create_completion(text, max_tokens=20, stream=True, ai_service=ai_service_streaming)) assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text assert chunks[-1]["choices"][0]["finish_reason"] == "stop" ## Test basic completion until stop sequence mock_llama(llama, all_text) - completion = llama.create_completion(text, max_tokens=20, stop=["lazy"]) + completion = llama.create_completion(text, max_tokens=20, stop=["lazy"], ai_service=ai_service_completion) assert completion["choices"][0]["text"] == " jumps over the " assert completion["choices"][0]["finish_reason"] == "stop" ## Test streaming completion until stop sequence mock_llama(llama, all_text) chunks = list( - llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]) + llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"], ai_service=ai_service_streaming) ) assert ( "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the " @@ -197,13 +210,13 @@ def test_llama_patch(mock_llama): ## Test basic completion until length mock_llama(llama, all_text) - completion = llama.create_completion(text, max_tokens=2) + completion = llama.create_completion(text, max_tokens=2, ai_service=ai_service_completion) assert completion["choices"][0]["text"] == " jumps" assert completion["choices"][0]["finish_reason"] == "length" ## Test streaming completion until length mock_llama(llama, all_text) - chunks = list(llama.create_completion(text, max_tokens=2, stream=True)) + chunks = list(llama.create_completion(text, max_tokens=2, stream=True, ai_service=ai_service_streaming)) assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps" assert chunks[-1]["choices"][0]["finish_reason"] == "length" @@ -228,17 +241,19 @@ def test_llama_pickle(): def test_utf8(mock_llama): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, logits_all=True) + llama.metrics = set_test_metrics_exporter() output_text = "😀" + ai_service = "label-suggestions" ## Test basic completion with utf8 multibyte mock_llama(llama, output_text) - completion = llama.create_completion("", max_tokens=4) + completion = llama.create_completion("", max_tokens=4, ai_service=ai_service) assert completion["choices"][0]["text"] == output_text ## Test basic completion with incomplete utf8 multibyte mock_llama(llama, output_text) - completion = llama.create_completion("", max_tokens=1) + completion = llama.create_completion("", max_tokens=1, ai_service=ai_service) assert completion["choices"][0]["text"] == "" @@ -266,6 +281,22 @@ def test_llama_server(): } +def test_metrics_endpoint(): + from fastapi.testclient import TestClient + from llama_cpp.server.app import create_app, Settings + + settings = Settings( + model=MODEL, + vocab_only=True, + ) + app = create_app(settings) + client = TestClient(app) + response = client.get("/metrics") + assert response.status_code == 200 + assert "test-label-suggestions" in response.text + assert "test-acceptance-criteria" in response.text + + @pytest.mark.parametrize( "size_and_axis", [ diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index c10aee42e..f031bf72b 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -21,12 +21,13 @@ def test_mistral_instruct(): response = llama_chat_format.format_mistral_instruct( messages=messages, ) + prompt = ("" if response.added_special else "") + response.prompt reference = chat_formatter.render( messages=messages, bos_token="", eos_token="", ) - assert response.prompt == reference + assert prompt == reference mistral_7b_tokenizer_config = """{ diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f364eb6fb..968967376 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961 +Subproject commit 968967376dc2c018d29f897c4883d335bbf384fb