diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index def441a3..00000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,80 +0,0 @@ -# Use the latest 2.1 version of CircleCI pipeline process engine. -# See: https://circleci.com/docs/2.0/configuration-reference -version: 2.1 - -# Orbs are reusable packages of CircleCI configuration that you may share across projects, enabling you to create encapsulated, parameterized commands, jobs, and executors that can be used across multiple projects. -# See: https://circleci.com/docs/2.0/orb-intro/ -orbs: - # The python orb contains a set of prepackaged CircleCI configuration you can use repeatedly in your configuration files - # Orb commands and jobs help you with common scripting around a language/tool - # so you dont have to copy and paste it everywhere. - # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python - python: circleci/python@1.5.0 - -# Define a job to be invoked later in a workflow. -# See: https://circleci.com/docs/2.0/configuration-reference/#jobs -jobs: - build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do! - # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/ - # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub - # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python - # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container - # Change the version below to your required version of python - docker: - # Important: Don't change this otherwise we will stop testing the earliest - # python version we have to support. - - image: python:3.8-bullseye - resource_class: small - steps: - - checkout # checkout source code to working directory - - run: - name: Install Environment Dependencies - command: | # install dependencies - apt-get -y install curl - pip install --upgrade pip - pip install poetry - poetry install --no-ansi - - - run: - name: Black Formatting Check # Only validation, without re-formatting - command: | - poetry run black --check -t py36 launch - - run: - name: Ruff Lint Check # Uses pyproject.toml for configuration - command: | - poetry run ruff launch - - run: - name: Pylint Lint Check # Uses .pylintrc for configuration - command: | - poetry run pylint launch --ignore=api_client,openapi_client - - run: - name: MyPy typing check - command: | - poetry run mypy --ignore-missing-imports launch --exclude launch/api_client --exclude launch/openapi_client - - run: - name: Isort Import Formatting Check # Only validation, without re-formatting - command: | - poetry run isort --check-only launch - - run: - name: Pytest Test Cases - command: | - mkdir test_results - set -e - TEST_FILES=$(circleci tests glob "tests/**/test_*.py") - poetry run coverage run --include=launch/* -m pytest $TEST_FILES - poetry run coverage report - poetry run coverage html - - store_test_results: - path: htmlcov - - store_test_results: - path: test_results - - store_artifacts: - path: test_results - -# Invoke jobs via workflows -# See: https://circleci.com/docs/2.0/configuration-reference/#workflows -workflows: - build_and_test: # This is the name of the workflow, feel free to change it to better match your workflow. - # Inside the workflow, you define the jobs you want to run. - jobs: - - build-and-test diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 568870df..00000000 --- a/.gitignore +++ /dev/null @@ -1,17 +0,0 @@ -# python -**/__pycache__/ - -# editor -.vscode -**/.idea/ -**/scratch/ -*.swp -.vim/ - -# Sphinx documentation -docs/_sources -docs/.doctrees - -site/ - -.DS_Store diff --git a/.isort.cfg b/.isort.cfg deleted file mode 100644 index c7c32c0a..00000000 --- a/.isort.cfg +++ /dev/null @@ -1,4 +0,0 @@ -[settings] -profile=black -# Enforced in .flake8 -line_length=79 diff --git a/docs/.nojekyll b/.nojekyll similarity index 100% rename from docs/.nojekyll rename to .nojekyll diff --git a/.openapi-generator-ignore b/.openapi-generator-ignore deleted file mode 100644 index 140a9113..00000000 --- a/.openapi-generator-ignore +++ /dev/null @@ -1,19 +0,0 @@ -# OpenAPI Generator Ignore -# Files listed here will not be overwritten by the generator - -# Custom wrapper code -launch/__init__.py -launch/client.py -launch/model_endpoint.py - -# Project files we maintain manually -README.md -.gitignore -requirements.txt - -# Type stub files (have syntax errors due to generator bug) -**/*.pyi - -# Generated docs (have invalid Python syntax in examples due to generator bug) -launch/api_client_README.md -launch/api_client/docs/** diff --git a/.openapi-generator/FILES b/.openapi-generator/FILES deleted file mode 100644 index 3b4c5ac1..00000000 --- a/.openapi-generator/FILES +++ /dev/null @@ -1,456 +0,0 @@ -launch/api_client/__init__.py -launch/api_client/api_client.py -launch/api_client/apis/__init__.py -launch/api_client/apis/tags/default_api.py -launch/api_client/configuration.py -launch/api_client/docs/apis/tags/DefaultApi.md -launch/api_client/docs/models/Annotation.md -launch/api_client/docs/models/Audio.md -launch/api_client/docs/models/Audio1.md -launch/api_client/docs/models/Audio2.md -launch/api_client/docs/models/BatchCompletionsJob.md -launch/api_client/docs/models/BatchCompletionsJobStatus.md -launch/api_client/docs/models/BatchCompletionsModelConfig.md -launch/api_client/docs/models/BatchJobSerializationFormat.md -launch/api_client/docs/models/BatchJobStatus.md -launch/api_client/docs/models/CallbackAuth.md -launch/api_client/docs/models/CallbackBasicAuth.md -launch/api_client/docs/models/CallbackmTLSAuth.md -launch/api_client/docs/models/CancelBatchCompletionsV2Response.md -launch/api_client/docs/models/CancelFineTuneResponse.md -launch/api_client/docs/models/ChatCompletionFunctionCallOption.md -launch/api_client/docs/models/ChatCompletionFunctions.md -launch/api_client/docs/models/ChatCompletionMessageToolCall.md -launch/api_client/docs/models/ChatCompletionMessageToolCallChunk.md -launch/api_client/docs/models/ChatCompletionMessageToolCallsInput.md -launch/api_client/docs/models/ChatCompletionMessageToolCallsOutput.md -launch/api_client/docs/models/ChatCompletionNamedToolChoice.md -launch/api_client/docs/models/ChatCompletionRequestAssistantMessage.md -launch/api_client/docs/models/ChatCompletionRequestAssistantMessageContentPart.md -launch/api_client/docs/models/ChatCompletionRequestDeveloperMessage.md -launch/api_client/docs/models/ChatCompletionRequestFunctionMessage.md -launch/api_client/docs/models/ChatCompletionRequestMessage.md -launch/api_client/docs/models/ChatCompletionRequestMessageContentPartAudio.md -launch/api_client/docs/models/ChatCompletionRequestMessageContentPartFile.md -launch/api_client/docs/models/ChatCompletionRequestMessageContentPartImage.md -launch/api_client/docs/models/ChatCompletionRequestMessageContentPartRefusal.md -launch/api_client/docs/models/ChatCompletionRequestMessageContentPartText.md -launch/api_client/docs/models/ChatCompletionRequestSystemMessage.md -launch/api_client/docs/models/ChatCompletionRequestSystemMessageContentPart.md -launch/api_client/docs/models/ChatCompletionRequestToolMessage.md -launch/api_client/docs/models/ChatCompletionRequestToolMessageContentPart.md -launch/api_client/docs/models/ChatCompletionRequestUserMessage.md -launch/api_client/docs/models/ChatCompletionRequestUserMessageContentPart.md -launch/api_client/docs/models/ChatCompletionResponseMessage.md -launch/api_client/docs/models/ChatCompletionStreamOptions.md -launch/api_client/docs/models/ChatCompletionStreamResponseDelta.md -launch/api_client/docs/models/ChatCompletionTokenLogprob.md -launch/api_client/docs/models/ChatCompletionTool.md -launch/api_client/docs/models/ChatCompletionToolChoiceOption.md -launch/api_client/docs/models/ChatCompletionV2Request.md -launch/api_client/docs/models/ChatCompletionV2StreamErrorChunk.md -launch/api_client/docs/models/Choice.md -launch/api_client/docs/models/Choice1.md -launch/api_client/docs/models/Choice2.md -launch/api_client/docs/models/CloneModelBundleV1Request.md -launch/api_client/docs/models/CloneModelBundleV2Request.md -launch/api_client/docs/models/CloudpickleArtifactFlavor.md -launch/api_client/docs/models/CompletionOutput.md -launch/api_client/docs/models/CompletionStreamOutput.md -launch/api_client/docs/models/CompletionStreamV1Request.md -launch/api_client/docs/models/CompletionStreamV1Response.md -launch/api_client/docs/models/CompletionSyncV1Request.md -launch/api_client/docs/models/CompletionSyncV1Response.md -launch/api_client/docs/models/CompletionTokensDetails.md -launch/api_client/docs/models/CompletionUsage.md -launch/api_client/docs/models/CompletionV2Request.md -launch/api_client/docs/models/CompletionV2StreamErrorChunk.md -launch/api_client/docs/models/Content.md -launch/api_client/docs/models/Content1.md -launch/api_client/docs/models/Content2.md -launch/api_client/docs/models/Content3.md -launch/api_client/docs/models/Content4.md -launch/api_client/docs/models/Content8.md -launch/api_client/docs/models/CreateAsyncTaskV1Response.md -launch/api_client/docs/models/CreateBatchCompletionsV1ModelConfig.md -launch/api_client/docs/models/CreateBatchCompletionsV1Request.md -launch/api_client/docs/models/CreateBatchCompletionsV1RequestContent.md -launch/api_client/docs/models/CreateBatchCompletionsV1Response.md -launch/api_client/docs/models/CreateBatchCompletionsV2Request.md -launch/api_client/docs/models/CreateBatchJobResourceRequests.md -launch/api_client/docs/models/CreateBatchJobV1Request.md -launch/api_client/docs/models/CreateBatchJobV1Response.md -launch/api_client/docs/models/CreateChatCompletionResponse.md -launch/api_client/docs/models/CreateChatCompletionStreamResponse.md -launch/api_client/docs/models/CreateCompletionResponse.md -launch/api_client/docs/models/CreateDeepSpeedModelEndpointRequest.md -launch/api_client/docs/models/CreateDockerImageBatchJobBundleV1Request.md -launch/api_client/docs/models/CreateDockerImageBatchJobBundleV1Response.md -launch/api_client/docs/models/CreateDockerImageBatchJobResourceRequests.md -launch/api_client/docs/models/CreateDockerImageBatchJobV1Request.md -launch/api_client/docs/models/CreateDockerImageBatchJobV1Response.md -launch/api_client/docs/models/CreateFineTuneRequest.md -launch/api_client/docs/models/CreateFineTuneResponse.md -launch/api_client/docs/models/CreateLLMModelEndpointV1Request.md -launch/api_client/docs/models/CreateLLMModelEndpointV1Response.md -launch/api_client/docs/models/CreateLightLLMModelEndpointRequest.md -launch/api_client/docs/models/CreateModelBundleV1Request.md -launch/api_client/docs/models/CreateModelBundleV1Response.md -launch/api_client/docs/models/CreateModelBundleV2Request.md -launch/api_client/docs/models/CreateModelBundleV2Response.md -launch/api_client/docs/models/CreateModelEndpointV1Request.md -launch/api_client/docs/models/CreateModelEndpointV1Response.md -launch/api_client/docs/models/CreateSGLangModelEndpointRequest.md -launch/api_client/docs/models/CreateTensorRTLLMModelEndpointRequest.md -launch/api_client/docs/models/CreateTextGenerationInferenceModelEndpointRequest.md -launch/api_client/docs/models/CreateTriggerV1Request.md -launch/api_client/docs/models/CreateTriggerV1Response.md -launch/api_client/docs/models/CreateVLLMModelEndpointRequest.md -launch/api_client/docs/models/CustomFramework.md -launch/api_client/docs/models/DeleteFileResponse.md -launch/api_client/docs/models/DeleteLLMEndpointResponse.md -launch/api_client/docs/models/DeleteModelEndpointV1Response.md -launch/api_client/docs/models/DeleteTriggerV1Response.md -launch/api_client/docs/models/DockerImageBatchJob.md -launch/api_client/docs/models/DockerImageBatchJobBundleV1Response.md -launch/api_client/docs/models/EndpointPredictV1Request.md -launch/api_client/docs/models/File.md -launch/api_client/docs/models/FilteredChatCompletionV2Request.md -launch/api_client/docs/models/FilteredCompletionV2Request.md -launch/api_client/docs/models/Function1.md -launch/api_client/docs/models/Function2.md -launch/api_client/docs/models/Function3.md -launch/api_client/docs/models/FunctionCall.md -launch/api_client/docs/models/FunctionCall2.md -launch/api_client/docs/models/FunctionObject.md -launch/api_client/docs/models/FunctionParameters.md -launch/api_client/docs/models/GetAsyncTaskV1Response.md -launch/api_client/docs/models/GetBatchCompletionV2Response.md -launch/api_client/docs/models/GetBatchJobV1Response.md -launch/api_client/docs/models/GetDockerImageBatchJobV1Response.md -launch/api_client/docs/models/GetFileContentResponse.md -launch/api_client/docs/models/GetFileResponse.md -launch/api_client/docs/models/GetFineTuneEventsResponse.md -launch/api_client/docs/models/GetFineTuneResponse.md -launch/api_client/docs/models/GetLLMModelEndpointV1Response.md -launch/api_client/docs/models/GetModelEndpointV1Response.md -launch/api_client/docs/models/GetTriggerV1Response.md -launch/api_client/docs/models/GpuType.md -launch/api_client/docs/models/HTTPValidationError.md -launch/api_client/docs/models/ImageUrl.md -launch/api_client/docs/models/InputAudio.md -launch/api_client/docs/models/JsonSchema.md -launch/api_client/docs/models/LLMFineTuneEvent.md -launch/api_client/docs/models/LLMInferenceFramework.md -launch/api_client/docs/models/LLMSource.md -launch/api_client/docs/models/ListDockerImageBatchJobBundleV1Response.md -launch/api_client/docs/models/ListDockerImageBatchJobsV1Response.md -launch/api_client/docs/models/ListFilesResponse.md -launch/api_client/docs/models/ListFineTunesResponse.md -launch/api_client/docs/models/ListLLMModelEndpointsV1Response.md -launch/api_client/docs/models/ListModelBundlesV1Response.md -launch/api_client/docs/models/ListModelBundlesV2Response.md -launch/api_client/docs/models/ListModelEndpointsV1Response.md -launch/api_client/docs/models/ListTriggersV1Response.md -launch/api_client/docs/models/Logprobs.md -launch/api_client/docs/models/Logprobs2.md -launch/api_client/docs/models/Metadata.md -launch/api_client/docs/models/ModelBundleEnvironmentParams.md -launch/api_client/docs/models/ModelBundleFrameworkType.md -launch/api_client/docs/models/ModelBundleOrderBy.md -launch/api_client/docs/models/ModelBundlePackagingType.md -launch/api_client/docs/models/ModelBundleV1Response.md -launch/api_client/docs/models/ModelBundleV2Response.md -launch/api_client/docs/models/ModelDownloadRequest.md -launch/api_client/docs/models/ModelDownloadResponse.md -launch/api_client/docs/models/ModelEndpointDeploymentState.md -launch/api_client/docs/models/ModelEndpointOrderBy.md -launch/api_client/docs/models/ModelEndpointResourceState.md -launch/api_client/docs/models/ModelEndpointStatus.md -launch/api_client/docs/models/ModelEndpointType.md -launch/api_client/docs/models/ParallelToolCalls.md -launch/api_client/docs/models/PredictionContent.md -launch/api_client/docs/models/Prompt.md -launch/api_client/docs/models/Prompt1.md -launch/api_client/docs/models/Prompt1Item.md -launch/api_client/docs/models/PromptTokensDetails.md -launch/api_client/docs/models/PytorchFramework.md -launch/api_client/docs/models/Quantization.md -launch/api_client/docs/models/ReasoningEffort.md -launch/api_client/docs/models/RequestSchema.md -launch/api_client/docs/models/ResponseFormatJsonObject.md -launch/api_client/docs/models/ResponseFormatJsonSchema.md -launch/api_client/docs/models/ResponseFormatJsonSchemaSchema.md -launch/api_client/docs/models/ResponseFormatText.md -launch/api_client/docs/models/ResponseModalities.md -launch/api_client/docs/models/ResponseSchema.md -launch/api_client/docs/models/RestartModelEndpointV1Response.md -launch/api_client/docs/models/RunnableImageFlavor.md -launch/api_client/docs/models/ServiceTier.md -launch/api_client/docs/models/StopConfiguration.md -launch/api_client/docs/models/StopConfiguration1.md -launch/api_client/docs/models/StreamError.md -launch/api_client/docs/models/StreamErrorContent.md -launch/api_client/docs/models/StreamingEnhancedRunnableImageFlavor.md -launch/api_client/docs/models/SyncEndpointPredictV1Request.md -launch/api_client/docs/models/SyncEndpointPredictV1Response.md -launch/api_client/docs/models/TaskStatus.md -launch/api_client/docs/models/TensorflowFramework.md -launch/api_client/docs/models/TokenOutput.md -launch/api_client/docs/models/ToolConfig.md -launch/api_client/docs/models/TopLogprob.md -launch/api_client/docs/models/TritonEnhancedRunnableImageFlavor.md -launch/api_client/docs/models/UpdateBatchCompletionsV2Request.md -launch/api_client/docs/models/UpdateBatchCompletionsV2Response.md -launch/api_client/docs/models/UpdateBatchJobV1Request.md -launch/api_client/docs/models/UpdateBatchJobV1Response.md -launch/api_client/docs/models/UpdateDeepSpeedModelEndpointRequest.md -launch/api_client/docs/models/UpdateDockerImageBatchJobV1Request.md -launch/api_client/docs/models/UpdateDockerImageBatchJobV1Response.md -launch/api_client/docs/models/UpdateLLMModelEndpointV1Request.md -launch/api_client/docs/models/UpdateLLMModelEndpointV1Response.md -launch/api_client/docs/models/UpdateModelEndpointV1Request.md -launch/api_client/docs/models/UpdateModelEndpointV1Response.md -launch/api_client/docs/models/UpdateSGLangModelEndpointRequest.md -launch/api_client/docs/models/UpdateTextGenerationInferenceModelEndpointRequest.md -launch/api_client/docs/models/UpdateTriggerV1Request.md -launch/api_client/docs/models/UpdateTriggerV1Response.md -launch/api_client/docs/models/UpdateVLLMModelEndpointRequest.md -launch/api_client/docs/models/UploadFileResponse.md -launch/api_client/docs/models/UrlCitation.md -launch/api_client/docs/models/UserLocation.md -launch/api_client/docs/models/ValidationError.md -launch/api_client/docs/models/VoiceIdsShared.md -launch/api_client/docs/models/WebSearchContextSize.md -launch/api_client/docs/models/WebSearchLocation.md -launch/api_client/docs/models/WebSearchOptions.md -launch/api_client/docs/models/ZipArtifactFlavor.md -launch/api_client/exceptions.py -launch/api_client/model/__init__.py -launch/api_client/model/annotation.py -launch/api_client/model/audio.py -launch/api_client/model/audio1.py -launch/api_client/model/audio2.py -launch/api_client/model/batch_completions_job.py -launch/api_client/model/batch_completions_job_status.py -launch/api_client/model/batch_completions_model_config.py -launch/api_client/model/batch_job_serialization_format.py -launch/api_client/model/batch_job_status.py -launch/api_client/model/callback_auth.py -launch/api_client/model/callback_basic_auth.py -launch/api_client/model/callbackm_tls_auth.py -launch/api_client/model/cancel_batch_completions_v2_response.py -launch/api_client/model/cancel_fine_tune_response.py -launch/api_client/model/chat_completion_function_call_option.py -launch/api_client/model/chat_completion_functions.py -launch/api_client/model/chat_completion_message_tool_call.py -launch/api_client/model/chat_completion_message_tool_call_chunk.py -launch/api_client/model/chat_completion_message_tool_calls_input.py -launch/api_client/model/chat_completion_message_tool_calls_output.py -launch/api_client/model/chat_completion_named_tool_choice.py -launch/api_client/model/chat_completion_request_assistant_message.py -launch/api_client/model/chat_completion_request_assistant_message_content_part.py -launch/api_client/model/chat_completion_request_developer_message.py -launch/api_client/model/chat_completion_request_function_message.py -launch/api_client/model/chat_completion_request_message.py -launch/api_client/model/chat_completion_request_message_content_part_audio.py -launch/api_client/model/chat_completion_request_message_content_part_file.py -launch/api_client/model/chat_completion_request_message_content_part_image.py -launch/api_client/model/chat_completion_request_message_content_part_refusal.py -launch/api_client/model/chat_completion_request_message_content_part_text.py -launch/api_client/model/chat_completion_request_system_message.py -launch/api_client/model/chat_completion_request_system_message_content_part.py -launch/api_client/model/chat_completion_request_tool_message.py -launch/api_client/model/chat_completion_request_tool_message_content_part.py -launch/api_client/model/chat_completion_request_user_message.py -launch/api_client/model/chat_completion_request_user_message_content_part.py -launch/api_client/model/chat_completion_response_message.py -launch/api_client/model/chat_completion_stream_options.py -launch/api_client/model/chat_completion_stream_response_delta.py -launch/api_client/model/chat_completion_token_logprob.py -launch/api_client/model/chat_completion_tool.py -launch/api_client/model/chat_completion_tool_choice_option.py -launch/api_client/model/chat_completion_v2_request.py -launch/api_client/model/chat_completion_v2_stream_error_chunk.py -launch/api_client/model/choice.py -launch/api_client/model/choice1.py -launch/api_client/model/choice2.py -launch/api_client/model/clone_model_bundle_v1_request.py -launch/api_client/model/clone_model_bundle_v2_request.py -launch/api_client/model/cloudpickle_artifact_flavor.py -launch/api_client/model/completion_output.py -launch/api_client/model/completion_stream_output.py -launch/api_client/model/completion_stream_v1_request.py -launch/api_client/model/completion_stream_v1_response.py -launch/api_client/model/completion_sync_v1_request.py -launch/api_client/model/completion_sync_v1_response.py -launch/api_client/model/completion_tokens_details.py -launch/api_client/model/completion_usage.py -launch/api_client/model/completion_v2_request.py -launch/api_client/model/completion_v2_stream_error_chunk.py -launch/api_client/model/content.py -launch/api_client/model/content1.py -launch/api_client/model/content2.py -launch/api_client/model/content3.py -launch/api_client/model/content4.py -launch/api_client/model/content8.py -launch/api_client/model/create_async_task_v1_response.py -launch/api_client/model/create_batch_completions_v1_model_config.py -launch/api_client/model/create_batch_completions_v1_request.py -launch/api_client/model/create_batch_completions_v1_request_content.py -launch/api_client/model/create_batch_completions_v1_response.py -launch/api_client/model/create_batch_completions_v2_request.py -launch/api_client/model/create_batch_job_resource_requests.py -launch/api_client/model/create_batch_job_v1_request.py -launch/api_client/model/create_batch_job_v1_response.py -launch/api_client/model/create_chat_completion_response.py -launch/api_client/model/create_chat_completion_stream_response.py -launch/api_client/model/create_completion_response.py -launch/api_client/model/create_deep_speed_model_endpoint_request.py -launch/api_client/model/create_docker_image_batch_job_bundle_v1_request.py -launch/api_client/model/create_docker_image_batch_job_bundle_v1_response.py -launch/api_client/model/create_docker_image_batch_job_resource_requests.py -launch/api_client/model/create_docker_image_batch_job_v1_request.py -launch/api_client/model/create_docker_image_batch_job_v1_response.py -launch/api_client/model/create_fine_tune_request.py -launch/api_client/model/create_fine_tune_response.py -launch/api_client/model/create_light_llm_model_endpoint_request.py -launch/api_client/model/create_llm_model_endpoint_v1_request.py -launch/api_client/model/create_llm_model_endpoint_v1_response.py -launch/api_client/model/create_model_bundle_v1_request.py -launch/api_client/model/create_model_bundle_v1_response.py -launch/api_client/model/create_model_bundle_v2_request.py -launch/api_client/model/create_model_bundle_v2_response.py -launch/api_client/model/create_model_endpoint_v1_request.py -launch/api_client/model/create_model_endpoint_v1_response.py -launch/api_client/model/create_sg_lang_model_endpoint_request.py -launch/api_client/model/create_tensor_rtllm_model_endpoint_request.py -launch/api_client/model/create_text_generation_inference_model_endpoint_request.py -launch/api_client/model/create_trigger_v1_request.py -launch/api_client/model/create_trigger_v1_response.py -launch/api_client/model/create_vllm_model_endpoint_request.py -launch/api_client/model/custom_framework.py -launch/api_client/model/delete_file_response.py -launch/api_client/model/delete_llm_endpoint_response.py -launch/api_client/model/delete_model_endpoint_v1_response.py -launch/api_client/model/delete_trigger_v1_response.py -launch/api_client/model/docker_image_batch_job.py -launch/api_client/model/docker_image_batch_job_bundle_v1_response.py -launch/api_client/model/endpoint_predict_v1_request.py -launch/api_client/model/file.py -launch/api_client/model/filtered_chat_completion_v2_request.py -launch/api_client/model/filtered_completion_v2_request.py -launch/api_client/model/function1.py -launch/api_client/model/function2.py -launch/api_client/model/function3.py -launch/api_client/model/function_call.py -launch/api_client/model/function_call2.py -launch/api_client/model/function_object.py -launch/api_client/model/function_parameters.py -launch/api_client/model/get_async_task_v1_response.py -launch/api_client/model/get_batch_completion_v2_response.py -launch/api_client/model/get_batch_job_v1_response.py -launch/api_client/model/get_docker_image_batch_job_v1_response.py -launch/api_client/model/get_file_content_response.py -launch/api_client/model/get_file_response.py -launch/api_client/model/get_fine_tune_events_response.py -launch/api_client/model/get_fine_tune_response.py -launch/api_client/model/get_llm_model_endpoint_v1_response.py -launch/api_client/model/get_model_endpoint_v1_response.py -launch/api_client/model/get_trigger_v1_response.py -launch/api_client/model/gpu_type.py -launch/api_client/model/http_validation_error.py -launch/api_client/model/image_url.py -launch/api_client/model/input_audio.py -launch/api_client/model/json_schema.py -launch/api_client/model/list_docker_image_batch_job_bundle_v1_response.py -launch/api_client/model/list_docker_image_batch_jobs_v1_response.py -launch/api_client/model/list_files_response.py -launch/api_client/model/list_fine_tunes_response.py -launch/api_client/model/list_llm_model_endpoints_v1_response.py -launch/api_client/model/list_model_bundles_v1_response.py -launch/api_client/model/list_model_bundles_v2_response.py -launch/api_client/model/list_model_endpoints_v1_response.py -launch/api_client/model/list_triggers_v1_response.py -launch/api_client/model/llm_fine_tune_event.py -launch/api_client/model/llm_inference_framework.py -launch/api_client/model/llm_source.py -launch/api_client/model/logprobs.py -launch/api_client/model/logprobs2.py -launch/api_client/model/metadata.py -launch/api_client/model/model_bundle_environment_params.py -launch/api_client/model/model_bundle_framework_type.py -launch/api_client/model/model_bundle_order_by.py -launch/api_client/model/model_bundle_packaging_type.py -launch/api_client/model/model_bundle_v1_response.py -launch/api_client/model/model_bundle_v2_response.py -launch/api_client/model/model_download_request.py -launch/api_client/model/model_download_response.py -launch/api_client/model/model_endpoint_deployment_state.py -launch/api_client/model/model_endpoint_order_by.py -launch/api_client/model/model_endpoint_resource_state.py -launch/api_client/model/model_endpoint_status.py -launch/api_client/model/model_endpoint_type.py -launch/api_client/model/parallel_tool_calls.py -launch/api_client/model/prediction_content.py -launch/api_client/model/prompt.py -launch/api_client/model/prompt1.py -launch/api_client/model/prompt1_item.py -launch/api_client/model/prompt_tokens_details.py -launch/api_client/model/pytorch_framework.py -launch/api_client/model/quantization.py -launch/api_client/model/reasoning_effort.py -launch/api_client/model/request_schema.py -launch/api_client/model/response_format_json_object.py -launch/api_client/model/response_format_json_schema.py -launch/api_client/model/response_format_json_schema_schema.py -launch/api_client/model/response_format_text.py -launch/api_client/model/response_modalities.py -launch/api_client/model/response_schema.py -launch/api_client/model/restart_model_endpoint_v1_response.py -launch/api_client/model/runnable_image_flavor.py -launch/api_client/model/service_tier.py -launch/api_client/model/stop_configuration.py -launch/api_client/model/stop_configuration1.py -launch/api_client/model/stream_error.py -launch/api_client/model/stream_error_content.py -launch/api_client/model/streaming_enhanced_runnable_image_flavor.py -launch/api_client/model/sync_endpoint_predict_v1_request.py -launch/api_client/model/sync_endpoint_predict_v1_response.py -launch/api_client/model/task_status.py -launch/api_client/model/tensorflow_framework.py -launch/api_client/model/token_output.py -launch/api_client/model/tool_config.py -launch/api_client/model/top_logprob.py -launch/api_client/model/triton_enhanced_runnable_image_flavor.py -launch/api_client/model/update_batch_completions_v2_request.py -launch/api_client/model/update_batch_completions_v2_response.py -launch/api_client/model/update_batch_job_v1_request.py -launch/api_client/model/update_batch_job_v1_response.py -launch/api_client/model/update_deep_speed_model_endpoint_request.py -launch/api_client/model/update_docker_image_batch_job_v1_request.py -launch/api_client/model/update_docker_image_batch_job_v1_response.py -launch/api_client/model/update_llm_model_endpoint_v1_request.py -launch/api_client/model/update_llm_model_endpoint_v1_response.py -launch/api_client/model/update_model_endpoint_v1_request.py -launch/api_client/model/update_model_endpoint_v1_response.py -launch/api_client/model/update_sg_lang_model_endpoint_request.py -launch/api_client/model/update_text_generation_inference_model_endpoint_request.py -launch/api_client/model/update_trigger_v1_request.py -launch/api_client/model/update_trigger_v1_response.py -launch/api_client/model/update_vllm_model_endpoint_request.py -launch/api_client/model/upload_file_response.py -launch/api_client/model/url_citation.py -launch/api_client/model/user_location.py -launch/api_client/model/validation_error.py -launch/api_client/model/voice_ids_shared.py -launch/api_client/model/web_search_context_size.py -launch/api_client/model/web_search_location.py -launch/api_client/model/web_search_options.py -launch/api_client/model/zip_artifact_flavor.py -launch/api_client/models/__init__.py -launch/api_client/rest.py -launch/api_client/schemas.py -launch/api_client/test/__init__.py -launch/api_client/test/test_models/__init__.py -launch/api_client_README.md diff --git a/.openapi-generator/VERSION b/.openapi-generator/VERSION deleted file mode 100644 index c0be8a79..00000000 --- a/.openapi-generator/VERSION +++ /dev/null @@ -1 +0,0 @@ -6.4.0 \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 477ceb1f..00000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,41 +0,0 @@ -fail_fast: false -repos: -- repo: local - hooks: - - id: system - name: Black - entry: poetry run black . - pass_filenames: false - language: system - -- repo: local - hooks: - - id: system - name: ruff - entry: poetry run ruff launch - pass_filenames: false - language: system - -- repo: local - hooks: - - id: system - name: isort - entry: poetry run isort . - pass_filenames: false - language: system - -- repo: local - hooks: - - id: system - name: pylint - entry: poetry run pylint launch - pass_filenames: false - language: system - -- repo: local - hooks: - - id: system - name: mypy - entry: poetry run mypy --ignore-missing-imports launch - pass_filenames: false - language: system diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 439abffd..00000000 --- a/.pylintrc +++ /dev/null @@ -1,31 +0,0 @@ -[tool.pylint.MESSAGE_CONTROL] -disable= - no-else-return, - too-few-public-methods, - line-too-long, - duplicate-code, - import-error, - unused-argument, - import-outside-toplevel, - too-many-instance-attributes, - no-member, - W3101, - R1735, - W0511, - R0914, - R0913, - C0114, - C0111, - C0103, - R0904 - -[tool.pylint.REPORTS] -reports=no - -[tool.pylint.FORMAT] -max-line-length=79 - -[MASTER] -# Ignore anything inside launch/clientlib (since it's documentation) -ignore=clientlib,api_client -extension-pkg-whitelist=pydantic diff --git a/404.html b/404.html new file mode 100644 index 00000000..050a34bf --- /dev/null +++ b/404.html @@ -0,0 +1,619 @@ + + + +
+ + + + + + + + + + + + + + + +LaunchClient(api_key: str, endpoint: Optional[str] = None, self_hosted: bool = False, use_path_with_custom_endpoint: bool = False)
+Scale Launch Python Client.
+ +Initializes a Scale Launch Client.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
api_key |
+
+ str
+ |
+
+
+
+ Your Scale API key + |
+ + required + | +
endpoint |
+
+ Optional[str]
+ |
+
+
+
+ The Scale Launch Endpoint (this should not need to be changed) + |
+
+ None
+ |
+
self_hosted |
+
+ bool
+ |
+
+
+
+ True iff you are connecting to a self-hosted Scale Launch + |
+
+ False
+ |
+
use_path_with_custom_endpoint |
+
+ bool
+ |
+
+
+
+ True iff you are not using the default Scale Launch endpoint +but your endpoint has path routing (to SCALE_LAUNCH_VX_PATH) set up + |
+
+ False
+ |
+
batch_async_request(*, model_bundle: Union[ModelBundle, str], urls: Optional[List[str]] = None, inputs: Optional[List[Dict[str, Any]]] = None, batch_url_file_location: Optional[str] = None, serialization_format: str = 'JSON', labels: Optional[Dict[str, str]] = None, cpus: Optional[int] = None, memory: Optional[str] = None, gpus: Optional[int] = None, gpu_type: Optional[str] = None, storage: Optional[str] = None, max_workers: Optional[int] = None, per_worker: Optional[int] = None, timeout_seconds: Optional[float] = None) -> Dict[str, Any]
+Sends a batch inference request using a given bundle. Returns a key that can be used to +retrieve the results of inference at a later time.
+Must have exactly one of urls or inputs passed in.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle |
+
+ Union[ModelBundle, str]
+ |
+
+
+
+ The bundle or the name of a the bundle to use for inference. + |
+ + required + | +
urls |
+
+ Optional[List[str]]
+ |
+
+
+
+ A list of urls, each pointing to a file containing model input. Must be +accessible by Scale Launch, hence urls need to either be public or signedURLs. + |
+
+ None
+ |
+
inputs |
+
+ Optional[List[Dict[str, Any]]]
+ |
+
+
+
+ A list of model inputs, if exists, we will upload the inputs and pass it in +to Launch. + |
+
+ None
+ |
+
batch_url_file_location |
+
+ Optional[str]
+ |
+
+
+
+ In self-hosted mode, the input to the batch job will be +uploaded to this location if provided. Otherwise, one will be determined from +bundle_location_fn() + |
+
+ None
+ |
+
serialization_format |
+
+ str
+ |
+
+
+
+ Serialization format of output, either 'PICKLE' or 'JSON'. +'pickle' corresponds to pickling results + returning + |
+
+ 'JSON'
+ |
+
labels |
+
+ Optional[Dict[str, str]]
+ |
+
+
+
+ An optional dictionary of key/value pairs to associate with this endpoint. + |
+
+ None
+ |
+
cpus |
+
+ Optional[int]
+ |
+
+
+
+ Number of cpus each worker should get, e.g. 1, 2, etc. This must be greater than +or equal to 1. + |
+
+ None
+ |
+
memory |
+
+ Optional[str]
+ |
+
+
+
+ Amount of memory each worker should get, e.g. "4Gi", "512Mi", etc. This must be +a positive amount of memory. + |
+
+ None
+ |
+
storage |
+
+ Optional[str]
+ |
+
+
+
+ Amount of local ephemeral storage each worker should get, e.g. "4Gi", "512Mi", +etc. This must be a positive amount of storage. + |
+
+ None
+ |
+
gpus |
+
+ Optional[int]
+ |
+
+
+
+ Number of gpus each worker should get, e.g. 0, 1, etc. + |
+
+ None
+ |
+
max_workers |
+
+ Optional[int]
+ |
+
+
+
+ The maximum number of workers. Must be greater than or equal to 0, and as
+well as greater than or equal to |
+
+ None
+ |
+
per_worker |
+
+ Optional[int]
+ |
+
+
+
+ The maximum number of concurrent requests that an individual worker can
+service. Launch automatically scales the number of workers for the endpoint so that
+each worker is processing
|
+
+ None
+ |
+
gpu_type |
+
+ Optional[str]
+ |
+
+
+
+ If specifying a non-zero number of gpus, this controls the type of gpu +requested. Here are the supported values: +
|
+
+ None
+ |
+
timeout_seconds |
+
+ Optional[float]
+ |
+
+
+
+ The maximum amount of time (in seconds) that the batch job can take. +If not specified, the server defaults to 12 hours. This includes the time required +to build the endpoint and the total time required for all the individual tasks. + |
+
+ None
+ |
+
Returns:
+| Type | +Description | +
|---|---|
+ Dict[str, Any]
+ |
+
+
+
+ A dictionary that contains |
+
Cancel a fine-tune
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
fine_tune_id |
+
+ str
+ |
+
+
+
+ ID of the fine-tune + |
+ + required + | +
Returns:
+| Name | Type | +Description | +
|---|---|---|
CancelFineTuneResponse |
+ CancelFineTuneResponse
+ |
+
+
+
+ whether the cancellation was successful + |
+
clone_model_bundle_with_changes(model_bundle: Union[ModelBundle, str], app_config: Optional[Dict] = None) -> ModelBundle
+This method is deprecated. Use
+clone_model_bundle_with_changes_v2 instead.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle |
+
+ Union[ModelBundle, str]
+ |
+
+
+
+ The existing bundle or its ID. + |
+ + required + | +
app_config |
+
+ Optional[Dict]
+ |
+
+
+
+ The new bundle's app config, if not passed in, the new
+bundle's |
+
+ None
+ |
+
Returns:
+| Type | +Description | +
|---|---|
+ ModelBundle
+ |
+
+
+
+ A |
+
clone_model_bundle_with_changes_v2(original_model_bundle_id: str, new_app_config: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response
+Clone a model bundle with an optional new app_config.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
original_model_bundle_id |
+
+ str
+ |
+
+
+
+ The ID of the model bundle you want to clone. + |
+ + required + | +
new_app_config |
+
+ Optional[Dict[str, Any]]
+ |
+
+
+
+ A dictionary of new app config values to use for the cloned model. + |
+
+ None
+ |
+
Returns:
+| Type | +Description | +
|---|---|
+ CreateModelBundleV2Response
+ |
+
+
+
+ An object containing the following keys: +
|
+
completions_stream(endpoint_name: str, prompt: str, max_new_tokens: int, temperature: float, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, timeout: float = DEFAULT_LLM_COMPLETIONS_TIMEOUT) -> Iterable[CompletionStreamV1Response]
+Run prompt completion on an LLM endpoint in streaming fashion. Will fail if endpoint does not support streaming.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
endpoint_name |
+
+ str
+ |
+
+
+
+ The name of the LLM endpoint to make the request to + |
+ + required + | +
prompt |
+
+ str
+ |
+
+
+
+ The prompt to send to the endpoint + |
+ + required + | +
max_new_tokens |
+
+ int
+ |
+
+
+
+ The maximum number of tokens to generate for each prompt + |
+ + required + | +
temperature |
+
+ float
+ |
+
+
+
+ The temperature to use for sampling + |
+ + required + | +
stop_sequences |
+
+ Optional[List[str]]
+ |
+
+
+
+ List of sequences to stop the completion at + |
+
+ None
+ |
+
return_token_log_probs |
+
+ Optional[bool]
+ |
+
+
+
+ Whether to return the log probabilities of the tokens + |
+
+ False
+ |
+
Returns:
+| Type | +Description | +
|---|---|
+ Iterable[CompletionStreamV1Response]
+ |
+
+
+
+ Iterable responses for prompt completion + |
+
completions_sync(endpoint_name: str, prompt: str, max_new_tokens: int, temperature: float, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, timeout: float = DEFAULT_LLM_COMPLETIONS_TIMEOUT) -> CompletionSyncV1Response
+Run prompt completion on a sync LLM endpoint. Will fail if the endpoint is not sync.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
endpoint_name |
+
+ str
+ |
+
+
+
+ The name of the LLM endpoint to make the request to + |
+ + required + | +
prompt |
+
+ str
+ |
+
+
+
+ The completion prompt to send to the endpoint + |
+ + required + | +
max_new_tokens |
+
+ int
+ |
+
+
+
+ The maximum number of tokens to generate for each prompt + |
+ + required + | +
temperature |
+
+ float
+ |
+
+
+
+ The temperature to use for sampling + |
+ + required + | +
stop_sequences |
+
+ Optional[List[str]]
+ |
+
+
+
+ List of sequences to stop the completion at + |
+
+ None
+ |
+
return_token_log_probs |
+
+ Optional[bool]
+ |
+
+
+
+ Whether to return the log probabilities of the tokens + |
+
+ False
+ |
+
Returns:
+| Type | +Description | +
|---|---|
+ CompletionSyncV1Response
+ |
+
+
+
+ Response for prompt completion + |
+
create_docker_image_batch_job(*, labels: Dict[str, str], docker_image_batch_job_bundle: Optional[Union[str, DockerImageBatchJobBundleResponse]] = None, docker_image_batch_job_bundle_name: Optional[str] = None, job_config: Optional[Dict[str, Any]] = None, cpus: Optional[int] = None, memory: Optional[str] = None, gpus: Optional[int] = None, gpu_type: Optional[str] = None, storage: Optional[str] = None)
+For self hosted mode only.
+Parameters:
+ docker_image_batch_job_bundle: Specifies the docker image bundle to use for the batch job.
+ Either the string id of a docker image bundle, or a
+ DockerImageBatchJobBundleResponse object.
+ Only one of docker_image_batch_job_bundle and docker_image_batch_job_bundle_name
+ can be specified.
+ docker_image_batch_job_bundle_name: The name of a batch job bundle. If specified,
+ Launch will use the most recent bundle with that name owned by the current user.
+ Only one of docker_image_batch_job_bundle and docker_image_batch_job_bundle_name
+ can be specified.
+ labels: Kubernetes labels that are present on the batch job.
+ job_config: A JSON-serializable python object that will get passed to the batch job,
+ specifically as the contents of a file mounted at mount_location inside the bundle.
+ You can call python's json.load() on the file to retrieve the contents.
+ cpus: Optional override for the number of cpus to give to your job. Either the default
+ must be specified in the bundle, or this must be specified.
+ memory: Optional override for the amount of memory to give to your job. Either the default
+ must be specified in the bundle, or this must be specified.
+ gpus: Optional number of gpus to give to the bundle. If not specified in the bundle or
+ here, will be interpreted as 0 gpus.
+ gpu_type: Optional type of gpu. If the final number of gpus is positive, must be specified
+ either in the bundle or here.
+ storage: Optional reserved amount of disk to give to your batch job. If not specified,
+ your job may be evicted if it is using too much disk.
create_docker_image_batch_job_bundle(*, name: str, image_repository: str, image_tag: str, command: List[str], env: Optional[Dict[str, str]] = None, mount_location: Optional[str] = None, cpus: Optional[int] = None, memory: Optional[str] = None, gpus: Optional[int] = None, gpu_type: Optional[str] = None, storage: Optional[str] = None) -> CreateDockerImageBatchJobBundleResponse
+For self hosted mode only.
+Creates a Docker Image Batch Job Bundle.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
name |
+
+ str
+ |
+
+
+
+ A user-defined name for the bundle. Does not need to be unique. + |
+ + required + | +
image_repository |
+
+ str
+ |
+
+
+
+ The (short) repository of your image. For example, if your image is located at
+123456789012.dkr.ecr.us-west-2.amazonaws.com/repo:tag, and your version of Launch
+is configured to look at 123456789012.dkr.ecr.us-west-2.amazonaws.com for Docker Images,
+you would pass the value |
+ + required + | +
image_tag |
+
+ str
+ |
+
+
+
+ The tag of your image inside of the repo. In the example above, you would pass
+the value |
+ + required + | +
command |
+
+ List[str]
+ |
+
+
+
+ The command to run inside the docker image. + |
+ + required + | +
env |
+
+ Optional[Dict[str, str]]
+ |
+
+
+
+ A dictionary of environment variables to inject into your docker image. + |
+
+ None
+ |
+
mount_location |
+
+ Optional[str]
+ |
+
+
+
+ A location in the filesystem where you would like a json-formatted file, controllable
+on runtime, to be mounted. This allows behavior to be specified on runtime.
+(Specifically, the contents of this file can be read via |
+
+ None
+ |
+
cpus |
+
+ Optional[int]
+ |
+
+
+
+ Optional default value for the number of cpus to give the job. + |
+
+ None
+ |
+
memory |
+
+ Optional[str]
+ |
+
+
+
+ Optional default value for the amount of memory to give the job. + |
+
+ None
+ |
+
gpus |
+
+ Optional[int]
+ |
+
+
+
+ Optional default value for the number of gpus to give the job. + |
+
+ None
+ |
+
gpu_type |
+
+ Optional[str]
+ |
+
+
+
+ Optional default value for the type of gpu to give the job. + |
+
+ None
+ |
+
storage |
+
+ Optional[str]
+ |
+
+
+
+ Optional default value for the amount of disk to give the job. + |
+
+ None
+ |
+
create_fine_tune(model: str, training_file: str, validation_file: Optional[str] = None, fine_tuning_method: Optional[str] = None, hyperparameters: Optional[Dict[str, str]] = None, wandb_config: Optional[Dict[str, Any]] = None, suffix: str = None) -> CreateFineTuneResponse
+Create a fine-tune
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model |
+
+ str
+ |
+
+
+
+ Identifier of base model to train from. + |
+ + required + | +
training_file |
+
+ str
+ |
+
+
+
+ Path to file of training dataset. +Dataset must be a csv with columns 'prompt' and 'response'. + |
+ + required + | +
validation_file |
+
+ Optional[str]
+ |
+
+
+
+ Path to file of validation dataset. +Has the same format as training_file. If not provided, we will generate a split +from the training dataset. + |
+
+ None
+ |
+
fine_tuning_method |
+
+ Optional[str]
+ |
+
+
+
+ Fine-tuning method. Currently unused, +but when different techniques are implemented we will expose this field. + |
+
+ None
+ |
+
hyperparameters |
+
+ Optional[Dict[str, str]]
+ |
+
+
+
+ Hyperparameters to pass in to training job. + |
+
+ None
+ |
+
wandb_config |
+
+ Optional[Dict[str, Any]]
+ |
+
+
+
+ Configuration for Weights and Biases.
+To enable set |
+
+ None
+ |
+
suffix |
+
+ str
+ |
+
+
+
+ Optional user-provided identifier suffix for the fine-tuned model. + |
+
+ None
+ |
+
Returns:
+| Name | Type | +Description | +
|---|---|---|
CreateFineTuneResponse |
+ CreateFineTuneResponse
+ |
+
+
+
+ ID of the created fine-tune + |
+
create_llm_model_endpoint(endpoint_name: str, model_name: str, inference_framework_image_tag: str, source: LLMSource = LLMSource.HUGGING_FACE, inference_framework: LLMInferenceFramework = LLMInferenceFramework.DEEPSPEED, num_shards: int = 4, quantize: Optional[Quantization] = None, checkpoint_path: Optional[str] = None, cpus: int = 32, memory: str = '192Gi', storage: Optional[str] = None, gpus: int = 4, min_workers: int = 0, max_workers: int = 1, per_worker: int = 10, gpu_type: Optional[str] = 'nvidia-ampere-a10', endpoint_type: str = 'sync', high_priority: Optional[bool] = False, post_inference_hooks: Optional[List[PostInferenceHooks]] = None, default_callback_url: Optional[str] = None, default_callback_auth_kind: Optional[Literal['basic', 'mtls']] = None, default_callback_auth_username: Optional[str] = None, default_callback_auth_password: Optional[str] = None, default_callback_auth_cert: Optional[str] = None, default_callback_auth_key: Optional[str] = None, public_inference: Optional[bool] = None, update_if_exists: bool = False, labels: Optional[Dict[str, str]] = None)
+Creates and registers a model endpoint in Scale Launch. The returned object is an
+instance of type Endpoint, which is a base class of either SyncEndpoint or
+AsyncEndpoint. This is the object to which you sent inference requests.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
endpoint_name |
+
+ str
+ |
+
+
+
+ The name of the model endpoint you want to create. The name +must be unique across all endpoints that you own. + |
+ + required + | +
model_name |
+
+ str
+ |
+
+
+
+ name for the LLM. List can be found at +(TODO: add list of supported models) + |
+ + required + | +
inference_framework_image_tag |
+
+ str
+ |
+
+
+
+ image tag for the inference framework. +(TODO: use latest image tag when unspecified) + |
+ + required + | +
source |
+
+ LLMSource
+ |
+
+
+
+ source of the LLM. Currently only HuggingFace is supported. + |
+
+ HUGGING_FACE
+ |
+
inference_framework |
+
+ LLMInferenceFramework
+ |
+
+
+
+ inference framework for the LLM. Currently only DeepSpeed is supported. + |
+
+ DEEPSPEED
+ |
+
num_shards |
+
+ int
+ |
+
+
+
+ number of shards for the LLM. When bigger than 1, LLM will be sharded +to multiple GPUs. Number of GPUs must be larger than num_shards. + |
+
+ 4
+ |
+
quantize |
+
+ Optional[Quantization]
+ |
+
+
+
+ Quantization method for the LLM. Only affects behavior for text-generation-inference models. + |
+
+ None
+ |
+
checkpoint_path |
+
+ Optional[str]
+ |
+
+
+
+ Path to the checkpoint to load the model from. +Only affects behavior for text-generation-inference models. + |
+
+ None
+ |
+
cpus |
+
+ int
+ |
+
+
+
+ Number of cpus each worker should get, e.g. 1, 2, etc. This must be greater +than or equal to 1. + |
+
+ 32
+ |
+
memory |
+
+ str
+ |
+
+
+
+ Amount of memory each worker should get, e.g. "4Gi", "512Mi", etc. This must +be a positive amount of memory. + |
+
+ '192Gi'
+ |
+
storage |
+
+ Optional[str]
+ |
+
+
+
+ Amount of local ephemeral storage each worker should get, e.g. "4Gi", +"512Mi", etc. This must be a positive amount of storage. + |
+
+ None
+ |
+
gpus |
+
+ int
+ |
+
+
+
+ Number of gpus each worker should get, e.g. 0, 1, etc. + |
+
+ 4
+ |
+
min_workers |
+
+ int
+ |
+
+
+
+ The minimum number of workers. Must be greater than or equal to 0. This
+should be determined by computing the minimum throughput of your workload and
+dividing it by the throughput of a single worker. This field must be at least |
+
+ 0
+ |
+
max_workers |
+
+ int
+ |
+
+
+
+ The maximum number of workers. Must be greater than or equal to 0,
+and as well as greater than or equal to |
+
+ 1
+ |
+
per_worker |
+
+ int
+ |
+
+
+
+ The maximum number of concurrent requests that an individual worker can
+service. Launch automatically scales the number of workers for the endpoint so that
+each worker is processing
Here is our recommendation for computing
|
+
+ 10
+ |
+
gpu_type |
+
+ Optional[str]
+ |
+
+
+
+ If specifying a non-zero number of gpus, this controls the type of gpu +requested. Here are the supported values: +
|
+
+ 'nvidia-ampere-a10'
+ |
+
endpoint_type |
+
+ str
+ |
+
+
+
+ Either |
+
+ 'sync'
+ |
+
high_priority |
+
+ Optional[bool]
+ |
+
+
+
+ Either |
+
+ False
+ |
+
post_inference_hooks |
+
+ Optional[List[PostInferenceHooks]]
+ |
+
+
+
+ List of hooks to trigger after inference tasks are served. + |
+
+ None
+ |
+
default_callback_url |
+
+ Optional[str]
+ |
+
+
+
+ The default callback url to use for async endpoints. +This can be overridden in the task parameters for each individual task. +post_inference_hooks must contain "callback" for the callback to be triggered. + |
+
+ None
+ |
+
default_callback_auth_kind |
+
+ Optional[Literal['basic', 'mtls']]
+ |
+
+
+
+ The default callback auth kind to use for async endpoints. +Either "basic" or "mtls". This can be overridden in the task parameters for each +individual task. + |
+
+ None
+ |
+
default_callback_auth_username |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth username to use. This only +applies if default_callback_auth_kind is "basic". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
default_callback_auth_password |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth password to use. This only +applies if default_callback_auth_kind is "basic". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
default_callback_auth_cert |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth cert to use. This only applies +if default_callback_auth_kind is "mtls". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
default_callback_auth_key |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth key to use. This only applies +if default_callback_auth_kind is "mtls". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
public_inference |
+
+ Optional[bool]
+ |
+
+
+
+ If |
+
+ None
+ |
+
update_if_exists |
+
+ bool
+ |
+
+
+
+ If |
+
+ False
+ |
+
labels |
+
+ Optional[Dict[str, str]]
+ |
+
+
+
+ An optional dictionary of key/value pairs to associate with this endpoint. + |
+
+ None
+ |
+
Returns:
+| Type | +Description | +
|---|---|
| + | +
+
+
+ A Endpoint object that can be used to make requests to the endpoint. + |
+
create_model_bundle(model_bundle_name: str, env_params: Dict[str, str], *, load_predict_fn: Optional[Callable[[LaunchModel_T], Callable[[Any], Any]]] = None, predict_fn_or_cls: Optional[Callable[[Any], Any]] = None, requirements: Optional[List[str]] = None, model: Optional[LaunchModel_T] = None, load_model_fn: Optional[Callable[[], LaunchModel_T]] = None, app_config: Optional[Union[Dict[str, Any], str]] = None, globals_copy: Optional[Dict[str, Any]] = None, request_schema: Optional[Type[BaseModel]] = None, response_schema: Optional[Type[BaseModel]] = None) -> ModelBundle
+This method is deprecated. Use
+create_model_bundle_from_callable_v2 instead.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle_name |
+
+ str
+ |
+
+
+
+ The name of the model bundle you want to create. The name +must be unique across all bundles that you own. + |
+ + required + | +
predict_fn_or_cls |
+
+ Optional[Callable[[Any], Any]]
+ |
+
+
+
+
|
+
+ None
+ |
+
model |
+
+ Optional[LaunchModel_T]
+ |
+
+
+
+ Typically a trained Neural Network, e.g. a Pytorch module. +Exactly one of |
+
+ None
+ |
+
load_model_fn |
+
+ Optional[Callable[[], LaunchModel_T]]
+ |
+
+
+
+ A function that, when run, loads a model. This function is essentially
+a deferred wrapper around the Exactly one of |
+
+ None
+ |
+
load_predict_fn |
+
+ Optional[Callable[[LaunchModel_T], Callable[[Any], Any]]]
+ |
+
+
+
+ Function that, when called with a model, returns a function that +carries out inference. +If Otherwise, if In both cases, |
+
+ None
+ |
+
requirements |
+
+ Optional[List[str]]
+ |
+
+
+
+ A list of python package requirements, where each list element is of
+the form
If you do not pass in a value for |
+
+ None
+ |
+
app_config |
+
+ Optional[Union[Dict[str, Any], str]]
+ |
+
+
+
+ Either a Dictionary that represents a YAML file contents or a local path +to a YAML file. + |
+
+ None
+ |
+
env_params |
+
+ Dict[str, str]
+ |
+
+
+
+ A dictionary that dictates environment information e.g. +the use of pytorch or tensorflow, which base image tag to use, etc. +Specifically, the dictionary should contain the following keys: +
|
+ + required + | +
globals_copy |
+
+ Optional[Dict[str, Any]]
+ |
+
+
+
+ Dictionary of the global symbol table. Normally provided by
+ |
+
+ None
+ |
+
request_schema |
+
+ Optional[Type[BaseModel]]
+ |
+
+
+
+ A pydantic model that represents the request schema for the model +bundle. This is used to validate the request body for the model bundle's endpoint. + |
+
+ None
+ |
+
response_schema |
+
+ Optional[Type[BaseModel]]
+ |
+
+
+
+ A pydantic model that represents the request schema for the model +bundle. This is used to validate the response for the model bundle's endpoint. +Note: If request_schema is specified, then response_schema must also be specified. + |
+
+ None
+ |
+
create_model_bundle_from_callable_v2(*, model_bundle_name: str, load_predict_fn: Callable[[LaunchModel_T], Callable[[Any], Any]], load_model_fn: Callable[[], LaunchModel_T], request_schema: Type[BaseModel], response_schema: Type[BaseModel], requirements: Optional[List[str]] = None, pytorch_image_tag: Optional[str] = None, tensorflow_version: Optional[str] = None, custom_base_image_repository: Optional[str] = None, custom_base_image_tag: Optional[str] = None, app_config: Optional[Union[Dict[str, Any], str]] = None, metadata: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response
+Uploads and registers a model bundle to Scale Launch.
+ + + +Parameters:
+Returns:
+| Type | +Description | +
|---|---|
+ CreateModelBundleV2Response
+ |
+
+
+
+ An object containing the following keys: +
|
+
create_model_bundle_from_dirs(*, model_bundle_name: str, base_paths: List[str], requirements_path: str, env_params: Dict[str, str], load_predict_fn_module_path: str, load_model_fn_module_path: str, app_config: Optional[Union[Dict[str, Any], str]] = None, request_schema: Optional[Type[BaseModel]] = None, response_schema: Optional[Type[BaseModel]] = None) -> ModelBundle
+This method is deprecated. Use
+create_model_bundle_from_dirs_v2
+instead.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle_name |
+
+ str
+ |
+
+
+
+ The name of the model bundle you want to create. The name +must be unique across all bundles that you own. + |
+ + required + | +
base_paths |
+
+ List[str]
+ |
+
+
+
+ The paths on the local filesystem where the bundle code lives. + |
+ + required + | +
requirements_path |
+
+ str
+ |
+
+
+
+ A path on the local filesystem where a |
+ + required + | +
env_params |
+
+ Dict[str, str]
+ |
+
+
+
+ A dictionary that dictates environment information e.g. +the use of pytorch or tensorflow, which base image tag to use, etc. +Specifically, the dictionary should contain the following keys: +
Example: + + |
+ + required + | +
load_predict_fn_module_path |
+
+ str
+ |
+
+
+
+ A python module path for a function that, when called +with the output of load_model_fn_module_path, returns a function that carries out +inference. + |
+ + required + | +
load_model_fn_module_path |
+
+ str
+ |
+
+
+
+ A python module path for a function that returns a model. +The output feeds into the function located at load_predict_fn_module_path. + |
+ + required + | +
app_config |
+
+ Optional[Union[Dict[str, Any], str]]
+ |
+
+
+
+ Either a Dictionary that represents a YAML file contents or a local path +to a YAML file. + |
+
+ None
+ |
+
request_schema |
+
+ Optional[Type[BaseModel]]
+ |
+
+
+
+ A pydantic model that represents the request schema for the model +bundle. This is used to validate the request body for the model bundle's endpoint. + |
+
+ None
+ |
+
response_schema |
+
+ Optional[Type[BaseModel]]
+ |
+
+
+
+ A pydantic model that represents the request schema for the model +bundle. This is used to validate the response for the model bundle's endpoint. +Note: If request_schema is specified, then response_schema must also be specified. + |
+
+ None
+ |
+
create_model_bundle_from_dirs_v2(*, model_bundle_name: str, base_paths: List[str], load_predict_fn_module_path: str, load_model_fn_module_path: str, request_schema: Type[BaseModel], response_schema: Type[BaseModel], requirements_path: Optional[str] = None, pytorch_image_tag: Optional[str] = None, tensorflow_version: Optional[str] = None, custom_base_image_repository: Optional[str] = None, custom_base_image_tag: Optional[str] = None, app_config: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response
+Packages up code from one or more local filesystem folders and uploads them as a bundle +to Scale Launch. In this mode, a bundle is just local code instead of a serialized object.
+For example, if you have a directory structure like so, and your current working
+directory is my_root:
my_root/
+ my_module1/
+ __init__.py
+ ...files and directories
+ my_inference_file.py
+ my_module2/
+ __init__.py
+ ...files and directories
+then calling create_model_bundle_from_dirs_v2 with base_paths=["my_module1",
+"my_module2"] essentially creates a zip file without the root directory, e.g.:
my_module1/
+ __init__.py
+ ...files and directories
+ my_inference_file.py
+ my_module2/
+ __init__.py
+ ...files and directories
+and these contents will be unzipped relative to the server side application root. Bear
+these points in mind when referencing Python module paths for this bundle. For instance,
+if my_inference_file.py has def f(...) as the desired inference loading function,
+then the load_predict_fn_module_path argument should be my_module1.my_inference_file.f.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle_name |
+
+ str
+ |
+
+
+
+ The name of the model bundle you want to create. + |
+ + required + | +
base_paths |
+
+ List[str]
+ |
+
+
+
+ A list of paths to directories that will be zipped up and uploaded +as a bundle. Each path must be relative to the current working directory. + |
+ + required + | +
load_predict_fn_module_path |
+
+ str
+ |
+
+
+
+ The Python module path to the function that will be +used to load the model for inference. This function should take in a path to a +model directory, and return a model object. The model object should be pickleable. + |
+ + required + | +
load_model_fn_module_path |
+
+ str
+ |
+
+
+
+ The Python module path to the function that will be +used to load the model for training. This function should take in a path to a +model directory, and return a model object. The model object should be pickleable. + |
+ + required + | +
request_schema |
+
+ Type[BaseModel]
+ |
+
+
+
+ A Pydantic model that defines the request schema for the bundle. + |
+ + required + | +
response_schema |
+
+ Type[BaseModel]
+ |
+
+
+
+ A Pydantic model that defines the response schema for the bundle. + |
+ + required + | +
requirements_path |
+
+ Optional[str]
+ |
+
+
+
+ Path to a requirements.txt file that will be used to install +dependencies for the bundle. This file must be relative to the current working +directory. + |
+
+ None
+ |
+
pytorch_image_tag |
+
+ Optional[str]
+ |
+
+
+
+ The image tag for the PyTorch image that will be used to run the
+bundle. Exactly one of |
+
+ None
+ |
+
tensorflow_version |
+
+ Optional[str]
+ |
+
+
+
+ The version of TensorFlow that will be used to run the bundle.
+If not specified, the default version will be used. Exactly one of
+ |
+
+ None
+ |
+
custom_base_image_repository |
+
+ Optional[str]
+ |
+
+
+
+ The repository for a custom base image that will be
+used to run the bundle. If not specified, the default base image will be used.
+Exactly one of |
+
+ None
+ |
+
custom_base_image_tag |
+
+ Optional[str]
+ |
+
+
+
+ The tag for a custom base image that will be used to run the
+bundle. Must be specified if |
+
+ None
+ |
+
app_config |
+
+ Optional[Dict[str, Any]]
+ |
+
+
+
+ An optional dictionary of configuration values that will be passed to the
+bundle when it is run. These values can be accessed by the bundle via the
+ |
+
+ None
+ |
+
metadata |
+
+ Optional[Dict[str, Any]]
+ |
+
+
+
+ Metadata to record with the bundle. + |
+
+ None
+ |
+
Returns:
+| Type | +Description | +
|---|---|
+ CreateModelBundleV2Response
+ |
+
+
+
+ An object containing the following keys: +
|
+
create_model_bundle_from_runnable_image_v2(*, model_bundle_name: str, request_schema: Type[BaseModel], response_schema: Type[BaseModel], repository: str, tag: str, command: List[str], healthcheck_route: Optional[str] = None, predict_route: Optional[str] = None, env: Dict[str, str], readiness_initial_delay_seconds: int, metadata: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response
+Create a model bundle from a runnable image. The specified command must start a process
+that will listen for requests on port 5005 using HTTP.
Inference requests must be served at the POST /predict route while the GET /readyz route is a healthcheck.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle_name |
+
+ str
+ |
+
+
+
+ The name of the model bundle you want to create. + |
+ + required + | +
request_schema |
+
+ Type[BaseModel]
+ |
+
+
+
+ A Pydantic model that defines the request schema for the bundle. + |
+ + required + | +
response_schema |
+
+ Type[BaseModel]
+ |
+
+
+
+ A Pydantic model that defines the response schema for the bundle. + |
+ + required + | +
repository |
+
+ str
+ |
+
+
+
+ The name of the Docker repository for the runnable image. + |
+ + required + | +
tag |
+
+ str
+ |
+
+
+
+ The tag for the runnable image. + |
+ + required + | +
command |
+
+ List[str]
+ |
+
+
+
+ The command that will be used to start the process that listens for requests. + |
+ + required + | +
predict_route |
+
+ Optional[str]
+ |
+
+
+
+ The endpoint route on the runnable image that will be called. + |
+
+ None
+ |
+
healthcheck_route |
+
+ Optional[str]
+ |
+
+
+
+ The healthcheck endpoint route on the runnable image. + |
+
+ None
+ |
+
env |
+
+ Dict[str, str]
+ |
+
+
+
+ A dictionary of environment variables that will be passed to the bundle when it +is run. + |
+ + required + | +
readiness_initial_delay_seconds |
+
+ int
+ |
+
+
+
+ The number of seconds to wait for the HTTP server to become ready and +successfully respond on its healthcheck. + |
+ + required + | +
metadata |
+
+ Optional[Dict[str, Any]]
+ |
+
+
+
+ Metadata to record with the bundle. + |
+
+ None
+ |
+
Returns:
+| Type | +Description | +
|---|---|
+ CreateModelBundleV2Response
+ |
+
+
+
+ An object containing the following keys: +
|
+
create_model_bundle_from_streaming_enhanced_runnable_image_v2(*, model_bundle_name: str, request_schema: Type[BaseModel], response_schema: Type[BaseModel], repository: str, tag: str, command: Optional[List[str]] = None, healthcheck_route: Optional[str] = None, predict_route: Optional[str] = None, streaming_command: List[str], streaming_predict_route: Optional[str] = None, env: Dict[str, str], readiness_initial_delay_seconds: int, metadata: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response
+Create a model bundle from a runnable image. The specified command must start a process
+that will listen for requests on port 5005 using HTTP.
Inference requests must be served at the POST /predict route while the GET /readyz route is a healthcheck.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle_name |
+
+ str
+ |
+
+
+
+ The name of the model bundle you want to create. + |
+ + required + | +
request_schema |
+
+ Type[BaseModel]
+ |
+
+
+
+ A Pydantic model that defines the request schema for the bundle. + |
+ + required + | +
response_schema |
+
+ Type[BaseModel]
+ |
+
+
+
+ A Pydantic model that defines the response schema for the bundle. + |
+ + required + | +
repository |
+
+ str
+ |
+
+
+
+ The name of the Docker repository for the runnable image. + |
+ + required + | +
tag |
+
+ str
+ |
+
+
+
+ The tag for the runnable image. + |
+ + required + | +
command |
+
+ Optional[List[str]]
+ |
+
+
+
+ The command that will be used to start the process that listens for requests if +this bundle is used as a SYNC or ASYNC endpoint. + |
+
+ None
+ |
+
healthcheck_route |
+
+ Optional[str]
+ |
+
+
+
+ The healthcheck endpoint route on the runnable image. + |
+
+ None
+ |
+
predict_route |
+
+ Optional[str]
+ |
+
+
+
+ The endpoint route on the runnable image that will be called if this bundle is used as a SYNC +or ASYNC endpoint. + |
+
+ None
+ |
+
streaming_command |
+
+ List[str]
+ |
+
+
+
+ The command that will be used to start the process that listens for +requests if this bundle is used as a STREAMING endpoint. + |
+ + required + | +
streaming_predict_route |
+
+ Optional[str]
+ |
+
+
+
+ The endpoint route on the runnable image that will be called if this bundle is used +as a STREAMING endpoint. + |
+
+ None
+ |
+
env |
+
+ Dict[str, str]
+ |
+
+
+
+ A dictionary of environment variables that will be passed to the bundle when it +is run. + |
+ + required + | +
readiness_initial_delay_seconds |
+
+ int
+ |
+
+
+
+ The number of seconds to wait for the HTTP server to become ready and +successfully respond on its healthcheck. + |
+ + required + | +
metadata |
+
+ Optional[Dict[str, Any]]
+ |
+
+
+
+ Metadata to record with the bundle. + |
+
+ None
+ |
+
Returns:
+| Type | +Description | +
|---|---|
+ CreateModelBundleV2Response
+ |
+
+
+
+ An object containing the following keys: +
|
+
create_model_bundle_from_triton_enhanced_runnable_image_v2(*, model_bundle_name: str, request_schema: Type[BaseModel], response_schema: Type[BaseModel], repository: str, tag: str, command: List[str], healthcheck_route: Optional[str] = None, predict_route: Optional[str] = None, env: Dict[str, str], readiness_initial_delay_seconds: int, triton_model_repository: str, triton_model_replicas: Optional[Dict[str, str]] = None, triton_num_cpu: float, triton_commit_tag: str, triton_storage: Optional[str] = None, triton_memory: Optional[str] = None, triton_readiness_initial_delay_seconds: int, metadata: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response
+Create a model bundle from a runnable image and a tritonserver image.
+Same requirements as :param:create_model_bundle_from_runnable_image_v2 with additional constraints necessary
+for configuring tritonserver's execution.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle_name |
+
+ str
+ |
+
+
+
+ The name of the model bundle you want to create. + |
+ + required + | +
request_schema |
+
+ Type[BaseModel]
+ |
+
+
+
+ A Pydantic model that defines the request schema for the bundle. + |
+ + required + | +
response_schema |
+
+ Type[BaseModel]
+ |
+
+
+
+ A Pydantic model that defines the response schema for the bundle. + |
+ + required + | +
repository |
+
+ str
+ |
+
+
+
+ The name of the Docker repository for the runnable image. + |
+ + required + | +
tag |
+
+ str
+ |
+
+
+
+ The tag for the runnable image. + |
+ + required + | +
command |
+
+ List[str]
+ |
+
+
+
+ The command that will be used to start the process that listens for requests. + |
+ + required + | +
predict_route |
+
+ Optional[str]
+ |
+
+
+
+ The endpoint route on the runnable image that will be called. + |
+
+ None
+ |
+
healthcheck_route |
+
+ Optional[str]
+ |
+
+
+
+ The healthcheck endpoint route on the runnable image. + |
+
+ None
+ |
+
env |
+
+ Dict[str, str]
+ |
+
+
+
+ A dictionary of environment variables that will be passed to the bundle when it +is run. + |
+ + required + | +
readiness_initial_delay_seconds |
+
+ int
+ |
+
+
+
+ The number of seconds to wait for the HTTP server to +become ready and successfully respond on its healthcheck. + |
+ + required + | +
triton_model_repository |
+
+ str
+ |
+
+
+
+ The S3 prefix that contains the contents of the model +repository, formatted according to +https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md + |
+ + required + | +
triton_model_replicas |
+
+ Optional[Dict[str, str]]
+ |
+
+
+
+ If supplied, the name and number of replicas to make for each +model. + |
+
+ None
+ |
+
triton_num_cpu |
+
+ float
+ |
+
+
+
+ Number of CPUs, fractional, to allocate to tritonserver. + |
+ + required + | +
triton_commit_tag |
+
+ str
+ |
+
+
+
+ The image tag of the specific trionserver version. + |
+ + required + | +
triton_storage |
+
+ Optional[str]
+ |
+
+
+
+ Amount of storage space to allocate for the tritonserver container. + |
+
+ None
+ |
+
triton_memory |
+
+ Optional[str]
+ |
+
+
+
+ Amount of memory to allocate for the tritonserver container. + |
+
+ None
+ |
+
triton_readiness_initial_delay_seconds |
+
+ int
+ |
+
+
+
+ Like readiness_initial_delay_seconds, but for +tritonserver's own healthcheck. + |
+ + required + | +
metadata |
+
+ Optional[Dict[str, Any]]
+ |
+
+
+
+ Metadata to record with the bundle. + |
+
+ None
+ |
+
Returns:
+| Type | +Description | +
|---|---|
+ CreateModelBundleV2Response
+ |
+
+
+
+ An object containing the following keys: +
|
+
create_model_endpoint(*, endpoint_name: str, model_bundle: Union[ModelBundle, str], cpus: int = 3, memory: str = '8Gi', storage: str = '16Gi', gpus: int = 0, min_workers: int = 1, max_workers: int = 1, per_worker: int = 10, gpu_type: Optional[str] = None, endpoint_type: str = 'sync', high_priority: Optional[bool] = False, post_inference_hooks: Optional[List[PostInferenceHooks]] = None, default_callback_url: Optional[str] = None, default_callback_auth_kind: Optional[Literal['basic', 'mtls']] = None, default_callback_auth_username: Optional[str] = None, default_callback_auth_password: Optional[str] = None, default_callback_auth_cert: Optional[str] = None, default_callback_auth_key: Optional[str] = None, public_inference: Optional[bool] = None, update_if_exists: bool = False, labels: Optional[Dict[str, str]] = None) -> Optional[Endpoint]
+Creates and registers a model endpoint in Scale Launch. The returned object is an
+instance of type Endpoint, which is a base class of either SyncEndpoint or
+AsyncEndpoint. This is the object to which you sent inference requests.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
endpoint_name |
+
+ str
+ |
+
+
+
+ The name of the model endpoint you want to create. The name +must be unique across all endpoints that you own. + |
+ + required + | +
model_bundle |
+
+ Union[ModelBundle, str]
+ |
+
+
+
+ The |
+ + required + | +
cpus |
+
+ int
+ |
+
+
+
+ Number of cpus each worker should get, e.g. 1, 2, etc. This must be greater +than or equal to 1. + |
+
+ 3
+ |
+
memory |
+
+ str
+ |
+
+
+
+ Amount of memory each worker should get, e.g. "4Gi", "512Mi", etc. This must +be a positive amount of memory. + |
+
+ '8Gi'
+ |
+
storage |
+
+ str
+ |
+
+
+
+ Amount of local ephemeral storage each worker should get, e.g. "4Gi", +"512Mi", etc. This must be a positive amount of storage. + |
+
+ '16Gi'
+ |
+
gpus |
+
+ int
+ |
+
+
+
+ Number of gpus each worker should get, e.g. 0, 1, etc. + |
+
+ 0
+ |
+
min_workers |
+
+ int
+ |
+
+
+
+ The minimum number of workers. Must be greater than or equal to 0. This
+should be determined by computing the minimum throughput of your workload and
+dividing it by the throughput of a single worker. This field must be at least |
+
+ 1
+ |
+
max_workers |
+
+ int
+ |
+
+
+
+ The maximum number of workers. Must be greater than or equal to 0,
+and as well as greater than or equal to |
+
+ 1
+ |
+
per_worker |
+
+ int
+ |
+
+
+
+ The maximum number of concurrent requests that an individual worker can
+service. Launch automatically scales the number of workers for the endpoint so that
+each worker is processing
Here is our recommendation for computing
|
+
+ 10
+ |
+
gpu_type |
+
+ Optional[str]
+ |
+
+
+
+ If specifying a non-zero number of gpus, this controls the type of gpu +requested. Here are the supported values: +
|
+
+ None
+ |
+
endpoint_type |
+
+ str
+ |
+
+
+
+ Either |
+
+ 'sync'
+ |
+
high_priority |
+
+ Optional[bool]
+ |
+
+
+
+ Either |
+
+ False
+ |
+
post_inference_hooks |
+
+ Optional[List[PostInferenceHooks]]
+ |
+
+
+
+ List of hooks to trigger after inference tasks are served. + |
+
+ None
+ |
+
default_callback_url |
+
+ Optional[str]
+ |
+
+
+
+ The default callback url to use for async endpoints. +This can be overridden in the task parameters for each individual task. +post_inference_hooks must contain "callback" for the callback to be triggered. + |
+
+ None
+ |
+
default_callback_auth_kind |
+
+ Optional[Literal['basic', 'mtls']]
+ |
+
+
+
+ The default callback auth kind to use for async endpoints. +Either "basic" or "mtls". This can be overridden in the task parameters for each +individual task. + |
+
+ None
+ |
+
default_callback_auth_username |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth username to use. This only +applies if default_callback_auth_kind is "basic". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
default_callback_auth_password |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth password to use. This only +applies if default_callback_auth_kind is "basic". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
default_callback_auth_cert |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth cert to use. This only applies +if default_callback_auth_kind is "mtls". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
default_callback_auth_key |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth key to use. This only applies +if default_callback_auth_kind is "mtls". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
public_inference |
+
+ Optional[bool]
+ |
+
+
+
+ If |
+
+ None
+ |
+
update_if_exists |
+
+ bool
+ |
+
+
+
+ If |
+
+ False
+ |
+
labels |
+
+ Optional[Dict[str, str]]
+ |
+
+
+
+ An optional dictionary of key/value pairs to associate with this endpoint. + |
+
+ None
+ |
+
Returns:
+| Type | +Description | +
|---|---|
+ Optional[Endpoint]
+ |
+
+
+
+ A Endpoint object that can be used to make requests to the endpoint. + |
+
Delete a file
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
file_id |
+
+ str
+ |
+
+
+
+ ID of the file + |
+ + required + | +
Returns:
+| Name | Type | +Description | +
|---|---|---|
DeleteFileResponse |
+ DeleteFileResponse
+ |
+
+
+
+ whether the deletion was successful + |
+
Deletes an LLM model endpoint.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_endpoint_name |
+
+ str
+ |
+
+
+
+ The name of the model endpoint to delete. + |
+ + required + | +
Deletes a model endpoint.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_endpoint |
+ + | +
+
+
+ A |
+ + required + | +
edit_model_endpoint(*, model_endpoint: Union[ModelEndpoint, str], model_bundle: Optional[Union[ModelBundle, str]] = None, cpus: Optional[float] = None, memory: Optional[str] = None, storage: Optional[str] = None, gpus: Optional[int] = None, min_workers: Optional[int] = None, max_workers: Optional[int] = None, per_worker: Optional[int] = None, gpu_type: Optional[str] = None, high_priority: Optional[bool] = None, post_inference_hooks: Optional[List[PostInferenceHooks]] = None, default_callback_url: Optional[str] = None, default_callback_auth_kind: Optional[Literal['basic', 'mtls']] = None, default_callback_auth_username: Optional[str] = None, default_callback_auth_password: Optional[str] = None, default_callback_auth_cert: Optional[str] = None, default_callback_auth_key: Optional[str] = None, public_inference: Optional[bool] = None) -> None
+Edits an existing model endpoint. Here are the fields that cannot be edited on an +existing endpoint:
+SyncEnpdoint
+to an AsyncEndpoint or vice versa.Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_endpoint |
+
+ Union[ModelEndpoint, str]
+ |
+
+
+
+ The model endpoint (or its name) you want to edit. The name +must be unique across all endpoints that you own. + |
+ + required + | +
model_bundle |
+
+ Optional[Union[ModelBundle, str]]
+ |
+
+
+
+ The |
+
+ None
+ |
+
cpus |
+
+ Optional[float]
+ |
+
+
+
+ Number of cpus each worker should get, e.g. 1, 2, etc. This must be greater +than or equal to 1. + |
+
+ None
+ |
+
memory |
+
+ Optional[str]
+ |
+
+
+
+ Amount of memory each worker should get, e.g. "4Gi", "512Mi", etc. This must +be a positive amount of memory. + |
+
+ None
+ |
+
storage |
+
+ Optional[str]
+ |
+
+
+
+ Amount of local ephemeral storage each worker should get, e.g. "4Gi", +"512Mi", etc. This must be a positive amount of storage. + |
+
+ None
+ |
+
gpus |
+
+ Optional[int]
+ |
+
+
+
+ Number of gpus each worker should get, e.g. 0, 1, etc. + |
+
+ None
+ |
+
min_workers |
+
+ Optional[int]
+ |
+
+
+
+ The minimum number of workers. Must be greater than or equal to 0. + |
+
+ None
+ |
+
max_workers |
+
+ Optional[int]
+ |
+
+
+
+ The maximum number of workers. Must be greater than or equal to 0,
+and as well as greater than or equal to |
+
+ None
+ |
+
per_worker |
+
+ Optional[int]
+ |
+
+
+
+ The maximum number of concurrent requests that an individual worker can
+service. Launch automatically scales the number of workers for the endpoint so that
+each worker is processing
|
+
+ None
+ |
+
gpu_type |
+
+ Optional[str]
+ |
+
+
+
+ If specifying a non-zero number of gpus, this controls the type of gpu +requested. Here are the supported values: +
|
+
+ None
+ |
+
high_priority |
+
+ Optional[bool]
+ |
+
+
+
+ Either |
+
+ None
+ |
+
post_inference_hooks |
+
+ Optional[List[PostInferenceHooks]]
+ |
+
+
+
+ List of hooks to trigger after inference tasks are served. + |
+
+ None
+ |
+
default_callback_url |
+
+ Optional[str]
+ |
+
+
+
+ The default callback url to use for async endpoints. +This can be overridden in the task parameters for each individual task. +post_inference_hooks must contain "callback" for the callback to be triggered. + |
+
+ None
+ |
+
default_callback_auth_kind |
+
+ Optional[Literal['basic', 'mtls']]
+ |
+
+
+
+ The default callback auth kind to use for async endpoints. +Either "basic" or "mtls". This can be overridden in the task parameters for each +individual task. + |
+
+ None
+ |
+
default_callback_auth_username |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth username to use. This only +applies if default_callback_auth_kind is "basic". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
default_callback_auth_password |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth password to use. This only +applies if default_callback_auth_kind is "basic". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
default_callback_auth_cert |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth cert to use. This only applies +if default_callback_auth_kind is "mtls". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
default_callback_auth_key |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth key to use. This only applies +if default_callback_auth_kind is "mtls". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
public_inference |
+
+ Optional[bool]
+ |
+
+
+
+ If |
+
+ None
+ |
+
Gets inference results from a previously created batch job.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
batch_job_id |
+
+ str
+ |
+
+
+
+ An id representing the batch task job. This id is the in the response from
+calling |
+ + required + | +
Returns:
+| Type | +Description | +
|---|---|
+ Dict[str, Any]
+ |
+
+
+
+ A dictionary that contains the following fields: + |
+
+ Dict[str, Any]
+ |
+
+
+
+
|
+
+ Dict[str, Any]
+ |
+
+
+
+
|
+
+ Dict[str, Any]
+ |
+
+
+
+
|
+
+ Dict[str, Any]
+ |
+
+
+
+
|
+
+ Dict[str, Any]
+ |
+
+
+
+
|
+
For self hosted mode only. Gets information about a batch job given a batch job id.
+ +get_docker_image_batch_job_bundle(docker_image_batch_job_bundle_id: str) -> DockerImageBatchJobBundleResponse
+For self hosted mode only. Gets information for a single batch job bundle with a given id.
+ +Get metadata about a file
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
file_id |
+
+ str
+ |
+
+
+
+ ID of the file + |
+ + required + | +
Returns:
+| Name | Type | +Description | +
|---|---|---|
GetFileResponse |
+ GetFileResponse
+ |
+
+
+
+ ID, filename, and size of the requested file + |
+
Get a file's content
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
file_id |
+
+ str
+ |
+
+
+
+ ID of the file + |
+ + required + | +
Returns:
+| Name | Type | +Description | +
|---|---|---|
GetFileContentResponse |
+ GetFileContentResponse
+ |
+
+
+
+ ID and content of the requested file + |
+
Get status of a fine-tune
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
fine_tune_id |
+
+ str
+ |
+
+
+
+ ID of the fine-tune + |
+ + required + | +
Returns:
+| Name | Type | +Description | +
|---|---|---|
GetFineTuneResponse |
+ GetFineTuneResponse
+ |
+
+
+
+ ID and status of the requested fine-tune + |
+
Get list of fine-tune events
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
fine_tune_id |
+
+ str
+ |
+
+
+
+ ID of the fine-tune + |
+ + required + | +
Returns:
+| Name | Type | +Description | +
|---|---|---|
GetFineTuneEventsResponse |
+ GetFineTuneEventsResponse
+ |
+
+
+
+ a list of all the events of the fine-tune + |
+
For self hosted mode only. Gets information for the latest batch job bundle with a given name.
+ +Get the latest version of a model bundle.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle_name |
+
+ str
+ |
+
+
+
+ The name of the model bundle you want to get. + |
+ + required + | +
Returns:
+| Type | +Description | +
|---|---|
+ ModelBundleV2Response
+ |
+
+
+
+ An object containing the following keys: +
|
+
get_llm_model_endpoint(endpoint_name: str) -> Optional[Union[AsyncEndpoint, SyncEndpoint, StreamingEndpoint]]
+Gets a model endpoint associated with a name that the user has access to.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
endpoint_name |
+
+ str
+ |
+
+
+
+ The name of the endpoint to retrieve. + |
+ + required + | +
Returns a model bundle specified by bundle_name that the user owns.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle |
+
+ Union[ModelBundle, str]
+ |
+
+
+
+ The bundle or its name. + |
+ + required + | +
Returns:
+| Type | +Description | +
|---|---|
+ ModelBundle
+ |
+
+
+
+ A |
+
Get a model bundle.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_bundle_id |
+
+ str
+ |
+
+
+
+ The ID of the model bundle you want to get. + |
+ + required + | +
Returns:
+| Type | +Description | +
|---|---|
+ ModelBundleV2Response
+ |
+
+
+
+ An object containing the following fields: +
|
+
Gets a model endpoint associated with a name.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
endpoint_name |
+
+ str
+ |
+
+
+
+ The name of the endpoint to retrieve. + |
+ + required + | +
list_docker_image_batch_job_bundles(bundle_name: Optional[str] = None, order_by: Optional[Literal['newest', 'oldest']] = None) -> ListDockerImageBatchJobBundleResponse
+For self hosted mode only. Gets information for multiple bundles.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
bundle_name |
+
+ Optional[str]
+ |
+
+
+
+ The name of the bundles to retrieve. If not specified, this will retrieve all + |
+
+ None
+ |
+
order_by |
+
+ Optional[Literal['newest', 'oldest']]
+ |
+
+
+
+ Either "newest", "oldest", or not specified. Specify to sort by newest/oldest. + |
+
+ None
+ |
+
List files
+ + + +Returns:
+| Name | Type | +Description | +
|---|---|---|
ListFilesResponse |
+ ListFilesResponse
+ |
+
+
+
+ list of all files (ID, filename, and size) + |
+
List fine-tunes
+ + + +Returns:
+| Name | Type | +Description | +
|---|---|---|
ListFineTunesResponse |
+ ListFineTunesResponse
+ |
+
+
+
+ list of all fine-tunes and their statuses + |
+
Lists all LLM model endpoints that the user has access to.
+ + + +Returns:
+| Type | +Description | +
|---|---|
+ List[Endpoint]
+ |
+
+
+
+ A list of |
+
Returns a list of model bundles that the user owns.
+ + + +Returns:
+| Type | +Description | +
|---|---|
+ List[ModelBundle]
+ |
+
+
+
+ A list of ModelBundle objects + |
+
List all model bundles.
+ + + +Returns:
+| Type | +Description | +
|---|---|
+ ListModelBundlesV2Response
+ |
+
+
+
+ An object containing the following keys: +
|
+
Lists all model endpoints that the user owns.
+ + + +Returns:
+| Type | +Description | +
|---|---|
+ List[Endpoint]
+ |
+
+
+
+ A list of |
+
download a finetuned model
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_name |
+
+ str
+ |
+
+
+
+ name of the model to download + |
+ + required + | +
download_format |
+
+ str
+ |
+
+
+
+ format of the model to download + |
+
+ 'hugging_face'
+ |
+
Returns:
+| Name | Type | +Description | +
|---|---|---|
ModelDownloadResponse |
+ ModelDownloadResponse
+ |
+
+
+
+ dictionary with file names and urls to download the model + |
+
Retrieves the logs for the creation of the endpoint.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_endpoint |
+
+ Union[ModelEndpoint, str]
+ |
+
+
+
+ The endpoint or its name. + |
+ + required + | +
For self-hosted mode only. Registers a function that gives a location for batch CSV +inputs. Should give different locations each time. This function is called as +batch_csv_location_fn(), and should return a batch_csv_url that upload_batch_csv_fn can +take.
+Strictly, batch_csv_location_fn() does not need to return a str. The only requirement is +that if batch_csv_location_fn returns a value of type T, then upload_batch_csv_fn() takes +in an object of type T as its second argument (i.e. batch_csv_url).
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
batch_csv_location_fn |
+
+ Callable[[], str]
+ |
+
+
+
+ Function that generates batch_csv_urls for upload_batch_csv_fn. + |
+ + required + | +
For self-hosted mode only. Registers a function that gives a location for a model bundle.
+Should give different locations each time. This function is called as
+bundle_location_fn(), and should return a bundle_url that
+register_upload_bundle_fn can take.
Strictly, bundle_location_fn() does not need to return a str. The only
+requirement is that if bundle_location_fn returns a value of type T,
+then upload_bundle_fn() takes in an object of type T as its second argument (i.e.
+bundle_url).
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
bundle_location_fn |
+
+ Callable[[], str]
+ |
+
+
+
+ Function that generates bundle_urls for upload_bundle_fn. + |
+ + required + | +
For self-hosted mode only. Registers a function that handles batch text upload. This +function is called as
+upload_batch_csv_fn(csv_text, csv_url)
+This function should directly write the contents of csv_text as a text string into
+csv_url.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
upload_batch_csv_fn |
+
+ Callable[[str, str], None]
+ |
+
+
+
+ Function that takes in a csv text (string type), +and uploads that bundle to an appropriate location. Only needed for self-hosted mode. + |
+ + required + | +
For self-hosted mode only. Registers a function that handles model bundle upload. This +function is called as
+upload_bundle_fn(serialized_bundle, bundle_url)
+This function should directly write the contents of serialized_bundle as a
+binary string into bundle_url.
See register_bundle_location_fn for more notes on the signature of upload_bundle_fn
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
upload_bundle_fn |
+
+ Callable[[str, str], None]
+ |
+
+
+
+ Function that takes in a serialized bundle (bytes type), +and uploads that bundle to an appropriate location. Only needed for self-hosted mode. + |
+ + required + | +
For self hosted mode only. Updates a batch job by id. +Use this if you want to cancel/delete a batch job.
+ +Upload a file
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
file_path |
+
+ str
+ |
+
+
+
+ Path to a local file to upload. + |
+ + required + | +
Returns:
+| Name | Type | +Description | +
|---|---|---|
UploadFileResponse |
+ UploadFileResponse
+ |
+
+
+
+ ID of the created file + |
+
EndpointRequest(url: Optional[str] = None, args: Optional[Dict] = None, callback_url: Optional[str] = None, callback_auth_kind: Optional[Literal['basic', 'mtls']] = None, callback_auth_username: Optional[str] = None, callback_auth_password: Optional[str] = None, callback_auth_cert: Optional[str] = None, callback_auth_key: Optional[str] = None, return_pickled: Optional[bool] = False, request_id: Optional[str] = None)
+Represents a single request to either a SyncEndpoint, StreamingEndpoint, or AsyncEndpoint.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
url |
+
+ Optional[str]
+ |
+
+
+
+ A url to some file that can be read in to a ModelBundle's predict function. Can be an image, raw text, etc.
+Note: the contents of the file located at Exactly one of |
+
+ None
+ |
+
args |
+
+ Optional[Dict]
+ |
+
+
+
+ A Dictionary with arguments to a ModelBundle's predict function. If the predict function has signature
+ Exactly one of |
+
+ None
+ |
+
return_pickled |
+
+ Optional[bool]
+ |
+
+
+
+ Whether the output should be a pickled python object, or directly returned serialized json. + |
+
+ False
+ |
+
callback_url |
+
+ Optional[str]
+ |
+
+
+
+ The callback url to use for this task. If None, then the +default_callback_url of the endpoint is used. The endpoint must specify +"callback" as a post-inference hook for the callback to be triggered. + |
+
+ None
+ |
+
callback_auth_kind |
+
+ Optional[Literal['basic', 'mtls']]
+ |
+
+
+
+ The default callback auth kind to use for async endpoints. +Either "basic" or "mtls". This can be overridden in the task parameters for each +individual task. + |
+
+ None
+ |
+
callback_auth_username |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth username to use. This only +applies if callback_auth_kind is "basic". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
callback_auth_password |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth password to use. This only +applies if callback_auth_kind is "basic". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
callback_auth_cert |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth cert to use. This only applies +if callback_auth_kind is "mtls". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
callback_auth_key |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth key to use. This only applies +if callback_auth_kind is "mtls". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
request_id |
+
+ Optional[str]
+ |
+
+
+
+ (deprecated) A user-specifiable id for requests. +Should be unique among EndpointRequests made in the same batch call. +If one isn't provided the client will generate its own. + |
+
+ None
+ |
+
EndpointResponse(client, status: str, result_url: Optional[str] = None, result: Optional[str] = None, traceback: Optional[str] = None)
+Represents a response received from a Endpoint.
+ + + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
client |
+ + | +
+
+
+ An instance of |
+ + required + | +
status |
+
+ str
+ |
+
+
+
+ A string representing the status of the request, i.e. |
+ + required + | +
result_url |
+
+ Optional[str]
+ |
+
+
+
+ A string that is a url containing the pickled python object from the +Endpoint's predict function. +Exactly one of |
+
+ None
+ |
+
result |
+
+ Optional[str]
+ |
+
+
+
+ A string that is the serialized return value (in json form) of the Endpoint's predict function.
+Specifically, one can Exactly one of |
+
+ None
+ |
+
traceback |
+
+ Optional[str]
+ |
+
+
+
+ The stack trace if the inference endpoint raised an error. Can be used for debugging + |
+
+ None
+ |
+
Represents a future response from an Endpoint. Specifically, when the EndpointResponseFuture is ready,
+then its get method will return an actual instance of EndpointResponse.
This object should not be directly instantiated by the user.
+ + + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
client |
+ + | +
+
+
+ An instance of |
+ + required + | +
endpoint_name |
+
+ str
+ |
+
+
+
+ The name of the endpoint. + |
+ + required + | +
async_task_id |
+
+ str
+ |
+
+
+
+ An async task id. + |
+ + required + | +
Retrieves the EndpointResponse for the prediction request after it completes. This method blocks.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
timeout |
+
+ Optional[float]
+ |
+
+
+
+ The maximum number of seconds to wait for the response. If None, then +the method will block indefinitely until the response is ready. + |
+
+ None
+ |
+
+ Bases: Iterator
Represents a stream response from an Endpoint. This object is iterable and yields
+EndpointResponse objects.
This object should not be directly instantiated by the user.
+ + + + + + +
+ Bases: str, Enum
Post-inference hooks are functions that are called after inference is complete.
+ + + +Attributes:
+| Name | +Type | +Description | +
|---|---|---|
CALLBACK |
+
+ str
+ |
+
+
+
+ The callback hook is called with the inference response and the task ID. + |
+
We provide some APIs to conveniently create, list and inference with LLMs. Under the hood they are Launch model endpoints.
+import os
+
+from rich import print
+
+from launch import LaunchClient
+from launch.api_client.model.llm_inference_framework import (
+ LLMInferenceFramework,
+)
+from launch.api_client.model.llm_source import LLMSource
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"), endpoint=os.getenv("LAUNCH_ENDPOINT"))
+
+endpoints = client.list_llm_model_endpoints()
+
+print(endpoints)
+
+endpoint_name = "test-flan-t5-xxl"
+client.create_llm_model_endpoint(
+ endpoint_name=endpoint_name,
+ model_name="flan-t5-xxl",
+ source=LLMSource.HUGGING_FACE,
+ inference_framework=LLMInferenceFramework.DEEPSPEED,
+ inference_framework_image_tag=os.getenv("INFERENCE_FRAMEWORK_IMAGE_TAG"),
+ num_shards=4,
+ min_workers=1,
+ max_workers=1,
+ gpus=4,
+ endpoint_type="sync",
+)
+
+# Wait for the endpoint to be ready
+
+output = client.completions_sync(endpoint_name, prompt="What is Deep Learning?", max_new_tokens=10, temperature=0)
+print(output)
+
+ Bases: BaseModel
instance-attribute
+
+
+¶Optional configuration for the application.
+class-attribute
+ instance-attribute
+
+
+¶framework: Union[PytorchFramework, TensorflowFramework, CustomFramework] = Field(..., discriminator='framework_type')
+Machine Learning framework specification. Either
+PytorchFramework,
+TensorflowFramework, or
+CustomFramework.
instance-attribute
+
+
+¶Function which, when called, returns the model object.
+instance-attribute
+
+
+¶Function which, when called, returns the prediction function.
+instance-attribute
+
+
+¶List of requirements to install in the environment before running the model.
+
+ Bases: BaseModel
Response object for creating a Model Bundle.
+ + + + +instance-attribute
+
+
+¶ID of the Model Bundle.
+
+ Bases: BaseModel
+ Bases: BaseModel
Response object for listing Model Bundles.
+ + + + +instance-attribute
+
+
+¶A list of Model Bundles.
+dataclass
+
+
+¶Represents a ModelBundle.
+ + + + +class-attribute
+ instance-attribute
+
+
+¶An optional user-specified configuration mapping for the bundle.
+class-attribute
+ instance-attribute
+
+
+¶A dictionary that dictates environment information. See LaunchClient.create_model_bundle +for more information.
+class-attribute
+ instance-attribute
+
+
+¶A globally unique identifier for the bundle.
+class-attribute
+ instance-attribute
+
+
+¶An opaque location for the bundle.
+class-attribute
+ instance-attribute
+
+
+¶Arbitrary metadata for the bundle.
+instance-attribute
+
+
+¶The name of the bundle. Must be unique across all bundles that the user owns.
+class-attribute
+ instance-attribute
+
+
+¶The packaging type for the bundle. Can be cloudpickle or zip.
class-attribute
+ instance-attribute
+
+
+¶A list of Python package requirements for the bundle. See LaunchClient.create_model_bundle +for more information.
+
+ Bases: BaseModel
Response object for a single Model Bundle.
+ + + + +instance-attribute
+
+
+¶Timestamp of when the Model Bundle was created.
+class-attribute
+ instance-attribute
+
+
+¶Flavor of the Model Bundle, representing how the model bundle was packaged.
+See ModelBundleFlavors for details.
instance-attribute
+
+
+¶Metadata associated with the Model Bundle.
+instance-attribute
+
+
+¶IDs of the Model Artifacts associated with the Model Bundle.
+instance-attribute
+
+
+¶Name of the Model Bundle.
+
+ Bases: BaseModel
instance-attribute
+
+
+¶Image tag of the Pytorch image to use.
+
+ Bases: RunnableImageLike
Model bundles that use custom docker images that expose an HTTP server for inference.
+ + + + +
+ Bases: BaseModel
instance-attribute
+
+
+¶Tensorflow version to use.
+
+ Bases: BaseModel
class-attribute
+ instance-attribute
+
+
+¶Optional configuration for the application.
+class-attribute
+ instance-attribute
+
+
+¶framework: Union[PytorchFramework, TensorflowFramework, CustomFramework] = Field(..., discriminator='framework_type')
+Machine Learning framework specification. Either
+PytorchFramework,
+TensorflowFramework, or
+CustomFramework.
instance-attribute
+
+
+¶Path to the module to load the model object.
+instance-attribute
+
+
+¶Path to the module to load the prediction function.
+instance-attribute
+
+
+¶List of requirements to install in the environment before running the model.
+All classes here are returned by the
+get_model_endpoint
+method and provide a predict function.
+ Bases: Endpoint
An asynchronous model endpoint.
+ + + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_endpoint |
+
+ ModelEndpoint
+ |
+
+
+
+ ModelEndpoint object. + |
+ + required + | +
client |
+ + | +
+
+
+ A LaunchClient object + |
+ + required + | +
Runs an asynchronous prediction request.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
request |
+
+ EndpointRequest
+ |
+
+
+
+ The |
+ + required + | +
Returns:
+| Name | Type | +Description | +
|---|---|---|
+ EndpointResponseFuture
+ |
+
+
+
+ An |
+ |
Example |
+ EndpointResponseFuture
+ |
+
+
+
+
+ |
+
+ EndpointResponseFuture
+ |
+
+
+
+ .. code-block:: python +my_endpoint = AsyncEndpoint(...) +f: EndpointResponseFuture = my_endpoint.predict(EndpointRequest(...)) +result = f.get() # blocks on completion + |
+
(deprecated) +Runs inference on the data items specified by urls. Returns a AsyncEndpointResponse.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
requests |
+
+ Sequence[EndpointRequest]
+ |
+
+
+
+ List of EndpointRequests. Request_ids must all be distinct. + |
+ + required + | +
Returns:
+| Type | +Description | +
|---|---|
+ AsyncEndpointBatchResponse
+ |
+
+
+
+ an AsyncEndpointResponse keeping track of the inference requests made + |
+
+ Bases: Endpoint
A synchronous model endpoint.
+ + + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_endpoint |
+
+ ModelEndpoint
+ |
+
+
+
+ ModelEndpoint object. + |
+ + required + | +
client |
+ + | +
+
+
+ A LaunchClient object + |
+ + required + | +
Runs a synchronous prediction request.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
request |
+
+ EndpointRequest
+ |
+
+
+
+ The |
+ + required + | +
+ Bases: Endpoint
A synchronous model endpoint.
+ + + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
model_endpoint |
+
+ ModelEndpoint
+ |
+
+
+
+ ModelEndpoint object. + |
+ + required + | +
client |
+ + | +
+
+
+ A LaunchClient object + |
+ + required + | +
Runs a streaming prediction request.
+ + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
request |
+
+ EndpointRequest
+ |
+
+
+
+ The |
+ + required + | +
Returns:
+| Type | +Description | +
|---|---|
+ EndpointResponseStream
+ |
+
+
+
+ An |
+
Launch comes with a CLI for listing bundles / endpoints, editing endpoints, +and sending tasks to endpoints.
+The CLI can be used as scale-launch ....
Run scale-launch --help for more options.
This is the command line interface (CLI) package for Scale Launch.
+
+ ██╗ █████╗ ██╗ ██╗███╗ ██╗ ██████╗██╗ ██╗
+ ██║ ██╔══██╗██║ ██║████╗ ██║██╔════╝██║ ██║
+ ██║ ███████║██║ ██║██╔██╗ ██║██║ ███████║
+ ██║ ██╔══██║██║ ██║██║╚██╗██║██║ ██╔══██║
+ ███████╗██║ ██║╚██████╔╝██║ ╚████║╚██████╗██║ ██║
+ ╚══════╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═══╝ ╚═════╝╚═╝ ╚═╝
+
+Usage: scale-launch [OPTIONS] COMMAND [ARGS]...
+
+Options:
+ --help Show this message and exit.
+
+Commands:
+ batch-jobs Batch Jobs is a wrapper around batch jobs in Scale Launch
+ bundles Bundles is a wrapper around model bundles in Scale Launch
+ config Config is a wrapper around getting and setting your API key and other configuration options
+ endpoints Endpoints is a wrapper around model endpoints in Scale Launch
+ tasks Tasks is a wrapper around sending requests to endpoints
+For predicting over a larger set of tasks (> 50) at once, it is recommended to +use batch jobs. Batch jobs are a way to send a large number of tasks to a model +bundle. The tasks are processed in parallel, and the results are returned as a +list of predictions.
+Batch jobs are created using the
+batch_async_request
+method of the
+LaunchClient.
import logging
+import os
+import time
+from launch import LaunchClient
+
+logger = logging.getLogger(__name__)
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+batch_job = client.batch_async_request(
+ model_bundle="test-bundle",
+ inputs=[
+ {"x": 2, "y": "hello"},
+ {"x": 3, "y": "world"},
+ ],
+ gpus=0,
+ labels={
+ "team": "MY_TEAM",
+ "product": "MY_PRODUCT",
+ }
+)
+
+status = "PENDING"
+res = None
+while status != "SUCCESS" and status != "FAILURE" and status != "CANCELLED":
+ time.sleep(30)
+ res = client.get_batch_async_response(batch_job["job_id"])
+ status = res["status"]
+ logging.info(f"the batch job is {status}")
+
+logging.info(res)
+Async model endpoints can be configured to send callbacks to a user-defined +callback URL. Callbacks are sent as HTTP POST requests with a JSON body. The +following code snippet shows how to create an async model endpoint with a +callback URL.
+To configure an async endpoint to send callbacks, set the post_inference_hooks
+field to include
+launch.PostInferenceHooks.CALLBACK.
+A callback URL also needs to be specified, and it can be configured as a default
+using the default_callback_url argument to
+launch.LaunchClient.create_model_endpoint
+or as a per-task override using the callback_url field of
+launch.EndpointRequest.
Note
+Callbacks will not be sent if the endpoint does not have any post-inference
+hooks specified, even if a default_callback_url is provided to the endpoint
+creation method or if the prediction request has a callback_url override.
import os
+import time
+from launch import EndpointRequest, LaunchClient, PostInferenceHooks
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+endpoint = client.create_model_endpoint(
+ endpoint_name="demo-endpoint-callback",
+ model_bundle="test-bundle",
+ cpus=1,
+ min_workers=1,
+ endpoint_type="async",
+ update_if_exists=True,
+ labels={
+ "team": "MY_TEAM",
+ "product": "MY_PRODUCT",
+ },
+ post_inference_hooks=[PostInferenceHooks.CALLBACK],
+ default_callback_url="https://example.com",
+)
+
+while endpoint.status() != "READY":
+ time.sleep(10)
+
+future_default = endpoint.predict(
+ request=EndpointRequest(args={"x": 2, "y": "hello"})
+)
+"""
+A callback is sent to https://example.com with the following JSON body:
+{
+ "task_id": "THE_TASK_ID",
+ "result": 7
+}
+"""
+
+future_custom_callback_url = endpoint.predict(
+ request=EndpointRequest(
+ args={"x": 3, "y": "hello"}, callback_url="https://example.com/custom"
+ ),
+)
+
+"""
+A callback is sent to https://example.com/custom with the following JSON body:
+{
+ "task_id": "THE_TASK_ID",
+ "result": 8
+}
+"""
+Warning
+This feature is currently in beta, and the API is likely to change.
+Callbacks can be authenticated using shared authentication headers. To enable authentication,
+set either default_callback_auth_kind when creating the endpoint or callback_auth_kind
+when making a prediction request.
Currently, the supported authentication methods are basic and mtls. If basic is used,
+then the default_callback_auth_username and default_callback_auth_password fields must be
+specified when creating the endpoint, or the callback_auth_username and callback_auth_password
+fields must be specified when making a prediction request. If mtls is used, then the
+same is true for the default_callback_auth_cert and default_callback_auth_key fields,
+or the callback_auth_cert and callback_auth_key fields.
import os
+import time
+from launch import EndpointRequest, LaunchClient, PostInferenceHooks
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+endpoint = client.create_model_endpoint(
+ endpoint_name="demo-endpoint-callback",
+ model_bundle="test-bundle",
+ cpus=1,
+ min_workers=1,
+ endpoint_type="async",
+ update_if_exists=True,
+ labels={
+ "team": "MY_TEAM",
+ "product": "MY_PRODUCT",
+ },
+ post_inference_hooks=[PostInferenceHooks.CALLBACK],
+ default_callback_url="https://example.com",
+ default_callback_auth_kind="basic",
+ default_callback_auth_username="user",
+ default_callback_auth_password="password",
+)
+
+while endpoint.status() != "READY":
+ time.sleep(10)
+
+future_default = endpoint.predict(
+ request=EndpointRequest(args={"x": 2, "y": "hello"})
+)
+"""
+A callback is sent to https://example.com with ("user", "password") as the basic auth.
+"""
+
+future_custom_callback_auth = endpoint.predict(
+ request=EndpointRequest(
+ args={"x": 3, "y": "hello"},
+ callback_auth_kind="mtls",
+ callback_auth_cert="cert",
+ callback_auth_key="key",
+ ),
+)
+"""
+A callback is sent with mTLS authentication.
+"""
+
+client.edit_model_endpoint(
+ model_endpoint=endpoint.model_endpoint,
+ default_callback_auth_kind="mtls",
+ default_callback_auth_cert="cert",
+ default_callback_auth_key="key",
+)
+
+while endpoint.status() != "READY":
+ time.sleep(10)
+
+future_default = endpoint.predict(
+ request=EndpointRequest(args={"x": 2, "y": "hello"})
+)
+"""
+A callback is sent with mTLS auth.
+"""
+
+future_custom_callback_auth = endpoint.predict(
+ request=EndpointRequest(
+ args={"x": 3, "y": "hello"},
+ callback_auth_kind="basic",
+ callback_auth_username="user",
+ callback_auth_password="pass",
+ ),
+)
+"""
+A callback is sent with ("user", "pass") as the basic auth.
+"""
+Once endpoints have been created, users can send tasks to them to make +predictions. The following code snippet shows how to send tasks to endpoints.
+import os
+from launch import EndpointRequest, LaunchClient
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+endpoint = client.get_model_endpoint("demo-endpoint-async")
+future = endpoint.predict(request=EndpointRequest(args={"x": 2, "y": "hello"}))
+response = future.get()
+print(response)
+import os
+from launch import EndpointRequest, LaunchClient
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+endpoint = client.get_model_endpoint("demo-endpoint-streaming")
+response = endpoint.predict(request=EndpointRequest(args={"x": 2, "y": "hello"}))
+for chunk in response:
+ print(chunk)
+EndpointRequest(url: Optional[str] = None, args: Optional[Dict] = None, callback_url: Optional[str] = None, callback_auth_kind: Optional[Literal['basic', 'mtls']] = None, callback_auth_username: Optional[str] = None, callback_auth_password: Optional[str] = None, callback_auth_cert: Optional[str] = None, callback_auth_key: Optional[str] = None, return_pickled: Optional[bool] = False, request_id: Optional[str] = None)
+Represents a single request to either a SyncEndpoint, StreamingEndpoint, or AsyncEndpoint.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
url |
+
+ Optional[str]
+ |
+
+
+
+ A url to some file that can be read in to a ModelBundle's predict function. Can be an image, raw text, etc.
+Note: the contents of the file located at Exactly one of |
+
+ None
+ |
+
args |
+
+ Optional[Dict]
+ |
+
+
+
+ A Dictionary with arguments to a ModelBundle's predict function. If the predict function has signature
+ Exactly one of |
+
+ None
+ |
+
return_pickled |
+
+ Optional[bool]
+ |
+
+
+
+ Whether the output should be a pickled python object, or directly returned serialized json. + |
+
+ False
+ |
+
callback_url |
+
+ Optional[str]
+ |
+
+
+
+ The callback url to use for this task. If None, then the +default_callback_url of the endpoint is used. The endpoint must specify +"callback" as a post-inference hook for the callback to be triggered. + |
+
+ None
+ |
+
callback_auth_kind |
+
+ Optional[Literal['basic', 'mtls']]
+ |
+
+
+
+ The default callback auth kind to use for async endpoints. +Either "basic" or "mtls". This can be overridden in the task parameters for each +individual task. + |
+
+ None
+ |
+
callback_auth_username |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth username to use. This only +applies if callback_auth_kind is "basic". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
callback_auth_password |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth password to use. This only +applies if callback_auth_kind is "basic". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
callback_auth_cert |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth cert to use. This only applies +if callback_auth_kind is "mtls". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
callback_auth_key |
+
+ Optional[str]
+ |
+
+
+
+ The default callback auth key to use. This only applies +if callback_auth_kind is "mtls". This can be overridden in the task +parameters for each individual task. + |
+
+ None
+ |
+
request_id |
+
+ Optional[str]
+ |
+
+
+
+ (deprecated) A user-specifiable id for requests. +Should be unique among EndpointRequests made in the same batch call. +If one isn't provided the client will generate its own. + |
+
+ None
+ |
+
Represents a future response from an Endpoint. Specifically, when the EndpointResponseFuture is ready,
+then its get method will return an actual instance of EndpointResponse.
This object should not be directly instantiated by the user.
+ + + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
client |
+ + | +
+
+
+ An instance of |
+ + required + | +
endpoint_name |
+
+ str
+ |
+
+
+
+ The name of the endpoint. + |
+ + required + | +
async_task_id |
+
+ str
+ |
+
+
+
+ An async task id. + |
+ + required + | +
Retrieves the EndpointResponse for the prediction request after it completes. This method blocks.
Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
timeout |
+
+ Optional[float]
+ |
+
+
+
+ The maximum number of seconds to wait for the response. If None, then +the method will block indefinitely until the response is ready. + |
+
+ None
+ |
+
EndpointResponse(client, status: str, result_url: Optional[str] = None, result: Optional[str] = None, traceback: Optional[str] = None)
+Represents a response received from a Endpoint.
+ + + + +Parameters:
+| Name | +Type | +Description | +Default | +
|---|---|---|---|
client |
+ + | +
+
+
+ An instance of |
+ + required + | +
status |
+
+ str
+ |
+
+
+
+ A string representing the status of the request, i.e. |
+ + required + | +
result_url |
+
+ Optional[str]
+ |
+
+
+
+ A string that is a url containing the pickled python object from the +Endpoint's predict function. +Exactly one of |
+
+ None
+ |
+
result |
+
+ Optional[str]
+ |
+
+
+
+ A string that is the serialized return value (in json form) of the Endpoint's predict function.
+Specifically, one can Exactly one of |
+
+ None
+ |
+
traceback |
+
+ Optional[str]
+ |
+
+
+
+ The stack trace if the inference endpoint raised an error. Can be used for debugging + |
+
+ None
+ |
+
+ Bases: Iterator
Represents a stream response from an Endpoint. This object is iterable and yields
+EndpointResponse objects.
This object should not be directly instantiated by the user.
+ + + + + + +Model Bundles are deployable models that can be used to make predictions. They +are created by packaging a model up into a deployable format.
+There are five methods for creating model bundles:
+create_model_bundle_from_callable_v2,
+create_model_bundle_from_dirs_v2,
+create_model_bundle_from_runnable_image_v2,
+create_model_bundle_from_triton_enhanced_runnable_image_v2,
+and create_model_bundle_from_streaming_enhanced_runnable_image_v2.
The first directly pickles a user-specified load_predict_fn, a function which
+loads the model and returns a predict_fn, a function which takes in a request.
+The second takes in directories containing a load_predict_fn and the
+module path to the load_predict_fn.
+The third takes a Docker image and a command that starts a process listening for
+requests at port 5005 using HTTP and exposes POST /predict and
+GET /readyz endpoints.
+The fourth is a variant of the third that also starts an instance of the NVidia
+Triton framework for efficient model serving.
+The fifth is a variant of the third that responds with a stream of SSEs at POST /stream (the user
+can decide whether POST /predict is also exposed).
Each of these modes of creating a model bundle is called a "Flavor".
+Info
+Here are some tips for how to choose between the different flavors of ModelBundle:
+A CloudpickleArtifactFlavor (creating from callable) is good if:
A ZipArtifactFlavor (creating from directories) is good if:
A RunnableImageFlavor (creating from runnable image) is good if:
A TritonEnhancedRunnableImageFlavor (a runnable image variant) is good if:
RunnableImageFlavortritonserver to accelerate model inferenceA StreamingEnhancedRunnableImageFlavor (a runnable image variant) is good if:
RunnableImageFlavorimport os
+from pydantic import BaseModel
+from launch import LaunchClient
+
+
+class MyRequestSchema(BaseModel):
+ x: int
+ y: str
+
+class MyResponseSchema(BaseModel):
+ __root__: int
+
+
+def my_load_predict_fn(model):
+ def returns_model_of_x_plus_len_of_y(x: int, y: str) -> int:
+ """MyRequestSchema -> MyResponseSchema"""
+ assert isinstance(x, int) and isinstance(y, str)
+ return model(x) + len(y)
+
+ return returns_model_of_x_plus_len_of_y
+
+
+def my_load_model_fn():
+ def my_model(x):
+ return x * 2
+
+ return my_model
+
+BUNDLE_PARAMS = {
+ "model_bundle_name": "test-bundle",
+ "load_model_fn": my_load_model_fn,
+ "load_predict_fn": my_load_predict_fn,
+ "request_schema": MyRequestSchema,
+ "response_schema": MyResponseSchema,
+ "requirements": ["pytest==7.2.1", "numpy"], # list your requirements here
+ "pytorch_image_tag": "1.7.1-cuda11.0-cudnn8-runtime",
+}
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+client.create_model_bundle_from_callable_v2(**BUNDLE_PARAMS)
+import os
+import tempfile
+from pydantic import BaseModel
+from launch import LaunchClient
+
+directory = tempfile.mkdtemp()
+model_filename = os.path.join(directory, "model.py")
+with open(model_filename, "w") as f:
+ f.write("""def my_load_model_fn(deserialized_config):
+ def my_model(x):
+ return x * 2
+
+ return my_model
+""")
+
+predict_filename = os.path.join(directory, "predict.py")
+with open(predict_filename, "w") as f:
+ f.write("""def my_load_predict_fn(deserialized_config, model):
+ def returns_model_of_x_plus_len_of_y(x: int, y: str) -> int:
+ assert isinstance(x, int) and isinstance(y, str)
+ return model(x) + len(y)
+
+ return returns_model_of_x_plus_len_of_y
+""")
+
+requirements_filename = os.path.join(directory, "requirements.txt")
+with open(requirements_filename, "w") as f:
+ f.write("""
+pytest==7.2.1
+numpy
+""")
+
+"""
+The directory structure should now look like
+
+directory/
+ model.py
+ predict.py
+ requirements.txt
+"""
+
+
+class MyRequestSchema(BaseModel):
+ x: int
+ y: str
+
+class MyResponseSchema(BaseModel):
+ __root__: int
+
+print(directory)
+print(model_filename)
+print(predict_filename)
+print(requirements_filename)
+
+BUNDLE_PARAMS = {
+ "model_bundle_name": "test-bundle-from-dirs",
+ "base_paths": [directory],
+ "load_predict_fn_module_path": f"{os.path.basename(directory)}.predict.my_load_predict_fn",
+ "load_model_fn_module_path": f"{os.path.basename(directory)}.model.my_load_model_fn",
+ "request_schema": MyRequestSchema,
+ "response_schema": MyResponseSchema,
+ "requirements_path": requirements_filename,
+ "pytorch_image_tag": "1.7.1-cuda11.0-cudnn8-runtime",
+}
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+client.create_model_bundle_from_dirs_v2(**BUNDLE_PARAMS)
+
+# Clean up files from demo
+os.remove(model_filename)
+os.remove(predict_filename)
+os.remove(requirements_filename)
+os.rmdir(directory)
+import os
+from pydantic import BaseModel
+from launch import LaunchClient
+
+
+class MyRequestSchema(BaseModel):
+ x: int
+ y: str
+
+class MyResponseSchema(BaseModel):
+ __root__: int
+
+
+BUNDLE_PARAMS = {
+ "model_bundle_name": "test-bundle",
+ "request_schema": MyRequestSchema,
+ "response_schema": MyResponseSchema,
+ "repository": "...",
+ "tag": "...",
+ "command": ...,
+ "predict_route": "/predict",
+ "healthcheck_route": "/readyz",
+ "env": {
+ "TEST_KEY": "test_value",
+ },
+ "readiness_initial_delay_seconds": 30,
+}
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+client.create_model_bundle_from_runnable_image_v2(**BUNDLE_PARAMS)
+import os
+from pydantic import BaseModel
+from launch import LaunchClient
+
+
+class MyRequestSchema(BaseModel):
+ x: int
+ y: str
+
+class MyResponseSchema(BaseModel):
+ __root__: int
+
+
+BUNDLE_PARAMS = {
+ "model_bundle_name": "test-triton-bundle",
+ "request_schema": MyRequestSchema,
+ "response_schema": MyResponseSchema,
+ "repository": "...",
+ "tag": "...",
+ "command": ...,
+ "predict_route": "/predict",
+ "healthcheck_route": "/readyz",
+ "env": {
+ "TEST_KEY": "test_value",
+ },
+ "readiness_initial_delay_seconds": 30,
+ "triton_model_repository": "...",
+ "triton_model_replicas": {"": ""},
+ "triton_num_cpu": 4.0,
+ "triton_commit_tag": "",
+ "triton_storage": "",
+ "triton_memory": "",
+ "triton_readiness_initial_delay_seconds": 300,
+}
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+client.create_model_bundle_from_triton_enhanced_runnable_image_v2(**BUNDLE_PARAMS)
+import os
+from pydantic import BaseModel
+from launch import LaunchClient
+
+
+class MyRequestSchema(BaseModel):
+ x: int
+ y: str
+
+class MyResponseSchema(BaseModel):
+ __root__: int
+
+
+BUNDLE_PARAMS = {
+ "model_bundle_name": "test-streaming-bundle",
+ "request_schema": MyRequestSchema,
+ "response_schema": MyResponseSchema,
+ "repository": "...",
+ "tag": "...",
+ "command": ..., # optional; if provided, will also expose the /predict endpoint
+ "predict_route": "/predict",
+ "healthcheck_route": "/readyz",
+ "streaming_command": ..., # required
+ "streaming_predict_route": "/stream",
+ "env": {
+ "TEST_KEY": "test_value",
+ },
+ "readiness_initial_delay_seconds": 30,
+}
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+client.create_model_bundle_from_streaming_enhanced_runnable_image_v2(**BUNDLE_PARAMS)
+The app_config field of a model bundle is a dictionary that can be used to
+configure the model bundle. If specified, the app_config is passed to the
+load_predict_fn when the model bundle is deployed, alongside the model. This
+can allow for more code reuse between multiple bundles that perform similar
+tasks.
import os
+from launch import LaunchClient
+from pydantic import BaseModel
+from typing import List, Union
+from typing_extensions import Literal
+
+
+class MyRequestSchemaSingle(BaseModel):
+ kind: Literal['single']
+ x: int
+ y: str
+
+class MyRequestSchemaBatched(BaseModel):
+ kind: Literal['batched']
+ x: List[int]
+ y: List[str]
+
+class MyRequestSchema(BaseModel):
+ __root__: Union[MyRequestSchemaSingle, MyRequestSchemaBatched]
+
+class MyResponseSchema(BaseModel):
+ __root__: Union[int, List[int]]
+
+
+def my_load_predict_fn(app_config, model):
+ def returns_model_of_x_plus_len_of_y(x: Union[int, List[int]], y: Union[str, List[str]]) -> Union[int, List[int]]:
+ """MyRequestSchema -> MyResponseSchema"""
+ if app_config["mode"] == "single":
+ assert isinstance(x, int) and isinstance(y, str)
+ return model(x) + len(y)
+
+ result = []
+ for x_i, y_i in zip(x, y):
+ result.append(model(x_i) + len(y_i))
+ return result
+
+ return returns_model_of_x_plus_len_of_y
+
+
+def my_load_model_fn(app_config):
+ def my_model_single(x: int):
+ return x * 2
+
+ def my_model_batched(x: List[int]):
+ return [my_model_single(x_i) for x_i in x]
+
+ if app_config["mode"] == "single":
+ return my_model_single
+
+ return my_model_batched
+
+
+BUNDLE_PARAMS_SINGLE = {
+ "model_bundle_name": "test-bundle-single",
+ "load_predict_fn": my_load_predict_fn,
+ "load_model_fn": my_load_model_fn,
+ "requirements": ["pytest==7.2.1", "numpy"],
+ "request_schema": MyRequestSchema,
+ "response_schema": MyResponseSchema,
+ "pytorch_image_tag": "1.7.1-cuda11.0-cudnn8-runtime",
+ "app_config": {"mode": "single"},
+}
+BUNDLE_PARAMS_BATCHED = {
+ "model_bundle_name": "test-bundle-batched",
+ "load_predict_fn": my_load_predict_fn,
+ "load_model_fn": my_load_model_fn,
+ "requirements": ["pytest==7.2.1", "numpy"],
+ "request_schema": MyRequestSchema,
+ "response_schema": MyResponseSchema,
+ "pytorch_image_tag": "1.7.1-cuda11.0-cudnn8-runtime",
+ "app_config": {"mode": "batched"},
+}
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+bundle_single = client.create_model_bundle_from_callable_v2(**BUNDLE_PARAMS_SINGLE)
+bundle_batch = client.create_model_bundle_from_callable_v2(**BUNDLE_PARAMS_BATCHED)
+Model Bundles are immutable, meaning they cannot be edited once created.
+However, it is possible to clone an existing model bundle with a new app_config
+using
+clone_model_bundle_with_changes_v2.
To list all the model bundles you own, use
+list_model_bundles_v2.
Model Endpoints are deployments of models that can receive requests and return +predictions containing the results of the model's inference. Each model endpoint +is associated with a model bundle, which contains the model's code. An endpoint +specifies deployment parameters, such as the minimum and maximum number of +workers, as well as the requested resources for each worker, such as the number +of CPUs, amount of memory, GPU count, and type of GPU.
+Endpoints can be asynchronous, synchronous, or streaming. Asynchronous endpoints return +a future immediately after receiving a request, and the future can be used to +retrieve the prediction once it is ready. Synchronous endpoints return the +prediction directly after receiving a request. Streaming endpoints are variants of synchronous +endpoints that return a stream of SSEs instead of a single HTTP response.
+Info
+Here are some tips for how to choose between SyncEndpoint, StreamingEndpoint, AsyncEndpoint, and BatchJob for deploying your ModelBundle:
+A SyncEndpoint is good if:
+A StreamingEndpoint is good if:
+An AsyncEndpoint is good if:
+A BatchJob is good if:
+Async model endpoints are the most cost-efficient way to perform inference on +tasks that are less latency-sensitive.
+import os
+from launch import LaunchClient
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+endpoint = client.create_model_endpoint(
+ endpoint_name="demo-endpoint-async",
+ model_bundle="test-bundle",
+ cpus=1,
+ min_workers=0,
+ endpoint_type="async",
+ update_if_exists=True,
+ labels={
+ "team": "MY_TEAM",
+ "product": "MY_PRODUCT",
+ },
+)
+Sync model endpoints are useful for latency-sensitive tasks, such as real-time +inference. Sync endpoints are more expensive than async endpoints.
+Note
+Sync model endpoints require at least 1 min_worker.
import os
+from launch import LaunchClient
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+endpoint = client.create_model_endpoint(
+ endpoint_name="demo-endpoint-sync",
+ model_bundle="test-bundle",
+ cpus=1,
+ min_workers=1,
+ endpoint_type="sync",
+ update_if_exists=True,
+ labels={
+ "team": "MY_TEAM",
+ "product": "MY_PRODUCT",
+ },
+)
+Streaming model endpoints are variants of sync model endpoints that are useful for tasks with strict +requirements on perceived latency. Streaming endpoints are more expensive than async endpoints.
+Note
+Streaming model endpoints require at least 1 min_worker.
import os
+from launch import LaunchClient
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+endpoint = client.create_model_endpoint(
+ endpoint_name="demo-endpoint-streaming",
+ model_bundle="test-streaming-bundle",
+ cpus=1,
+ min_workers=1,
+ per_worker=1,
+ endpoint_type="streaming",
+ update_if_exists=True,
+ labels={
+ "team": "MY_TEAM",
+ "product": "MY_PRODUCT",
+ },
+)
+Model endpoints can be listed, updated, and deleted using the Launch API.
+import os
+from launch import LaunchClient
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+endpoints = client.list_model_endpoints()
+import os
+from launch import LaunchClient
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+client.edit_model_endpoint(
+ model_endpoint="demo-endpoint-sync",
+ max_workers=2,
+)
+import time
+import os
+from launch import LaunchClient
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+endpoint = client.create_model_endpoint(
+ endpoint_name="demo-endpoint-tmp",
+ model_bundle="test-bundle",
+ cpus=1,
+ min_workers=0,
+ endpoint_type="async",
+ update_if_exists=True,
+ labels={
+ "team": "MY_TEAM",
+ "product": "MY_PRODUCT",
+ },
+)
+time.sleep(15) # Wait for Launch to build the endpoint
+client.delete_model_endpoint(model_endpoint_name="demo-endpoint-tmp")
+Creating deployments on Launch generally involves three steps:
+Create and upload a ModelBundle. Pass your trained model
+ as well as pre-/post-processing code to the Scale Launch Python client, and
+ we’ll create a model bundle based on the code and store it in our Bundle Store.
Create a ModelEndpoint. Pass a ModelBundle as well as
+ infrastructure settings such as the desired number of GPUs to our client.
+ This provisions resources on Scale’s cluster dedicated to your ModelEndpoint.
Make requests to the ModelEndpoint. You can make requests through the Python + client, or make HTTP requests directly to Scale.
+Warning
+This feature is currently in beta, and the API is likely to change. Please contact us if you are interested +in using this feature.
+If you need more customization that what cloudpickle or zip artifacts can offer, or if you just already have a pre-built +docker image, then you can create a Model Bundle with that docker image. You will need to modify your image to run a +web server that exposes HTTP port 5005.
+In our example below, we assume that you have some existing Python function my_inference_fn that can be imported.
+If you need to invoke some other binary (e.g. a custom C++ binary), then you can shell out to the OS to call that binary;
+subsequent versions of this document will have native examples for non-Python binaries.
For choice of web server, we recommend FastAPI due to its speed and ergonomics. +Any web server would work, although we give examples with FastAPI.
+You can add fastapi and uvicorn to the requirements.txt file that gets installed as part of your Dockerfile. Alternatively,
+you can add pip install fastapi uvicorn to the Dockerfile directly.
Inside your project workspace, create a server.py file with these contents:
# test='skip'
+from fastapi import FastAPI
+
+from pydantic import BaseModel
+
+app = FastAPI()
+
+class MyRequestSchema(BaseModel):
+ url: str
+
+
+class MyResponseSchema(BaseModel):
+ response: str
+
+def my_inference_fn(req: MyRequestSchema) -> MyResponseSchema:
+ # This is an example inference function - you can instead import a function from your own codebase,
+ # or shell out to the OS, etc.
+ resp = req.url + "_hello"
+ return MyResponseSchema(response=resp)
+
+@app.post("/predict")
+async def predict(request: MyRequestSchema) -> MyResponseSchema:
+ response = my_inference_fn(request)
+ return response
+
+@app.get("/readyz")
+def readyz():
+ return "ok"
+Build your updated Dockerfile and push the image to a location that is accessible by Scale. For instance, if you are +using AWS ECR, please make sure that the necessary cross-account permissions allow Scale to pull your docker image.
+Now you can upload your docker image as a Model Bundle, and then create a Model Endpoint referencing that Model Bundle. Note that path.to.your.server.file:app in the command section below should be relative to the WORKDIR of your docker image.
# test='skip'
+import os
+
+from launch import LaunchClient
+
+from server import MyRequestSchema, MyResponseSchema # Defined as part of your server.py
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+
+model_bundle_name = "my_bundle_name"
+
+client.create_model_bundle_from_runnable_image_v2(
+ model_bundle_name=model_bundle_name,
+ request_schema=MyRequestSchema,
+ response_schema=MyResponseSchema,
+ repository="$YOUR_ECR_REPO",
+ tag="$YOUR_IMAGE_TAG",
+ command=[
+ "dumb-init",
+ "--",
+ "uvicorn",
+ "path.to.your.server.file:app",
+ "--port",
+ "5005",
+ "--host",
+ "::",
+ ],
+ predict_route="/predict",
+ healthcheck_route="/readyz",
+ readiness_initial_delay_seconds=120,
+ env={},
+)
+
+client.create_model_endpoint(
+ endpoint_name=f"endpoint-{model_bundle_name}",
+ model_bundle=model_bundle_name,
+ endpoint_type="async",
+ min_workers=0,
+ max_workers=1,
+ per_worker=1,
+ memory="30Gi",
+ storage="40Gi",
+ cpus=4, # This must be at least 2 because forwarding services consume 1 cpu.
+ gpus=1,
+ gpu_type="nvidia-ampere-a10",
+ update_if_exists=True,
+)
+Simple, scalable, and high performance ML service deployment in python.
+import os
+import time
+from launch import LaunchClient
+from launch import EndpointRequest
+from pydantic import BaseModel
+from rich import print
+
+
+class MyRequestSchema(BaseModel):
+ x: int
+ y: str
+
+class MyResponseSchema(BaseModel):
+ __root__: int
+
+
+def my_load_predict_fn(model):
+ def returns_model_of_x_plus_len_of_y(x: int, y: str) -> int:
+ """MyRequestSchema -> MyResponseSchema"""
+ assert isinstance(x, int) and isinstance(y, str)
+ return model(x) + len(y)
+
+ return returns_model_of_x_plus_len_of_y
+
+
+def my_load_model_fn():
+ def my_model(x):
+ return x * 2
+
+ return my_model
+
+BUNDLE_PARAMS = {
+ "model_bundle_name": "test-bundle",
+ "load_predict_fn": my_load_predict_fn,
+ "load_model_fn": my_load_model_fn,
+ "request_schema": MyRequestSchema,
+ "response_schema": MyResponseSchema,
+ "requirements": ["pytest==7.2.1", "numpy"], # list your requirements here
+ "pytorch_image_tag": "1.7.1-cuda11.0-cudnn8-runtime",
+}
+
+ENDPOINT_PARAMS = {
+ "endpoint_name": "demo-endpoint",
+ "model_bundle": "test-bundle",
+ "cpus": 1,
+ "min_workers": 0,
+ "endpoint_type": "async",
+ "update_if_exists": True,
+ "labels": {
+ "team": "MY_TEAM",
+ "product": "launch",
+ }
+}
+
+def predict_on_endpoint(request: MyRequestSchema) -> MyResponseSchema:
+ # Wait for the endpoint to be ready first before submitting a task
+ endpoint = client.get_model_endpoint(endpoint_name="demo-endpoint")
+ while endpoint.status() != "READY":
+ time.sleep(10)
+
+ endpoint_request = EndpointRequest(args=request.dict(), return_pickled=False)
+
+ future = endpoint.predict(request=endpoint_request)
+ raw_response = future.get()
+
+ response = MyResponseSchema.parse_raw(raw_response.result)
+ return response
+
+
+client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
+
+client.create_model_bundle_from_callable_v2(**BUNDLE_PARAMS)
+endpoint = client.create_model_endpoint(**ENDPOINT_PARAMS)
+
+request = MyRequestSchema(x=5, y="hello")
+response = predict_on_endpoint(request)
+print(response)
+"""
+MyResponseSchema(__root__=10)
+"""
+What's going on here:
+pydantic to define our request and response
+ schemas, MyRequestSchema and MyResponseSchema. These schemas are used to generate the API
+ documentation for our models.model and the load_predict_fn, which tells Launch
+ how to load our model and how to make predictions with it. In this case,
+ we're just returning a function that adds the length of the string y to
+ model(x), where model doubles the integer x.load_predict_fn, the request_schema, and the
+ response_schema. We also specify the env_params, which tell Launch environment settings like
+ the base image to use. In this case, we're using a PyTorch image.model_bundle that we created above, and we specify the endpoint_type, which tells
+ Launch whether to use a synchronous or asynchronous endpoint. In this case, we're using an
+ asynchronous endpoint, which means that we can make predictions and return immediately with a
+ future object. We can then use the future object to get the prediction result later.predict_on_endpoint with a MyRequestSchema object.
+ This function first waits for the endpoint to be ready, then it submits a prediction request to
+ the endpoint. It then waits for the prediction result and returns it.Notice that we specified min_workers=0, meaning that the endpoint will scale down to 0 workers
+when it's not being used.
To use Scale Launch, first install it using pip:
Simple, scalable, and high performance ML service deployment in python.
"},{"location":"#example","title":"Example","text":"Launch Usageimport os\nimport time\nfrom launch import LaunchClient\nfrom launch import EndpointRequest\nfrom pydantic import BaseModel\nfrom rich import print\nclass MyRequestSchema(BaseModel):\nx: int\ny: str\nclass MyResponseSchema(BaseModel):\n__root__: int\ndef my_load_predict_fn(model):\ndef returns_model_of_x_plus_len_of_y(x: int, y: str) -> int:\n\"\"\"MyRequestSchema -> MyResponseSchema\"\"\"\nassert isinstance(x, int) and isinstance(y, str)\nreturn model(x) + len(y)\nreturn returns_model_of_x_plus_len_of_y\ndef my_load_model_fn():\ndef my_model(x):\nreturn x * 2\nreturn my_model\nBUNDLE_PARAMS = {\n\"model_bundle_name\": \"test-bundle\",\n\"load_predict_fn\": my_load_predict_fn,\n\"load_model_fn\": my_load_model_fn,\n\"request_schema\": MyRequestSchema,\n\"response_schema\": MyResponseSchema,\n\"requirements\": [\"pytest==7.2.1\", \"numpy\"], # list your requirements here\n\"pytorch_image_tag\": \"1.7.1-cuda11.0-cudnn8-runtime\",\n}\nENDPOINT_PARAMS = {\n\"endpoint_name\": \"demo-endpoint\",\n\"model_bundle\": \"test-bundle\",\n\"cpus\": 1,\n\"min_workers\": 0,\n\"endpoint_type\": \"async\",\n\"update_if_exists\": True,\n\"labels\": {\n\"team\": \"MY_TEAM\",\n\"product\": \"launch\",\n}\n}\ndef predict_on_endpoint(request: MyRequestSchema) -> MyResponseSchema:\n# Wait for the endpoint to be ready first before submitting a task\nendpoint = client.get_model_endpoint(endpoint_name=\"demo-endpoint\")\nwhile endpoint.status() != \"READY\":\ntime.sleep(10)\nendpoint_request = EndpointRequest(args=request.dict(), return_pickled=False)\nfuture = endpoint.predict(request=endpoint_request)\nraw_response = future.get()\nresponse = MyResponseSchema.parse_raw(raw_response.result)\nreturn response\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nclient.create_model_bundle_from_callable_v2(**BUNDLE_PARAMS)\nendpoint = client.create_model_endpoint(**ENDPOINT_PARAMS)\nrequest = MyRequestSchema(x=5, y=\"hello\")\nresponse = predict_on_endpoint(request)\nprint(response)\n\"\"\"\nMyResponseSchema(__root__=10)\n\"\"\"\n What's going on here:
pydantic to define our request and response schemas, MyRequestSchema and MyResponseSchema. These schemas are used to generate the API documentation for our models.model and the load_predict_fn, which tells Launch how to load our model and how to make predictions with it. In this case, we're just returning a function that adds the length of the string y to model(x), where model doubles the integer x.load_predict_fn, the request_schema, and the response_schema. We also specify the env_params, which tell Launch environment settings like the base image to use. In this case, we're using a PyTorch image.model_bundle that we created above, and we specify the endpoint_type, which tells Launch whether to use a synchronous or asynchronous endpoint. In this case, we're using an asynchronous endpoint, which means that we can make predictions and return immediately with a future object. We can then use the future object to get the prediction result later.predict_on_endpoint with a MyRequestSchema object. This function first waits for the endpoint to be ready, then it submits a prediction request to the endpoint. It then waits for the prediction result and returns it.Notice that we specified min_workers=0, meaning that the endpoint will scale down to 0 workers when it's not being used.
To use Scale Launch, first install it using pip:
pip install -U scale-launch\n"},{"location":"cli/","title":"CLI","text":"Launch comes with a CLI for listing bundles / endpoints, editing endpoints, and sending tasks to endpoints.
The CLI can be used as scale-launch ....
Run scale-launch --help for more options.
This is the command line interface (CLI) package for Scale Launch.\n\n \u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2557 \u2588\u2588\u2557\u2588\u2588\u2588\u2557 \u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2557 \u2588\u2588\u2557\n \u2588\u2588\u2551 \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2551 \u2588\u2588\u2551\n \u2588\u2588\u2551 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2554\u2588\u2588\u2557 \u2588\u2588\u2551\u2588\u2588\u2551 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\n \u2588\u2588\u2551 \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2551\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2551\u255a\u2588\u2588\u2557\u2588\u2588\u2551\u2588\u2588\u2551 \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2551\n \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551\u255a\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2551 \u255a\u2588\u2588\u2588\u2588\u2551\u255a\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551\n \u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u255d \u255a\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u255d \u255a\u2550\u2550\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u255d \u255a\u2550\u255d\n\nUsage: scale-launch [OPTIONS] COMMAND [ARGS]...\n\nOptions:\n --help Show this message and exit.\n\nCommands:\n batch-jobs Batch Jobs is a wrapper around batch jobs in Scale Launch\n bundles Bundles is a wrapper around model bundles in Scale Launch\n config Config is a wrapper around getting and setting your API key and other configuration options\n endpoints Endpoints is a wrapper around model endpoints in Scale Launch\n tasks Tasks is a wrapper around sending requests to endpoints\n"},{"location":"api/client/","title":"Launch Client","text":""},{"location":"api/client/#launch.client.LaunchClient","title":"LaunchClient","text":"LaunchClient(api_key: str, endpoint: Optional[str] = None, self_hosted: bool = False, use_path_with_custom_endpoint: bool = False)\n Scale Launch Python Client.
Initializes a Scale Launch Client.
Parameters:
Name Type Description Defaultapi_key str Your Scale API key
requiredendpoint Optional[str] The Scale Launch Endpoint (this should not need to be changed)
None self_hosted bool True iff you are connecting to a self-hosted Scale Launch
False use_path_with_custom_endpoint bool True iff you are not using the default Scale Launch endpoint but your endpoint has path routing (to SCALE_LAUNCH_VX_PATH) set up
False"},{"location":"api/client/#launch.client.LaunchClient.batch_async_request","title":"batch_async_request","text":"batch_async_request(*, model_bundle: Union[ModelBundle, str], urls: Optional[List[str]] = None, inputs: Optional[List[Dict[str, Any]]] = None, batch_url_file_location: Optional[str] = None, serialization_format: str = 'JSON', labels: Optional[Dict[str, str]] = None, cpus: Optional[int] = None, memory: Optional[str] = None, gpus: Optional[int] = None, gpu_type: Optional[str] = None, storage: Optional[str] = None, max_workers: Optional[int] = None, per_worker: Optional[int] = None, timeout_seconds: Optional[float] = None) -> Dict[str, Any]\n Sends a batch inference request using a given bundle. Returns a key that can be used to retrieve the results of inference at a later time.
Must have exactly one of urls or inputs passed in.
Parameters:
Name Type Description Defaultmodel_bundle Union[ModelBundle, str] The bundle or the name of a the bundle to use for inference.
requiredurls Optional[List[str]] A list of urls, each pointing to a file containing model input. Must be accessible by Scale Launch, hence urls need to either be public or signedURLs.
None inputs Optional[List[Dict[str, Any]]] A list of model inputs, if exists, we will upload the inputs and pass it in to Launch.
None batch_url_file_location Optional[str] In self-hosted mode, the input to the batch job will be uploaded to this location if provided. Otherwise, one will be determined from bundle_location_fn()
None serialization_format str Serialization format of output, either 'PICKLE' or 'JSON'. 'pickle' corresponds to pickling results + returning
'JSON' labels Optional[Dict[str, str]] An optional dictionary of key/value pairs to associate with this endpoint.
None cpus Optional[int] Number of cpus each worker should get, e.g. 1, 2, etc. This must be greater than or equal to 1.
None memory Optional[str] Amount of memory each worker should get, e.g. \"4Gi\", \"512Mi\", etc. This must be a positive amount of memory.
None storage Optional[str] Amount of local ephemeral storage each worker should get, e.g. \"4Gi\", \"512Mi\", etc. This must be a positive amount of storage.
None gpus Optional[int] Number of gpus each worker should get, e.g. 0, 1, etc.
None max_workers Optional[int] The maximum number of workers. Must be greater than or equal to 0, and as well as greater than or equal to min_workers.
None per_worker Optional[int] The maximum number of concurrent requests that an individual worker can service. Launch automatically scales the number of workers for the endpoint so that each worker is processing per_worker requests:
per_worker, then the number of workers will be reduced.per_worker, then the number of workers will be increased to meet the elevated traffic.None gpu_type Optional[str] If specifying a non-zero number of gpus, this controls the type of gpu requested. Here are the supported values:
nvidia-tesla-t4nvidia-ampere-a10nvidia-hopper-h100nvidia-hopper-h100-1g20gnvidia-hopper-h100-3g40gNone timeout_seconds Optional[float] The maximum amount of time (in seconds) that the batch job can take. If not specified, the server defaults to 12 hours. This includes the time required to build the endpoint and the total time required for all the individual tasks.
None Returns:
Type DescriptionDict[str, Any] A dictionary that contains job_id as a key, and the ID as the value.
cancel_fine_tune(fine_tune_id: str) -> CancelFineTuneResponse\n Cancel a fine-tune
Parameters:
Name Type Description Defaultfine_tune_id str ID of the fine-tune
requiredReturns:
Name Type DescriptionCancelFineTuneResponse CancelFineTuneResponse whether the cancellation was successful
"},{"location":"api/client/#launch.client.LaunchClient.clone_model_bundle_with_changes","title":"clone_model_bundle_with_changes","text":"clone_model_bundle_with_changes(model_bundle: Union[ModelBundle, str], app_config: Optional[Dict] = None) -> ModelBundle\n Warning This method is deprecated. Use clone_model_bundle_with_changes_v2 instead.
Parameters:
Name Type Description Defaultmodel_bundle Union[ModelBundle, str] The existing bundle or its ID.
requiredapp_config Optional[Dict] The new bundle's app config, if not passed in, the new bundle's app_config will be set to None
None Returns:
Type DescriptionModelBundle A ModelBundle object
clone_model_bundle_with_changes_v2(original_model_bundle_id: str, new_app_config: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response\n Clone a model bundle with an optional new app_config.
Parameters:
Name Type Description Defaultoriginal_model_bundle_id str The ID of the model bundle you want to clone.
requirednew_app_config Optional[Dict[str, Any]] A dictionary of new app config values to use for the cloned model.
None Returns:
Type DescriptionCreateModelBundleV2Response An object containing the following keys:
model_bundle_id: The ID of the cloned model bundle.completions_stream(endpoint_name: str, prompt: str, max_new_tokens: int, temperature: float, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, timeout: float = DEFAULT_LLM_COMPLETIONS_TIMEOUT) -> Iterable[CompletionStreamV1Response]\n Run prompt completion on an LLM endpoint in streaming fashion. Will fail if endpoint does not support streaming.
Parameters:
Name Type Description Defaultendpoint_name str The name of the LLM endpoint to make the request to
requiredprompt str The prompt to send to the endpoint
requiredmax_new_tokens int The maximum number of tokens to generate for each prompt
requiredtemperature float The temperature to use for sampling
requiredstop_sequences Optional[List[str]] List of sequences to stop the completion at
None return_token_log_probs Optional[bool] Whether to return the log probabilities of the tokens
False Returns:
Type DescriptionIterable[CompletionStreamV1Response] Iterable responses for prompt completion
"},{"location":"api/client/#launch.client.LaunchClient.completions_sync","title":"completions_sync","text":"completions_sync(endpoint_name: str, prompt: str, max_new_tokens: int, temperature: float, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, timeout: float = DEFAULT_LLM_COMPLETIONS_TIMEOUT) -> CompletionSyncV1Response\n Run prompt completion on a sync LLM endpoint. Will fail if the endpoint is not sync.
Parameters:
Name Type Description Defaultendpoint_name str The name of the LLM endpoint to make the request to
requiredprompt str The completion prompt to send to the endpoint
requiredmax_new_tokens int The maximum number of tokens to generate for each prompt
requiredtemperature float The temperature to use for sampling
requiredstop_sequences Optional[List[str]] List of sequences to stop the completion at
None return_token_log_probs Optional[bool] Whether to return the log probabilities of the tokens
False Returns:
Type DescriptionCompletionSyncV1Response Response for prompt completion
"},{"location":"api/client/#launch.client.LaunchClient.create_docker_image_batch_job","title":"create_docker_image_batch_job","text":"create_docker_image_batch_job(*, labels: Dict[str, str], docker_image_batch_job_bundle: Optional[Union[str, DockerImageBatchJobBundleResponse]] = None, docker_image_batch_job_bundle_name: Optional[str] = None, job_config: Optional[Dict[str, Any]] = None, cpus: Optional[int] = None, memory: Optional[str] = None, gpus: Optional[int] = None, gpu_type: Optional[str] = None, storage: Optional[str] = None)\n For self hosted mode only. Parameters: docker_image_batch_job_bundle: Specifies the docker image bundle to use for the batch job. Either the string id of a docker image bundle, or a DockerImageBatchJobBundleResponse object. Only one of docker_image_batch_job_bundle and docker_image_batch_job_bundle_name can be specified. docker_image_batch_job_bundle_name: The name of a batch job bundle. If specified, Launch will use the most recent bundle with that name owned by the current user. Only one of docker_image_batch_job_bundle and docker_image_batch_job_bundle_name can be specified. labels: Kubernetes labels that are present on the batch job. job_config: A JSON-serializable python object that will get passed to the batch job, specifically as the contents of a file mounted at mount_location inside the bundle. You can call python's json.load() on the file to retrieve the contents. cpus: Optional override for the number of cpus to give to your job. Either the default must be specified in the bundle, or this must be specified. memory: Optional override for the amount of memory to give to your job. Either the default must be specified in the bundle, or this must be specified. gpus: Optional number of gpus to give to the bundle. If not specified in the bundle or here, will be interpreted as 0 gpus. gpu_type: Optional type of gpu. If the final number of gpus is positive, must be specified either in the bundle or here. storage: Optional reserved amount of disk to give to your batch job. If not specified, your job may be evicted if it is using too much disk.
create_docker_image_batch_job_bundle(*, name: str, image_repository: str, image_tag: str, command: List[str], env: Optional[Dict[str, str]] = None, mount_location: Optional[str] = None, cpus: Optional[int] = None, memory: Optional[str] = None, gpus: Optional[int] = None, gpu_type: Optional[str] = None, storage: Optional[str] = None) -> CreateDockerImageBatchJobBundleResponse\n For self hosted mode only.
Creates a Docker Image Batch Job Bundle.
Parameters:
Name Type Description Defaultname str A user-defined name for the bundle. Does not need to be unique.
requiredimage_repository str The (short) repository of your image. For example, if your image is located at 123456789012.dkr.ecr.us-west-2.amazonaws.com/repo:tag, and your version of Launch is configured to look at 123456789012.dkr.ecr.us-west-2.amazonaws.com for Docker Images, you would pass the value repo for the image_repository parameter.
image_tag str The tag of your image inside of the repo. In the example above, you would pass the value tag for the image_tag parameter.
command List[str] The command to run inside the docker image.
requiredenv Optional[Dict[str, str]] A dictionary of environment variables to inject into your docker image.
None mount_location Optional[str] A location in the filesystem where you would like a json-formatted file, controllable on runtime, to be mounted. This allows behavior to be specified on runtime. (Specifically, the contents of this file can be read via json.load() inside of the user-defined code.)
None cpus Optional[int] Optional default value for the number of cpus to give the job.
None memory Optional[str] Optional default value for the amount of memory to give the job.
None gpus Optional[int] Optional default value for the number of gpus to give the job.
None gpu_type Optional[str] Optional default value for the type of gpu to give the job.
None storage Optional[str] Optional default value for the amount of disk to give the job.
None"},{"location":"api/client/#launch.client.LaunchClient.create_fine_tune","title":"create_fine_tune","text":"create_fine_tune(model: str, training_file: str, validation_file: Optional[str] = None, fine_tuning_method: Optional[str] = None, hyperparameters: Optional[Dict[str, str]] = None, wandb_config: Optional[Dict[str, Any]] = None, suffix: str = None) -> CreateFineTuneResponse\n Create a fine-tune
Parameters:
Name Type Description Defaultmodel str Identifier of base model to train from.
requiredtraining_file str Path to file of training dataset. Dataset must be a csv with columns 'prompt' and 'response'.
requiredvalidation_file Optional[str] Path to file of validation dataset. Has the same format as training_file. If not provided, we will generate a split from the training dataset.
None fine_tuning_method Optional[str] Fine-tuning method. Currently unused, but when different techniques are implemented we will expose this field.
None hyperparameters Optional[Dict[str, str]] Hyperparameters to pass in to training job.
None wandb_config Optional[Dict[str, Any]] Configuration for Weights and Biases. To enable set hyperparameters[\"report_to\"] to wandb. api_key must be provided which is the API key.
None suffix str Optional user-provided identifier suffix for the fine-tuned model.
None Returns:
Name Type DescriptionCreateFineTuneResponse CreateFineTuneResponse ID of the created fine-tune
"},{"location":"api/client/#launch.client.LaunchClient.create_llm_model_endpoint","title":"create_llm_model_endpoint","text":"create_llm_model_endpoint(endpoint_name: str, model_name: str, inference_framework_image_tag: str, source: LLMSource = LLMSource.HUGGING_FACE, inference_framework: LLMInferenceFramework = LLMInferenceFramework.DEEPSPEED, num_shards: int = 4, quantize: Optional[Quantization] = None, checkpoint_path: Optional[str] = None, cpus: int = 32, memory: str = '192Gi', storage: Optional[str] = None, gpus: int = 4, min_workers: int = 0, max_workers: int = 1, per_worker: int = 10, gpu_type: Optional[str] = 'nvidia-ampere-a10', endpoint_type: str = 'sync', high_priority: Optional[bool] = False, post_inference_hooks: Optional[List[PostInferenceHooks]] = None, default_callback_url: Optional[str] = None, default_callback_auth_kind: Optional[Literal['basic', 'mtls']] = None, default_callback_auth_username: Optional[str] = None, default_callback_auth_password: Optional[str] = None, default_callback_auth_cert: Optional[str] = None, default_callback_auth_key: Optional[str] = None, public_inference: Optional[bool] = None, update_if_exists: bool = False, labels: Optional[Dict[str, str]] = None)\n Creates and registers a model endpoint in Scale Launch. The returned object is an instance of type Endpoint, which is a base class of either SyncEndpoint or AsyncEndpoint. This is the object to which you sent inference requests.
Parameters:
Name Type Description Defaultendpoint_name str The name of the model endpoint you want to create. The name must be unique across all endpoints that you own.
requiredmodel_name str name for the LLM. List can be found at (TODO: add list of supported models)
requiredinference_framework_image_tag str image tag for the inference framework. (TODO: use latest image tag when unspecified)
requiredsource LLMSource source of the LLM. Currently only HuggingFace is supported.
HUGGING_FACE inference_framework LLMInferenceFramework inference framework for the LLM. Currently only DeepSpeed is supported.
DEEPSPEED num_shards int number of shards for the LLM. When bigger than 1, LLM will be sharded to multiple GPUs. Number of GPUs must be larger than num_shards.
4 quantize Optional[Quantization] Quantization method for the LLM. Only affects behavior for text-generation-inference models.
None checkpoint_path Optional[str] Path to the checkpoint to load the model from. Only affects behavior for text-generation-inference models.
None cpus int Number of cpus each worker should get, e.g. 1, 2, etc. This must be greater than or equal to 1.
32 memory str Amount of memory each worker should get, e.g. \"4Gi\", \"512Mi\", etc. This must be a positive amount of memory.
'192Gi' storage Optional[str] Amount of local ephemeral storage each worker should get, e.g. \"4Gi\", \"512Mi\", etc. This must be a positive amount of storage.
None gpus int Number of gpus each worker should get, e.g. 0, 1, etc.
4 min_workers int The minimum number of workers. Must be greater than or equal to 0. This should be determined by computing the minimum throughput of your workload and dividing it by the throughput of a single worker. This field must be at least 1 for synchronous endpoints.
0 max_workers int The maximum number of workers. Must be greater than or equal to 0, and as well as greater than or equal to min_workers. This should be determined by computing the maximum throughput of your workload and dividing it by the throughput of a single worker.
1 per_worker int The maximum number of concurrent requests that an individual worker can service. Launch automatically scales the number of workers for the endpoint so that each worker is processing per_worker requests, subject to the limits defined by min_workers and max_workers.
per_worker, then the number of workers will be reduced. - Otherwise, if the average number of concurrent requests per worker is higher than per_worker, then the number of workers will be increased to meet the elevated traffic.Here is our recommendation for computing per_worker:
min_workers and max_workers per your minimum and maximum throughput requirements. 2. Determine a value for the maximum number of concurrent requests in the workload. Divide this number by max_workers. Doing this ensures that the number of workers will \"climb\" to max_workers.10 gpu_type Optional[str] If specifying a non-zero number of gpus, this controls the type of gpu requested. Here are the supported values:
nvidia-tesla-t4nvidia-ampere-a10nvidia-hopper-h100nvidia-hopper-h100-1g20gnvidia-hopper-h100-3g40g'nvidia-ampere-a10' endpoint_type str Either \"sync\" or \"async\".
'sync' high_priority Optional[bool] Either True or False. Enabling this will allow the created endpoint to leverage the shared pool of prewarmed nodes for faster spinup time.
False post_inference_hooks Optional[List[PostInferenceHooks]] List of hooks to trigger after inference tasks are served.
None default_callback_url Optional[str] The default callback url to use for async endpoints. This can be overridden in the task parameters for each individual task. post_inference_hooks must contain \"callback\" for the callback to be triggered.
None default_callback_auth_kind Optional[Literal['basic', 'mtls']] The default callback auth kind to use for async endpoints. Either \"basic\" or \"mtls\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_username Optional[str] The default callback auth username to use. This only applies if default_callback_auth_kind is \"basic\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_password Optional[str] The default callback auth password to use. This only applies if default_callback_auth_kind is \"basic\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_cert Optional[str] The default callback auth cert to use. This only applies if default_callback_auth_kind is \"mtls\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_key Optional[str] The default callback auth key to use. This only applies if default_callback_auth_kind is \"mtls\". This can be overridden in the task parameters for each individual task.
None public_inference Optional[bool] If True, this endpoint will be available to all user IDs for inference.
None update_if_exists bool If True, will attempt to update the endpoint if it exists. Otherwise, will unconditionally try to create a new endpoint. Note that endpoint names for a given user must be unique, so attempting to call this function with update_if_exists=False for an existing endpoint will raise an error.
False labels Optional[Dict[str, str]] An optional dictionary of key/value pairs to associate with this endpoint.
None Returns:
Type DescriptionA Endpoint object that can be used to make requests to the endpoint.
"},{"location":"api/client/#launch.client.LaunchClient.create_model_bundle","title":"create_model_bundle","text":"create_model_bundle(model_bundle_name: str, env_params: Dict[str, str], *, load_predict_fn: Optional[Callable[[LaunchModel_T], Callable[[Any], Any]]] = None, predict_fn_or_cls: Optional[Callable[[Any], Any]] = None, requirements: Optional[List[str]] = None, model: Optional[LaunchModel_T] = None, load_model_fn: Optional[Callable[[], LaunchModel_T]] = None, app_config: Optional[Union[Dict[str, Any], str]] = None, globals_copy: Optional[Dict[str, Any]] = None, request_schema: Optional[Type[BaseModel]] = None, response_schema: Optional[Type[BaseModel]] = None) -> ModelBundle\n Warning This method is deprecated. Use create_model_bundle_from_callable_v2 instead.
Parameters:
Name Type Description Defaultmodel_bundle_name str The name of the model bundle you want to create. The name must be unique across all bundles that you own.
requiredpredict_fn_or_cls Optional[Callable[[Any], Any]] Function or a Callable class that runs end-to-end (pre/post processing and model inference) on the call. i.e. predict_fn_or_cls(REQUEST) -> RESPONSE.
None model Optional[LaunchModel_T] Typically a trained Neural Network, e.g. a Pytorch module.
Exactly one of model and load_model_fn must be provided.
None load_model_fn Optional[Callable[[], LaunchModel_T]] A function that, when run, loads a model. This function is essentially a deferred wrapper around the model argument.
Exactly one of model and load_model_fn must be provided.
None load_predict_fn Optional[Callable[[LaunchModel_T], Callable[[Any], Any]]] Function that, when called with a model, returns a function that carries out inference.
If model is specified, then this is equivalent to: load_predict_fn(model, app_config=optional_app_config]) -> predict_fn
Otherwise, if load_model_fn is specified, then this is equivalent to: load_predict_fn(load_model_fn(), app_config=optional_app_config]) -> predict_fn
In both cases, predict_fn is then the inference function, i.e.: predict_fn(REQUEST) -> RESPONSE
None requirements Optional[List[str]] A list of python package requirements, where each list element is of the form <package_name>==<package_version>, e.g.
[\"tensorflow==2.3.0\", \"tensorflow-hub==0.11.0\"]
If you do not pass in a value for requirements, then you must pass in globals() for the globals_copy argument.
None app_config Optional[Union[Dict[str, Any], str]] Either a Dictionary that represents a YAML file contents or a local path to a YAML file.
None env_params Dict[str, str] A dictionary that dictates environment information e.g. the use of pytorch or tensorflow, which base image tag to use, etc. Specifically, the dictionary should contain the following keys:
framework_type: either tensorflow or pytorch. - PyTorch fields: - pytorch_image_tag: An image tag for the pytorch docker base image. The list of tags can be found from https://hub.docker.com/r/pytorch/pytorch/tags. - Example:
.. code-block:: python
{ \"framework_type\": \"pytorch\", \"pytorch_image_tag\": \"1.10.0-cuda11.3-cudnn8-runtime\" }
Tensorflow fields:
tensorflow_version: Version of tensorflow, e.g. \"2.3.0\".globals_copy Optional[Dict[str, Any]] Dictionary of the global symbol table. Normally provided by globals() built-in function.
None request_schema Optional[Type[BaseModel]] A pydantic model that represents the request schema for the model bundle. This is used to validate the request body for the model bundle's endpoint.
None response_schema Optional[Type[BaseModel]] A pydantic model that represents the request schema for the model bundle. This is used to validate the response for the model bundle's endpoint. Note: If request_schema is specified, then response_schema must also be specified.
None"},{"location":"api/client/#launch.client.LaunchClient.create_model_bundle_from_callable_v2","title":"create_model_bundle_from_callable_v2","text":"create_model_bundle_from_callable_v2(*, model_bundle_name: str, load_predict_fn: Callable[[LaunchModel_T], Callable[[Any], Any]], load_model_fn: Callable[[], LaunchModel_T], request_schema: Type[BaseModel], response_schema: Type[BaseModel], requirements: Optional[List[str]] = None, pytorch_image_tag: Optional[str] = None, tensorflow_version: Optional[str] = None, custom_base_image_repository: Optional[str] = None, custom_base_image_tag: Optional[str] = None, app_config: Optional[Union[Dict[str, Any], str]] = None, metadata: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response\n Uploads and registers a model bundle to Scale Launch.
Parameters:
Name Type Description Defaultmodel_bundle_name str Name of the model bundle.
requiredload_predict_fn Callable[[LaunchModel_T], Callable[[Any], Any]] Function that takes in a model and returns a predict function. When your model bundle is deployed, this predict function will be called as follows:
input = {\"input\": \"some input\"} # or whatever your request schema is.\n\ndef load_model_fn():\n # load model\n return model\n\ndef load_predict_fn(model, app_config=None):\n def predict_fn(input):\n # do pre-processing\n output = model(input)\n # do post-processing\n return output\n return predict_fn\n\npredict_fn = load_predict_fn(load_model_fn(), app_config=optional_app_config)\nresponse = predict_fn(input)\n required load_model_fn Callable[[], LaunchModel_T] A function that, when run, loads a model.
requiredrequest_schema Type[BaseModel] A pydantic model that represents the request schema for the model bundle. This is used to validate the request body for the model bundle's endpoint.
requiredresponse_schema Type[BaseModel] A pydantic model that represents the request schema for the model bundle. This is used to validate the response for the model bundle's endpoint.
requiredrequirements Optional[List[str]] List of pip requirements.
None pytorch_image_tag Optional[str] The image tag for the PyTorch image that will be used to run the bundle. Exactly one of pytorch_image_tag, tensorflow_version, or custom_base_image_repository must be specified.
None tensorflow_version Optional[str] The version of TensorFlow that will be used to run the bundle. If not specified, the default version will be used. Exactly one of pytorch_image_tag, tensorflow_version, or custom_base_image_repository must be specified.
None custom_base_image_repository Optional[str] The repository for a custom base image that will be used to run the bundle. If not specified, the default base image will be used. Exactly one of pytorch_image_tag, tensorflow_version, or custom_base_image_repository must be specified.
None custom_base_image_tag Optional[str] The tag for a custom base image that will be used to run the bundle. Must be specified if custom_base_image_repository is specified.
None app_config Optional[Union[Dict[str, Any], str]] An optional dictionary of configuration values that will be passed to the bundle when it is run. These values can be accessed by the bundle via the app_config global variable.
None metadata Optional[Dict[str, Any]] Metadata to record with the bundle.
None Returns:
Type DescriptionCreateModelBundleV2Response An object containing the following keys:
model_bundle_id: The ID of the created model bundle.create_model_bundle_from_dirs(*, model_bundle_name: str, base_paths: List[str], requirements_path: str, env_params: Dict[str, str], load_predict_fn_module_path: str, load_model_fn_module_path: str, app_config: Optional[Union[Dict[str, Any], str]] = None, request_schema: Optional[Type[BaseModel]] = None, response_schema: Optional[Type[BaseModel]] = None) -> ModelBundle\n Warning This method is deprecated. Use create_model_bundle_from_dirs_v2 instead.
Parameters:
Name Type Description Defaultmodel_bundle_name str The name of the model bundle you want to create. The name must be unique across all bundles that you own.
requiredbase_paths List[str] The paths on the local filesystem where the bundle code lives.
requiredrequirements_path str A path on the local filesystem where a requirements.txt file lives.
env_params Dict[str, str] A dictionary that dictates environment information e.g. the use of pytorch or tensorflow, which base image tag to use, etc. Specifically, the dictionary should contain the following keys:
framework_type: either tensorflow or pytorch.pytorch_image_tag: An image tag for the pytorch docker base image. The list of tags can be found from https://hub.docker.com/r/pytorch/pytorch/tagsExample:
{\n\"framework_type\": \"pytorch\",\n\"pytorch_image_tag\": \"1.10.0-cuda11.3-cudnn8-runtime\",\n}\n required load_predict_fn_module_path str A python module path for a function that, when called with the output of load_model_fn_module_path, returns a function that carries out inference.
requiredload_model_fn_module_path str A python module path for a function that returns a model. The output feeds into the function located at load_predict_fn_module_path.
requiredapp_config Optional[Union[Dict[str, Any], str]] Either a Dictionary that represents a YAML file contents or a local path to a YAML file.
None request_schema Optional[Type[BaseModel]] A pydantic model that represents the request schema for the model bundle. This is used to validate the request body for the model bundle's endpoint.
None response_schema Optional[Type[BaseModel]] A pydantic model that represents the request schema for the model bundle. This is used to validate the response for the model bundle's endpoint. Note: If request_schema is specified, then response_schema must also be specified.
None"},{"location":"api/client/#launch.client.LaunchClient.create_model_bundle_from_dirs_v2","title":"create_model_bundle_from_dirs_v2","text":"create_model_bundle_from_dirs_v2(*, model_bundle_name: str, base_paths: List[str], load_predict_fn_module_path: str, load_model_fn_module_path: str, request_schema: Type[BaseModel], response_schema: Type[BaseModel], requirements_path: Optional[str] = None, pytorch_image_tag: Optional[str] = None, tensorflow_version: Optional[str] = None, custom_base_image_repository: Optional[str] = None, custom_base_image_tag: Optional[str] = None, app_config: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response\n Packages up code from one or more local filesystem folders and uploads them as a bundle to Scale Launch. In this mode, a bundle is just local code instead of a serialized object.
For example, if you have a directory structure like so, and your current working directory is my_root:
my_root/\n my_module1/\n __init__.py\n ...files and directories\n my_inference_file.py\n my_module2/\n __init__.py\n ...files and directories\n then calling create_model_bundle_from_dirs_v2 with base_paths=[\"my_module1\", \"my_module2\"] essentially creates a zip file without the root directory, e.g.:
my_module1/\n __init__.py\n ...files and directories\n my_inference_file.py\n my_module2/\n __init__.py\n ...files and directories\n and these contents will be unzipped relative to the server side application root. Bear these points in mind when referencing Python module paths for this bundle. For instance, if my_inference_file.py has def f(...) as the desired inference loading function, then the load_predict_fn_module_path argument should be my_module1.my_inference_file.f.
Parameters:
Name Type Description Defaultmodel_bundle_name str The name of the model bundle you want to create.
requiredbase_paths List[str] A list of paths to directories that will be zipped up and uploaded as a bundle. Each path must be relative to the current working directory.
requiredload_predict_fn_module_path str The Python module path to the function that will be used to load the model for inference. This function should take in a path to a model directory, and return a model object. The model object should be pickleable.
requiredload_model_fn_module_path str The Python module path to the function that will be used to load the model for training. This function should take in a path to a model directory, and return a model object. The model object should be pickleable.
requiredrequest_schema Type[BaseModel] A Pydantic model that defines the request schema for the bundle.
requiredresponse_schema Type[BaseModel] A Pydantic model that defines the response schema for the bundle.
requiredrequirements_path Optional[str] Path to a requirements.txt file that will be used to install dependencies for the bundle. This file must be relative to the current working directory.
None pytorch_image_tag Optional[str] The image tag for the PyTorch image that will be used to run the bundle. Exactly one of pytorch_image_tag, tensorflow_version, or custom_base_image_repository must be specified.
None tensorflow_version Optional[str] The version of TensorFlow that will be used to run the bundle. If not specified, the default version will be used. Exactly one of pytorch_image_tag, tensorflow_version, or custom_base_image_repository must be specified.
None custom_base_image_repository Optional[str] The repository for a custom base image that will be used to run the bundle. If not specified, the default base image will be used. Exactly one of pytorch_image_tag, tensorflow_version, or custom_base_image_repository must be specified.
None custom_base_image_tag Optional[str] The tag for a custom base image that will be used to run the bundle. Must be specified if custom_base_image_repository is specified.
None app_config Optional[Dict[str, Any]] An optional dictionary of configuration values that will be passed to the bundle when it is run. These values can be accessed by the bundle via the app_config global variable.
None metadata Optional[Dict[str, Any]] Metadata to record with the bundle.
None Returns:
Type DescriptionCreateModelBundleV2Response An object containing the following keys:
model_bundle_id: The ID of the created model bundle.create_model_bundle_from_runnable_image_v2(*, model_bundle_name: str, request_schema: Type[BaseModel], response_schema: Type[BaseModel], repository: str, tag: str, command: List[str], healthcheck_route: Optional[str] = None, predict_route: Optional[str] = None, env: Dict[str, str], readiness_initial_delay_seconds: int, metadata: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response\n Create a model bundle from a runnable image. The specified command must start a process that will listen for requests on port 5005 using HTTP.
Inference requests must be served at the POST /predict route while the GET /readyz route is a healthcheck.
Parameters:
Name Type Description Defaultmodel_bundle_name str The name of the model bundle you want to create.
requiredrequest_schema Type[BaseModel] A Pydantic model that defines the request schema for the bundle.
requiredresponse_schema Type[BaseModel] A Pydantic model that defines the response schema for the bundle.
requiredrepository str The name of the Docker repository for the runnable image.
requiredtag str The tag for the runnable image.
requiredcommand List[str] The command that will be used to start the process that listens for requests.
requiredpredict_route Optional[str] The endpoint route on the runnable image that will be called.
None healthcheck_route Optional[str] The healthcheck endpoint route on the runnable image.
None env Dict[str, str] A dictionary of environment variables that will be passed to the bundle when it is run.
requiredreadiness_initial_delay_seconds int The number of seconds to wait for the HTTP server to become ready and successfully respond on its healthcheck.
requiredmetadata Optional[Dict[str, Any]] Metadata to record with the bundle.
None Returns:
Type DescriptionCreateModelBundleV2Response An object containing the following keys:
model_bundle_id: The ID of the created model bundle.create_model_bundle_from_streaming_enhanced_runnable_image_v2(*, model_bundle_name: str, request_schema: Type[BaseModel], response_schema: Type[BaseModel], repository: str, tag: str, command: Optional[List[str]] = None, healthcheck_route: Optional[str] = None, predict_route: Optional[str] = None, streaming_command: List[str], streaming_predict_route: Optional[str] = None, env: Dict[str, str], readiness_initial_delay_seconds: int, metadata: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response\n Create a model bundle from a runnable image. The specified command must start a process that will listen for requests on port 5005 using HTTP.
Inference requests must be served at the POST /predict route while the GET /readyz route is a healthcheck.
Parameters:
Name Type Description Defaultmodel_bundle_name str The name of the model bundle you want to create.
requiredrequest_schema Type[BaseModel] A Pydantic model that defines the request schema for the bundle.
requiredresponse_schema Type[BaseModel] A Pydantic model that defines the response schema for the bundle.
requiredrepository str The name of the Docker repository for the runnable image.
requiredtag str The tag for the runnable image.
requiredcommand Optional[List[str]] The command that will be used to start the process that listens for requests if this bundle is used as a SYNC or ASYNC endpoint.
None healthcheck_route Optional[str] The healthcheck endpoint route on the runnable image.
None predict_route Optional[str] The endpoint route on the runnable image that will be called if this bundle is used as a SYNC or ASYNC endpoint.
None streaming_command List[str] The command that will be used to start the process that listens for requests if this bundle is used as a STREAMING endpoint.
requiredstreaming_predict_route Optional[str] The endpoint route on the runnable image that will be called if this bundle is used as a STREAMING endpoint.
None env Dict[str, str] A dictionary of environment variables that will be passed to the bundle when it is run.
requiredreadiness_initial_delay_seconds int The number of seconds to wait for the HTTP server to become ready and successfully respond on its healthcheck.
requiredmetadata Optional[Dict[str, Any]] Metadata to record with the bundle.
None Returns:
Type DescriptionCreateModelBundleV2Response An object containing the following keys:
model_bundle_id: The ID of the created model bundle.create_model_bundle_from_triton_enhanced_runnable_image_v2(*, model_bundle_name: str, request_schema: Type[BaseModel], response_schema: Type[BaseModel], repository: str, tag: str, command: List[str], healthcheck_route: Optional[str] = None, predict_route: Optional[str] = None, env: Dict[str, str], readiness_initial_delay_seconds: int, triton_model_repository: str, triton_model_replicas: Optional[Dict[str, str]] = None, triton_num_cpu: float, triton_commit_tag: str, triton_storage: Optional[str] = None, triton_memory: Optional[str] = None, triton_readiness_initial_delay_seconds: int, metadata: Optional[Dict[str, Any]] = None) -> CreateModelBundleV2Response\n Create a model bundle from a runnable image and a tritonserver image.
Same requirements as :param:create_model_bundle_from_runnable_image_v2 with additional constraints necessary for configuring tritonserver's execution.
Parameters:
Name Type Description Defaultmodel_bundle_name str The name of the model bundle you want to create.
requiredrequest_schema Type[BaseModel] A Pydantic model that defines the request schema for the bundle.
requiredresponse_schema Type[BaseModel] A Pydantic model that defines the response schema for the bundle.
requiredrepository str The name of the Docker repository for the runnable image.
requiredtag str The tag for the runnable image.
requiredcommand List[str] The command that will be used to start the process that listens for requests.
requiredpredict_route Optional[str] The endpoint route on the runnable image that will be called.
None healthcheck_route Optional[str] The healthcheck endpoint route on the runnable image.
None env Dict[str, str] A dictionary of environment variables that will be passed to the bundle when it is run.
requiredreadiness_initial_delay_seconds int The number of seconds to wait for the HTTP server to become ready and successfully respond on its healthcheck.
requiredtriton_model_repository str The S3 prefix that contains the contents of the model repository, formatted according to https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md
requiredtriton_model_replicas Optional[Dict[str, str]] If supplied, the name and number of replicas to make for each model.
None triton_num_cpu float Number of CPUs, fractional, to allocate to tritonserver.
requiredtriton_commit_tag str The image tag of the specific trionserver version.
requiredtriton_storage Optional[str] Amount of storage space to allocate for the tritonserver container.
None triton_memory Optional[str] Amount of memory to allocate for the tritonserver container.
None triton_readiness_initial_delay_seconds int Like readiness_initial_delay_seconds, but for tritonserver's own healthcheck.
requiredmetadata Optional[Dict[str, Any]] Metadata to record with the bundle.
None Returns:
Type DescriptionCreateModelBundleV2Response An object containing the following keys:
model_bundle_id: The ID of the created model bundle.create_model_endpoint(*, endpoint_name: str, model_bundle: Union[ModelBundle, str], cpus: int = 3, memory: str = '8Gi', storage: str = '16Gi', gpus: int = 0, min_workers: int = 1, max_workers: int = 1, per_worker: int = 10, gpu_type: Optional[str] = None, endpoint_type: str = 'sync', high_priority: Optional[bool] = False, post_inference_hooks: Optional[List[PostInferenceHooks]] = None, default_callback_url: Optional[str] = None, default_callback_auth_kind: Optional[Literal['basic', 'mtls']] = None, default_callback_auth_username: Optional[str] = None, default_callback_auth_password: Optional[str] = None, default_callback_auth_cert: Optional[str] = None, default_callback_auth_key: Optional[str] = None, public_inference: Optional[bool] = None, update_if_exists: bool = False, labels: Optional[Dict[str, str]] = None) -> Optional[Endpoint]\n Creates and registers a model endpoint in Scale Launch. The returned object is an instance of type Endpoint, which is a base class of either SyncEndpoint or AsyncEndpoint. This is the object to which you sent inference requests.
Parameters:
Name Type Description Defaultendpoint_name str The name of the model endpoint you want to create. The name must be unique across all endpoints that you own.
requiredmodel_bundle Union[ModelBundle, str] The ModelBundle that the endpoint should serve.
cpus int Number of cpus each worker should get, e.g. 1, 2, etc. This must be greater than or equal to 1.
3 memory str Amount of memory each worker should get, e.g. \"4Gi\", \"512Mi\", etc. This must be a positive amount of memory.
'8Gi' storage str Amount of local ephemeral storage each worker should get, e.g. \"4Gi\", \"512Mi\", etc. This must be a positive amount of storage.
'16Gi' gpus int Number of gpus each worker should get, e.g. 0, 1, etc.
0 min_workers int The minimum number of workers. Must be greater than or equal to 0. This should be determined by computing the minimum throughput of your workload and dividing it by the throughput of a single worker. This field must be at least 1 for synchronous endpoints.
1 max_workers int The maximum number of workers. Must be greater than or equal to 0, and as well as greater than or equal to min_workers. This should be determined by computing the maximum throughput of your workload and dividing it by the throughput of a single worker.
1 per_worker int The maximum number of concurrent requests that an individual worker can service. Launch automatically scales the number of workers for the endpoint so that each worker is processing per_worker requests, subject to the limits defined by min_workers and max_workers.
per_worker, then the number of workers will be reduced. - Otherwise, if the average number of concurrent requests per worker is higher than per_worker, then the number of workers will be increased to meet the elevated traffic.Here is our recommendation for computing per_worker:
min_workers and max_workers per your minimum and maximum throughput requirements. 2. Determine a value for the maximum number of concurrent requests in the workload. Divide this number by max_workers. Doing this ensures that the number of workers will \"climb\" to max_workers.10 gpu_type Optional[str] If specifying a non-zero number of gpus, this controls the type of gpu requested. Here are the supported values:
nvidia-tesla-t4nvidia-ampere-a10nvidia-hopper-h100nvidia-hopper-h100-1g20gnvidia-hopper-h100-3g40gNone endpoint_type str Either \"sync\", \"async\", or \"streaming\".
'sync' high_priority Optional[bool] Either True or False. Enabling this will allow the created endpoint to leverage the shared pool of prewarmed nodes for faster spinup time.
False post_inference_hooks Optional[List[PostInferenceHooks]] List of hooks to trigger after inference tasks are served.
None default_callback_url Optional[str] The default callback url to use for async endpoints. This can be overridden in the task parameters for each individual task. post_inference_hooks must contain \"callback\" for the callback to be triggered.
None default_callback_auth_kind Optional[Literal['basic', 'mtls']] The default callback auth kind to use for async endpoints. Either \"basic\" or \"mtls\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_username Optional[str] The default callback auth username to use. This only applies if default_callback_auth_kind is \"basic\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_password Optional[str] The default callback auth password to use. This only applies if default_callback_auth_kind is \"basic\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_cert Optional[str] The default callback auth cert to use. This only applies if default_callback_auth_kind is \"mtls\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_key Optional[str] The default callback auth key to use. This only applies if default_callback_auth_kind is \"mtls\". This can be overridden in the task parameters for each individual task.
None public_inference Optional[bool] If True, this endpoint will be available to all user IDs for inference.
None update_if_exists bool If True, will attempt to update the endpoint if it exists. Otherwise, will unconditionally try to create a new endpoint. Note that endpoint names for a given user must be unique, so attempting to call this function with update_if_exists=False for an existing endpoint will raise an error.
False labels Optional[Dict[str, str]] An optional dictionary of key/value pairs to associate with this endpoint.
None Returns:
Type DescriptionOptional[Endpoint] A Endpoint object that can be used to make requests to the endpoint.
"},{"location":"api/client/#launch.client.LaunchClient.delete_file","title":"delete_file","text":"delete_file(file_id: str) -> DeleteFileResponse\n Delete a file
Parameters:
Name Type Description Defaultfile_id str ID of the file
requiredReturns:
Name Type DescriptionDeleteFileResponse DeleteFileResponse whether the deletion was successful
"},{"location":"api/client/#launch.client.LaunchClient.delete_llm_model_endpoint","title":"delete_llm_model_endpoint","text":"delete_llm_model_endpoint(model_endpoint_name: str) -> bool\n Deletes an LLM model endpoint.
Parameters:
Name Type Description Defaultmodel_endpoint_name str The name of the model endpoint to delete.
required"},{"location":"api/client/#launch.client.LaunchClient.delete_model_endpoint","title":"delete_model_endpoint","text":"delete_model_endpoint(model_endpoint_name: str)\n Deletes a model endpoint.
Parameters:
Name Type Description Defaultmodel_endpoint A ModelEndpoint object.
edit_model_endpoint(*, model_endpoint: Union[ModelEndpoint, str], model_bundle: Optional[Union[ModelBundle, str]] = None, cpus: Optional[float] = None, memory: Optional[str] = None, storage: Optional[str] = None, gpus: Optional[int] = None, min_workers: Optional[int] = None, max_workers: Optional[int] = None, per_worker: Optional[int] = None, gpu_type: Optional[str] = None, high_priority: Optional[bool] = None, post_inference_hooks: Optional[List[PostInferenceHooks]] = None, default_callback_url: Optional[str] = None, default_callback_auth_kind: Optional[Literal['basic', 'mtls']] = None, default_callback_auth_username: Optional[str] = None, default_callback_auth_password: Optional[str] = None, default_callback_auth_cert: Optional[str] = None, default_callback_auth_key: Optional[str] = None, public_inference: Optional[bool] = None) -> None\n Edits an existing model endpoint. Here are the fields that cannot be edited on an existing endpoint:
SyncEnpdoint to an AsyncEndpoint or vice versa.Parameters:
Name Type Description Defaultmodel_endpoint Union[ModelEndpoint, str] The model endpoint (or its name) you want to edit. The name must be unique across all endpoints that you own.
requiredmodel_bundle Optional[Union[ModelBundle, str]] The ModelBundle that the endpoint should serve.
None cpus Optional[float] Number of cpus each worker should get, e.g. 1, 2, etc. This must be greater than or equal to 1.
None memory Optional[str] Amount of memory each worker should get, e.g. \"4Gi\", \"512Mi\", etc. This must be a positive amount of memory.
None storage Optional[str] Amount of local ephemeral storage each worker should get, e.g. \"4Gi\", \"512Mi\", etc. This must be a positive amount of storage.
None gpus Optional[int] Number of gpus each worker should get, e.g. 0, 1, etc.
None min_workers Optional[int] The minimum number of workers. Must be greater than or equal to 0.
None max_workers Optional[int] The maximum number of workers. Must be greater than or equal to 0, and as well as greater than or equal to min_workers.
None per_worker Optional[int] The maximum number of concurrent requests that an individual worker can service. Launch automatically scales the number of workers for the endpoint so that each worker is processing per_worker requests:
per_worker, then the number of workers will be reduced. - Otherwise, if the average number of concurrent requests per worker is higher than per_worker, then the number of workers will be increased to meet the elevated traffic.None gpu_type Optional[str] If specifying a non-zero number of gpus, this controls the type of gpu requested. Here are the supported values:
nvidia-tesla-t4nvidia-ampere-a10nvidia-hopper-h100nvidia-hopper-h100-1g20gnvidia-hopper-h100-3g40gNone high_priority Optional[bool] Either True or False. Enabling this will allow the created endpoint to leverage the shared pool of prewarmed nodes for faster spinup time.
None post_inference_hooks Optional[List[PostInferenceHooks]] List of hooks to trigger after inference tasks are served.
None default_callback_url Optional[str] The default callback url to use for async endpoints. This can be overridden in the task parameters for each individual task. post_inference_hooks must contain \"callback\" for the callback to be triggered.
None default_callback_auth_kind Optional[Literal['basic', 'mtls']] The default callback auth kind to use for async endpoints. Either \"basic\" or \"mtls\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_username Optional[str] The default callback auth username to use. This only applies if default_callback_auth_kind is \"basic\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_password Optional[str] The default callback auth password to use. This only applies if default_callback_auth_kind is \"basic\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_cert Optional[str] The default callback auth cert to use. This only applies if default_callback_auth_kind is \"mtls\". This can be overridden in the task parameters for each individual task.
None default_callback_auth_key Optional[str] The default callback auth key to use. This only applies if default_callback_auth_kind is \"mtls\". This can be overridden in the task parameters for each individual task.
None public_inference Optional[bool] If True, this endpoint will be available to all user IDs for inference.
None"},{"location":"api/client/#launch.client.LaunchClient.get_batch_async_response","title":"get_batch_async_response","text":"get_batch_async_response(batch_job_id: str) -> Dict[str, Any]\n Gets inference results from a previously created batch job.
Parameters:
Name Type Description Defaultbatch_job_id str An id representing the batch task job. This id is the in the response from calling batch_async_request.
Returns:
Type DescriptionDict[str, Any] A dictionary that contains the following fields:
Dict[str, Any] status: The status of the job.Dict[str, Any] result: The url where the result is stored.Dict[str, Any] duration: A string representation of how long the job took to finish or how long it has been running, for a job current in progress.Dict[str, Any] num_tasks_pending: The number of tasks that are still pending.Dict[str, Any] num_tasks_completed: The number of tasks that have completed.get_docker_image_batch_job(batch_job_id: str)\n For self hosted mode only. Gets information about a batch job given a batch job id.
"},{"location":"api/client/#launch.client.LaunchClient.get_docker_image_batch_job_bundle","title":"get_docker_image_batch_job_bundle","text":"get_docker_image_batch_job_bundle(docker_image_batch_job_bundle_id: str) -> DockerImageBatchJobBundleResponse\n For self hosted mode only. Gets information for a single batch job bundle with a given id.
"},{"location":"api/client/#launch.client.LaunchClient.get_file","title":"get_file","text":"get_file(file_id: str) -> GetFileResponse\n Get metadata about a file
Parameters:
Name Type Description Defaultfile_id str ID of the file
requiredReturns:
Name Type DescriptionGetFileResponse GetFileResponse ID, filename, and size of the requested file
"},{"location":"api/client/#launch.client.LaunchClient.get_file_content","title":"get_file_content","text":"get_file_content(file_id: str) -> GetFileContentResponse\n Get a file's content
Parameters:
Name Type Description Defaultfile_id str ID of the file
requiredReturns:
Name Type DescriptionGetFileContentResponse GetFileContentResponse ID and content of the requested file
"},{"location":"api/client/#launch.client.LaunchClient.get_fine_tune","title":"get_fine_tune","text":"get_fine_tune(fine_tune_id: str) -> GetFineTuneResponse\n Get status of a fine-tune
Parameters:
Name Type Description Defaultfine_tune_id str ID of the fine-tune
requiredReturns:
Name Type DescriptionGetFineTuneResponse GetFineTuneResponse ID and status of the requested fine-tune
"},{"location":"api/client/#launch.client.LaunchClient.get_fine_tune_events","title":"get_fine_tune_events","text":"get_fine_tune_events(fine_tune_id: str) -> GetFineTuneEventsResponse\n Get list of fine-tune events
Parameters:
Name Type Description Defaultfine_tune_id str ID of the fine-tune
requiredReturns:
Name Type DescriptionGetFineTuneEventsResponse GetFineTuneEventsResponse a list of all the events of the fine-tune
"},{"location":"api/client/#launch.client.LaunchClient.get_latest_docker_image_batch_job_bundle","title":"get_latest_docker_image_batch_job_bundle","text":"get_latest_docker_image_batch_job_bundle(bundle_name: str) -> DockerImageBatchJobBundleResponse\n For self hosted mode only. Gets information for the latest batch job bundle with a given name.
"},{"location":"api/client/#launch.client.LaunchClient.get_latest_model_bundle_v2","title":"get_latest_model_bundle_v2","text":"get_latest_model_bundle_v2(model_bundle_name: str) -> ModelBundleV2Response\n Get the latest version of a model bundle.
Parameters:
Name Type Description Defaultmodel_bundle_name str The name of the model bundle you want to get.
requiredReturns:
Type DescriptionModelBundleV2Response An object containing the following keys:
id: The ID of the model bundle.name: The name of the model bundle.schema_location: The location of the schema for the model bundle.flavor: The flavor of the model bundle. Either RunnableImage, CloudpickleArtifact, ZipArtifact, or TritonEnhancedRunnableImageFlavor.created_at: The time the model bundle was created.metadata: A dictionary of metadata associated with the model bundle.model_artifact_ids: A list of IDs of model artifacts associated with the bundle.get_llm_model_endpoint(endpoint_name: str) -> Optional[Union[AsyncEndpoint, SyncEndpoint, StreamingEndpoint]]\n Gets a model endpoint associated with a name that the user has access to.
Parameters:
Name Type Description Defaultendpoint_name str The name of the endpoint to retrieve.
required"},{"location":"api/client/#launch.client.LaunchClient.get_model_bundle","title":"get_model_bundle","text":"get_model_bundle(model_bundle: Union[ModelBundle, str]) -> ModelBundle\n Returns a model bundle specified by bundle_name that the user owns.
Parameters:
Name Type Description Defaultmodel_bundle Union[ModelBundle, str] The bundle or its name.
requiredReturns:
Type DescriptionModelBundle A ModelBundle object
get_model_bundle_v2(model_bundle_id: str) -> ModelBundleV2Response\n Get a model bundle.
Parameters:
Name Type Description Defaultmodel_bundle_id str The ID of the model bundle you want to get.
requiredReturns:
Type DescriptionModelBundleV2Response An object containing the following fields:
id: The ID of the model bundle.name: The name of the model bundle.flavor: The flavor of the model bundle. Either RunnableImage, CloudpickleArtifact, ZipArtifact, or TritonEnhancedRunnableImageFlavor.created_at: The time the model bundle was created.metadata: A dictionary of metadata associated with the model bundle.model_artifact_ids: A list of IDs of model artifacts associated with the bundle.get_model_endpoint(endpoint_name: str) -> Optional[Union[AsyncEndpoint, SyncEndpoint]]\n Gets a model endpoint associated with a name.
Parameters:
Name Type Description Defaultendpoint_name str The name of the endpoint to retrieve.
required"},{"location":"api/client/#launch.client.LaunchClient.list_docker_image_batch_job_bundles","title":"list_docker_image_batch_job_bundles","text":"list_docker_image_batch_job_bundles(bundle_name: Optional[str] = None, order_by: Optional[Literal['newest', 'oldest']] = None) -> ListDockerImageBatchJobBundleResponse\n For self hosted mode only. Gets information for multiple bundles.
Parameters:
Name Type Description Defaultbundle_name Optional[str] The name of the bundles to retrieve. If not specified, this will retrieve all
None order_by Optional[Literal['newest', 'oldest']] Either \"newest\", \"oldest\", or not specified. Specify to sort by newest/oldest.
None"},{"location":"api/client/#launch.client.LaunchClient.list_files","title":"list_files","text":"list_files() -> ListFilesResponse\n List files
Returns:
Name Type DescriptionListFilesResponse ListFilesResponse list of all files (ID, filename, and size)
"},{"location":"api/client/#launch.client.LaunchClient.list_fine_tunes","title":"list_fine_tunes","text":"list_fine_tunes() -> ListFineTunesResponse\n List fine-tunes
Returns:
Name Type DescriptionListFineTunesResponse ListFineTunesResponse list of all fine-tunes and their statuses
"},{"location":"api/client/#launch.client.LaunchClient.list_llm_model_endpoints","title":"list_llm_model_endpoints","text":"list_llm_model_endpoints() -> List[Endpoint]\n Lists all LLM model endpoints that the user has access to.
Returns:
Type DescriptionList[Endpoint] A list of ModelEndpoint objects.
list_model_bundles() -> List[ModelBundle]\n Returns a list of model bundles that the user owns.
Returns:
Type DescriptionList[ModelBundle] A list of ModelBundle objects
"},{"location":"api/client/#launch.client.LaunchClient.list_model_bundles_v2","title":"list_model_bundles_v2","text":"list_model_bundles_v2() -> ListModelBundlesV2Response\n List all model bundles.
Returns:
Type DescriptionListModelBundlesV2Response An object containing the following keys:
model_bundles: A list of model bundles. Each model bundle is an object.list_model_endpoints() -> List[Endpoint]\n Lists all model endpoints that the user owns.
Returns:
Type DescriptionList[Endpoint] A list of ModelEndpoint objects.
model_download(model_name: str, download_format: str = 'hugging_face') -> ModelDownloadResponse\n download a finetuned model
Parameters:
Name Type Description Defaultmodel_name str name of the model to download
requireddownload_format str format of the model to download
'hugging_face' Returns:
Name Type DescriptionModelDownloadResponse ModelDownloadResponse dictionary with file names and urls to download the model
"},{"location":"api/client/#launch.client.LaunchClient.read_endpoint_creation_logs","title":"read_endpoint_creation_logs","text":"read_endpoint_creation_logs(model_endpoint: Union[ModelEndpoint, str])\n Retrieves the logs for the creation of the endpoint.
Parameters:
Name Type Description Defaultmodel_endpoint Union[ModelEndpoint, str] The endpoint or its name.
required"},{"location":"api/client/#launch.client.LaunchClient.register_batch_csv_location_fn","title":"register_batch_csv_location_fn","text":"register_batch_csv_location_fn(batch_csv_location_fn: Callable[[], str])\n For self-hosted mode only. Registers a function that gives a location for batch CSV inputs. Should give different locations each time. This function is called as batch_csv_location_fn(), and should return a batch_csv_url that upload_batch_csv_fn can take.
Strictly, batch_csv_location_fn() does not need to return a str. The only requirement is that if batch_csv_location_fn returns a value of type T, then upload_batch_csv_fn() takes in an object of type T as its second argument (i.e. batch_csv_url).
Parameters:
Name Type Description Defaultbatch_csv_location_fn Callable[[], str] Function that generates batch_csv_urls for upload_batch_csv_fn.
required"},{"location":"api/client/#launch.client.LaunchClient.register_bundle_location_fn","title":"register_bundle_location_fn","text":"register_bundle_location_fn(bundle_location_fn: Callable[[], str])\n For self-hosted mode only. Registers a function that gives a location for a model bundle. Should give different locations each time. This function is called as bundle_location_fn(), and should return a bundle_url that register_upload_bundle_fn can take.
Strictly, bundle_location_fn() does not need to return a str. The only requirement is that if bundle_location_fn returns a value of type T, then upload_bundle_fn() takes in an object of type T as its second argument (i.e. bundle_url).
Parameters:
Name Type Description Defaultbundle_location_fn Callable[[], str] Function that generates bundle_urls for upload_bundle_fn.
required"},{"location":"api/client/#launch.client.LaunchClient.register_upload_batch_csv_fn","title":"register_upload_batch_csv_fn","text":"register_upload_batch_csv_fn(upload_batch_csv_fn: Callable[[str, str], None])\n For self-hosted mode only. Registers a function that handles batch text upload. This function is called as
upload_batch_csv_fn(csv_text, csv_url)\n This function should directly write the contents of csv_text as a text string into csv_url.
Parameters:
Name Type Description Defaultupload_batch_csv_fn Callable[[str, str], None] Function that takes in a csv text (string type), and uploads that bundle to an appropriate location. Only needed for self-hosted mode.
required"},{"location":"api/client/#launch.client.LaunchClient.register_upload_bundle_fn","title":"register_upload_bundle_fn","text":"register_upload_bundle_fn(upload_bundle_fn: Callable[[str, str], None])\n For self-hosted mode only. Registers a function that handles model bundle upload. This function is called as
upload_bundle_fn(serialized_bundle, bundle_url)\n This function should directly write the contents of serialized_bundle as a binary string into bundle_url.
See register_bundle_location_fn for more notes on the signature of upload_bundle_fn
Parameters:
Name Type Description Defaultupload_bundle_fn Callable[[str, str], None] Function that takes in a serialized bundle (bytes type), and uploads that bundle to an appropriate location. Only needed for self-hosted mode.
required"},{"location":"api/client/#launch.client.LaunchClient.update_docker_image_batch_job","title":"update_docker_image_batch_job","text":"update_docker_image_batch_job(batch_job_id: str, cancel: bool)\n For self hosted mode only. Updates a batch job by id. Use this if you want to cancel/delete a batch job.
"},{"location":"api/client/#launch.client.LaunchClient.upload_file","title":"upload_file","text":"upload_file(file_path: str) -> UploadFileResponse\n Upload a file
Parameters:
Name Type Description Defaultfile_path str Path to a local file to upload.
requiredReturns:
Name Type DescriptionUploadFileResponse UploadFileResponse ID of the created file
"},{"location":"api/endpoint_predictions/","title":"Endpoint Predictions","text":""},{"location":"api/endpoint_predictions/#launch.model_endpoint.EndpointRequest","title":"EndpointRequest","text":"EndpointRequest(url: Optional[str] = None, args: Optional[Dict] = None, callback_url: Optional[str] = None, callback_auth_kind: Optional[Literal['basic', 'mtls']] = None, callback_auth_username: Optional[str] = None, callback_auth_password: Optional[str] = None, callback_auth_cert: Optional[str] = None, callback_auth_key: Optional[str] = None, return_pickled: Optional[bool] = False, request_id: Optional[str] = None)\n Represents a single request to either a SyncEndpoint, StreamingEndpoint, or AsyncEndpoint.
Parameters:
Name Type Description Defaulturl Optional[str] A url to some file that can be read in to a ModelBundle's predict function. Can be an image, raw text, etc. Note: the contents of the file located at url are opened as a sequence of bytes and passed to the predict function. If you instead want to pass the url itself as an input to the predict function, see args.
Exactly one of url and args must be specified.
None args Optional[Dict] A Dictionary with arguments to a ModelBundle's predict function. If the predict function has signature predict_fn(foo, bar), then the keys in the dictionary should be \"foo\" and \"bar\". Values must be native Python objects.
Exactly one of url and args must be specified.
None return_pickled Optional[bool] Whether the output should be a pickled python object, or directly returned serialized json.
False callback_url Optional[str] The callback url to use for this task. If None, then the default_callback_url of the endpoint is used. The endpoint must specify \"callback\" as a post-inference hook for the callback to be triggered.
None callback_auth_kind Optional[Literal['basic', 'mtls']] The default callback auth kind to use for async endpoints. Either \"basic\" or \"mtls\". This can be overridden in the task parameters for each individual task.
None callback_auth_username Optional[str] The default callback auth username to use. This only applies if callback_auth_kind is \"basic\". This can be overridden in the task parameters for each individual task.
None callback_auth_password Optional[str] The default callback auth password to use. This only applies if callback_auth_kind is \"basic\". This can be overridden in the task parameters for each individual task.
None callback_auth_cert Optional[str] The default callback auth cert to use. This only applies if callback_auth_kind is \"mtls\". This can be overridden in the task parameters for each individual task.
None callback_auth_key Optional[str] The default callback auth key to use. This only applies if callback_auth_kind is \"mtls\". This can be overridden in the task parameters for each individual task.
None request_id Optional[str] (deprecated) A user-specifiable id for requests. Should be unique among EndpointRequests made in the same batch call. If one isn't provided the client will generate its own.
None"},{"location":"api/endpoint_predictions/#launch.model_endpoint.EndpointResponse","title":"EndpointResponse","text":"EndpointResponse(client, status: str, result_url: Optional[str] = None, result: Optional[str] = None, traceback: Optional[str] = None)\n Represents a response received from a Endpoint.
Parameters:
Name Type Description Defaultclient An instance of LaunchClient.
status str A string representing the status of the request, i.e. SUCCESS, FAILURE, or PENDING
result_url Optional[str] A string that is a url containing the pickled python object from the Endpoint's predict function.
Exactly one of result_url or result will be populated, depending on the value of return_pickled in the request.
None result Optional[str] A string that is the serialized return value (in json form) of the Endpoint's predict function. Specifically, one can json.loads() the value of result to get the original python object back.
Exactly one of result_url or result will be populated, depending on the value of return_pickled in the request.
None traceback Optional[str] The stack trace if the inference endpoint raised an error. Can be used for debugging
None"},{"location":"api/endpoint_predictions/#launch.model_endpoint.EndpointResponseFuture","title":"EndpointResponseFuture","text":"EndpointResponseFuture(client, endpoint_name: str, async_task_id: str)\n Represents a future response from an Endpoint. Specifically, when the EndpointResponseFuture is ready, then its get method will return an actual instance of EndpointResponse.
This object should not be directly instantiated by the user.
Parameters:
Name Type Description Defaultclient An instance of LaunchClient.
endpoint_name str The name of the endpoint.
requiredasync_task_id str An async task id.
required"},{"location":"api/endpoint_predictions/#launch.model_endpoint.EndpointResponseFuture.get","title":"get","text":"get(timeout: Optional[float] = None) -> EndpointResponse\n Retrieves the EndpointResponse for the prediction request after it completes. This method blocks.
Parameters:
Name Type Description Defaulttimeout Optional[float] The maximum number of seconds to wait for the response. If None, then the method will block indefinitely until the response is ready.
None"},{"location":"api/endpoint_predictions/#launch.model_endpoint.EndpointResponseStream","title":"EndpointResponseStream","text":"EndpointResponseStream(response)\n Bases: Iterator
Represents a stream response from an Endpoint. This object is iterable and yields EndpointResponse objects.
This object should not be directly instantiated by the user.
"},{"location":"api/endpoint_predictions/#launch.model_endpoint.EndpointResponseStream.__iter__","title":"__iter__","text":"__iter__()\n Uses server-sent events to iterate through the stream.
"},{"location":"api/endpoint_predictions/#launch.model_endpoint.EndpointResponseStream.__next__","title":"__next__","text":"__next__()\n Uses server-sent events to iterate through the stream.
"},{"location":"api/hooks/","title":"Hooks","text":""},{"location":"api/hooks/#launch.hooks.PostInferenceHooks","title":"PostInferenceHooks","text":" Bases: str, Enum
Post-inference hooks are functions that are called after inference is complete.
Attributes:
Name Type DescriptionCALLBACK str The callback hook is called with the inference response and the task ID.
"},{"location":"api/llms/","title":"LLM APIs","text":"We provide some APIs to conveniently create, list and inference with LLMs. Under the hood they are Launch model endpoints.
"},{"location":"api/llms/#example","title":"Example","text":"LLM APIs Usageimport os\nfrom rich import print\nfrom launch import LaunchClient\nfrom launch.api_client.model.llm_inference_framework import (\nLLMInferenceFramework,\n)\nfrom launch.api_client.model.llm_source import LLMSource\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"), endpoint=os.getenv(\"LAUNCH_ENDPOINT\"))\nendpoints = client.list_llm_model_endpoints()\nprint(endpoints)\nendpoint_name = \"test-flan-t5-xxl\"\nclient.create_llm_model_endpoint(\nendpoint_name=endpoint_name,\nmodel_name=\"flan-t5-xxl\",\nsource=LLMSource.HUGGING_FACE,\ninference_framework=LLMInferenceFramework.DEEPSPEED,\ninference_framework_image_tag=os.getenv(\"INFERENCE_FRAMEWORK_IMAGE_TAG\"),\nnum_shards=4,\nmin_workers=1,\nmax_workers=1,\ngpus=4,\nendpoint_type=\"sync\",\n)\n# Wait for the endpoint to be ready\noutput = client.completions_sync(endpoint_name, prompt=\"What is Deep Learning?\", max_new_tokens=10, temperature=0)\nprint(output)\n"},{"location":"api/model_bundles/","title":"Model Bundles","text":""},{"location":"api/model_bundles/#launch.model_bundle.CloudpickleArtifactFlavor","title":"CloudpickleArtifactFlavor","text":" Bases: BaseModel
instance-attribute","text":"app_config: Optional[Dict[str, Any]]\n Optional configuration for the application.
"},{"location":"api/model_bundles/#launch.model_bundle.CloudpickleArtifactFlavor.framework","title":"frameworkclass-attribute instance-attribute","text":"framework: Union[PytorchFramework, TensorflowFramework, CustomFramework] = Field(..., discriminator='framework_type')\n Machine Learning framework specification. Either PytorchFramework, TensorflowFramework, or CustomFramework.
instance-attribute","text":"load_model_fn: str\n Function which, when called, returns the model object.
"},{"location":"api/model_bundles/#launch.model_bundle.CloudpickleArtifactFlavor.load_predict_fn","title":"load_predict_fninstance-attribute","text":"load_predict_fn: str\n Function which, when called, returns the prediction function.
"},{"location":"api/model_bundles/#launch.model_bundle.CloudpickleArtifactFlavor.requirements","title":"requirementsinstance-attribute","text":"requirements: List[str]\n List of requirements to install in the environment before running the model.
"},{"location":"api/model_bundles/#launch.model_bundle.CreateModelBundleV2Response","title":"CreateModelBundleV2Response","text":" Bases: BaseModel
Response object for creating a Model Bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.CreateModelBundleV2Response.model_bundle_id","title":"model_bundle_idinstance-attribute","text":"model_bundle_id: str\n ID of the Model Bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.CustomFramework","title":"CustomFramework","text":" Bases: BaseModel
instance-attribute","text":"image_repository: str\n Docker image repository to use as the base image.
"},{"location":"api/model_bundles/#launch.model_bundle.CustomFramework.image_tag","title":"image_taginstance-attribute","text":"image_tag: str\n Docker image tag to use as the base image.
"},{"location":"api/model_bundles/#launch.model_bundle.ListModelBundlesV2Response","title":"ListModelBundlesV2Response","text":" Bases: BaseModel
Response object for listing Model Bundles.
"},{"location":"api/model_bundles/#launch.model_bundle.ListModelBundlesV2Response.model_bundles","title":"model_bundlesinstance-attribute","text":"model_bundles: List[ModelBundleV2Response]\n A list of Model Bundles.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundle","title":"ModelBundledataclass","text":"Represents a ModelBundle.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundle.app_config","title":"app_configclass-attribute instance-attribute","text":"app_config: Optional[Dict[Any, Any]] = None\n An optional user-specified configuration mapping for the bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundle.env_params","title":"env_paramsclass-attribute instance-attribute","text":"env_params: Optional[Dict[str, str]] = None\n A dictionary that dictates environment information. See LaunchClient.create_model_bundle for more information.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundle.id","title":"idclass-attribute instance-attribute","text":"id: Optional[str] = None\n A globally unique identifier for the bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundle.location","title":"locationclass-attribute instance-attribute","text":"location: Optional[str] = None\n An opaque location for the bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundle.metadata","title":"metadataclass-attribute instance-attribute","text":"metadata: Optional[Dict[Any, Any]] = None\n Arbitrary metadata for the bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundle.name","title":"nameinstance-attribute","text":"name: str\n The name of the bundle. Must be unique across all bundles that the user owns.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundle.packaging_type","title":"packaging_typeclass-attribute instance-attribute","text":"packaging_type: Optional[str] = None\n The packaging type for the bundle. Can be cloudpickle or zip.
class-attribute instance-attribute","text":"requirements: Optional[List[str]] = None\n A list of Python package requirements for the bundle. See LaunchClient.create_model_bundle for more information.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundleV2Response","title":"ModelBundleV2Response","text":" Bases: BaseModel
Response object for a single Model Bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundleV2Response.created_at","title":"created_atinstance-attribute","text":"created_at: datetime.datetime\n Timestamp of when the Model Bundle was created.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundleV2Response.flavor","title":"flavorclass-attribute instance-attribute","text":"flavor: ModelBundleFlavors = Field(..., discriminator='flavor')\n Flavor of the Model Bundle, representing how the model bundle was packaged.
See ModelBundleFlavors for details.
instance-attribute","text":"id: str\n ID of the Model Bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundleV2Response.metadata","title":"metadatainstance-attribute","text":"metadata: Dict[str, Any]\n Metadata associated with the Model Bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundleV2Response.model_artifact_ids","title":"model_artifact_idsinstance-attribute","text":"model_artifact_ids: List[str]\n IDs of the Model Artifacts associated with the Model Bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.ModelBundleV2Response.name","title":"nameinstance-attribute","text":"name: str\n Name of the Model Bundle.
"},{"location":"api/model_bundles/#launch.model_bundle.PytorchFramework","title":"PytorchFramework","text":" Bases: BaseModel
instance-attribute","text":"pytorch_image_tag: str\n Image tag of the Pytorch image to use.
"},{"location":"api/model_bundles/#launch.model_bundle.RunnableImageFlavor","title":"RunnableImageFlavor","text":" Bases: RunnableImageLike
Model bundles that use custom docker images that expose an HTTP server for inference.
"},{"location":"api/model_bundles/#launch.model_bundle.TensorflowFramework","title":"TensorflowFramework","text":" Bases: BaseModel
instance-attribute","text":"tensorflow_version: str\n Tensorflow version to use.
"},{"location":"api/model_bundles/#launch.model_bundle.ZipArtifactFlavor","title":"ZipArtifactFlavor","text":" Bases: BaseModel
class-attribute instance-attribute","text":"app_config: Optional[Dict[str, Any]] = None\n Optional configuration for the application.
"},{"location":"api/model_bundles/#launch.model_bundle.ZipArtifactFlavor.framework","title":"frameworkclass-attribute instance-attribute","text":"framework: Union[PytorchFramework, TensorflowFramework, CustomFramework] = Field(..., discriminator='framework_type')\n Machine Learning framework specification. Either PytorchFramework, TensorflowFramework, or CustomFramework.
instance-attribute","text":"load_model_fn_module_path: str\n Path to the module to load the model object.
"},{"location":"api/model_bundles/#launch.model_bundle.ZipArtifactFlavor.load_predict_fn_module_path","title":"load_predict_fn_module_pathinstance-attribute","text":"load_predict_fn_module_path: str\n Path to the module to load the prediction function.
"},{"location":"api/model_bundles/#launch.model_bundle.ZipArtifactFlavor.requirements","title":"requirementsinstance-attribute","text":"requirements: List[str]\n List of requirements to install in the environment before running the model.
"},{"location":"api/model_endpoints/","title":"Model Endpoints","text":"All classes here are returned by the get_model_endpoint method and provide a predict function.
AsyncEndpoint(model_endpoint: ModelEndpoint, client: ModelEndpoint)\n Bases: Endpoint
An asynchronous model endpoint.
Parameters:
Name Type Description Defaultmodel_endpoint ModelEndpoint ModelEndpoint object.
requiredclient A LaunchClient object
required"},{"location":"api/model_endpoints/#launch.model_endpoint.AsyncEndpoint.predict","title":"predict","text":"predict(request: EndpointRequest) -> EndpointResponseFuture\n Runs an asynchronous prediction request.
Parameters:
Name Type Description Defaultrequest EndpointRequest The EndpointRequest object that contains the payload.
Returns:
Name Type DescriptionEndpointResponseFuture An EndpointResponseFuture such the user can use to query the status of the request.
Example EndpointResponseFuture EndpointResponseFuture .. code-block:: python
my_endpoint = AsyncEndpoint(...) f: EndpointResponseFuture = my_endpoint.predict(EndpointRequest(...)) result = f.get() # blocks on completion
"},{"location":"api/model_endpoints/#launch.model_endpoint.AsyncEndpoint.predict_batch","title":"predict_batch","text":"predict_batch(requests: Sequence[EndpointRequest]) -> AsyncEndpointBatchResponse\n (deprecated) Runs inference on the data items specified by urls. Returns a AsyncEndpointResponse.
Parameters:
Name Type Description Defaultrequests Sequence[EndpointRequest] List of EndpointRequests. Request_ids must all be distinct.
requiredReturns:
Type DescriptionAsyncEndpointBatchResponse an AsyncEndpointResponse keeping track of the inference requests made
"},{"location":"api/model_endpoints/#launch.model_endpoint.SyncEndpoint","title":"SyncEndpoint","text":"SyncEndpoint(model_endpoint: ModelEndpoint, client: ModelEndpoint)\n Bases: Endpoint
A synchronous model endpoint.
Parameters:
Name Type Description Defaultmodel_endpoint ModelEndpoint ModelEndpoint object.
requiredclient A LaunchClient object
required"},{"location":"api/model_endpoints/#launch.model_endpoint.SyncEndpoint.predict","title":"predict","text":"predict(request: EndpointRequest) -> EndpointResponse\n Runs a synchronous prediction request.
Parameters:
Name Type Description Defaultrequest EndpointRequest The EndpointRequest object that contains the payload.
StreamingEndpoint(model_endpoint: ModelEndpoint, client: ModelEndpoint)\n Bases: Endpoint
A synchronous model endpoint.
Parameters:
Name Type Description Defaultmodel_endpoint ModelEndpoint ModelEndpoint object.
requiredclient A LaunchClient object
required"},{"location":"api/model_endpoints/#launch.model_endpoint.StreamingEndpoint.predict","title":"predict","text":"predict(request: EndpointRequest) -> EndpointResponseStream\n Runs a streaming prediction request.
Parameters:
Name Type Description Defaultrequest EndpointRequest The EndpointRequest object that contains the payload.
Returns:
Type DescriptionEndpointResponseStream An EndpointResponseStream object that can be used to iterate through the stream.
For predicting over a larger set of tasks (> 50) at once, it is recommended to use batch jobs. Batch jobs are a way to send a large number of tasks to a model bundle. The tasks are processed in parallel, and the results are returned as a list of predictions.
Batch jobs are created using the batch_async_request method of the LaunchClient.
import logging\nimport os\nimport time\nfrom launch import LaunchClient\nlogger = logging.getLogger(__name__)\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nbatch_job = client.batch_async_request(\nmodel_bundle=\"test-bundle\",\ninputs=[\n{\"x\": 2, \"y\": \"hello\"},\n{\"x\": 3, \"y\": \"world\"},\n],\ngpus=0,\nlabels={\n\"team\": \"MY_TEAM\",\n\"product\": \"MY_PRODUCT\",\n}\n)\nstatus = \"PENDING\"\nres = None\nwhile status != \"SUCCESS\" and status != \"FAILURE\" and status != \"CANCELLED\":\ntime.sleep(30)\nres = client.get_batch_async_response(batch_job[\"job_id\"])\nstatus = res[\"status\"]\nlogging.info(f\"the batch job is {status}\")\nlogging.info(res)\n"},{"location":"concepts/callbacks/","title":"Callbacks","text":"Async model endpoints can be configured to send callbacks to a user-defined callback URL. Callbacks are sent as HTTP POST requests with a JSON body. The following code snippet shows how to create an async model endpoint with a callback URL.
To configure an async endpoint to send callbacks, set the post_inference_hooks field to include launch.PostInferenceHooks.CALLBACK. A callback URL also needs to be specified, and it can be configured as a default using the default_callback_url argument to launch.LaunchClient.create_model_endpoint or as a per-task override using the callback_url field of launch.EndpointRequest.
Note
Callbacks will not be sent if the endpoint does not have any post-inference hooks specified, even if a default_callback_url is provided to the endpoint creation method or if the prediction request has a callback_url override.
import os\nimport time\nfrom launch import EndpointRequest, LaunchClient, PostInferenceHooks\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nendpoint = client.create_model_endpoint(\nendpoint_name=\"demo-endpoint-callback\",\nmodel_bundle=\"test-bundle\",\ncpus=1,\nmin_workers=1,\nendpoint_type=\"async\",\nupdate_if_exists=True,\nlabels={\n\"team\": \"MY_TEAM\",\n\"product\": \"MY_PRODUCT\",\n},\npost_inference_hooks=[PostInferenceHooks.CALLBACK],\ndefault_callback_url=\"https://example.com\",\n)\nwhile endpoint.status() != \"READY\":\ntime.sleep(10)\nfuture_default = endpoint.predict(\nrequest=EndpointRequest(args={\"x\": 2, \"y\": \"hello\"})\n)\n\"\"\"\nA callback is sent to https://example.com with the following JSON body:\n{\n \"task_id\": \"THE_TASK_ID\",\n \"result\": 7\n}\n\"\"\"\nfuture_custom_callback_url = endpoint.predict(\nrequest=EndpointRequest(\nargs={\"x\": 3, \"y\": \"hello\"}, callback_url=\"https://example.com/custom\"\n),\n)\n\"\"\"\nA callback is sent to https://example.com/custom with the following JSON body:\n{\n \"task_id\": \"THE_TASK_ID\",\n \"result\": 8\n}\n\"\"\"\n"},{"location":"concepts/callbacks/#authentication-for-callbacks","title":"Authentication for callbacks","text":"Warning
This feature is currently in beta, and the API is likely to change.
Callbacks can be authenticated using shared authentication headers. To enable authentication, set either default_callback_auth_kind when creating the endpoint or callback_auth_kind when making a prediction request.
Currently, the supported authentication methods are basic and mtls. If basic is used, then the default_callback_auth_username and default_callback_auth_password fields must be specified when creating the endpoint, or the callback_auth_username and callback_auth_password fields must be specified when making a prediction request. If mtls is used, then the same is true for the default_callback_auth_cert and default_callback_auth_key fields, or the callback_auth_cert and callback_auth_key fields.
import os\nimport time\nfrom launch import EndpointRequest, LaunchClient, PostInferenceHooks\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nendpoint = client.create_model_endpoint(\nendpoint_name=\"demo-endpoint-callback\",\nmodel_bundle=\"test-bundle\",\ncpus=1,\nmin_workers=1,\nendpoint_type=\"async\",\nupdate_if_exists=True,\nlabels={\n\"team\": \"MY_TEAM\",\n\"product\": \"MY_PRODUCT\",\n},\npost_inference_hooks=[PostInferenceHooks.CALLBACK],\ndefault_callback_url=\"https://example.com\",\ndefault_callback_auth_kind=\"basic\",\ndefault_callback_auth_username=\"user\",\ndefault_callback_auth_password=\"password\",\n)\nwhile endpoint.status() != \"READY\":\ntime.sleep(10)\nfuture_default = endpoint.predict(\nrequest=EndpointRequest(args={\"x\": 2, \"y\": \"hello\"})\n)\n\"\"\"\nA callback is sent to https://example.com with (\"user\", \"password\") as the basic auth.\n\"\"\"\nfuture_custom_callback_auth = endpoint.predict(\nrequest=EndpointRequest(\nargs={\"x\": 3, \"y\": \"hello\"},\ncallback_auth_kind=\"mtls\", \ncallback_auth_cert=\"cert\", \ncallback_auth_key=\"key\",\n),\n)\n\"\"\"\nA callback is sent with mTLS authentication.\n\"\"\"\nclient.edit_model_endpoint(\nmodel_endpoint=endpoint.model_endpoint,\ndefault_callback_auth_kind=\"mtls\",\ndefault_callback_auth_cert=\"cert\",\ndefault_callback_auth_key=\"key\",\n)\nwhile endpoint.status() != \"READY\":\ntime.sleep(10)\nfuture_default = endpoint.predict(\nrequest=EndpointRequest(args={\"x\": 2, \"y\": \"hello\"})\n)\n\"\"\"\nA callback is sent with mTLS auth.\n\"\"\"\nfuture_custom_callback_auth = endpoint.predict(\nrequest=EndpointRequest(\nargs={\"x\": 3, \"y\": \"hello\"},\ncallback_auth_kind=\"basic\",\ncallback_auth_username=\"user\",\ncallback_auth_password=\"pass\",\n),\n)\n\"\"\"\nA callback is sent with (\"user\", \"pass\") as the basic auth.\n\"\"\"\n"},{"location":"concepts/endpoint_predictions/","title":"Endpoint Predictions","text":"Once endpoints have been created, users can send tasks to them to make predictions. The following code snippet shows how to send tasks to endpoints.
Sending a Task to an Async EndpointSending a Task to a Sync EndpointSending a Task to a Streaming Endpointimport os\nfrom launch import EndpointRequest, LaunchClient\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nendpoint = client.get_model_endpoint(\"demo-endpoint-async\")\nfuture = endpoint.predict(request=EndpointRequest(args={\"x\": 2, \"y\": \"hello\"}))\nresponse = future.get()\nprint(response)\n import os\nfrom launch import EndpointRequest, LaunchClient\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nendpoint = client.get_model_endpoint(\"demo-endpoint-sync\")\nresponse = endpoint.predict(request=EndpointRequest(args={\"x\": 2, \"y\": \"hello\"}))\nprint(response)\n import os\nfrom launch import EndpointRequest, LaunchClient\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nendpoint = client.get_model_endpoint(\"demo-endpoint-streaming\")\nresponse = endpoint.predict(request=EndpointRequest(args={\"x\": 2, \"y\": \"hello\"}))\nfor chunk in response:\nprint(chunk)\n"},{"location":"concepts/endpoint_predictions/#launch.model_endpoint.EndpointRequest","title":"EndpointRequest","text":"EndpointRequest(url: Optional[str] = None, args: Optional[Dict] = None, callback_url: Optional[str] = None, callback_auth_kind: Optional[Literal['basic', 'mtls']] = None, callback_auth_username: Optional[str] = None, callback_auth_password: Optional[str] = None, callback_auth_cert: Optional[str] = None, callback_auth_key: Optional[str] = None, return_pickled: Optional[bool] = False, request_id: Optional[str] = None)\n Represents a single request to either a SyncEndpoint, StreamingEndpoint, or AsyncEndpoint.
Parameters:
Name Type Description Defaulturl Optional[str] A url to some file that can be read in to a ModelBundle's predict function. Can be an image, raw text, etc. Note: the contents of the file located at url are opened as a sequence of bytes and passed to the predict function. If you instead want to pass the url itself as an input to the predict function, see args.
Exactly one of url and args must be specified.
None args Optional[Dict] A Dictionary with arguments to a ModelBundle's predict function. If the predict function has signature predict_fn(foo, bar), then the keys in the dictionary should be \"foo\" and \"bar\". Values must be native Python objects.
Exactly one of url and args must be specified.
None return_pickled Optional[bool] Whether the output should be a pickled python object, or directly returned serialized json.
False callback_url Optional[str] The callback url to use for this task. If None, then the default_callback_url of the endpoint is used. The endpoint must specify \"callback\" as a post-inference hook for the callback to be triggered.
None callback_auth_kind Optional[Literal['basic', 'mtls']] The default callback auth kind to use for async endpoints. Either \"basic\" or \"mtls\". This can be overridden in the task parameters for each individual task.
None callback_auth_username Optional[str] The default callback auth username to use. This only applies if callback_auth_kind is \"basic\". This can be overridden in the task parameters for each individual task.
None callback_auth_password Optional[str] The default callback auth password to use. This only applies if callback_auth_kind is \"basic\". This can be overridden in the task parameters for each individual task.
None callback_auth_cert Optional[str] The default callback auth cert to use. This only applies if callback_auth_kind is \"mtls\". This can be overridden in the task parameters for each individual task.
None callback_auth_key Optional[str] The default callback auth key to use. This only applies if callback_auth_kind is \"mtls\". This can be overridden in the task parameters for each individual task.
None request_id Optional[str] (deprecated) A user-specifiable id for requests. Should be unique among EndpointRequests made in the same batch call. If one isn't provided the client will generate its own.
None"},{"location":"concepts/endpoint_predictions/#launch.model_endpoint.EndpointResponseFuture","title":"EndpointResponseFuture","text":"EndpointResponseFuture(client, endpoint_name: str, async_task_id: str)\n Represents a future response from an Endpoint. Specifically, when the EndpointResponseFuture is ready, then its get method will return an actual instance of EndpointResponse.
This object should not be directly instantiated by the user.
Parameters:
Name Type Description Defaultclient An instance of LaunchClient.
endpoint_name str The name of the endpoint.
requiredasync_task_id str An async task id.
required"},{"location":"concepts/endpoint_predictions/#launch.model_endpoint.EndpointResponseFuture.get","title":"get","text":"get(timeout: Optional[float] = None) -> EndpointResponse\n Retrieves the EndpointResponse for the prediction request after it completes. This method blocks.
Parameters:
Name Type Description Defaulttimeout Optional[float] The maximum number of seconds to wait for the response. If None, then the method will block indefinitely until the response is ready.
None"},{"location":"concepts/endpoint_predictions/#launch.model_endpoint.EndpointResponse","title":"EndpointResponse","text":"EndpointResponse(client, status: str, result_url: Optional[str] = None, result: Optional[str] = None, traceback: Optional[str] = None)\n Represents a response received from a Endpoint.
Parameters:
Name Type Description Defaultclient An instance of LaunchClient.
status str A string representing the status of the request, i.e. SUCCESS, FAILURE, or PENDING
result_url Optional[str] A string that is a url containing the pickled python object from the Endpoint's predict function.
Exactly one of result_url or result will be populated, depending on the value of return_pickled in the request.
None result Optional[str] A string that is the serialized return value (in json form) of the Endpoint's predict function. Specifically, one can json.loads() the value of result to get the original python object back.
Exactly one of result_url or result will be populated, depending on the value of return_pickled in the request.
None traceback Optional[str] The stack trace if the inference endpoint raised an error. Can be used for debugging
None"},{"location":"concepts/endpoint_predictions/#launch.model_endpoint.EndpointResponseStream","title":"EndpointResponseStream","text":"EndpointResponseStream(response)\n Bases: Iterator
Represents a stream response from an Endpoint. This object is iterable and yields EndpointResponse objects.
This object should not be directly instantiated by the user.
"},{"location":"concepts/endpoint_predictions/#launch.model_endpoint.EndpointResponseStream.__iter__","title":"__iter__","text":"__iter__()\n Uses server-sent events to iterate through the stream.
"},{"location":"concepts/endpoint_predictions/#launch.model_endpoint.EndpointResponseStream.__next__","title":"__next__","text":"__next__()\n Uses server-sent events to iterate through the stream.
"},{"location":"concepts/model_bundles/","title":"Model Bundles","text":"Model Bundles are deployable models that can be used to make predictions. They are created by packaging a model up into a deployable format.
"},{"location":"concepts/model_bundles/#creating-model-bundles","title":"Creating Model Bundles","text":"There are five methods for creating model bundles: create_model_bundle_from_callable_v2, create_model_bundle_from_dirs_v2, create_model_bundle_from_runnable_image_v2, create_model_bundle_from_triton_enhanced_runnable_image_v2, and create_model_bundle_from_streaming_enhanced_runnable_image_v2.
The first directly pickles a user-specified load_predict_fn, a function which loads the model and returns a predict_fn, a function which takes in a request. The second takes in directories containing a load_predict_fn and the module path to the load_predict_fn. The third takes a Docker image and a command that starts a process listening for requests at port 5005 using HTTP and exposes POST /predict and GET /readyz endpoints. The fourth is a variant of the third that also starts an instance of the NVidia Triton framework for efficient model serving. The fifth is a variant of the third that responds with a stream of SSEs at POST /stream (the user can decide whether POST /predict is also exposed).
Each of these modes of creating a model bundle is called a \"Flavor\".
Info
Creating From CallablesCreating From DirectoriesCreating From a Runnable ImageCreating From a Triton Enhanced Runnable ImageCreating From a Streaming Enhanced Runnable Imageimport os\nfrom pydantic import BaseModel\nfrom launch import LaunchClient\nclass MyRequestSchema(BaseModel):\nx: int\ny: str\nclass MyResponseSchema(BaseModel):\n__root__: int\ndef my_load_predict_fn(model):\ndef returns_model_of_x_plus_len_of_y(x: int, y: str) -> int:\n\"\"\"MyRequestSchema -> MyResponseSchema\"\"\"\nassert isinstance(x, int) and isinstance(y, str)\nreturn model(x) + len(y)\nreturn returns_model_of_x_plus_len_of_y\ndef my_load_model_fn():\ndef my_model(x):\nreturn x * 2\nreturn my_model\nBUNDLE_PARAMS = {\n\"model_bundle_name\": \"test-bundle\",\n\"load_model_fn\": my_load_model_fn,\n\"load_predict_fn\": my_load_predict_fn,\n\"request_schema\": MyRequestSchema,\n\"response_schema\": MyResponseSchema,\n\"requirements\": [\"pytest==7.2.1\", \"numpy\"], # list your requirements here\n\"pytorch_image_tag\": \"1.7.1-cuda11.0-cudnn8-runtime\",\n}\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nclient.create_model_bundle_from_callable_v2(**BUNDLE_PARAMS)\n import os\nimport tempfile\nfrom pydantic import BaseModel\nfrom launch import LaunchClient\ndirectory = tempfile.mkdtemp()\nmodel_filename = os.path.join(directory, \"model.py\")\nwith open(model_filename, \"w\") as f:\nf.write(\"\"\"def my_load_model_fn(deserialized_config):\n def my_model(x):\n return x * 2\n return my_model\n\"\"\")\npredict_filename = os.path.join(directory, \"predict.py\")\nwith open(predict_filename, \"w\") as f:\nf.write(\"\"\"def my_load_predict_fn(deserialized_config, model):\n def returns_model_of_x_plus_len_of_y(x: int, y: str) -> int:\n assert isinstance(x, int) and isinstance(y, str)\n return model(x) + len(y)\n return returns_model_of_x_plus_len_of_y\n\"\"\")\nrequirements_filename = os.path.join(directory, \"requirements.txt\")\nwith open(requirements_filename, \"w\") as f:\nf.write(\"\"\"\npytest==7.2.1\nnumpy\n\"\"\")\n\"\"\"\nThe directory structure should now look like\ndirectory/\n model.py\n predict.py\n requirements.txt\n\"\"\"\nclass MyRequestSchema(BaseModel):\nx: int\ny: str\nclass MyResponseSchema(BaseModel):\n__root__: int\nprint(directory)\nprint(model_filename)\nprint(predict_filename)\nprint(requirements_filename)\nBUNDLE_PARAMS = {\n\"model_bundle_name\": \"test-bundle-from-dirs\",\n\"base_paths\": [directory],\n\"load_predict_fn_module_path\": f\"{os.path.basename(directory)}.predict.my_load_predict_fn\",\n\"load_model_fn_module_path\": f\"{os.path.basename(directory)}.model.my_load_model_fn\",\n\"request_schema\": MyRequestSchema,\n\"response_schema\": MyResponseSchema,\n\"requirements_path\": requirements_filename,\n\"pytorch_image_tag\": \"1.7.1-cuda11.0-cudnn8-runtime\",\n}\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nclient.create_model_bundle_from_dirs_v2(**BUNDLE_PARAMS)\n# Clean up files from demo\nos.remove(model_filename)\nos.remove(predict_filename)\nos.remove(requirements_filename)\nos.rmdir(directory)\n import os\nfrom pydantic import BaseModel\nfrom launch import LaunchClient\nclass MyRequestSchema(BaseModel):\nx: int\ny: str\nclass MyResponseSchema(BaseModel):\n__root__: int\nBUNDLE_PARAMS = {\n\"model_bundle_name\": \"test-bundle\",\n\"request_schema\": MyRequestSchema,\n\"response_schema\": MyResponseSchema,\n\"repository\": \"...\",\n\"tag\": \"...\",\n\"command\": ...,\n\"predict_route\": \"/predict\",\n\"healthcheck_route\": \"/readyz\",\n\"env\": {\n\"TEST_KEY\": \"test_value\",\n},\n\"readiness_initial_delay_seconds\": 30,\n}\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nclient.create_model_bundle_from_runnable_image_v2(**BUNDLE_PARAMS)\n import os\nfrom pydantic import BaseModel\nfrom launch import LaunchClient\nclass MyRequestSchema(BaseModel):\nx: int\ny: str\nclass MyResponseSchema(BaseModel):\n__root__: int\nBUNDLE_PARAMS = {\n\"model_bundle_name\": \"test-triton-bundle\",\n\"request_schema\": MyRequestSchema,\n\"response_schema\": MyResponseSchema,\n\"repository\": \"...\",\n\"tag\": \"...\",\n\"command\": ...,\n\"predict_route\": \"/predict\",\n\"healthcheck_route\": \"/readyz\",\n\"env\": {\n\"TEST_KEY\": \"test_value\",\n},\n\"readiness_initial_delay_seconds\": 30,\n\"triton_model_repository\": \"...\",\n\"triton_model_replicas\": {\"\": \"\"},\n\"triton_num_cpu\": 4.0,\n\"triton_commit_tag\": \"\",\n\"triton_storage\": \"\",\n\"triton_memory\": \"\",\n\"triton_readiness_initial_delay_seconds\": 300,\n}\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nclient.create_model_bundle_from_triton_enhanced_runnable_image_v2(**BUNDLE_PARAMS)\n import os\nfrom pydantic import BaseModel\nfrom launch import LaunchClient\nclass MyRequestSchema(BaseModel):\nx: int\ny: str\nclass MyResponseSchema(BaseModel):\n__root__: int\nBUNDLE_PARAMS = {\n\"model_bundle_name\": \"test-streaming-bundle\",\n\"request_schema\": MyRequestSchema,\n\"response_schema\": MyResponseSchema,\n\"repository\": \"...\",\n\"tag\": \"...\",\n\"command\": ..., # optional; if provided, will also expose the /predict endpoint\n\"predict_route\": \"/predict\",\n\"healthcheck_route\": \"/readyz\",\n\"streaming_command\": ..., # required\n\"streaming_predict_route\": \"/stream\",\n\"env\": {\n\"TEST_KEY\": \"test_value\",\n},\n\"readiness_initial_delay_seconds\": 30,\n}\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nclient.create_model_bundle_from_streaming_enhanced_runnable_image_v2(**BUNDLE_PARAMS)\n"},{"location":"concepts/model_bundles/#choosing-the-right-model-bundle-flavor","title":"Choosing the right model bundle flavor","text":"Here are some tips for how to choose between the different flavors of ModelBundle:
A CloudpickleArtifactFlavor (creating from callable) is good if:
A ZipArtifactFlavor (creating from directories) is good if:
A RunnableImageFlavor (creating from runnable image) is good if:
A TritonEnhancedRunnableImageFlavor (a runnable image variant) is good if:
RunnableImageFlavortritonserver to accelerate model inferenceA StreamingEnhancedRunnableImageFlavor (a runnable image variant) is good if:
RunnableImageFlavorThe app_config field of a model bundle is a dictionary that can be used to configure the model bundle. If specified, the app_config is passed to the load_predict_fn when the model bundle is deployed, alongside the model. This can allow for more code reuse between multiple bundles that perform similar tasks.
import os\nfrom launch import LaunchClient\nfrom pydantic import BaseModel\nfrom typing import List, Union\nfrom typing_extensions import Literal\nclass MyRequestSchemaSingle(BaseModel):\nkind: Literal['single']\nx: int\ny: str\nclass MyRequestSchemaBatched(BaseModel):\nkind: Literal['batched']\nx: List[int]\ny: List[str]\nclass MyRequestSchema(BaseModel):\n__root__: Union[MyRequestSchemaSingle, MyRequestSchemaBatched]\nclass MyResponseSchema(BaseModel):\n__root__: Union[int, List[int]]\ndef my_load_predict_fn(app_config, model):\ndef returns_model_of_x_plus_len_of_y(x: Union[int, List[int]], y: Union[str, List[str]]) -> Union[int, List[int]]:\n\"\"\"MyRequestSchema -> MyResponseSchema\"\"\"\nif app_config[\"mode\"] == \"single\":\nassert isinstance(x, int) and isinstance(y, str)\nreturn model(x) + len(y)\nresult = []\nfor x_i, y_i in zip(x, y):\nresult.append(model(x_i) + len(y_i))\nreturn result\nreturn returns_model_of_x_plus_len_of_y\ndef my_load_model_fn(app_config):\ndef my_model_single(x: int):\nreturn x * 2\ndef my_model_batched(x: List[int]):\nreturn [my_model_single(x_i) for x_i in x]\nif app_config[\"mode\"] == \"single\":\nreturn my_model_single\nreturn my_model_batched\nBUNDLE_PARAMS_SINGLE = {\n\"model_bundle_name\": \"test-bundle-single\",\n\"load_predict_fn\": my_load_predict_fn,\n\"load_model_fn\": my_load_model_fn,\n\"requirements\": [\"pytest==7.2.1\", \"numpy\"],\n\"request_schema\": MyRequestSchema,\n\"response_schema\": MyResponseSchema,\n\"pytorch_image_tag\": \"1.7.1-cuda11.0-cudnn8-runtime\",\n\"app_config\": {\"mode\": \"single\"},\n}\nBUNDLE_PARAMS_BATCHED = {\n\"model_bundle_name\": \"test-bundle-batched\",\n\"load_predict_fn\": my_load_predict_fn,\n\"load_model_fn\": my_load_model_fn,\n\"requirements\": [\"pytest==7.2.1\", \"numpy\"],\n\"request_schema\": MyRequestSchema,\n\"response_schema\": MyResponseSchema,\n\"pytorch_image_tag\": \"1.7.1-cuda11.0-cudnn8-runtime\",\n\"app_config\": {\"mode\": \"batched\"},\n}\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nbundle_single = client.create_model_bundle_from_callable_v2(**BUNDLE_PARAMS_SINGLE)\nbundle_batch = client.create_model_bundle_from_callable_v2(**BUNDLE_PARAMS_BATCHED)\n"},{"location":"concepts/model_bundles/#updating-model-bundles","title":"Updating Model Bundles","text":"Model Bundles are immutable, meaning they cannot be edited once created. However, it is possible to clone an existing model bundle with a new app_config using clone_model_bundle_with_changes_v2.
To list all the model bundles you own, use list_model_bundles_v2.
Model Endpoints are deployments of models that can receive requests and return predictions containing the results of the model's inference. Each model endpoint is associated with a model bundle, which contains the model's code. An endpoint specifies deployment parameters, such as the minimum and maximum number of workers, as well as the requested resources for each worker, such as the number of CPUs, amount of memory, GPU count, and type of GPU.
Endpoints can be asynchronous, synchronous, or streaming. Asynchronous endpoints return a future immediately after receiving a request, and the future can be used to retrieve the prediction once it is ready. Synchronous endpoints return the prediction directly after receiving a request. Streaming endpoints are variants of synchronous endpoints that return a stream of SSEs instead of a single HTTP response.
Info
"},{"location":"concepts/model_endpoints/#choosing-the-right-inference-mode","title":"Choosing the right inference mode","text":"Here are some tips for how to choose between SyncEndpoint, StreamingEndpoint, AsyncEndpoint, and BatchJob for deploying your ModelBundle:
A SyncEndpoint is good if:
A StreamingEndpoint is good if:
An AsyncEndpoint is good if:
A BatchJob is good if:
Async model endpoints are the most cost-efficient way to perform inference on tasks that are less latency-sensitive.
Creating an Async Model Endpointimport os\nfrom launch import LaunchClient\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nendpoint = client.create_model_endpoint(\nendpoint_name=\"demo-endpoint-async\",\nmodel_bundle=\"test-bundle\",\ncpus=1,\nmin_workers=0,\nendpoint_type=\"async\",\nupdate_if_exists=True,\nlabels={\n\"team\": \"MY_TEAM\",\n\"product\": \"MY_PRODUCT\",\n},\n)\n"},{"location":"concepts/model_endpoints/#creating-sync-model-endpoints","title":"Creating Sync Model Endpoints","text":"Sync model endpoints are useful for latency-sensitive tasks, such as real-time inference. Sync endpoints are more expensive than async endpoints.
Note
Sync model endpoints require at least 1 min_worker.
import os\nfrom launch import LaunchClient\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nendpoint = client.create_model_endpoint(\nendpoint_name=\"demo-endpoint-sync\",\nmodel_bundle=\"test-bundle\",\ncpus=1,\nmin_workers=1,\nendpoint_type=\"sync\",\nupdate_if_exists=True,\nlabels={\n\"team\": \"MY_TEAM\",\n\"product\": \"MY_PRODUCT\",\n},\n)\n"},{"location":"concepts/model_endpoints/#creating-streaming-model-endpoints","title":"Creating Streaming Model Endpoints","text":"Streaming model endpoints are variants of sync model endpoints that are useful for tasks with strict requirements on perceived latency. Streaming endpoints are more expensive than async endpoints.
Note
Streaming model endpoints require at least 1 min_worker.
import os\nfrom launch import LaunchClient\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nendpoint = client.create_model_endpoint(\nendpoint_name=\"demo-endpoint-streaming\",\nmodel_bundle=\"test-streaming-bundle\",\ncpus=1,\nmin_workers=1,\nper_worker=1,\nendpoint_type=\"streaming\",\nupdate_if_exists=True,\nlabels={\n\"team\": \"MY_TEAM\",\n\"product\": \"MY_PRODUCT\",\n},\n)\n"},{"location":"concepts/model_endpoints/#managing-model-endpoints","title":"Managing Model Endpoints","text":"Model endpoints can be listed, updated, and deleted using the Launch API.
Listing Model Endpointsimport os\nfrom launch import LaunchClient\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nendpoints = client.list_model_endpoints()\n Updating a Model Endpointimport os\nfrom launch import LaunchClient\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nclient.edit_model_endpoint(\nmodel_endpoint=\"demo-endpoint-sync\",\nmax_workers=2,\n)\n Deleting a Model Endpointimport time\nimport os\nfrom launch import LaunchClient\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nendpoint = client.create_model_endpoint(\nendpoint_name=\"demo-endpoint-tmp\",\nmodel_bundle=\"test-bundle\",\ncpus=1,\nmin_workers=0,\nendpoint_type=\"async\",\nupdate_if_exists=True,\nlabels={\n\"team\": \"MY_TEAM\",\n\"product\": \"MY_PRODUCT\",\n},\n)\ntime.sleep(15) # Wait for Launch to build the endpoint\nclient.delete_model_endpoint(model_endpoint_name=\"demo-endpoint-tmp\")\n"},{"location":"concepts/overview/","title":"Overview","text":"Creating deployments on Launch generally involves three steps:
Create and upload a ModelBundle. Pass your trained model as well as pre-/post-processing code to the Scale Launch Python client, and we\u2019ll create a model bundle based on the code and store it in our Bundle Store.
Create a ModelEndpoint. Pass a ModelBundle as well as infrastructure settings such as the desired number of GPUs to our client. This provisions resources on Scale\u2019s cluster dedicated to your ModelEndpoint.
Make requests to the ModelEndpoint. You can make requests through the Python client, or make HTTP requests directly to Scale.
Warning
This feature is currently in beta, and the API is likely to change. Please contact us if you are interested in using this feature.
If you need more customization that what cloudpickle or zip artifacts can offer, or if you just already have a pre-built docker image, then you can create a Model Bundle with that docker image. You will need to modify your image to run a web server that exposes HTTP port 5005.
In our example below, we assume that you have some existing Python function my_inference_fn that can be imported. If you need to invoke some other binary (e.g. a custom C++ binary), then you can shell out to the OS to call that binary; subsequent versions of this document will have native examples for non-Python binaries.
For choice of web server, we recommend FastAPI due to its speed and ergonomics. Any web server would work, although we give examples with FastAPI.
"},{"location":"guides/custom_docker_images/#step-1-install-requirements","title":"Step 1: Install Requirements","text":"You can add fastapi and uvicorn to the requirements.txt file that gets installed as part of your Dockerfile. Alternatively, you can add pip install fastapi uvicorn to the Dockerfile directly.
Inside your project workspace, create a server.py file with these contents:
# test='skip'\nfrom fastapi import FastAPI\nfrom pydantic import BaseModel\napp = FastAPI()\nclass MyRequestSchema(BaseModel):\nurl: str\nclass MyResponseSchema(BaseModel):\nresponse: str\ndef my_inference_fn(req: MyRequestSchema) -> MyResponseSchema:\n# This is an example inference function - you can instead import a function from your own codebase,\n# or shell out to the OS, etc.\nresp = req.url + \"_hello\"\nreturn MyResponseSchema(response=resp)\n@app.post(\"/predict\")\nasync def predict(request: MyRequestSchema) -> MyResponseSchema:\nresponse = my_inference_fn(request)\nreturn response\n@app.get(\"/readyz\")\ndef readyz():\nreturn \"ok\"\n"},{"location":"guides/custom_docker_images/#step-3-rebuild-and-push-your-image","title":"Step 3: Rebuild and push your image","text":"Build your updated Dockerfile and push the image to a location that is accessible by Scale. For instance, if you are using AWS ECR, please make sure that the necessary cross-account permissions allow Scale to pull your docker image.
"},{"location":"guides/custom_docker_images/#step-4-deploy","title":"Step 4: Deploy!","text":"Now you can upload your docker image as a Model Bundle, and then create a Model Endpoint referencing that Model Bundle. Note that path.to.your.server.file:app in the command section below should be relative to the WORKDIR of your docker image.
# test='skip'\nimport os\nfrom launch import LaunchClient\nfrom server import MyRequestSchema, MyResponseSchema # Defined as part of your server.py\nclient = LaunchClient(api_key=os.getenv(\"LAUNCH_API_KEY\"))\nmodel_bundle_name = \"my_bundle_name\"\nclient.create_model_bundle_from_runnable_image_v2(\nmodel_bundle_name=model_bundle_name,\nrequest_schema=MyRequestSchema,\nresponse_schema=MyResponseSchema,\nrepository=\"$YOUR_ECR_REPO\",\ntag=\"$YOUR_IMAGE_TAG\",\ncommand=[\n\"dumb-init\",\n\"--\",\n\"uvicorn\",\n\"path.to.your.server.file:app\",\n\"--port\",\n\"5005\",\n\"--host\",\n\"::\",\n],\npredict_route=\"/predict\",\nhealthcheck_route=\"/readyz\",\nreadiness_initial_delay_seconds=120,\nenv={},\n)\nclient.create_model_endpoint(\nendpoint_name=f\"endpoint-{model_bundle_name}\",\nmodel_bundle=model_bundle_name,\nendpoint_type=\"async\",\nmin_workers=0,\nmax_workers=1,\nper_worker=1,\nmemory=\"30Gi\",\nstorage=\"40Gi\",\ncpus=4, # This must be at least 2 because forwarding services consume 1 cpu.\ngpus=1,\ngpu_type=\"nvidia-ampere-a10\",\nupdate_if_exists=True,\n)\n"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
new file mode 100644
index 00000000..7b5d1a63
--- /dev/null
+++ b/sitemap.xml
@@ -0,0 +1,78 @@
+
+