diff --git a/.env.example b/.env.example index 01dc40e8..47311d1e 100644 --- a/.env.example +++ b/.env.example @@ -1,30 +1,33 @@ # Azure Subscription Variables SUBSCRIPTION_ID = '' -LOCATION = 'westeurope' +LOCATION = '' TENANT_ID = '' BASE_NAME = '' SP_APP_ID = '' SP_APP_SECRET = '' -RESOUCE_GROUP = 'mlops-rg' +RESOURCE_GROUP = 'mlops-RG' # Mock build/release ID for local testing BUILD_BUILDID = '001' # Azure ML Workspace Variables -WORKSPACE_NAME = 'aml-workspace' -EXPERIMENT_NAME = '' +WORKSPACE_NAME = 'mlops-aml-ws' +EXPERIMENT_NAME = 'mlopspython' # AML Compute Cluster Config AML_ENV_NAME='diabetes_regression_training_env' +AML_ENV_TRAIN_CONDA_DEP_FILE="conda_dependencies.yml" AML_COMPUTE_CLUSTER_NAME = 'train-cluster' AML_COMPUTE_CLUSTER_CPU_SKU = 'STANDARD_DS2_V2' AML_CLUSTER_MAX_NODES = '4' AML_CLUSTER_MIN_NODES = '0' AML_CLUSTER_PRIORITY = 'lowpriority' # Training Config -MODEL_NAME = 'sklearn_regression_model.pkl' +MODEL_NAME = 'diabetes_regression_model.pkl' MODEL_VERSION = '1' -TRAIN_SCRIPT_PATH = 'training/train.py' +TRAIN_SCRIPT_PATH = 'training/train_aml.py' + + # AML Pipeline Config TRAINING_PIPELINE_NAME = 'Training Pipeline' MODEL_PATH = '' @@ -51,3 +54,28 @@ ALLOW_RUN_CANCEL = 'true' # Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml. AML_REBUILD_ENVIRONMENT = 'false' + + + +USE_GPU_FOR_SCORING = "false" +AML_ENV_SCORE_CONDA_DEP_FILE="conda_dependencies_scoring.yml" +AML_ENV_SCORECOPY_CONDA_DEP_FILE="conda_dependencies_scorecopy.yml" +# AML Compute Cluster Config for parallel batch scoring +AML_ENV_NAME_SCORING='diabetes_regression_scoring_env' +AML_ENV_NAME_SCORE_COPY='diabetes_regression_score_copy_env' +AML_COMPUTE_CLUSTER_NAME_SCORING = 'score-cluster' +AML_COMPUTE_CLUSTER_CPU_SKU_SCORING = 'STANDARD_DS2_V2' +AML_CLUSTER_MAX_NODES_SCORING = '4' +AML_CLUSTER_MIN_NODES_SCORING = '0' +AML_CLUSTER_PRIORITY_SCORING = 'lowpriority' +AML_REBUILD_ENVIRONMENT_SCORING = 'true' +BATCHSCORE_SCRIPT_PATH = 'scoring/parallel_batchscore.py' +BATCHSCORE_COPY_SCRIPT_PATH = 'scoring/parallel_batchscore_copyoutput.py' + + +SCORING_DATASTORE_INPUT_CONTAINER = 'input' +SCORING_DATASTORE_INPUT_FILENAME = 'diabetes_scoring_input.csv' +SCORING_DATASTORE_OUTPUT_CONTAINER = 'output' +SCORING_DATASTORE_OUTPUT_FILENAME = 'diabetes_scoring_output.csv' +SCORING_DATASET_NAME = 'diabetes_scoring_ds' +SCORING_PIPELINE_NAME = 'diabetes-scoring-pipeline' diff --git a/.pipelines/diabetes_regression-batchscoring-ci.yml b/.pipelines/diabetes_regression-batchscoring-ci.yml new file mode 100644 index 00000000..1392fddb --- /dev/null +++ b/.pipelines/diabetes_regression-batchscoring-ci.yml @@ -0,0 +1,89 @@ +# Continuous Integration (CI) pipeline that orchestrates the batch scoring of the diabetes_regression model. + +# Runtime parameters to select artifacts +parameters: +- name : artifactBuildId + displayName: Model Train CI Build ID. Default is 'latest'. + type: string + default: latest + +pr: none + +# Trigger this pipeline on model-train pipeline completion +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest + pipelines: + - pipeline: model-train-ci + source: Model-Train-Register-CI # Name of the triggering pipeline + trigger: + branches: + include: + - master + +trigger: + branches: + include: + - master + paths: + include: + - diabetes_regression/scoring/parallel_batchscore.py + - ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py + - ml_service/pipelines/run_parallel_batchscore_pipeline.py + +variables: +- template: diabetes_regression-variables-template.yml +- group: devopsforai-aml-vg + +pool: + vmImage: ubuntu-latest + +stages: +- stage: 'Batch_Scoring_Pipeline_CI' + displayName: 'Batch Scoring Pipeline CI' + jobs: + - job: "Build_Batch_Scoring_Pipeline" + displayName: "Build Batch Scoring Pipeline" + container: mlops + timeoutInMinutes: 0 + steps: + - template: code-quality-template.yml + - template: diabetes_regression-get-model-id-artifact-template.yml + parameters: + projectId: '$(resources.pipeline.model-train-ci.projectID)' + pipelineId: '$(resources.pipeline.model-train-ci.pipelineID)' + artifactBuildId: ${{ parameters.artifactBuildId }} + - task: AzureCLI@1 + displayName: "Publish Batch Scoring Pipeline" + name: publish_batchscore + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + # Invoke the Python building and publishing a training pipeline + python -m ml_service.pipelines.diabetes_regression_build_parallel_batchscore_pipeline + env: + SCORING_DATASTORE_ACCESS_KEY: $(SCORING_DATASTORE_ACCESS_KEY) + + - job: "Run_Batch_Score_Pipeline" + displayName: "Run Batch Scoring Pipeline" + dependsOn: ["Build_Batch_Scoring_Pipeline"] + timeoutInMinutes: 240 + pool: server + variables: + pipeline_id: $[ dependencies.Build_Batch_Scoring_Pipeline.outputs['publish_batchscore.pipeline_id']] + model_name: $[ dependencies.Build_Batch_Scoring_Pipeline.outputs['get_model.MODEL_NAME']] + model_version: $[ dependencies.Build_Batch_Scoring_Pipeline.outputs['get_model.MODEL_VERSION']] + steps: + - task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0 + displayName: 'Invoke Batch Scoring pipeline' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + PipelineId: '$(pipeline_id)' + ExperimentName: '$(EXPERIMENT_NAME)' + PipelineParameters: '"ParameterAssignments": {"model_name": "$(model_name)", "model_version": "$(model_version)"}' + \ No newline at end of file diff --git a/.pipelines/diabetes_regression-cd.yml b/.pipelines/diabetes_regression-cd.yml new file mode 100644 index 00000000..a691cc47 --- /dev/null +++ b/.pipelines/diabetes_regression-cd.yml @@ -0,0 +1,161 @@ +# Continuous Integration (CI) pipeline that orchestrates the deployment of the diabetes_regression model. + +# Runtime parameters to select artifacts +parameters: +- name : artifactBuildId + displayName: Model Train CI Build ID. Default is 'latest'. + type: string + default: latest + +pr: none + +# Trigger this pipeline on model-train pipeline completion +trigger: none +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest + pipelines: + - pipeline: model-train-ci + source: Model-Train-Register-CI # Name of the triggering pipeline + trigger: + branches: + include: + - master + +variables: +- template: diabetes_regression-variables-template.yml +- group: devopsforai-aml-vg + +stages: +- stage: 'Deploy_ACI' + displayName: 'Deploy to ACI' + condition: variables['ACI_DEPLOYMENT_NAME'] + jobs: + - job: "Deploy_ACI" + displayName: "Deploy to ACI" + container: mlops + timeoutInMinutes: 0 + steps: + - download: none + - template: diabetes_regression-get-model-id-artifact-template.yml + parameters: + projectId: '$(resources.pipeline.model-train-ci.projectID)' + pipelineId: '$(resources.pipeline.model-train-ci.pipelineID)' + artifactBuildId: ${{ parameters.artifactBuildId }} + - task: AzureCLI@1 + displayName: 'Install AzureML CLI' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: 'az extension add --source https://azurecliext.blob.core.windows.net/release/azure_cli_ml-1.27.0-py3-none-any.whl --yes' + - task: AzureCLI@1 + displayName: "Deploy to ACI (CLI)" + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring + inlineScript: | + set -e # fail on error + + az ml model deploy --name $(ACI_DEPLOYMENT_NAME) --model '$(MODEL_NAME):$(get_model.MODEL_VERSION)' \ + --ic inference_config.yml \ + --dc deployment_config_aci.yml \ + -g $(RESOURCE_GROUP) --workspace-name $(WORKSPACE_NAME) \ + --overwrite -v + - task: AzureCLI@1 + displayName: 'Smoke test' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python -m ml_service.util.smoke_test_scoring_service --type ACI --service "$(ACI_DEPLOYMENT_NAME)" + +- stage: 'Deploy_AKS' + displayName: 'Deploy to AKS' + dependsOn: Deploy_ACI + condition: and(succeeded(), variables['AKS_DEPLOYMENT_NAME']) + jobs: + - job: "Deploy_AKS" + displayName: "Deploy to AKS" + container: mlops + timeoutInMinutes: 0 + steps: + - template: diabetes_regression-get-model-id-artifact-template.yml + parameters: + projectId: '$(resources.pipeline.model-train-ci.projectID)' + pipelineId: '$(resources.pipeline.model-train-ci.pipelineID)' + artifactBuildId: ${{ parameters.artifactBuildId }} + - task: AzureCLI@1 + displayName: 'Install AzureML CLI' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: 'az extension add --source https://azurecliext.blob.core.windows.net/release/azure_cli_ml-1.27.0-py3-none-any.whl --yes' + - task: AzureCLI@1 + displayName: "Deploy to AKS (CLI)" + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring + inlineScript: | + set -e # fail on error + + az ml model deploy --name $(AKS_DEPLOYMENT_NAME) --model '$(MODEL_NAME):$(get_model.MODEL_VERSION)' \ + --compute-target $(AKS_COMPUTE_NAME) \ + --ic inference_config.yml \ + --dc deployment_config_aks.yml \ + -g $(RESOURCE_GROUP) --workspace-name $(WORKSPACE_NAME) \ + --overwrite -v + - task: AzureCLI@1 + displayName: 'Smoke test' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python -m ml_service.util.smoke_test_scoring_service --type AKS --service "$(AKS_DEPLOYMENT_NAME)" + +- stage: 'Deploy_Webapp' + displayName: 'Deploy to Webapp' + condition: variables['WEBAPP_DEPLOYMENT_NAME'] + jobs: + - job: "Deploy_Webapp" + displayName: "Package and deploy model" + container: mlops + timeoutInMinutes: 0 + steps: + - template: diabetes_regression-get-model-id-artifact-template.yml + parameters: + projectId: '$(resources.pipeline.model-train-ci.projectID)' + pipelineId: '$(resources.pipeline.model-train-ci.pipelineID)' + artifactBuildId: ${{ parameters.artifactBuildId }} + - template: diabetes_regression-package-model-template.yml + parameters: + modelId: $(MODEL_NAME):$(get_model.MODEL_VERSION) + scoringScriptPath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring/score.py' + condaFilePath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/conda_dependencies.yml' + - script: echo $(IMAGE_LOCATION) >image_location.txt + displayName: "Write image location file" + - task: AzureWebAppContainer@1 + name: WebAppDeploy + displayName: 'Azure Web App on Container Deploy' + inputs: + azureSubscription: '$(AZURE_RM_SVC_CONNECTION)' + appName: '$(WEBAPP_DEPLOYMENT_NAME)' + resourceGroupName: '$(RESOURCE_GROUP)' + imageName: '$(IMAGE_LOCATION)' + - task: AzureCLI@1 + displayName: 'Smoke test' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python -m ml_service.util.smoke_test_scoring_service --type Webapp --service "$(WebAppDeploy.AppServiceApplicationUrl)/score" diff --git a/.pipelines/diabetes_regression-ci-image.yml b/.pipelines/diabetes_regression-ci-image.yml index 6282fd31..d7c925bf 100644 --- a/.pipelines/diabetes_regression-ci-image.yml +++ b/.pipelines/diabetes_regression-ci-image.yml @@ -30,14 +30,9 @@ variables: value: 'scoring/scoreB.py' steps: -- task: AzureCLI@1 - inputs: - azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' - scriptLocation: inlineScript - workingDirectory: $(Build.SourcesDirectory) - inlineScript: | - set -e - export SUBSCRIPTION_ID=$(az account show --query id -o tsv) - python3 -m ml_service.util.create_scoring_image - displayName: 'Create Scoring Image' +- template: diabetes_regression-package-model-template.yml + parameters: + modelId: $(MODEL_NAME):$(MODEL_VERSION) + scoringScriptPath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/$(SCORE_SCRIPT)' + condaFilePath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/conda_dependencies.yml' diff --git a/.pipelines/diabetes_regression-ci.yml b/.pipelines/diabetes_regression-ci.yml index 56258d50..5a539af0 100644 --- a/.pipelines/diabetes_regression-ci.yml +++ b/.pipelines/diabetes_regression-ci.yml @@ -1,4 +1,4 @@ -# Continuous Integration (CI) pipeline that orchestrates the training, evaluation, registration, deployment, and testing of the diabetes_regression model. +# Continuous Integration (CI) pipeline that orchestrates the training, evaluation, and registration of the diabetes_regression model. resources: containers: @@ -27,7 +27,6 @@ pool: stages: - stage: 'Model_CI' displayName: 'Model CI' - condition: not(variables['MODEL_BUILD_ID']) jobs: - job: "Model_CI_Pipeline" displayName: "Model CI Pipeline" @@ -48,8 +47,8 @@ stages: displayName: 'Publish Azure Machine Learning Pipeline' - stage: 'Trigger_AML_Pipeline' - displayName: 'Train model' - condition: and(succeeded(), not(variables['MODEL_BUILD_ID'])) + displayName: 'Train and evaluate model' + condition: succeeded() variables: BUILD_URI: '$(SYSTEM.COLLECTIONURI)$(SYSTEM.TEAMPROJECT)/_build/results?buildId=$(BUILD.BUILDID)' jobs: @@ -91,116 +90,8 @@ stages: - job: "Training_Run_Report" dependsOn: "Run_ML_Pipeline" condition: always() - displayName: "Determine if evaluation succeeded and new model is registered" + displayName: "Publish artifact if new model was registered" container: mlops timeoutInMinutes: 0 steps: - - template: diabetes_regression-get-model-version-template.yml - -- stage: 'Deploy_ACI' - displayName: 'Deploy to ACI' - dependsOn: Trigger_AML_Pipeline - condition: and(or(succeeded(), variables['MODEL_BUILD_ID']), variables['ACI_DEPLOYMENT_NAME']) - jobs: - - job: "Deploy_ACI" - displayName: "Deploy to ACI" - container: mlops - timeoutInMinutes: 0 - steps: - - template: diabetes_regression-get-model-version-template.yml - - task: ms-air-aiagility.vss-services-azureml.azureml-model-deploy-task.AMLModelDeploy@0 - displayName: 'Azure ML Model Deploy' - inputs: - azureSubscription: $(WORKSPACE_SVC_CONNECTION) - modelSourceType: manualSpec - modelName: '$(MODEL_NAME)' - modelVersion: $(MODEL_VERSION) - inferencePath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring/inference_config.yml' - deploymentTarget: ACI - deploymentName: $(ACI_DEPLOYMENT_NAME) - deployConfig: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring/deployment_config_aci.yml' - overwriteExistingDeployment: true - - task: AzureCLI@1 - displayName: 'Smoke test' - inputs: - azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' - scriptLocation: inlineScript - inlineScript: | - set -e # fail on error - export SUBSCRIPTION_ID=$(az account show --query id -o tsv) - python -m ml_service.util.smoke_test_scoring_service --type ACI --service "$(ACI_DEPLOYMENT_NAME)" - -- stage: 'Deploy_AKS' - displayName: 'Deploy to AKS' - dependsOn: Deploy_ACI - condition: and(succeeded(), variables['AKS_DEPLOYMENT_NAME']) - jobs: - - job: "Deploy_AKS" - displayName: "Deploy to AKS" - container: mlops - timeoutInMinutes: 0 - steps: - - template: diabetes_regression-get-model-version-template.yml - - task: ms-air-aiagility.vss-services-azureml.azureml-model-deploy-task.AMLModelDeploy@0 - displayName: 'Azure ML Model Deploy' - inputs: - azureSubscription: $(WORKSPACE_SVC_CONNECTION) - modelSourceType: manualSpec - modelName: '$(MODEL_NAME)' - modelVersion: $(MODEL_VERSION) - inferencePath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring/inference_config.yml' - deploymentTarget: AKS - aksCluster: $(AKS_COMPUTE_NAME) - deploymentName: $(AKS_DEPLOYMENT_NAME) - deployConfig: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring/deployment_config_aks.yml' - overwriteExistingDeployment: true - - task: AzureCLI@1 - displayName: 'Smoke test' - inputs: - azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' - scriptLocation: inlineScript - inlineScript: | - set -e # fail on error - export SUBSCRIPTION_ID=$(az account show --query id -o tsv) - python -m ml_service.util.smoke_test_scoring_service --type AKS --service "$(AKS_DEPLOYMENT_NAME)" - -- stage: 'Deploy_Webapp' - displayName: 'Deploy to Webapp' - dependsOn: Trigger_AML_Pipeline - condition: and(or(succeeded(), variables['MODEL_BUILD_ID']), variables['WEBAPP_DEPLOYMENT_NAME']) - jobs: - - job: "Deploy_Webapp" - displayName: "Deploy to Webapp" - container: mlops - timeoutInMinutes: 0 - steps: - - template: diabetes_regression-get-model-version-template.yml - - task: AzureCLI@1 - displayName: 'Create scoring image and set IMAGE_LOCATION variable' - inputs: - azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' - scriptLocation: inlineScript - inlineScript: | - set -e # fail on error - export SUBSCRIPTION_ID=$(az account show --query id -o tsv) - python -m ml_service.util.create_scoring_image --output_image_location_file image_location.txt - # Output image location to Azure DevOps job - IMAGE_LOCATION="$(cat image_location.txt)" - echo "##vso[task.setvariable variable=IMAGE_LOCATION]$IMAGE_LOCATION" - - task: AzureWebAppContainer@1 - name: WebAppDeploy - displayName: 'Azure Web App on Container Deploy' - inputs: - azureSubscription: '$(AZURE_RM_SVC_CONNECTION)' - appName: '$(WEBAPP_DEPLOYMENT_NAME)' - resourceGroupName: '$(RESOURCE_GROUP)' - imageName: '$(IMAGE_LOCATION)' - - task: AzureCLI@1 - displayName: 'Smoke test' - inputs: - azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' - scriptLocation: inlineScript - inlineScript: | - set -e # fail on error - export SUBSCRIPTION_ID=$(az account show --query id -o tsv) - python -m ml_service.util.smoke_test_scoring_service --type Webapp --service "$(WebAppDeploy.AppServiceApplicationUrl)/score" + - template: diabetes_regression-publish-model-artifact-template.yml diff --git a/.pipelines/diabetes_regression-get-model-id-artifact-template.yml b/.pipelines/diabetes_regression-get-model-id-artifact-template.yml new file mode 100644 index 00000000..b9e61306 --- /dev/null +++ b/.pipelines/diabetes_regression-get-model-id-artifact-template.yml @@ -0,0 +1,48 @@ +# Pipeline template that gets the model name and version from a previous build's artifact + +parameters: +- name: projectId + type: string + default: '' +- name: pipelineId + type: string + default: '' +- name: artifactBuildId + type: string + default: latest + +steps: + - download: none + - task: DownloadPipelineArtifact@2 + displayName: Download Pipeline Artifacts + inputs: + source: 'specific' + project: '${{ parameters.projectId }}' + pipeline: '${{ parameters.pipelineId }}' + preferTriggeringPipeline: true + ${{ if eq(parameters.artifactBuildId, 'latest') }}: + buildVersionToDownload: 'latestFromBranch' + ${{ if ne(parameters.artifactBuildId, 'latest') }}: + buildVersionToDownload: 'specific' + runId: '${{ parameters.artifactBuildId }}' + runBranch: '$(Build.SourceBranch)' + path: $(Build.SourcesDirectory)/bin + - task: Bash@3 + name: get_model + displayName: Parse Json for Model Name and Version + inputs: + targetType: 'inline' + script: | + # Print JSON + cat $(Build.SourcesDirectory)/bin/model/model.json | jq '.' + + # Set model name and version variables + MODEL_NAME=$(jq -r '.name' <$(Build.SourcesDirectory)/bin/model/model.json) + MODEL_VERSION=$(jq -r '.version' <$(Build.SourcesDirectory)/bin/model/model.json) + + echo "Model Name: $MODEL_NAME" + echo "Model Version: $MODEL_VERSION" + + # Set environment variables + echo "##vso[task.setvariable variable=MODEL_VERSION;isOutput=true]$MODEL_VERSION" + echo "##vso[task.setvariable variable=MODEL_NAME;isOutput=true]$MODEL_NAME" diff --git a/.pipelines/diabetes_regression-get-model-version-template.yml b/.pipelines/diabetes_regression-get-model-version-template.yml deleted file mode 100644 index 870985a6..00000000 --- a/.pipelines/diabetes_regression-get-model-version-template.yml +++ /dev/null @@ -1,15 +0,0 @@ -# Pipeline template that attempts to get the latest model version and adds it to the environment for subsequent tasks to use. -steps: -- task: AzureCLI@1 - inputs: - azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' - scriptLocation: inlineScript - inlineScript: | - set -e # fail on error - export SUBSCRIPTION_ID=$(az account show --query id -o tsv) - python -m ml_service.pipelines.diabetes_regression_verify_train_pipeline --build_id $(modelbuildid) --output_model_version_file "model_version.txt" - # Output model version to Azure DevOps job - MODEL_VERSION="$(cat model_version.txt)" - echo "##vso[task.setvariable variable=MODEL_VERSION]$MODEL_VERSION" - name: 'getversion' - displayName: "Determine if evaluation succeeded and new model is registered" diff --git a/.pipelines/diabetes_regression-package-model-template.yml b/.pipelines/diabetes_regression-package-model-template.yml new file mode 100644 index 00000000..16fc1c1d --- /dev/null +++ b/.pipelines/diabetes_regression-package-model-template.yml @@ -0,0 +1,42 @@ +# Pipeline template that creates a model package and adds the package location to the environment for subsequent tasks to use. +parameters: +- name: modelId + type: string + default: '' +- name: scoringScriptPath + type: string + default: '' +- name: condaFilePath + type: string + default: '' + +steps: + - task: AzureCLI@1 + displayName: 'Install AzureML CLI' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: 'az extension add --source https://azurecliext.blob.core.windows.net/release/azure_cli_ml-1.27.0-py3-none-any.whl --yes' + - task: AzureCLI@1 + displayName: 'Create model package and set IMAGE_LOCATION variable' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + + # Create model package using CLI + az ml model package --workspace-name $(WORKSPACE_NAME) -g $(RESOURCE_GROUP) \ + --model '${{ parameters.modelId }}' \ + --entry-script '${{ parameters.scoringScriptPath }}' \ + --cf '${{ parameters.condaFilePath }}' \ + -v \ + --rt python --query 'location' -o tsv > image_logs.txt + + # Show logs + cat image_logs.txt + + # Set environment variable using the last line of logs that has the package location + IMAGE_LOCATION=$(tail -n 1 image_logs.txt) + echo "##vso[task.setvariable variable=IMAGE_LOCATION]$IMAGE_LOCATION" diff --git a/.pipelines/diabetes_regression-publish-model-artifact-template.yml b/.pipelines/diabetes_regression-publish-model-artifact-template.yml new file mode 100644 index 00000000..d666750d --- /dev/null +++ b/.pipelines/diabetes_regression-publish-model-artifact-template.yml @@ -0,0 +1,29 @@ +# Pipeline template to check if a model was registered for the build and publishes an artifact with the model JSON +steps: +- task: AzureCLI@1 + displayName: 'Install AzureML CLI' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: 'az extension add --source https://azurecliext.blob.core.windows.net/release/azure_cli_ml-1.27.0-py3-none-any.whl --yes' +- task: AzureCLI@1 + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: | + set -e # fail on error + + # Get the model using the build ID tag + FOUND_MODEL=$(az ml model list -g $(RESOURCE_GROUP) --workspace-name $(WORKSPACE_NAME) --tag BuildId=$(Build.BuildId) --query '[0]') + + # If the variable is empty, print and fail + [[ -z "$FOUND_MODEL" ]] && { echo "Model was not registered for this run." ; exit 1; } + + # Write to a file + echo $FOUND_MODEL >model.json + name: 'getversion' + displayName: "Determine if evaluation succeeded and new model is registered (CLI)" +- publish: model.json + artifact: model diff --git a/.pipelines/diabetes_regression-variables-template.yml b/.pipelines/diabetes_regression-variables-template.yml index def14549..502753fb 100644 --- a/.pipelines/diabetes_regression-variables-template.yml +++ b/.pipelines/diabetes_regression-variables-template.yml @@ -1,8 +1,7 @@ # Pipeline template that defines common runtime environment variables. variables: - # Source Config - # The directory containing the scripts for training, evaluating, and registering the model + # The directory containing the scripts for training, evaluating, and registering the model - name: SOURCES_DIR_TRAIN value: diabetes_regression # The path to the model training script under SOURCES_DIR_TRAIN @@ -17,13 +16,14 @@ variables: # The path to the model scoring script relative to SOURCES_DIR_TRAIN - name: SCORE_SCRIPT value: scoring/score.py + # Azure ML Variables - name: EXPERIMENT_NAME value: mlopspython - name: DATASET_NAME value: diabetes_ds - # Uncomment DATASTORE_NAME if you have configured non default datastore to point to your data + # Uncomment DATASTORE_NAME if you have configured non default datastore to point to your data # - name: DATASTORE_NAME # value: datablobstore - name: DATASET_VERSION @@ -36,6 +36,8 @@ variables: # AML Compute Cluster Config - name: AML_ENV_NAME value: diabetes_regression_training_env + - name: AML_ENV_TRAIN_CONDA_DEP_FILE + value: "conda_dependencies.yml" - name: AML_COMPUTE_CLUSTER_CPU_SKU value: STANDARD_DS2_V2 - name: AML_COMPUTE_CLUSTER_NAME @@ -50,25 +52,78 @@ variables: # The name for the (docker/webapp) scoring image - name: IMAGE_NAME value: "diabetestrained" - + # Optional. Used by a training pipeline with R on Databricks - name: DB_CLUSTER_ID value: "" # These are the default values set in ml_service\util\env_variables.py. Uncomment and override if desired. - # Set to false to disable the evaluation step in the ML pipeline and register the newly trained model unconditionally. + # Set to false to disable the evaluation step in the ML pipeline and register the newly trained model unconditionally. # - name: RUN_EVALUATION # value: "true" - # Set to false to register the model regardless of the outcome of the evaluation step in the ML pipeline. + # Set to false to register the model regardless of the outcome of the evaluation step in the ML pipeline. # - name: ALLOW_RUN_CANCEL # value: "true" - # For debugging deployment issues. Specify a build id with the MODEL_BUILD_ID pipeline variable at queue time - # to skip training and deploy a model registered by a previous build. - - name: modelbuildid - value: $[coalesce(variables['MODEL_BUILD_ID'], variables['Build.BuildId'])] - - # Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml. # - name: AML_REBUILD_ENVIRONMENT # value: "false" + + # Variables below are used for controlling various aspects of batch scoring + - name: USE_GPU_FOR_SCORING + value: False + # Conda dependencies for the batch scoring step + - name: AML_ENV_SCORE_CONDA_DEP_FILE + value: "conda_dependencies_scoring.yml" + # Conda dependencies for the score copying step + - name: AML_ENV_SCORECOPY_CONDA_DEP_FILE + value: "conda_dependencies_scorecopy.yml" + # AML Compute Cluster Config for parallel batch scoring + - name: AML_ENV_NAME_SCORING + value: diabetes_regression_scoring_env + - name: AML_ENV_NAME_SCORE_COPY + value: diabetes_regression_score_copy_env + - name: AML_COMPUTE_CLUSTER_CPU_SKU_SCORING + value: STANDARD_DS2_V2 + - name: AML_COMPUTE_CLUSTER_NAME_SCORING + value: score-cluster + - name: AML_CLUSTER_MIN_NODES_SCORING + value: 0 + - name: AML_CLUSTER_MAX_NODES_SCORING + value: 4 + - name: AML_CLUSTER_PRIORITY_SCORING + value: lowpriority + # The path to the batch scoring script relative to SOURCES_DIR_TRAIN + - name: BATCHSCORE_SCRIPT_PATH + value: scoring/parallel_batchscore.py + - name: BATCHSCORE_COPY_SCRIPT_PATH + value: scoring/parallel_batchscore_copyoutput.py + # Flag to allow rebuilding the AML Environment after it was built for the first time. + # This enables dependency updates from the conda dependencies yaml for scoring activities. + - name: AML_REBUILD_ENVIRONMENT_SCORING + value: "true" + + # Datastore config for scoring + # The storage account name and key are supplied as variables in a variable group + # in the Azure Pipelines library for this project. Please refer to repo docs for + # more details + + # Blob container where the input data for scoring can be found + - name: SCORING_DATASTORE_INPUT_CONTAINER + value: "input" + # Blobname for the input data - include any applicable path in the string + - name: SCORING_DATASTORE_INPUT_FILENAME + value: "diabetes_scoring_input.csv" + # Blob container where the output data for scoring can be found + - name: SCORING_DATASTORE_OUTPUT_CONTAINER + value: "output" + # Blobname for the output data - include any applicable path in the string + - name: SCORING_DATASTORE_OUTPUT_FILENAME + value: "diabetes_scoring_output.csv" + # Dataset name for input data for scoring + - name: SCORING_DATASET_NAME + value: "diabetes_scoring_ds" + # Scoring pipeline name + - name: SCORING_PIPELINE_NAME + value: "diabetes-scoring-pipeline" + \ No newline at end of file diff --git a/README.md b/README.md index 0f9ab4a6..434be0df 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,9 @@ description: "Code which demonstrates how to set up and operationalize an MLOps # MLOps with Azure ML -[![Build Status](https://aidemos.visualstudio.com/MLOps/_apis/build/status/microsoft.MLOpsPython?branchName=master)](https://aidemos.visualstudio.com/MLOps/_build/latest?definitionId=151&branchName=master) +CI: [![Build Status](https://aidemos.visualstudio.com/MLOps/_apis/build/status/Model-Train-Register-CI?branchName=master)](https://aidemos.visualstudio.com/MLOps/_build/latest?definitionId=160&branchName=master) + +CD: [![Build Status](https://aidemos.visualstudio.com/MLOps/_apis/build/status/microsoft.MLOpsPython-CD?branchName=master)](https://aidemos.visualstudio.com/MLOps/_build/latest?definitionId=161&branchName=master) MLOps will help you to understand how to build a Continuous Integration and Continuous Delivery pipeline for an ML/AI project. We will be using the Azure DevOps Project for build and release/deployment pipelines along with Azure ML services for model retraining pipeline, model management and operationalization. diff --git a/bootstrap/README.md b/bootstrap/README.md index 27051f2b..0841cc30 100644 --- a/bootstrap/README.md +++ b/bootstrap/README.md @@ -1,18 +1,3 @@ # Bootstrap from MLOpsPython repository -To use this existing project structure and scripts for your new ML project, you can quickly get started from the existing repository, bootstrap and create a template that works for your ML project. - -Bootstrapping will prepare a directory structure for your project which includes: - -* renaming files and folders from the base project name `diabetes_regression` to your project name -* fixing imports and absolute path based on your project name -* deleting and cleaning up some directories - -To bootstrap from the existing MLOpsPython repository: - -1. Ensure Python 3 is installed locally -1. Clone this repository locally -1. Run bootstrap.py script -`python bootstrap.py -d [dirpath] -n [projectname]` - * `[dirpath]` is the absolute path to the root of the directory where MLOpsPython is cloned - * `[projectname]` is the name of your ML project +For steps on how to use the bootstrap script, please see the "Bootstrap the project" section of the [custom model guide](../docs/custom_model.md#bootstrap-the-project). diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py index 6e51b503..02f51bbc 100644 --- a/bootstrap/bootstrap.py +++ b/bootstrap/bootstrap.py @@ -84,17 +84,20 @@ def replace_project_name(project_dir, project_name, rename_name): files = [r".env.example", r".pipelines/code-quality-template.yml", r".pipelines/pr.yml", + r".pipelines/diabetes_regression-cd.yml", r".pipelines/diabetes_regression-ci.yml", r".pipelines/abtest.yml", r".pipelines/diabetes_regression-ci-image.yml", - r".pipelines/diabetes_regression-get-model-version-template.yml", # NOQA: E501 + r".pipelines/diabetes_regression-publish-model-artifact-template.yml", # NOQA: E501 + r".pipelines/diabetes_regression-get-model-id-artifact-template.yml", # NOQA: E501 + r".pipelines/diabetes_regression-batchscoring-ci.yml", r".pipelines/diabetes_regression-variables-template.yml", r"environment_setup/Dockerfile", r"environment_setup/install_requirements.sh", + r"ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py", # NOQA: E501 r"ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py", # NOQA: E501 r"ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py", # NOQA: E501 r"ml_service/pipelines/diabetes_regression_build_train_pipeline.py", # NOQA: E501 - r"ml_service/pipelines/diabetes_regression_verify_train_pipeline.py", # NOQA: E501 r"ml_service/util/create_scoring_image.py", r"diabetes_regression/conda_dependencies.yml", r"diabetes_regression/evaluate/evaluate_model.py", diff --git a/data/README.md b/data/README.md new file mode 100644 index 00000000..d43d139c --- /dev/null +++ b/data/README.md @@ -0,0 +1,3 @@ +This folder is used for example data, and it is not meant to be used for storing training data. + +Follow steps to [Configure Training Data](../docs/custom_model.md#Configure-Custom-Training) to use your own data for training. \ No newline at end of file diff --git a/diabetes_regression/ci_dependencies.yml b/diabetes_regression/ci_dependencies.yml index 72c91cd3..73086471 100644 --- a/diabetes_regression/ci_dependencies.yml +++ b/diabetes_regression/ci_dependencies.yml @@ -12,11 +12,12 @@ dependencies: - r=3.6.0 - r-essentials=3.6.0 + - conda-forge::jq - pip=20.0.* - pip: # dependencies with versions aligned with conda_dependencies.yml. - - azureml-sdk==1.3.* + - azureml-sdk==1.27.* # Additional pip dependencies for the CI environment. - pytest==5.4.* diff --git a/diabetes_regression/conda_dependencies.yml b/diabetes_regression/conda_dependencies.yml index 277efbec..e214c7b2 100644 --- a/diabetes_regression/conda_dependencies.yml +++ b/diabetes_regression/conda_dependencies.yml @@ -23,11 +23,11 @@ dependencies: - pip: # Base AzureML SDK - - azureml-sdk==1.3.* + - azureml-sdk==1.27.* - # Minimum required for the scoring environment. Must match AzureML SDK version. + # Must match AzureML SDK version. # https://docs.microsoft.com/en-us/azure/machine-learning/concept-environments - - azureml-defaults==1.3.* + - azureml-defaults==1.27.* # Training deps - scikit-learn diff --git a/diabetes_regression/conda_dependencies_scorecopy.yml b/diabetes_regression/conda_dependencies_scorecopy.yml new file mode 100644 index 00000000..9ed22ccd --- /dev/null +++ b/diabetes_regression/conda_dependencies_scorecopy.yml @@ -0,0 +1,31 @@ +# Conda environment specification. The dependencies defined in this file will +# be automatically provisioned for managed runs. These include runs against +# the localdocker, remotedocker, and cluster compute targets. + +# Note that this file is NOT used to automatically manage dependencies for the +# local compute target. To provision these dependencies locally, run: +# conda env update --file conda_dependencies.yml + +# Details about the Conda environment file format: +# https://conda.io/docs/using/envs.html#create-environment-file-by-hand + +# For managing Spark packages and configuration, see spark_dependencies.yml. +# Version of this configuration file's structure and semantics in AzureML. +# This directive is stored in a comment to preserve the Conda file structure. +# [AzureMlVersion] = 2 + +# These dependencies are used to create the environment used by the batch score +# copy pipeline step +name: diabetes_regression_score_copy_env +dependencies: + # The python interpreter version. + # Currently Azure ML Workbench only supports 3.5.2 and later. + - python=3.7.* + - pip + + - pip: + # Base AzureML SDK + - azureml-sdk==1.27.* + + # Score copying deps + - azure-storage-blob diff --git a/diabetes_regression/conda_dependencies_scoring.yml b/diabetes_regression/conda_dependencies_scoring.yml new file mode 100644 index 00000000..e744b369 --- /dev/null +++ b/diabetes_regression/conda_dependencies_scoring.yml @@ -0,0 +1,32 @@ +# Conda environment specification. The dependencies defined in this file will +# be automatically provisioned for managed runs. These include runs against +# the localdocker, remotedocker, and cluster compute targets. + +# Note that this file is NOT used to automatically manage dependencies for the +# local compute target. To provision these dependencies locally, run: +# conda env update --file conda_dependencies.yml + +# Details about the Conda environment file format: +# https://conda.io/docs/using/envs.html#create-environment-file-by-hand + +# For managing Spark packages and configuration, see spark_dependencies.yml. +# Version of this configuration file's structure and semantics in AzureML. +# This directive is stored in a comment to preserve the Conda file structure. +# [AzureMlVersion] = 2 + +# These dependencies are used to create the environment used by the batch score +# pipeline step +name: diabetes_regression_scoring_env +dependencies: + # The python interpreter version. + # Currently Azure ML Workbench only supports 3.5.2 and later. + - python=3.7.* + - pip + + - pip: + # Base AzureML SDK + - azureml-sdk==1.27.* + + # Scoring deps + - scikit-learn + - pandas diff --git a/diabetes_regression/evaluate/evaluate_model.py b/diabetes_regression/evaluate/evaluate_model.py index 125a16a5..d1ff3c6a 100644 --- a/diabetes_regression/evaluate/evaluate_model.py +++ b/diabetes_regression/evaluate/evaluate_model.py @@ -26,7 +26,7 @@ from azureml.core import Run import argparse import traceback -from util.model_helper import get_latest_model +from util.model_helper import get_model run = Run.get_context() @@ -45,7 +45,7 @@ # sources_dir = 'diabetes_regression' # path_to_util = os.path.join(".", sources_dir, "util") # sys.path.append(os.path.abspath(path_to_util)) # NOQA: E402 -# from model_helper import get_latest_model +# from model_helper import get_model # workspace_name = os.environ.get("WORKSPACE_NAME") # experiment_name = os.environ.get("EXPERIMENT_NAME") # resource_group = os.environ.get("RESOURCE_GROUP") @@ -108,24 +108,31 @@ firstRegistration = False tag_name = 'experiment_name' - model = get_latest_model( - model_name, tag_name, exp.name, ws) + model = get_model( + model_name=model_name, + tag_name=tag_name, + tag_value=exp.name, + aml_workspace=ws) if (model is not None): production_model_mse = 10000 if (metric_eval in model.tags): production_model_mse = float(model.tags[metric_eval]) - new_model_mse = float(run.parent.get_metrics().get(metric_eval)) + try: + new_model_mse = float(run.parent.get_metrics().get(metric_eval)) + except TypeError: + new_model_mse = None if (production_model_mse is None or new_model_mse is None): - print("Unable to find", metric_eval, "metrics, " + print("Unable to find ", metric_eval, " metrics, " "exiting evaluation") if((allow_run_cancel).lower() == 'true'): run.parent.cancel() else: print( - "Current Production model mse: {}, " - "New trained model mse: {}".format( - production_model_mse, new_model_mse + "Current Production model {}: {}, ".format( + metric_eval, production_model_mse) + + "New trained model {}: {}".format( + metric_eval, new_model_mse ) ) diff --git a/diabetes_regression/scoring/deployment_config_aks.yml b/diabetes_regression/scoring/deployment_config_aks.yml index 1299dc9d..cd81009d 100644 --- a/diabetes_regression/scoring/deployment_config_aks.yml +++ b/diabetes_regression/scoring/deployment_config_aks.yml @@ -7,8 +7,8 @@ autoScaler: targetUtilization: 70 authEnabled: True containerResourceRequirements: - cpu: 1 - memoryInGB: 4 + cpu: 0.5 + memoryInGB: 2 appInsightsEnabled: True scoringTimeoutMs: 5000 maxConcurrentRequestsPerContainer: 2 diff --git a/diabetes_regression/scoring/parallel_batchscore.py b/diabetes_regression/scoring/parallel_batchscore.py new file mode 100644 index 00000000..cd42c79c --- /dev/null +++ b/diabetes_regression/scoring/parallel_batchscore.py @@ -0,0 +1,157 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" + +import numpy as np +import pandas as pd +import joblib +import sys +from typing import List +from util.model_helper import get_model +from azureml.core import Model + +model = None + + +def parse_args() -> List[str]: + """ + The AML pipeline calls this file with a set of additional command + line arguments whose names are not documented. As such using the + ArgumentParser which necessitates that we supply the names of the + arguments is risky should those undocumented names change. Hence + we parse the arguments manually. + + :returns: List of model filters + + :raises: ValueError + """ + model_name_param = [ + (sys.argv[idx], sys.argv[idx + 1]) + for idx, itm in enumerate(sys.argv) + if itm == "--model_name" + ] + + if len(model_name_param) == 0: + raise ValueError( + "Model name is required but no model name parameter was passed to the script" # NOQA: E501 + ) + + model_name = model_name_param[0][1] + + model_version_param = [ + (sys.argv[idx], sys.argv[idx + 1]) + for idx, itm in enumerate(sys.argv) + if itm == "--model_version" + ] + model_version = ( + None + if len(model_version_param) < 1 + or len(model_version_param[0][1].strip()) == 0 # NOQA: E501 + else model_version_param[0][1] + ) + + model_tag_name_param = [ + (sys.argv[idx], sys.argv[idx + 1]) + for idx, itm in enumerate(sys.argv) + if itm == "--model_tag_name" + ] + model_tag_name = ( + None + if len(model_tag_name_param) < 1 + or len(model_tag_name_param[0][1].strip()) == 0 # NOQA: E501 + else model_tag_name_param[0][1] + ) + + model_tag_value_param = [ + (sys.argv[idx], sys.argv[idx + 1]) + for idx, itm in enumerate(sys.argv) + if itm == "--model_tag_value" + ] + model_tag_value = ( + None + if len(model_tag_value_param) < 1 + or len(model_tag_name_param[0][1].strip()) == 0 + else model_tag_value_param[0][1] + ) + + return [model_name, model_version, model_tag_name, model_tag_value] + + +def init(): + """ + Initializer called once per node that runs the scoring job. Parse command + line arguments and get the right model to use for scoring. + """ + try: + print("Initializing batch scoring script...") + + # Get the model using name/version/tags filter + model_filter = parse_args() + amlmodel = get_model( + model_name=model_filter[0], + model_version=model_filter[1], + tag_name=model_filter[2], + tag_value=model_filter[3]) + + # Load the model using name/version found + global model + modelpath = Model.get_model_path( + model_name=amlmodel.name, version=amlmodel.version) + model = joblib.load(modelpath) + print("Loaded model {}".format(model_filter[0])) + except Exception as ex: + print("Error: {}".format(ex)) + + +def run(mini_batch: pd.DataFrame) -> pd.DataFrame: + """ + The run method is called multiple times by the runtime. Each time + a mini-batch consisting of a portion of the input data is passed + in as a pandas DataFrame. The run method should return the scoring + results as a List or a pandas DataFrame. + + :param mini_batch: Dataframe containing a portion of the scoring data + + :returns: array containing the scores. + """ + + try: + result = None + + for _, sample in mini_batch.iterrows(): + # prediction + pred = model.predict(sample.values.reshape(1, -1)) + result = ( + np.array(pred) if result is None else np.vstack((result, pred)) + ) # NOQA: E501 + + return ( + [] + if result is None + else mini_batch.join(pd.DataFrame(result, columns=["score"])) + ) + + except Exception as ex: + print(ex) diff --git a/diabetes_regression/scoring/parallel_batchscore_copyoutput.py b/diabetes_regression/scoring/parallel_batchscore_copyoutput.py new file mode 100644 index 00000000..1bcde4b6 --- /dev/null +++ b/diabetes_regression/scoring/parallel_batchscore_copyoutput.py @@ -0,0 +1,91 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" + +from azure.storage.blob import ContainerClient +from datetime import datetime, date, timezone +import argparse +import os + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--output_path", type=str, default=None) + parser.add_argument("--scoring_datastore", type=str, default=None) + parser.add_argument("--score_container", type=str, default=None) + parser.add_argument("--scoring_datastore_key", type=str, default=None) + parser.add_argument("--scoring_output_filename", type=str, default=None) + + return parser.parse_args() + + +def copy_output(args): + print("Output : {}".format(args.output_path)) + + accounturl = "https://{}.blob.core.windows.net".format( + args.scoring_datastore + ) # NOQA E501 + + containerclient = ContainerClient( + accounturl, args.score_container, args.scoring_datastore_key + ) + + destfolder = date.today().isoformat() + filetime = ( + datetime.now(timezone.utc) + .time() + .isoformat("milliseconds") + .replace(":", "_") + .replace(".", "_") + ) # noqa E501 + destfilenameparts = args.scoring_output_filename.split(".") + destblobname = "{}/{}_{}.{}".format( + destfolder, destfilenameparts[0], filetime, destfilenameparts[1] + ) + + destblobclient = containerclient.get_blob_client(destblobname) + with open( + os.path.join(args.output_path, "parallel_run_step.txt"), "rb" + ) as scorefile: # noqa E501 + destblobclient.upload_blob(scorefile, blob_type="BlockBlob") + + +if __name__ == "__main__": + args = parse_args() + if ( + args.scoring_datastore is None + or args.scoring_datastore.strip() == "" + or args.score_container is None + or args.score_container.strip() == "" + or args.scoring_datastore_key is None + or args.scoring_datastore_key.strip() == "" + or args.scoring_output_filename is None + or args.scoring_output_filename.strip() == "" + or args.output_path is None + or args.output_path.strip() == "" + ): + print("Missing parameters in parallel_batchscore_copyoutput.py -- Not going to copy inferences to an output datastore") # NOQA E501 + else: + copy_output(args) diff --git a/diabetes_regression/util/model_helper.py b/diabetes_regression/util/model_helper.py index ceceff41..0fd20ef0 100644 --- a/diabetes_regression/util/model_helper.py +++ b/diabetes_regression/util/model_helper.py @@ -8,8 +8,8 @@ def get_current_workspace() -> Workspace: """ - Retrieves and returns the latest model from the workspace - by its name and tag. Will not work when ran locally. + Retrieves and returns the current workspace. + Will not work when ran locally. Parameters: None @@ -22,66 +22,58 @@ def get_current_workspace() -> Workspace: return experiment.workspace -def get_latest_model( +def get_model( model_name: str, + model_version: int = None, # If none, return latest model tag_name: str = None, tag_value: str = None, aml_workspace: Workspace = None ) -> AMLModel: """ - Retrieves and returns the latest model from the workspace - by its name and (optional) tag. + Retrieves and returns a model from the workspace by its name + and (optional) tag. Parameters: aml_workspace (Workspace): aml.core Workspace that the model lives. model_name (str): name of the model we are looking for + (optional) model_version (str): model version. Latest if not provided. (optional) tag (str): the tag value & name the model was registered under. Return: - A single aml model from the workspace that matches the name and tag. + A single aml model from the workspace that matches the name and tag, or + None. """ - try: - # Validate params. cannot be None. - if model_name is None: - raise ValueError("model_name[:str] is required") + if aml_workspace is None: + print("No workspace defined - using current experiment workspace.") + aml_workspace = get_current_workspace() - if aml_workspace is None: - print("No workspace defined - using current experiment workspace.") - aml_workspace = get_current_workspace() - - model_list = None - tag_ext = "" - - # Get lastest model - # True: by name and tags - if tag_name is not None and tag_value is not None: - model_list = AMLModel.list( - aml_workspace, name=model_name, - tags=[[tag_name, tag_value]], latest=True + tags = None + if tag_name is not None or tag_value is not None: + # Both a name and value must be specified to use tags. + if tag_name is None or tag_value is None: + raise ValueError( + "model_tag_name and model_tag_value should both be supplied" + + "or excluded" # NOQA: E501 ) - tag_ext = f"tag_name: {tag_name}, tag_value: {tag_value}." - # False: Only by name - else: - model_list = AMLModel.list( - aml_workspace, name=model_name, latest=True) - - # latest should only return 1 model, but if it does, - # then maybe sdk or source code changed. - - # define the error messages - too_many_model_message = ("Found more than one latest model. " - f"Models found: {model_list}. " - f"{tag_ext}") + tags = [[tag_name, tag_value]] - no_model_found_message = (f"No Model found with name: {model_name}. " - f"{tag_ext}") + model = None + if model_version is not None: + # TODO(tcare): Finding a specific version currently expects exceptions + # to propagate in the case we can't find the model. This call may + # result in a WebserviceException that may or may not be due to the + # model not existing. + model = AMLModel( + aml_workspace, + name=model_name, + version=model_version, + tags=tags) + else: + models = AMLModel.list( + aml_workspace, name=model_name, tags=tags, latest=True) + if len(models) == 1: + model = models[0] + elif len(models) > 1: + raise Exception("Expected only one model") - if len(model_list) > 1: - raise ValueError(too_many_model_message) - if len(model_list) == 1: - return model_list[0] - else: - print(no_model_found_message) - return None - except Exception: - raise + return model diff --git a/docs/code_description.md b/docs/code_description.md index 351b4c34..81abc78f 100644 --- a/docs/code_description.md +++ b/docs/code_description.md @@ -8,7 +8,7 @@ High level directory structure for this repository: ├── .pipelines <- Azure DevOps YAML pipelines for CI, PR and model training and deployment. ├── bootstrap <- Python script to initialize this repository with a custom project name. ├── charts <- Helm charts to deploy resources on Azure Kubernetes Service(AKS). -├── data <- Initial set of data to train and evaluate model. +├── data <- Initial set of data to train and evaluate model. Not for use to store data. ├── diabetes_regression <- The top-level folder for the ML project. │ ├── evaluate <- Python script to evaluate trained ML model. │ ├── register <- Python script to register trained ML model with Azure Machine Learning Service. @@ -18,7 +18,8 @@ High level directory structure for this repository: │ ├── util <- Python script for various utility operations specific to this ML project. ├── docs <- Extensive markdown documentation for entire project. ├── environment_setup <- The top-level folder for everything related to infrastructure. -│ ├── arm-templates <- Azure Resource Manager(ARM) templates to build infrastructure needed for this project. +│ ├── arm-templates <- Azure Resource Manager(ARM) templates to build infrastructure needed for this project. +│ ├── tf-templates <- Terraform templates to build infrastructure needed for this project. ├── experimentation <- Jupyter notebooks with ML experimentation code. ├── ml_service <- The top-level folder for all Azure Machine Learning resources. │ ├── pipelines <- Python script that builds Azure Machine Learning pipelines. @@ -35,7 +36,11 @@ The repository provides a template with folders structure suitable for maintaini - `environment_setup/install_requirements.sh` : This script prepares a local conda environment i.e. install the Azure ML SDK and the packages specified in environment definitions. -- `environment_setup/iac-*.yml, arm-templates` : Infrastructure as Code piplines to create and delete required resources along with corresponding arm-templates. +- `environment_setup/iac-*-arm.yml, arm-templates` : Infrastructure as Code piplines to create required resources using ARM, along with corresponding arm-templates. Infrastructure as Code can be deployed with this template or with the Terraform template. + +- `environment_setup/iac-*-tf.yml, tf-templates` : Infrastructure as Code piplines to create required resources using Terraform, along with corresponding tf-templates. Infrastructure as Code can be deployed with this template or with the ARM template. + +- `environment_setup/iac-remove-environment.yml` : Infrastructure as Code piplines to delete the created required resources. - `environment_setup/Dockerfile` : Dockerfile of a build agent containing Python 3.6 and all required packages. @@ -47,7 +52,10 @@ The repository provides a template with folders structure suitable for maintaini - `.pipelines/code-quality-template.yml` : a pipeline template used by the CI and PR pipelines. It contains steps performing linting, data and unit testing. - `.pipelines/diabetes_regression-ci-image.yml` : a pipeline building a scoring image for the diabetes regression model. - `.pipelines/diabetes_regression-ci.yml` : a pipeline triggered when the code is merged into **master**. It performs linting, data integrity testing, unit testing, building and publishing an ML pipeline. -- `.pipelines/diabetes_regression-get-model-version-template.yml` : a pipeline template used by the `.pipelines/diabetes_regression-ci.yml` pipeline. It finds out if a new model was registered and retrieves a version of the new model. +- `.pipelines/diabetes_regression-cd.yml` : a pipeline triggered when the code is merged into **master** and the `.pipelines/diabetes_regression-ci.yml` completes. Deploys the model to ACI, AKS or Webapp. +- `.pipelines/diabetes_regression-package-model-template.yml` : Pipeline template that creates a model package and adds the package location to the environment for subsequent tasks to use. +- `.pipelines/diabetes_regression-get-model-id-artifact-template.yml` : a pipeline template used by the `.pipelines/diabetes_regression-cd.yml` pipeline. It takes the model metadata artifact published by the previous pipeline and gets the model ID. +- `.pipelines/diabetes_regression-publish-model-artifact-template.yml` : a pipeline template used by the `.pipelines/diabetes_regression-ci.yml` pipeline. It finds out if a new model was registered and publishes a pipeline artifact containing the model metadata. - `.pipelines/helm-*.yml` : pipeline templates used by the `.pipelines/abtest.yml` pipeline. - `.pipelines/pr.yml` : a pipeline triggered when a **pull request** to the **master** branch is created. It performs linting, data integrity testing and unit testing only. @@ -57,7 +65,6 @@ The repository provides a template with folders structure suitable for maintaini - `ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py` : builds and publishes an ML training pipeline. It uses R on ML Compute. - `ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py` : builds and publishes an ML training pipeline. It uses R on Databricks Compute. - `ml_service/pipelines/run_train_pipeline.py` : invokes a published ML training pipeline (Python on ML Compute) via REST API. -- `ml_service/pipelines/diabetes_regression_verify_train_pipeline.py` : determines whether the evaluate_model.py step of the training pipeline registered a new model. - `ml_service/util` : contains common utility functions used to build and publish an ML training pipeline. ### Environment Definitions @@ -77,11 +84,11 @@ The repository provides a template with folders structure suitable for maintaini ### Evaluation Step -- `diabetes_regression/evaluate/evaluate_model.py` : an evaluating step of an ML training pipeline which registers a new trained model if evaluation shows the new model is more performant than the previous one. +- `diabetes_regression/evaluate/evaluate_model.py` : an evaluating step which cancels the pipeline in case of non-improvement. ### Registering Step -- `diabetes_regression/evaluate/register_model.py` : registers a new trained model if evaluation shows the new model is more performant than the previous one. +- `diabetes_regression/register/register_model.py` : registers a new trained model if evaluation shows the new model is more performant than the previous one. ### Scoring diff --git a/docs/custom_container.md b/docs/custom_container.md index 8c031d15..46e692f9 100644 --- a/docs/custom_container.md +++ b/docs/custom_container.md @@ -61,7 +61,11 @@ Edit the [environment_setup/docker-image-pipeline.yml](../environment_setup/dock and modify the string `'public/mlops/python'` with an name suitable to describe your environment, e.g. `'mlops/diabetes_regression'`. -Save and run the pipeline. This will build and push a container image to your Azure Container Registry with +Save and run the pipeline, making sure to set the these runtime variables: `amlsdkversion` and `githubrelease`. The values are up to you to set depending on your environment. These will show as tags on your image. + +![Custom Container Vars](./images/custom-container-variables.png) + +This will build and push a container image to your Azure Container Registry with the name you have just edited. The next step is to modify the build pipeline to run the CI job on a container run from that image. diff --git a/docs/custom_model.md b/docs/custom_model.md index bce1fb8a..28a15d78 100644 --- a/docs/custom_model.md +++ b/docs/custom_model.md @@ -3,13 +3,14 @@ This document provides steps to follow when using this repository as a template to train models and deploy the models with real-time inference in Azure ML with your own scripts and data. 1. Follow the MLOpsPython [Getting Started](getting_started.md) guide -1. Follow the MLOpsPython [bootstrap instructions](../bootstrap/README.md) to create your project starting point +1. Bootstrap the project 1. Configure training data 1. [If necessary] Convert your ML experimental code into production ready code 1. Replace the training code -1. Update the evaluation code +1. [Optional] Update the evaluation code 1. Customize the build agent environment 1. [If appropriate] Replace the score code +1. [If appropriate] Configure batch scoring data ## Follow the Getting Started guide @@ -17,24 +18,38 @@ Follow the [Getting Started](getting_started.md) guide to set up the infrastruct Take a look at the [Repo Details](code_description.md) document for a description of the structure of this repository. -## Follow the Bootstrap instructions +## Bootstrap the project -The [Bootstrap from MLOpsPython repository](../bootstrap/README.md) guide will help you to quickly prepare the repository for your project. +Bootstrapping will prepare the directory structure to be used for your project name which includes: + +* renaming files and folders from the base project name `diabetes_regression` to your project name +* fixing imports and absolute path based on your project name +* deleting and cleaning up some directories **Note:** Since the bootstrap script will rename the `diabetes_regression` folder to the project name of your choice, we'll refer to your project as `[project name]` when paths are involved. +To bootstrap from the existing MLOpsPython repository: + +1. Ensure Python 3 is installed locally +1. From a local copy of the code, run the `bootstrap.py` script in the `bootstrap` folder +`python bootstrap.py -d [dirpath] -n [projectname]` + * `[dirpath]` is the absolute path to the root of the directory where MLOpsPython is cloned + * `[projectname]` is the name of your ML project + +# Configure Custom Training + ## Configure training data The training ML pipeline uses a [sample diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html) as training data. -To use your own data: +**Important** Convert the template to use your own Azure ML Dataset for model training via these steps: 1. [Create a Dataset](https://docs.microsoft.com/azure/machine-learning/how-to-create-register-datasets) in your Azure ML workspace 1. Update the `DATASET_NAME` and `DATASTORE_NAME` variables in `.pipelines/[project name]-variables-template.yml` ## Convert your ML experimental code into production ready code -The MLOpsPython template creates an Azure Machine Learning (ML) pipeline that invokes a set of [Azure ML pipeline steps](https://docs.microsoft.com/python/api/azureml-pipeline-steps/azureml.pipeline.steps) (see `ml_service/pipelines/[project name]_build_train_pipeline.py`). If your experiment is currently in a Jupyter notebook, it will need to be refactored into scripts that can be run independantly and dropped into the template which the existing Azure ML pipeline steps utilize. +The MLOpsPython template creates an Azure Machine Learning (ML) pipeline that invokes a set of [Azure ML pipeline steps](https://docs.microsoft.com/python/api/azureml-pipeline-steps/azureml.pipeline.steps) (see `ml_service/pipelines/[project name]_build_train_pipeline.py`). If your experiment is currently in a Jupyter notebook, it will need to be refactored into scripts that can be run independently and dropped into the template which the existing Azure ML pipeline steps utilize. 1. Refactor your experiment code into scripts 1. [Recommended] Prepare unit tests @@ -65,12 +80,14 @@ To disable the evaluation step, either: ## Customize the build agent environment -The DevOps pipeline definitions in the MLOpsPython template run several steps in a Docker container that contains the dependencies required to work through the Getting Started guide. If additional dependencies are required to run your unit tests or generate your Azure ML pipeline, there are a few options: +The DevOps pipeline definitions in the MLOpsPython template run several steps in a Docker container that contains the dependencies required to work through the Getting Started guide. These dependencies may change over time and may not suit your project's needs. To manage your own dependencies, there are a few options: * Add a pipeline step to install dependencies required by unit tests to `.pipelines/code-quality-template.yml`. Recommended if you only have a small number of test dependencies. * Create a new Docker image containing your dependencies. See [docs/custom_container.md](custom_container.md). Recommended if you have a larger number of dependencies, or if the overhead of installing additional dependencies on each run is too high. * Remove the container references from the pipeline definition files and run the pipelines on self hosted agents with dependencies pre-installed. +# Configure Custom Scoring + ## Replace score code For the model to provide real-time inference capabilities, the score code needs to be replaced. The MLOpsPython template uses the score code to deploy the model to do real-time scoring on ACI, AKS, or Web apps. @@ -80,3 +97,28 @@ If you want to keep scoring: 1. Update or replace `[project name]/scoring/score.py` 1. Add any dependencies required by scoring to `[project name]/conda_dependencies.yml` 1. Modify the test cases in the `ml_service/util/smoke_test_scoring_service.py` script to match the schema of the training features in your data +1. Check and modify [project name]/scoring/deployment_config_aks.yml if AKS deployment is planned. The deployment configuration shall suit custom model as well as AKS cluster size. + +# Configure Custom Batch Scoring + +## Configure input and output data + +The batch scoring pipeline is configured to use the default datastore for input and output. It will use sample data for scoring. + +In order to configure your own input datastore and output datastores, you will need to specify an Azure Blob Storage Account and set up input and output containers. + +Configure the variables below in your variable group. + +**Note: The datastore storage resource, input/output containers, and scoring data is not created automatically. Make sure that you have manually provisioned these resources and placed your scoring data in your input container with the proper name.** + + +| Variable Name | Suggested Value | Short description | +| ------------------------ | ------------------------- | --------------------------------------------------------------------------------------------------------------------------- | +| SCORING_DATASTORE_STORAGE_NAME | | [Azure Blob Storage Account](https://docs.microsoft.com/en-us/azure/storage/blobs/) name. | +| SCORING_DATASTORE_ACCESS_KEY | | [Azure Storage Account Key](https://docs.microsoft.com/en-us/rest/api/storageservices/authorize-requests-to-azure-storage). You may want to consider linking this variable to Azure KeyVault to avoid storing the access key in plain text. | +| SCORING_DATASTORE_INPUT_CONTAINER | | The name of the container for input data. Defaults to `input` if not set. | +| SCORING_DATASTORE_OUTPUT_CONTAINER| | The name of the container for output data. Defaults to `output` if not set. | +| SCORING_DATASTORE_INPUT_FILENAME | | The filename of the input data in your container Defaults to `diabetes_scoring_input.csv` if not set. | +| SCORING_DATASET_NAME | | The AzureML Dataset name to use. Defaults to `diabetes_scoring_ds` if not set (optional). | +| SCORING_DATASTORE_OUTPUT_FILENAME | | The filename to use for the output data. The pipeline will create this file. Defaults to `diabetes_scoring_output.csv` if not set (optional). | + diff --git a/docs/development_setup.md b/docs/development_setup.md index 68e6b6bf..1c8c2479 100644 --- a/docs/development_setup.md +++ b/docs/development_setup.md @@ -10,19 +10,12 @@ In order to configure the project locally, create a copy of `.env.example` in th [Install the Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli). The Azure CLI will be used to log you in interactively. -Create a virtual environment using [venv](https://docs.python.org/3/library/venv.html), [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv). +Install [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -Here is an example for setting up and activating a `venv` environment with Python 3: +Install the required Python modules. [`install_requirements.sh`](https://github.com/microsoft/MLOpsPython/blob/master/environment_setup/install_requirements.sh) creates and activates a new conda environment with required Python modules. ``` -python3 -mvenv .venv -source .venv/bin/activate -``` - -Install the required Python modules in your virtual environment. - -``` -pip install -r environment_setup/requirements.txt +. environment_setup/install_requirements.sh ``` ### Running local code @@ -30,11 +23,11 @@ pip install -r environment_setup/requirements.txt To run your local ML pipeline code on Azure ML, run a command such as the following (in bash, all on one line): ``` -export BUILD_BUILDID=$(uuidgen); python ml_service/pipelines/build_train_pipeline.py && python ml_service/pipelines/run_train_pipeline.py +export BUILD_BUILDID=$(uuidgen); python ml_service/pipelines/diabetes_regression_build_train_pipeline.py && python ml_service/pipelines/run_train_pipeline.py ``` BUILD_BUILDID is a variable used to uniquely identify the ML pipeline between the -`build_train_pipeline.py` and `run_train_pipeline.py` scripts. In Azure DevOps it is +`diabetes_regression_build_train_pipeline.py` and `run_train_pipeline.py` scripts. In Azure DevOps it is set to the current build number. In a local environment, we can use a command such as `uuidgen` so set a different random identifier on each run, ensuring there are no collisions. diff --git a/docs/getting_started.md b/docs/getting_started.md index 86eb73df..4ba694d7 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -1,11 +1,11 @@ - # Getting Started with MLOpsPython -This guide shows how to get MLOpsPython working with a sample ML project ***diabetes_regression***. The project creates a linear regression model to predict diabetes. You can adapt this example to use with your own project. +This guide shows how to get MLOpsPython working with a sample ML project **_diabetes_regression_**. The project creates a linear regression model to predict diabetes and has CI/CD DevOps practices enabled for model training and serving when these steps are completed in this getting started guide. -We recommend working through this guide completely to ensure everything is working in your environment. After the sample is working, follow the [bootstrap instructions](../bootstrap/README.md) to convert the ***diabetes_regression*** sample into a starting point for your project. +If you would like to bring your own model code to use this template structure, follow the [custom model](custom_model.md) guide. We recommend completing this getting started guide with the diabetes model through ACI deployment first to ensure everything is working in your environment before converting the template to use your own model code. - [Setting up Azure DevOps](#setting-up-azure-devops) + - [Install the Azure Machine Learning extension](#install-the-azure-machine-learning-extension) - [Get the code](#get-the-code) - [Create a Variable Group for your Pipeline](#create-a-variable-group-for-your-pipeline) - [Variable Descriptions](#variable-descriptions) @@ -13,8 +13,9 @@ We recommend working through this guide completely to ensure everything is worki - [Create an Azure DevOps Service Connection for the Azure Resource Manager](#create-an-azure-devops-service-connection-for-the-azure-resource-manager) - [Create the IaC Pipeline](#create-the-iac-pipeline) - [Create an Azure DevOps Service Connection for the Azure ML Workspace](#create-an-azure-devops-service-connection-for-the-azure-ml-workspace) -- [Set up Build, Release Trigger, and Release Multi-Stage Pipeline](#set-up-build-release-trigger-and-release-multi-stage-pipeline) - - [Set up the Pipeline](#set-up-the-pipeline) +- [Set up Build, Release Trigger, and Release Multi-Stage Pipeline](#set-up-build-release-trigger-and-release-multi-stage-pipelines) + - [Set up the Model CI Training, Evaluation, and Registration Pipeline](#set-up-the-model-ci-training-evaluation-and-registration-pipeline) + - [Set up the Release Deployment and/or Batch Scoring Pipelines](#set-up-the-release-deployment-andor-batch-scoring-pipelines) - [Further Exploration](#further-exploration) - [Deploy the model to Azure Kubernetes Service](#deploy-the-model-to-azure-kubernetes-service) - [Web Service Authentication on Azure Kubernetes Service](#web-service-authentication-on-azure-kubernetes-service) @@ -33,63 +34,77 @@ You'll use Azure DevOps for running the multi-stage pipeline with build, model t If you already have an Azure DevOps organization, create a new project using the guide at [Create a project in Azure DevOps and TFS](https://docs.microsoft.com/en-us/azure/devops/organizations/projects/create-project?view=azure-devops). +### Install the Azure Machine Learning extension + +Install the **Azure Machine Learning** extension to your Azure DevOps organization from the [Visual Studio Marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml) by clicking "Get it free" and following the steps. The UI will tell you if try to add it and it's already installed. + +This extension contains the Azure ML pipeline tasks and adds the ability to create Azure ML Workspace service connections. The documentation page on the marketplace includes detailed instructions with screenshots on what capabilities it includes. + ## Get the code -We recommend using the [repository template](https://github.com/microsoft/MLOpsPython/generate), which effectively forks the repository to your own GitHub location and squashes the history. You can use the resulting repository for this guide and for your own experimentation. +We recommend using the [repository template](https://github.com/microsoft/MLOpsPython/generate), which effectively forks this repository to your own GitHub location and squashes the history. You can use the resulting repository for this guide and for your own experimentation. ## Create a Variable Group for your Pipeline -MLOpsPython requires some variables to be set before you can run any pipelines. You'll need to create a *variable group* in Azure DevOps to store values that are reused across multiple pipelines or pipeline stages. Either store the values directly in [Azure DevOps](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) or connect to an Azure Key Vault in your subscription. Check out the [Add & use variable groups](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=yaml#use-a-variable-group) documentation to learn more about how to create a variable group and link it to your pipeline. +MLOpsPython requires some variables to be set before you can run any pipelines. You'll need to create a _variable group_ in Azure DevOps to store values that are reused across multiple pipelines or pipeline stages. Either store the values directly in [Azure DevOps](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) or connect to an Azure Key Vault in your subscription. Check out the [Add & use variable groups](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=yaml#use-a-variable-group) documentation to learn more about how to create a variable group and link it to your pipeline. Navigate to **Library** in the **Pipelines** section as indicated below: ![Library Variable Groups](./images/library_variable_groups.png) -Create a variable group named **``devopsforai-aml-vg``**. The YAML pipeline definitions in this repository refer to this variable group by name. +Create a variable group named **`devopsforai-aml-vg`**. The YAML pipeline definitions in this repository refer to this variable group by name. The variable group should contain the following required variables. **Azure resources that don't exist yet will be created in the [Provisioning resources using Azure Pipelines](#provisioning-resources-using-azure-pipelines) step below.** | Variable Name | Suggested Value | Short description | | ------------------------ | ------------------------- | --------------------------------------------------------------------------------------------------------------------------- | | BASE_NAME | [your project name] | Unique naming prefix for created resources - max 10 chars, letters and numbers only | -| LOCATION | centralus | [Azure location](https://azure.microsoft.com/en-us/global-infrastructure/locations/), no spaces | +| LOCATION | centralus | [Azure location](https://azure.microsoft.com/en-us/global-infrastructure/locations/), no spaces. You can list all the region codes by running `az account list-locations -o table` in the Azure CLI | | RESOURCE_GROUP | mlops-RG | Azure Resource Group name | | WORKSPACE_NAME | mlops-AML-WS | Azure ML Workspace name | | AZURE_RM_SVC_CONNECTION | azure-resource-connection | [Azure Resource Manager Service Connection](#create-an-azure-devops-service-connection-for-the-azure-resource-manager) name | | WORKSPACE_SVC_CONNECTION | aml-workspace-connection | [Azure ML Workspace Service Connection](#create-an-azure-devops-azure-ml-workspace-service-connection) name | -| ACI_DEPLOYMENT_NAME | mlops-aci | [Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/) name | +| ACI_DEPLOYMENT_NAME | mlops-aci | [Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/) name | | -Make sure you select the **Allow access to all pipelines** checkbox in the variable group configuration. +Make sure you select the **Allow access to all pipelines** checkbox in the variable group configuration. To do this, first **Save** the variable group, then click **Pipeline Permissions**, then the button with 3 vertical dots, and then **Open access** button. More variables are available for further tweaking, but the above variables are all you need to get started with this example. For more information, see the [Additional Variables and Configuration](#additional-variables-and-configuration) section. ### Variable Descriptions -**BASE_NAME** is used as a prefix for naming Azure resources. When sharing an Azure subscription, the prefix allows you to avoid naming collisions for resources that require unique names, for example, Azure Blob Storage and Registry DNS. Make sure to set BASE_NAME to a unique name so that created resources will have unique names, for example, MyUniqueMLamlcr, MyUniqueML-AML-KV, and so on. The length of the BASE_NAME value shouldn't exceed 10 characters and must contain letters and numbers only. +**BASE_NAME** is used as a prefix for naming Azure resources and should be unique. When sharing an Azure subscription, the prefix allows you to avoid naming collisions for resources that require unique names, for example, Azure Blob Storage and Registry DNS. Make sure to set BASE_NAME to a unique name so that created resources will have unique names, for example, MyUniqueMLamlcr, MyUniqueML-AML-KV, and so on. The length of the BASE_NAME value shouldn't exceed 10 characters and must contain letters and numbers only. -**LOCATION** is the name of the [Azure location](https://azure.microsoft.com/en-us/global-infrastructure/locations/) for your resources. There should be no spaces in the name. For example, central, westus, westus2. +**LOCATION** is the name of the [Azure location](https://azure.microsoft.com/en-us/global-infrastructure/locations/) for your resources. There should be no spaces in the name. For example, central, westus, northeurope. You can list all the region codes by running `az account list-locations -o table` in the Azure CLI. **RESOURCE_GROUP** is used as the name for the resource group that will hold the Azure resources for the solution. If providing an existing Azure ML Workspace, set this value to the corresponding resource group name. -**WORKSPACE_NAME** is used for creating the Azure Machine Learning Workspace. You can provide an existing Azure ML Workspace here if you've got one. +**WORKSPACE_NAME** is used for creating the Azure Machine Learning Workspace. *While you should be able to provide an existing Azure ML Workspace if you have one, you will run into problems if this has been provisioned manually and the naming of the associated storage account doesn't follow the convention followed in this repo -- as the environment provisioning will try to associate it with a new Storage Account and this is not supported. To avoid these problems, specify a new workspace/unique name.* -**AZURE_RM_SVC_CONNECTION** is used by the [Azure Pipeline]((../environment_setup/iac-create-environment-pipeline.yml)) in Azure DevOps that creates the Azure ML workspace and associated resources through Azure Resource Manager. You'll create the connection in a [step below](#create-an-azure-devops-service-connection-for-the-azure-resource-manager). +**AZURE_RM_SVC_CONNECTION** is used by the [Azure Pipeline](../environment_setup/iac-create-environment-pipeline.yml) in Azure DevOps that creates the Azure ML workspace and associated resources through Azure Resource Manager. You'll create the connection in a [step below](#create-an-azure-devops-service-connection-for-the-azure-resource-manager). **WORKSPACE_SVC_CONNECTION** is used to reference a [service connection for the Azure ML workspace](#create-an-azure-devops-azure-ml-workspace-service-connection). You'll create the connection after [provisioning the workspace](#provisioning-resources-using-azure-pipelines) in the [Create an Azure DevOps Service Connection for the Azure ML Workspace](#create-an-azure-devops-service-connection-for-the-azure-ml-workspace) section below. **ACI_DEPLOYMENT_NAME** is used for naming the scoring service during deployment to [Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/). + ## Provisioning resources using Azure Pipelines -The easiest way to create all required Azure resources (Resource Group, Azure ML Workspace, Container Registry, and others) is to use the **Infrastructure as Code (IaC)** [pipeline in this repository](../environment_setup/iac-create-environment-pipeline.yml). The pipeline takes care of setting up all required resources based on these [Azure Resource Manager templates](../environment_setup/arm-templates/cloud-environment.json). +The easiest way to create all required Azure resources (Resource Group, Azure ML Workspace, Container Registry, and others) is to use the **Infrastructure as Code (IaC)** [pipeline with ARM templates](../environment_setup/iac-create-environment-pipeline-arm.yml) or the [pipeline with Terraform templates](../environment_setup/iac-create-environment-pipeline-tf.yml). The pipeline takes care of setting up all required resources based on these [Azure Resource Manager templates](../environment_setup/arm-templates/cloud-environment.json), or based on these [Terraform templates](../environment_setup/tf-templates). + +**Note:** Since Azure Blob storage account required for batch scoring is optional, the resource provisioning pipelines mentioned above do not create this resource automatically, and manual creation is required before use. ### Create an Azure DevOps Service Connection for the Azure Resource Manager -The [IaC provisioning pipeline]((../environment_setup/iac-create-environment-pipeline.yml)) requires an **Azure Resource Manager** [service connection](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#create-a-service-connection). +The [IaC provisioning pipeline](../environment_setup/iac-create-environment-pipeline.yml) requires an **Azure Resource Manager** [service connection](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#create-a-service-connection). To create one, in Azure DevOps select **Project Settings**, then **Service Connections**, and create a new one, where: -![Create service connection](./images/create-rm-service-connection.png) +- Type is **Azure Resource Manager** +- Authentication method is **Service principal (automatic)** +- Scope level is **Subscription** +- Leave **`Resource Group`** empty after selecting your subscription in the dropdown +- Use the same **`Service Connection Name`** that you used in the variable group you created +- Select **Grant access permission to all pipelines** -Leave the **``Resource Group``** field empty. +![Create service connection](./images/create-rm-service-connection.png) **Note:** Creating the Azure Resource Manager service connection scope requires 'Owner' or 'User Access Administrator' permissions on the subscription. You'll also need sufficient permissions to register an application with your Azure AD tenant, or you can get the ID and secret of a service principal from your Azure AD Administrator. That principal must have 'Contributor' permissions on the subscription. @@ -100,10 +115,14 @@ In your Azure DevOps project, create a build pipeline from your forked repositor ![Build connect step](./images/build-connect.png) -Select the **Existing Azure Pipelines YAML file** option and set the path to [/environment_setup/iac-create-environment-pipeline.yml](../environment_setup/iac-create-environment-pipeline.yml): +If you are using GitHub, after picking the option above, you'll be asked to authorize to GitHub and select the repo you forked. Then you'll have to select your forked repository on GitHub under the **Repository Access** section, and click **Approve and Install**. + +After the above, and when you're redirected back to Azure DevOps, select the **Existing Azure Pipelines YAML file** option and set the path to [/environment_setup/iac-create-environment-pipeline-arm.yml](../environment_setup/iac-create-environment-pipeline-arm.yml) or to [/environment_setup/iac-create-environment-pipeline-tf.yml](../environment_setup/iac-create-environment-pipeline-tf.yml), depending on if you want to deploy your infrastructure using ARM templates or Terraform: ![Configure step](./images/select-iac-pipeline.png) +If you decide to use Terraform, make sure the ['Terraform Build & Release Tasks' from Charles Zipp](https://marketplace.visualstudio.com/items?itemName=charleszipp.azure-pipelines-tasks-terraform) is installed. + Having done that, run the pipeline: ![IaC run](./images/run-iac-pipeline.png) @@ -112,44 +131,58 @@ Check that the newly created resources appear in the [Azure Portal](https://port ![Created resources](./images/created-resources.png) +**Note**: If you have other errors, one good thing to check is what you used in the variable names. If you end up running the pipeline multiple times, you may also run into errors and need to delete the Azure services and re-run the pipeline -- this should include a resource group, a KeyVault, a Storage Account, a Container Registry, an Application Insights and a Machine Learning workspace. + ## Create an Azure DevOps Service Connection for the Azure ML Workspace At this point, you should have an Azure ML Workspace created. Similar to the Azure Resource Manager service connection, you need to create an additional one for the Azure ML Workspace. -Install the **Azure Machine Learning** extension to your Azure DevOps organization from the [Visual Studio Marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml). The extension is required for the service connection. - -Create a new service connection to your Azure ML Workspace using the [Machine Learning Extension](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml) instructions to enable executing the Azure ML training pipeline. The connection name needs to match `WORKSPACE_SVC_CONNECTION` that you set in the variable group above. +Create a new service connection to your Azure ML Workspace using the [Machine Learning Extension](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml) instructions to enable executing the Azure ML training pipeline. The connection name needs to match `WORKSPACE_SVC_CONNECTION` that you set in the variable group above (e.g., 'aml-workspace-connection'). ![Created resources](./images/ml-ws-svc-connection.png) **Note:** Similar to the Azure Resource Manager service connection you created earlier, creating a service connection with Azure Machine Learning workspace scope requires 'Owner' or 'User Access Administrator' permissions on the Workspace. You'll need sufficient permissions to register an application with your Azure AD tenant, or you can get the ID and secret of a service principal from your Azure AD Administrator. That principal must have Contributor permissions on the Azure ML Workspace. -## Set up Build, Release Trigger, and Release Multi-Stage Pipeline +## Set up Build, Release Trigger, and Release Multi-Stage Pipelines + +Now that you've provisioned all the required Azure resources and service connections, you can set up the pipelines for training (Continuous Integration - **CI**) and deploying (Continuous Deployment - **CD**) your machine learning model to production. Additionally, you can set up a pipeline for batch scoring. -Now that you've provisioned all the required Azure resources and service connections, you can set up the pipeline for deploying your machine learning model to production. The pipeline has a sequence of stages for: +1. **Model CI, training, evaluation, and registration** - triggered on code changes to master branch on GitHub. Runs linting, unit tests, code coverage, and publishes and runs the training pipeline. If a new model is registered after evaluation, it creates a build artifact containing the JSON metadata of the model. Definition: [diabetes_regression-ci.yml](../.pipelines/diabetes_regression-ci.yml). +1. **Release deployment** - consumes the artifact of the previous pipeline and deploys a model to either [Azure Container Instances (ACI)](https://azure.microsoft.com/en-us/services/container-instances/), [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service), or [Azure App Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-app-service) environments. See [Further Exploration](#further-exploration) for other deployment types. Definition: [diabetes_regression-cd.yml](../.pipelines/diabetes_regression-cd.yml). + 1. **Note:** Edit the pipeline definition to remove unused stages. For example, if you're deploying to Azure Container Instances and Azure Kubernetes Service only, you'll need to delete the unused `Deploy_Webapp` stage. +1. **Batch Scoring Code Continuous Integration** - consumes the artifact of the model training pipeline. Runs linting, unit tests, code coverage, publishes a batch scoring pipeline, and invokes the published batch scoring pipeline to score a model. -1. **Model Code Continuous Integration:** triggered on code changes to master branch on GitHub. Runs linting, unit tests, code coverage and publishes a training pipeline. -1. **Train Model**: invokes the Azure ML service to trigger the published training pipeline to train, evaluate, and register a model. -1. **Release Deployment:** deploys a model to either [Azure Container Instances (ACI)](https://azure.microsoft.com/en-us/services/container-instances/), [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service), or [Azure App Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-app-service) environments. For simplicity, you're going to initially focus on Azure Container Instances. See [Further Exploration](#further-exploration) for other deployment types. - 1. **Note:** Edit the pipeline definition to remove unused stages. For example, if you're deploying to Azure Container Instances and Azure Kubernetes Service only, delete the unused `Deploy_Webapp` stage. +These pipelines use a Docker container on the Azure Pipelines agents to accomplish the pipeline steps. The container image ***mcr.microsoft.com/mlops/python:latest*** is built with [this Dockerfile](../environment_setup/Dockerfile) and has all the necessary dependencies installed for MLOpsPython and ***diabetes_regression***. This image is an example of a custom Docker image with a pre-baked environment. The environment is guaranteed to be the same on any building agent, VM, or local machine. **In your project, you'll want to build your own Docker image that only contains the dependencies and tools required for your use case. Your image will probably be smaller and faster, and it will be maintained by your team.** -### Set up the Pipeline +### Set up the Model CI, training, evaluation, and registration pipeline -In your Azure DevOps project, create and run a new build pipeline based on the [diabetes_regression-ci.yml](../.pipelines/diabetes_regression-ci.yml) +In your Azure DevOps project, create and run a new build pipeline based on the [./pipelines/diabetes_regression-ci.yml](../.pipelines/diabetes_regression-ci.yml) pipeline definition in your forked repository. -![Configure CI build pipeline](./images/ci-build-pipeline-configure.png) +If you plan to use the release deployment pipeline (in the next section), you will need to rename this pipeline to `Model-Train-Register-CI`. -Once the pipeline is finished, check the execution result: +**Note**: *To rename your pipeline, after you saved it, click **Pipelines** on the left menu on Azure DevOps, then **All** to see all the pipelines, then click the menu with the 3 vertical dots that appears when you hover the name of the new pipeline, and click it to pick **"Rename/move pipeline"**.* + +Start a run of the pipeline if you haven't already, and once the pipeline is finished, check the execution result. Note that the run can take 20 minutes, with time mostly spent in **Trigger ML Training Pipeline > Invoke ML Pipeline** step. You can track the execution of the AML pipeline by opening the AML Workspace user interface. Screenshots are below: -![Build](./images/multi-stage-aci.png) +![Build](./images/model-train-register.png) -Also check the published training pipeline in the **mlops-AML-WS** workspace in [Azure Portal](https://portal.azure.com/): +And the pipeline artifacts: + +![Build](./images/model-train-register-artifacts.png) + +Also check the published training pipeline in your newly created AML workspace in [Azure Machine Learning Studio](https://ml.azure.com/): ![Training pipeline](./images/training-pipeline.png) -Great, you now have the build pipeline set up which automatically triggers every time there's a change in the master branch! +Great, you now have the build pipeline for training set up which automatically triggers every time there's a change in the master branch! + +After the pipeline is finished, you'll also see a new model in the **AML Workspace** model registry section: + +![Trained model](./images/trained-model.png) + +To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\diabetes_regression-ci.yml` pipeline to `false`. You can also override the variable at runtime execution of the pipeline. The pipeline stages are summarized below: @@ -157,37 +190,148 @@ The pipeline stages are summarized below: - Linting (code quality analysis) - Unit tests and code coverage analysis -- Build and publish *ML Training Pipeline* in an *ML Workspace* +- Build and publish _ML Training Pipeline_ in an _ML Workspace_ #### Train model -- Determine the ID of the *ML Training Pipeline* published in the previous stage. -- Trigger the *ML Training Pipeline* and waits for it to complete. +- Determine the ID of the _ML Training Pipeline_ published in the previous stage. +- Trigger the _ML Training Pipeline_ and waits for it to complete. - This is an **agentless** job. The CI pipeline can wait for ML pipeline completion for hours or even days without using agent resources. -- Determine if a new model was registered by the *ML Training Pipeline*. - - If the model evaluation determines that the new model doesn't perform any better than the previous one, the new model won't register and the *ML Training Pipeline* will be **canceled**. In this case, you'll see a message in the 'Train Model' job under the 'Determine if evaluation succeeded and new model is registered' step saying '**Model was not registered for this run.**' - - See [evaluate_model.py](../diabetes_regression/evaluate/evaluate_model.py#L118) for the evaluation logic and [diabetes_regression_verify_train_pipeline.py](../ml_service/pipelines/diabetes_regression_verify_train_pipeline.py#L54) for the ML pipeline reporting logic. +- Determine if a new model was registered by the _ML Training Pipeline_. + - If the model evaluation step of the AML Pipeline determines that the new model doesn't perform any better than the previous one, the new model won't register and the _ML Training Pipeline_ will be **canceled**. In this case, you'll see a message in the 'Train Model' job under the 'Determine if evaluation succeeded and new model is registered' step saying '**Model was not registered for this run.**' + - See [evaluate_model.py](../diabetes_regression/evaluate/evaluate_model.py#L118) for the evaluation logic. This is a simplified test that just looks at MSE to decide whether or not to register a new model. A more realistic verification would also do some error analysis and verify the inferences/error distribution against a test dataset, for example. + - **Note**: *while it's possible to do an Evaluation Step as part of the ADO pipeline, this evaluation is logically part of the work done by Data Scientists, and as such the recommendation is that this step is done as part of the AML Pipeline and not ADO pipelines.* - [Additional Variables and Configuration](#additional-variables-and-configuration) for configuring this and other behavior. +#### Create pipeline artifact + +- Get the info about the registered model +- Create an Azure DevOps pipeline artifact called `model` that contains a `model.json` file containing the model information, for example: + +```json +{ "createdTime": "2021-12-14T13:03:24.494748+00:00", "framework": "Custom", "frameworkVersion": null, "id": "diabetes_regression_model.pkl:1", "name": "diabetes_regression_model.pkl", "version": 1 } +``` + +- Here's [more information on Azure DevOps Artifacts](https://docs.microsoft.com/en-us/azure/devops/pipelines/artifacts/build-artifacts?view=azure-devops&tabs=yaml#explore-download-and-deploy-your-artifacts) and where to find them on the ADO user interface. + +### Set up the Release Deployment and/or Batch Scoring pipelines + +--- +**PRE-REQUISITES** + +In order to use these pipelines: + +1. Follow the steps to set up the Model CI, training, evaluation, and registration pipeline. +1. You **must** rename your model CI/train/eval/register pipeline to `Model-Train-Register-CI`. + +These pipelines rely on the model CI pipeline and reference it by name. + +If you would like to change the name of your model CI pipeline, you must edit this section of yml for the CD and batch scoring pipeline, where it says `source: Model-Train-Register-CI` to use your own name. +``` +trigger: none +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest + pipelines: + - pipeline: model-train-ci + source: Model-Train-Register-CI # Name of the triggering pipeline + trigger: + branches: + include: + - master +``` + +--- + +The release deployment and batch scoring pipelines have the following behaviors: + +- The pipeline will **automatically trigger** on completion of the `Model-Train-Register-CI` pipeline for the master branch. +- The pipeline will default to using the latest successful build of the `Model-Train-Register-CI` pipeline. It will deploy the model produced by that build. +- You can specify a `Model-Train-Register-CI` build ID when running the pipeline manually. You can find this in the url of the build, and the model registered from that build will also be tagged with the build ID. This is useful to skip model training and registration, and deploy/score a model successfully registered by a `Model-Train-Register-CI` build. + - For example, if you navigate to a specific run of your CI pipeline, the URL should be something like `https://dev.azure.com/yourOrgName/yourProjectName/_build/results?buildId=653&view=results`. **653** is the build ID in this case. See the second screenshot below to verify where this number would be used. + +### Set up the Release Deployment pipeline + +In your Azure DevOps project, create and run a new **build** pipeline based on the [./pipelines/diabetes_regression-cd.yml](../.pipelines/diabetes_regression-cd.yml) +pipeline definition in your forked repository. It is recommended you rename this pipeline to something like `Model-Deploy-CD` for clarity. + +**Note**: *While Azure DevOps supports both Build and Release pipelines, when using YAML you don't usually need to use Release pipelines. This repository assumes the usage only of Build pipelines.* + +Your first run will use the latest model created by the `Model-Train-Register-CI` pipeline. + +Once the pipeline is finished, check the execution result: + +![Build](./images/model-deploy-result.png) + +To specify a particular build's model, set the `Model Train CI Build Id` parameter to the build ID you would like to use: + +![Build](./images/model-deploy-configure.png) + +Once your pipeline run begins, you can see the model name and version downloaded from the `Model-Train-Register-CI` pipeline. The run time will typically be 5-10 minutes. + +![Build](./images/model-deploy-get-artifact-logs.png) + +The pipeline has the following stage: + #### Deploy to ACI - Deploy the model to the QA environment in [Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/). - Smoke test - The test sends a sample query to the scoring web service and verifies that it returns the expected response. Have a look at the [smoke test code](../ml_service/util/smoke_test_scoring_service.py) for an example. -The pipeline uses a Docker container on the Azure Pipelines agents to accomplish the pipeline steps. The container image ***mcr.microsoft.com/mlops/python:latest*** is built with [this Dockerfile](../environment_setup/Dockerfile) and has all the necessary dependencies installed for MLOpsPython and ***diabetes_regression***. This image is an example of a custom Docker image with a pre-baked environment. The environment is guaranteed to be the same on any building agent, VM, or local machine. In your project, you'll want to build your own Docker image that only contains the dependencies and tools required for your use case. Your image will probably be smaller and faster, and it will be maintained by your team. +- You can verify that an ACI instance was created in the same resource group you specified: -After the pipeline is finished, you'll see a new model in the **ML Workspace**: +![Created Resouces ](./images/aci-in-azure-portal.png) -![Trained model](./images/trained-model.png) +### Set up the Batch Scoring pipeline -To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\diabetes_regression-ci.yml` pipeline to `false`. You can also override the variable at runtime execution of the pipeline. +In your Azure DevOps project, create and run a new build pipeline based on the [.pipelines/diabetes_regression-batchscoring-ci.yml](../.pipelines/diabetes_regression-batchscoring-ci.yml) +pipeline definition in your forked repository. Rename this pipeline to `Batch-Scoring`. + +Once the pipeline is finished, check the execution result: + +![Build](./images/batchscoring-ci-result.png) + +Also check the published batch scoring pipeline in your AML workspace in the [Azure Portal](https://portal.azure.com/): -To skip model training and registration, and deploy a model successfully registered by a previous build (for testing changes to the score file or inference configuration), add the variable `MODEL_BUILD_ID` when the pipeline is queued, and set the value to the ID of the previous build. +![Batch scoring pipeline](./images/batchscoring-pipeline.png) +Great, you now have the build pipeline set up for batch scoring which automatically triggers every time there's a change in the master branch! + +The pipeline stages are described below in detail -- and you must do further configurations to actually see the batch inferences: + +#### Batch Scoring CI + +- Linting (code quality analysis) +- Unit tests and code coverage analysis +- Build and publish *ML Batch Scoring Pipeline* in an *AML Workspace* + +#### Batch Score model + +- Determine the model to be used based on the model name (required), model version, model tag name and model tag value bound pipeline parameters. + - If run via Azure DevOps pipeline, the batch scoring pipeline will take the model name and version from the `Model-Train-Register-CI` build used as input. + - If run locally without the model version, the batch scoring pipeline will use the model's latest version. +- Trigger the *ML Batch Scoring Pipeline* and wait for it to complete. + - This is an **agentless** job. The CI pipeline can wait for ML pipeline completion for hours or even days without using agent resources. +- Create an Azure ML pipeline with two steps. The pipeline is created by the code in `ml_service\pipelines\diabetes_regression_build_parallel_batchscore_pipeline.py` and has two steps: + - `scoringstep` - this step is a **`ParallelRunStep`** that executes the code in `diabetes_regression\scoring\parallel_batchscore.py` with several different batches of the data to be scored. + - `scorecopystep` - this is a **`PythonScriptStep`** step that copies the output inferences from Azure ML's internal storage into a target location in a another storage account. + - If you run the instructions as defined above with no changes to variables, this step will be **not** executed. You'll see a message in the logs for the corresponding step saying `Missing Parameters`. In this case, you'll be able to find the file with the inferences in the same Storage Account associated with Azure ML, in a location similar to `azureml-blobstore-SomeGuid\azureml\SomeOtherGuid\defaultoutput\parallel_run_step.txt`. One way to find the right path is this: + - Open your experiment in Azure ML (by default called `mlopspython`). + - Open the run that you want to look at (named something like `neat_morning_qc10dzjy` or similar). + - In the graphical pipeline view with 2 steps, click the button to open the details tab: `Show run overview`. + - You'll see two steps (corresponding to `scoringstep`and `scorecopystep` as described above). + - Click the step with the with older "Submitted time". + - Click "Output + logs" at the top, and you'll see something like the following: + ![Outputs of `scoringstep`](./images/batch-child-run-scoringstep.png) + - The `defaultoutput` file will have JSON content with the path to a file called `parallel_run_step.txt` containing the scoring. + +To properly configure this step for your own custom scoring data, you must follow the instructions in [Configure Custom Batch Scoring](custom_model.md#Configure-Custom-Batch-Scoring), which let you specify both the location of the files to score (via the `SCORING_DATASTORE_INPUT_*` configuration variables) and where to store the inferences (via the `SCORING_DATASTORE_OUTPUT_*` configuration variables). + ## Further Exploration -You should now have a working pipeline that can get you started with MLOpsPython. Below are some additional features offered that might suit your scenario. +You should now have a working set of pipelines that can get you started with MLOpsPython. Below are some additional features offered that might suit your scenario. ### Deploy the model to Azure Kubernetes Service @@ -195,21 +339,23 @@ MLOpsPython also can deploy to [Azure Kubernetes Service](https://azure.microsof Creating a cluster on Azure Kubernetes Service is out of scope of this tutorial, but you can find set up information on the [Quickstart: Deploy an Azure Kubernetes Service (AKS) cluster using the Azure portal](https://docs.microsoft.com/en-us/azure/aks/kubernetes-walkthrough-portal#create-an-aks-cluster) page. -**Note:** If your target deployment environment is a Kubernetes cluster and you want to implement Canary and/or A/B testing deployment strategies, check out this [tutorial](./canary_ab_deployment.md). +> **_Note_** +> +> If your target deployment environment is a Kubernetes cluster and you want to implement Canary and/or A/B testing deployment strategies, check out this [tutorial](./canary_ab_deployment.md). Keep the Azure Container Instances deployment active because it's a lightweight way to validate changes before deploying to Azure Kubernetes Service. In the Variables tab, edit your variable group (`devopsforai-aml-vg`). In the variable group definition, add these variables: -| Variable Name | Suggested Value | -| ------------------- | --------------- | -| AKS_COMPUTE_NAME | aks | -| AKS_DEPLOYMENT_NAME | mlops-aks | - -Set **AKS_COMPUTE_NAME** to the *Compute name* of the Inference Cluster that references the Azure Kubernetes Service cluster in your Azure ML Workspace. +| Variable Name | Suggested Value | Description | +| ------------------- | --------------- | ----------- | +| AKS_COMPUTE_NAME | aks | The Compute name of the inference cluster, created in the Azure ML Workspace (ml.azure.com). This connection has to be created manually before setting the value! | +| AKS_DEPLOYMENT_NAME | mlops-aks | The name of the deployed aks cluster in your subscripttion. | After successfully deploying to Azure Container Instances, the next stage will deploy the model to Kubernetes and run a smoke test. +Set **AKS_COMPUTE_NAME** to the _Compute name_ of the Inference Cluster that references the Azure Kubernetes Service cluster in your Azure ML Workspace. + ![build](./images/multi-stage-aci-aks.png) Consider enabling [manual approvals](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/approvals) before the deployment stages. @@ -220,30 +366,53 @@ When deploying to Azure Kubernetes Service, key-based authentication is enabled ### Deploy the model to Azure App Service (Azure Web App for containers) -If you want to deploy your scoring service as an [Azure App Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-app-service) instead of Azure Container Instances and Azure Kubernetes Service, follow these additional steps. +If you want to deploy your scoring service as an [Azure App Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-app-service) instead of Azure Container Instances or Azure Kubernetes Service, follow these additional steps. + +- First, you'll need to create an App Service Plan using Linux. The simplest way is to run this from your Azure CLI: `az appservice plan create --name nameOfAppServicePlan --resource-group nameOfYourResourceGroup --sku B1 --is-linux`. + +- Second, you'll need to create a webapp in this App Service Plan, and configure it to run a certain container. As currently there is no UI in the Azure Portal to do this, this has to be done from the command line. We'll come back to this. + +- In the Variables tab, edit your variable group (`devopsforai-aml-vg`) and add a variable: -In the Variables tab, edit your variable group (`devopsforai-aml-vg`) and add a variable: + | Variable Name | Suggested Value | + | ---------------------- | ---------------------- | + | WEBAPP_DEPLOYMENT_NAME | _name of your web app_ | -| Variable Name | Suggested Value | -| ---------------------- | ---------------------- | -| WEBAPP_DEPLOYMENT_NAME | _name of your web app_ | + Set **WEBAPP_DEPLOYMENT_NAME** to the name of your Azure Web App. You have not yet created this webapp, so just use the name you're planning on giving it. -Set **WEBAPP_DEPLOYMENT_NAME** to the name of your Azure Web App. This app must exist before you can deploy the model to it. +- Delete the **ACI_DEPLOYMENT_NAME** or any AKS-related variable. -Delete the **ACI_DEPLOYMENT_NAME** variable. +- Next, you'll need to run your `Model-Deploy-CD` pipeline -The pipeline uses the [Create Image Script](../ml_service/util/create_scoring_image.py) to create a scoring image. The image will be registered under an Azure Container Registry instance that belongs to the Azure Machine Learning Service. Any dependencies that the scoring file depends on can also be packaged with the container with an image config. Learn more about how to create a container using the Azure ML SDK with the [Image class](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.image.image.image?view=azure-ml-py#create-workspace--name--models--image-config-) API documentation. + - The pipeline uses the [Azure ML CLI](../.pipelines/diabetes_regression-package-model-template.yml) to create a scoring image. The image will be registered under an Azure Container Registry instance that belongs to the Azure Machine Learning Service. Any dependencies that the scoring file depends on can also be packaged with the container with an image config. Learn more about how to create a container using the Azure ML SDK with the [Image class](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.image.image.image?view=azure-ml-py#create-workspace--name--models--image-config-) API documentation. -Make sure your webapp has the credentials to pull the image from the Azure Container Registry created by the Infrastructure as Code pipeline. Instructions can be found on the [Configure registry credentials in web app](https://docs.microsoft.com/en-us/azure/devops/pipelines/targets/webapp-on-container-linux?view=azure-devops&tabs=dotnet-core%2Cyaml#configure-registry-credentials-in-web-app) page. You'll need to run the pipeline once (including the Deploy to Webapp stage up to the `Create scoring image` step) so an image is present in the registry. After that, you can connect the Webapp to the Azure Container Registry in the Azure Portal. + - This pipeline will **fail** on the `Azure Web App on Container Deploy` step, with an error saying the webapp doesn't exist yet. This is expected. Go to the next step. -![build](./images/multi-stage-webapp.png) +- If you want to confirm that the scoring image has been created, open the Azure Container Registry mentioned above, which will be in the Resource Group of the Azure ML workspace, and look for the repositories. You'll have one that was created by the pipeline, called `package`, which was created by the CD pipeline: + + ![Azure Container Registry repository list](./images/container-registry-webapp-image.png) + +- Notedown the name of the Login Server of your Azure Container Registry. It'll be something like `YourAcrName.azurecr.io`. + +- Going back to the Step Two, now you can create a Web App in you App Service Plan using this scoring image but with the `latest` tag. The easiest way to do this is to run this in the Azure CLI: `az webapp create --resource-group yourResourceGroup --plan nameOfAppServicePlan --name nameOfWebApp --deployment-container-image-name YourAcrName.azurecr.io/package:latest` + - Here, `nameOfWebApp` is the same you put in your Azure DevOps `WEBAPP_DEPLOYMENT_NAME` variable. + +From now on, whenever you run the CD pipeline, it will update the image in the container registry and it'll automatically update the one used in the WebApp. CD pipeline runs will now succeed. + +![build](./images/ADO-CD-pipeline-to-webapp.png) + +To confirm, you can open the App Service Plan, open your new WebApp, and open the **Deployment Center**, where you'll see something like: + +![WebApp Deployment Center page](./images/appservice-webapp-deploymentcenter.png) + +If you run into problems, you may have to make sure your webapp has the credentials to pull the image from the Azure Container Registry created by the Infrastructure as Code pipeline. Instructions can be found on the [Configure registry credentials in web app](https://docs.microsoft.com/en-us/azure/devops/pipelines/targets/webapp-on-container-linux?view=azure-devops&tabs=dotnet-core%2Cyaml#configure-registry-credentials-in-web-app) page. ### Example pipelines using R The build pipeline also supports building and publishing Azure ML pipelines using R to train a model. You can enable it by changing the `build-train-script` pipeline variable to either of the following values: -* `diabetes_regression_build_train_pipeline_with_r.py` to train a model with R on Azure ML Compute. You'll also need to uncomment (include) the `r-essentials` Conda packages in the environment definition YAML `diabetes_regression/conda_dependencies.yml`. -* `diabetes_regression_build_train_pipeline_with_r_on_dbricks.py` to train a model with R on Databricks. You'll need to manually create a Databricks cluster and attach it to the Azure ML Workspace as a compute resource. Set the DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables in your variable group. +- `diabetes_regression_build_train_pipeline_with_r.py` to train a model with R on Azure ML Compute. You'll also need to uncomment (include) the `r-essentials` Conda packages in the environment definition YAML `diabetes_regression/conda_dependencies.yml`. +- `diabetes_regression_build_train_pipeline_with_r_on_dbricks.py` to train a model with R on Databricks. You'll need to manually create a Databricks cluster and attach it to the Azure ML Workspace as a compute resource. Set the DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables in your variable group. Example ML pipelines using R have a single step to train a model. They don't demonstrate how to evaluate and register a model. The evaluation and registering techniques are shown only in the Python implementation. @@ -251,9 +420,9 @@ Example ML pipelines using R have a single step to train a model. They don't dem You can explore aspects of model observability in the solution, such as: -* **Logging**: Navigate to the Application Insights instance linked to the Azure ML Portal, then go to the Logs (Analytics) pane. The following sample query correlates HTTP requests with custom logs generated in `score.py`. This can be used, for example, to analyze query duration vs. scoring batch size: +- **Logging**: Navigate to the Application Insights instance linked to the Azure ML Portal, then go to the Logs (Analytics) pane. The following sample query correlates HTTP requests with custom logs generated in `score.py`. This can be used, for example, to analyze query duration vs. scoring batch size: - ``` + ```sql let Traceinfo=traces | extend d=parse_json(tostring(customDimensions.Content)) | project workspace=customDimensions.["Workspace Name"], @@ -267,8 +436,8 @@ You can explore aspects of model observability in the solution, such as: | project-away id1 ``` -* **Distributed tracing**: The smoke test client code sets an HTTP `traceparent` header (per the [W3C Trace Context proposed specification](https://www.w3.org/TR/trace-context-1)), and the `score.py` code logs the header. The query above shows how to surface this value. You can adapt it to your tracing framework. -* **Monitoring**: You can use [Azure Monitor for containers](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-overview) to monitor the Azure ML scoring containers' performance. +- **Distributed tracing**: The smoke test client code sets an HTTP `traceparent` header (per the [W3C Trace Context proposed specification](https://www.w3.org/TR/trace-context-1)), and the `score.py` code logs the header. The query above shows how to surface this value. You can adapt it to your tracing framework. +- **Monitoring**: You can use [Azure Monitor for containers](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-overview) to monitor the Azure ML scoring containers' performance. ### Clean up the example resources @@ -276,8 +445,9 @@ To remove the resources created for this project, use the [/environment_setup/ia ## Next Steps: Integrating your project -* The [custom model](custom_model.md) guide includes information on bringing your own code to this repository template. -* Consider using [Azure Pipelines self-hosted agents](https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/agents?view=azure-devops&tabs=browser#install) to speed up your Azure ML pipeline execution. The Docker container image for the Azure ML pipeline is sizable, and having it cached on the agent between runs can trim several minutes from your runs. +- The [custom model](custom_model.md) guide includes information on bringing your own code to this repository template. +- We recommend using a [custom container](custom_model.md#customize-the-build-agent-environment) to manage your pipeline environment and dependencies. The container provided with the getting started guide may not be suitable or up to date with your project needs. +- Consider using [Azure Pipelines self-hosted agents](https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/agents?view=azure-devops&tabs=browser#install) to speed up your Azure ML pipeline execution. The Docker container image for the Azure ML pipeline is sizable, and having it cached on the agent between runs can trim several minutes from your runs. Additionally, for secure deployments of Azure Machine Learning, you'll probably need to have a self-hosted agent in a Virtual Network. ### Additional Variables and Configuration @@ -287,7 +457,7 @@ There are more variables used in the project. They're defined in two places: one For using Azure Pipelines, all other variables are stored in the file `.pipelines/diabetes_regression-variables-template.yml`. Using the default values as a starting point, adjust the variables to suit your requirements. -In that folder, you'll also find the `parameters.json` file that we recommend using to provide parameters for training, evaluation, and scoring scripts. The sample parameter that `diabetes_regression` uses is the ridge regression [*alpha* hyperparameter](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html). We don't provide any serializers for this config file. +In the `diabetes_regression` folder, you'll also find the `parameters.json` file that we recommend using to provide parameters for training, evaluation, and scoring scripts. The sample parameter that `diabetes_regression` uses is the ridge regression [_alpha_ hyperparameter](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html). We don't provide any serializers for this config file. #### Local configuration diff --git a/docs/images/ADO-CD-pipeline-to-webapp.png b/docs/images/ADO-CD-pipeline-to-webapp.png new file mode 100644 index 00000000..aac8c9ee Binary files /dev/null and b/docs/images/ADO-CD-pipeline-to-webapp.png differ diff --git a/docs/images/aci-in-azure-portal.png b/docs/images/aci-in-azure-portal.png new file mode 100644 index 00000000..e7bfa8cd Binary files /dev/null and b/docs/images/aci-in-azure-portal.png differ diff --git a/docs/images/appservice-webapp-deploymentcenter.png b/docs/images/appservice-webapp-deploymentcenter.png new file mode 100644 index 00000000..b79ff615 Binary files /dev/null and b/docs/images/appservice-webapp-deploymentcenter.png differ diff --git a/docs/images/batch-child-run-scoringstep.png b/docs/images/batch-child-run-scoringstep.png new file mode 100644 index 00000000..6b87f52d Binary files /dev/null and b/docs/images/batch-child-run-scoringstep.png differ diff --git a/docs/images/batchscoring-ci-result.png b/docs/images/batchscoring-ci-result.png new file mode 100644 index 00000000..d07d41a8 Binary files /dev/null and b/docs/images/batchscoring-ci-result.png differ diff --git a/docs/images/batchscoring-pipeline.png b/docs/images/batchscoring-pipeline.png new file mode 100644 index 00000000..2b79fe03 Binary files /dev/null and b/docs/images/batchscoring-pipeline.png differ diff --git a/docs/images/container-registry-webapp-image.png b/docs/images/container-registry-webapp-image.png new file mode 100644 index 00000000..4ec09f8f Binary files /dev/null and b/docs/images/container-registry-webapp-image.png differ diff --git a/docs/images/custom-container-variables.png b/docs/images/custom-container-variables.png new file mode 100644 index 00000000..24a6a92a Binary files /dev/null and b/docs/images/custom-container-variables.png differ diff --git a/docs/images/model-deploy-configure.png b/docs/images/model-deploy-configure.png new file mode 100644 index 00000000..fcd87750 Binary files /dev/null and b/docs/images/model-deploy-configure.png differ diff --git a/docs/images/model-deploy-get-artifact-logs.png b/docs/images/model-deploy-get-artifact-logs.png new file mode 100644 index 00000000..2249a8d3 Binary files /dev/null and b/docs/images/model-deploy-get-artifact-logs.png differ diff --git a/docs/images/model-deploy-result.png b/docs/images/model-deploy-result.png new file mode 100644 index 00000000..cd3d166e Binary files /dev/null and b/docs/images/model-deploy-result.png differ diff --git a/docs/images/model-train-register-artifacts.png b/docs/images/model-train-register-artifacts.png new file mode 100644 index 00000000..0d3eed26 Binary files /dev/null and b/docs/images/model-train-register-artifacts.png differ diff --git a/docs/images/model-train-register.png b/docs/images/model-train-register.png new file mode 100644 index 00000000..5ce4ef41 Binary files /dev/null and b/docs/images/model-train-register.png differ diff --git a/docs/images/trained-model.png b/docs/images/trained-model.png index 3753fd7d..5bea4fe2 100644 Binary files a/docs/images/trained-model.png and b/docs/images/trained-model.png differ diff --git a/docs/images/training-pipeline.png b/docs/images/training-pipeline.png index cbdaf048..48854513 100644 Binary files a/docs/images/training-pipeline.png and b/docs/images/training-pipeline.png differ diff --git a/environment_setup/arm-templates/cloud-environment.json b/environment_setup/arm-templates/cloud-environment.json index f2b2ac2f..5f102747 100644 --- a/environment_setup/arm-templates/cloud-environment.json +++ b/environment_setup/arm-templates/cloud-environment.json @@ -13,16 +13,6 @@ "location": { "type": "string", "defaultValue": "eastus", - "allowedValues": [ - "eastus", - "eastus2", - "southcentralus", - "southeastasia", - "westcentralus", - "westeurope", - "westus2", - "centralus" - ], "metadata": { "description": "Specifies the location for all resources." } @@ -45,6 +35,17 @@ "acr": { "type": "string", "defaultValue": "[concat(toLower(parameters('baseName')),'amlcr')]" + }, + "sku": { + "type": "string", + "defaultValue": "basic", + "allowedValues": [ + "basic", + "enterprise" + ], + "metadata": { + "description": "Specifies the sku, also referred as 'edition' of the Azure Machine Learning workspace." + } } }, "variables": { @@ -92,7 +93,8 @@ "name": "standard", "family": "A" }, - "accessPolicies": [] + "accessPolicies": [ + ] } }, { @@ -131,6 +133,10 @@ "identity": { "type": "systemAssigned" }, + "sku": { + "tier": "[parameters('sku')]", + "name": "[parameters('sku')]" + }, "properties": { "friendlyName": "[variables('amlWorkspaceName')]", "keyVault": "[resourceId('Microsoft.KeyVault/vaults',variables('keyVaultName'))]", @@ -138,6 +144,6 @@ "containerRegistry": "[resourceId('Microsoft.ContainerRegistry/registries',variables('containerRegistryName'))]", "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts/',variables('storageAccountName'))]" } - } + } ] } \ No newline at end of file diff --git a/environment_setup/iac-create-environment-pipeline-arm.yml b/environment_setup/iac-create-environment-pipeline-arm.yml new file mode 100644 index 00000000..0b9f474c --- /dev/null +++ b/environment_setup/iac-create-environment-pipeline-arm.yml @@ -0,0 +1,36 @@ +# CI/PR Pipeline that deploys an ARM template to create or update the resources needed by the other pipelines. +trigger: + branches: + include: + - master + paths: + include: + - environment_setup/arm-templates/* +pr: + branches: + include: + - master + paths: + include: + - environment_setup/arm-templates/* + +pool: + vmImage: "ubuntu-latest" + +variables: + - group: devopsforai-aml-vg + - name: WORKSPACE_SKU # https://docs.microsoft.com/en-us/azure/machine-learning/overview-what-is-azure-ml#sku + value: basic + +steps: + - task: AzureResourceGroupDeployment@2 + inputs: + azureSubscription: "$(AZURE_RM_SVC_CONNECTION)" + action: "Create Or Update Resource Group" + resourceGroupName: "$(RESOURCE_GROUP)" + location: $(LOCATION) + templateLocation: "Linked artifact" + csmFile: "$(Build.SourcesDirectory)/environment_setup/arm-templates/cloud-environment.json" + overrideParameters: "-baseName $(BASE_NAME) -location $(LOCATION) -workspace $(WORKSPACE_NAME) -sku $(WORKSPACE_SKU)" + deploymentMode: "Incremental" + displayName: "Deploy MLOps resources to Azure" diff --git a/environment_setup/iac-create-environment-pipeline-tf.yml b/environment_setup/iac-create-environment-pipeline-tf.yml new file mode 100644 index 00000000..ef184546 --- /dev/null +++ b/environment_setup/iac-create-environment-pipeline-tf.yml @@ -0,0 +1,72 @@ +# CI/PR Pipeline that deploys an TF template to create or update the resources needed by the other pipelines. +trigger: + branches: + include: + - master + paths: + include: + - environment_setup/tf-templates/* +pr: + branches: + include: + - master + paths: + include: + - environment_setup/tf-templates/* + +pool: + vmImage: 'ubuntu-latest' + +variables: +- group: devopsforai-aml-vg + +steps: +- task: charleszipp.azure-pipelines-tasks-terraform.azure-pipelines-tasks-terraform-installer.TerraformInstaller@0 + displayName: 'Use Terraform 0.12.24' + inputs: + terraformVersion: 0.12.24 + +- task: charleszipp.azure-pipelines-tasks-terraform.azure-pipelines-tasks-terraform-cli.TerraformCLI@0 + displayName: 'TF init - Deploy MLOps resources to Azure' + inputs: + command: init + commandOptions: '-backend=true -backend-config=$(Build.SourcesDirectory)/environment_setup/tf-templates/backend.tf' + workingDirectory: '$(Build.SourcesDirectory)/environment_setup/tf-templates' + backendType: azurerm + backendServiceArm: $(AZURE_RM_SVC_CONNECTION) + ensureBackend: true + backendAzureRmResourceGroupLocation: $(LOCATION) + backendAzureRmResourceGroupName: $(RESOURCE_GROUP) + backendAzureRmStorageAccountName: '$(BASE_NAME)statestor' + backendAzureRmStorageAccountSku: 'Standard_LRS' + backendAzureRmContainerName: 'tfstate-cont' + backendAzureRmKey: 'mlopsinfra.tfstate' + +- task: charleszipp.azure-pipelines-tasks-terraform.azure-pipelines-tasks-terraform-cli.TerraformCLI@0 + displayName: 'TF validate - Deploy MLOps resources to Azure' + inputs: + command: validate + workingDirectory: '$(Build.SourcesDirectory)/environment_setup/tf-templates' + +- task: charleszipp.azure-pipelines-tasks-terraform.azure-pipelines-tasks-terraform-cli.TerraformCLI@0 + displayName: 'TF plan - Deploy MLOps resources to Azure' + inputs: + command: plan + workingDirectory: '$(Build.SourcesDirectory)/environment_setup/tf-templates' + environmentServiceName: $(AZURE_RM_SVC_CONNECTION) + env: + TF_VAR_BASE_NAME: $(BASE_NAME) + TF_VAR_RESOURCE_GROUP: $(RESOURCE_GROUP) + TF_VAR_WORKSPACE_NAME: $(WORKSPACE_NAME) + +- task: charleszipp.azure-pipelines-tasks-terraform.azure-pipelines-tasks-terraform-cli.TerraformCLI@0 + displayName: 'TF apply - Deploy MLOps resources to Azure' + inputs: + command: apply + workingDirectory: '$(Build.SourcesDirectory)/environment_setup/tf-templates' + environmentServiceName: $(AZURE_RM_SVC_CONNECTION) + env: + TF_VAR_BASE_NAME: $(BASE_NAME) + TF_VAR_RESOURCE_GROUP: $(RESOURCE_GROUP) + TF_VAR_WORKSPACE_NAME: $(WORKSPACE_NAME) + diff --git a/environment_setup/iac-create-environment-pipeline.yml b/environment_setup/iac-create-environment-pipeline.yml deleted file mode 100644 index f21c6eab..00000000 --- a/environment_setup/iac-create-environment-pipeline.yml +++ /dev/null @@ -1,37 +0,0 @@ -# CI/PR Pipeline that deploys an ARM template to create or update the resources needed by the other pipelines. -trigger: - branches: - include: - - master - paths: - include: - - environment_setup/arm-templates/* -pr: - branches: - include: - - master - paths: - include: - - environment_setup/arm-templates/* - -pool: - vmImage: 'ubuntu-latest' - -variables: -- group: devopsforai-aml-vg - - -steps: -- task: AzureResourceGroupDeployment@2 - inputs: - azureSubscription: '$(AZURE_RM_SVC_CONNECTION)' - action: 'Create Or Update Resource Group' - resourceGroupName: '$(RESOURCE_GROUP)' - location: $(LOCATION) - templateLocation: 'Linked artifact' - csmFile: '$(Build.SourcesDirectory)/environment_setup/arm-templates/cloud-environment.json' - overrideParameters: '-baseName $(BASE_NAME) -location $(LOCATION) -workspace $(WORKSPACE_NAME)' - deploymentMode: 'Incremental' - displayName: 'Deploy MLOps resources to Azure' - - \ No newline at end of file diff --git a/environment_setup/tf-templates/backend.tf b/environment_setup/tf-templates/backend.tf new file mode 100644 index 00000000..0aec0499 --- /dev/null +++ b/environment_setup/tf-templates/backend.tf @@ -0,0 +1,4 @@ +terraform { + backend "azurerm" { + } +} diff --git a/environment_setup/tf-templates/main.tf b/environment_setup/tf-templates/main.tf new file mode 100644 index 00000000..c57a5a84 --- /dev/null +++ b/environment_setup/tf-templates/main.tf @@ -0,0 +1,71 @@ +provider "azurerm" { + version = "=2.3.0" + features {} +} + +variable BASE_NAME {} +variable RESOURCE_GROUP {} +variable WORKSPACE_NAME {} + +#-------------------------------------------------------------------------------- + +#Set the already-existing resource group +data "azurerm_resource_group" "amlrg" { + name = var.RESOURCE_GROUP +} + +#Set client config for a.o. tenant id +data "azurerm_client_config" "currentconfig" { +} + +#-------------------------------------------------------------------------------- + +# Storage account for AML Service +resource "azurerm_storage_account" "amlstor" { + name = "${var.BASE_NAME}amlsa" + location = data.azurerm_resource_group.amlrg.location + resource_group_name = data.azurerm_resource_group.amlrg.name + account_tier = "Standard" + account_replication_type = "LRS" +} + +# Keyvault for AML Service +resource "azurerm_key_vault" "amlkv" { + name = "${var.BASE_NAME}-AML-KV" + location = data.azurerm_resource_group.amlrg.location + resource_group_name = data.azurerm_resource_group.amlrg.name + tenant_id = data.azurerm_client_config.currentconfig.tenant_id + sku_name = "standard" +} + +# App Insights for AML Service +resource "azurerm_application_insights" "amlai" { + name = "${var.BASE_NAME}-AML-AI" + location = data.azurerm_resource_group.amlrg.location + resource_group_name = data.azurerm_resource_group.amlrg.name + application_type = "web" +} + +# Container registry for AML Service +resource "azurerm_container_registry" "amlacr" { + name = "${var.BASE_NAME}amlcr" + resource_group_name = data.azurerm_resource_group.amlrg.name + location = data.azurerm_resource_group.amlrg.location + sku = "Standard" + admin_enabled = true +} + +# ML Workspace for AML Service, depending on the storage account, Keyvault, App Insights and ACR. +resource "azurerm_machine_learning_workspace" "amlws" { + name = var.WORKSPACE_NAME + location = data.azurerm_resource_group.amlrg.location + resource_group_name = data.azurerm_resource_group.amlrg.name + application_insights_id = azurerm_application_insights.amlai.id + key_vault_id = azurerm_key_vault.amlkv.id + storage_account_id = azurerm_storage_account.amlstor.id + container_registry_id = azurerm_container_registry.amlacr.id + + identity { + type = "SystemAssigned" + } +} diff --git a/ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py b/ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py new file mode 100644 index 00000000..5a0f0125 --- /dev/null +++ b/ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py @@ -0,0 +1,428 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" +import os +from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep +from ml_service.util.manage_environment import get_environment +from ml_service.pipelines.load_sample_data import create_sample_data_csv +from ml_service.util.env_variables import Env +from ml_service.util.attach_compute import get_compute +from azureml.core import ( + Workspace, + Dataset, + Datastore, + RunConfiguration, +) +from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter +from azureml.core.compute import ComputeTarget +from azureml.data.datapath import DataPath +from azureml.pipeline.steps import PythonScriptStep +from typing import Tuple + + +def get_or_create_datastore( + datastorename: str, ws: Workspace, env: Env, input: bool = True +) -> Datastore: + """ + Obtains a datastore with matching name. Creates it if none exists. + + :param datastorename: Name of the datastore + :param ws: Current AML Workspace + :param env: Environment variables + :param input: Datastore points to the input container if + this is True(default) or the output storage container otherwise + + :returns: Datastore + + :raises: ValueError + """ + if datastorename is None: + raise ValueError("Datastore name is required.") + + containername = ( + env.scoring_datastore_input_container + if input + else env.scoring_datastore_output_container + ) + + if datastorename in ws.datastores: + + datastore = ws.datastores[datastorename] + + # the datastore is not registered but we have all details to register it + elif ( + env.scoring_datastore_access_key is not None + and containername is not None # NOQA: E501 + ): # NOQA:E501 + + datastore = Datastore.register_azure_blob_container( + workspace=ws, + datastore_name=datastorename, + account_name=env.scoring_datastore_storage_name, + account_key=env.scoring_datastore_access_key, + container_name=containername, + ) + else: + raise ValueError( + "No existing datastore named {} nor was enough information supplied to create one.".format( # NOQA: E501 + datastorename + ) + ) + + return datastore + + +def get_input_dataset(ws: Workspace, ds: Datastore, env: Env) -> Dataset: + """ + Gets an input dataset wrapped around an input data file. The input + data file is assumed to exist in the supplied datastore. + + + :param ws: AML Workspace + :param ds: Datastore containing the data file + :param env: Environment variables + + :returns: Input Dataset + """ + + scoringinputds = Dataset.Tabular.from_delimited_files( + path=DataPath(ds, env.scoring_datastore_input_filename) + ) + + scoringinputds = scoringinputds.register( + ws, + name=env.scoring_dataset_name, + tags={"purpose": "scoring input", "format": "csv"}, + create_new_version=True, + ).as_named_input(env.scoring_dataset_name) + + return scoringinputds + + +def get_fallback_input_dataset(ws: Workspace, env: Env) -> Dataset: + """ + Called when an input datastore does not exist or no input data file exists + at that location. Create a sample dataset using the diabetes dataset from + scikit-learn. Useful when debugging this code in the absence of the input + data location Azure blob. + + + :param ws: AML Workspace + :param env: Environment Variables + + :returns: Fallback input dataset + + :raises: FileNotFoundError + """ + # This call creates an example CSV from sklearn sample data. If you + # have already bootstrapped your project, you can comment this line + # out and use your own CSV. + create_sample_data_csv( + file_name=env.scoring_datastore_input_filename, for_scoring=True + ) + + if not os.path.exists(env.scoring_datastore_input_filename): + error_message = ( + "Could not find CSV dataset for scoring at {}. " + + "No alternate data store location was provided either.".format( + env.scoring_datastore_input_filename + ) # NOQA: E501 + ) + + raise FileNotFoundError(error_message) + + # upload the input data to the workspace default datastore + default_datastore = ws.get_default_datastore() + scoreinputdataref = default_datastore.upload_files( + [env.scoring_datastore_input_filename], + target_path="scoringinput", + overwrite=False, + ) + + scoringinputds = ( + Dataset.Tabular.from_delimited_files(scoreinputdataref) + .register(ws, env.scoring_dataset_name, create_new_version=True) + .as_named_input(env.scoring_dataset_name) + ) + + return scoringinputds + + +def get_output_location( + ws: Workspace, env: Env, outputdatastore: Datastore = None +) -> PipelineData: + """ + Returns a Datastore wrapped as a PipelineData instance suitable + for passing into a pipeline step. Represents the location where + the scoring output should be written. Uses the default workspace + blob store if no output datastore is supplied. + + + :param ws: AML Workspace + :param env: Environment Variables + :param outputdatastore: AML Datastore, optional, default is None + + :returns: PipelineData wrapping the output datastore + """ + + if outputdatastore is None: + output_loc = PipelineData( + name="defaultoutput", datastore=ws.get_default_datastore() + ) + else: + output_loc = PipelineData( + name=outputdatastore.name, datastore=outputdatastore + ) # NOQA: E501 + + return output_loc + + +def get_inputds_outputloc( + ws: Workspace, env: Env +) -> Tuple[Dataset, PipelineData]: # NOQA: E501 + """ + Prepare the input and output for the scoring step. Input is a tabular + dataset wrapped around the scoring data. Output is PipelineData + representing a location to write the scores down. + + :param ws: AML Workspace + :param env: Environment Variables + + :returns: Input dataset and output location + """ + + if env.scoring_datastore_storage_name is None: + # fall back to default + scoringinputds = get_fallback_input_dataset(ws, env) + output_loc = get_output_location(ws, env) + else: + inputdatastore = get_or_create_datastore( + "{}_in".format(env.scoring_datastore_storage_name), ws, env + ) + outputdatastore = get_or_create_datastore( + "{}_out".format(env.scoring_datastore_storage_name), + ws, + env, + input=False, # NOQA: E501 + ) + scoringinputds = get_input_dataset(ws, inputdatastore, env) + output_loc = get_output_location(ws, env, outputdatastore) + + return (scoringinputds, output_loc) + + +def get_run_configs( + ws: Workspace, computetarget: ComputeTarget, env: Env +) -> Tuple[ParallelRunConfig, RunConfiguration]: + """ + Creates the necessary run configurations required by the + pipeline to enable parallelized scoring. + + :param ws: AML Workspace + :param computetarget: AML Compute target + :param env: Environment Variables + + :returns: Tuple[Scoring Run configuration, Score copy run configuration] + """ + + # get a conda environment for scoring + environment = get_environment( + ws, + env.aml_env_name_scoring, + conda_dependencies_file=env.aml_env_score_conda_dep_file, + enable_docker=True, + use_gpu=env.use_gpu_for_scoring, + create_new=env.rebuild_env_scoring, + ) + + score_run_config = ParallelRunConfig( + entry_script=env.batchscore_script_path, + source_directory=env.sources_directory_train, + error_threshold=10, + output_action="append_row", + compute_target=computetarget, + node_count=env.max_nodes_scoring, + environment=environment, + run_invocation_timeout=300, + ) + + copy_run_config = RunConfiguration() + copy_run_config.environment = get_environment( + ws, + env.aml_env_name_score_copy, + conda_dependencies_file=env.aml_env_scorecopy_conda_dep_file, + enable_docker=True, + use_gpu=env.use_gpu_for_scoring, + create_new=env.rebuild_env_scoring, + ) + return (score_run_config, copy_run_config) + + +def get_scoring_pipeline( + scoring_dataset: Dataset, + output_loc: PipelineData, + score_run_config: ParallelRunConfig, + copy_run_config: RunConfiguration, + computetarget: ComputeTarget, + ws: Workspace, + env: Env, +) -> Pipeline: + """ + Creates the scoring pipeline. + + :param scoring_dataset: Data to score + :param output_loc: Location to save the scoring results + :param score_run_config: Parallel Run configuration to support + parallelized scoring + :param copy_run_config: Script Run configuration to support + score copying + :param computetarget: AML Compute target + :param ws: AML Workspace + :param env: Environment Variables + + :returns: Scoring pipeline instance + """ + # To help filter the model make the model name, model version and a + # tag/value pair bindable parameters so that they can be passed to + # the pipeline when invoked either over REST or via the AML SDK. + model_name_param = PipelineParameter( + "model_name", default_value=" " + ) # NOQA: E501 + model_version_param = PipelineParameter( + "model_version", default_value=" " + ) # NOQA: E501 + model_tag_name_param = PipelineParameter( + "model_tag_name", default_value=" " + ) # NOQA: E501 + model_tag_value_param = PipelineParameter( + "model_tag_value", default_value=" " + ) # NOQA: E501 + + scoring_step = ParallelRunStep( + name="scoringstep", + inputs=[scoring_dataset], + output=output_loc, + arguments=[ + "--model_name", + model_name_param, + "--model_version", + model_version_param, + "--model_tag_name", + model_tag_name_param, + "--model_tag_value", + model_tag_value_param, + ], + parallel_run_config=score_run_config, + allow_reuse=False, + ) + + copying_step = PythonScriptStep( + name="scorecopystep", + script_name=env.batchscore_copy_script_path, + source_directory=env.sources_directory_train, + arguments=[ + "--output_path", + output_loc, + "--scoring_output_filename", + env.scoring_datastore_output_filename + if env.scoring_datastore_output_filename is not None + else "", + "--scoring_datastore", + env.scoring_datastore_storage_name + if env.scoring_datastore_storage_name is not None + else "", + "--score_container", + env.scoring_datastore_output_container + if env.scoring_datastore_output_container is not None + else "", + "--scoring_datastore_key", + env.scoring_datastore_access_key + if env.scoring_datastore_access_key is not None + else "", + ], + inputs=[output_loc], + allow_reuse=False, + compute_target=computetarget, + runconfig=copy_run_config, + ) + return Pipeline(workspace=ws, steps=[scoring_step, copying_step]) + + +def build_batchscore_pipeline(): + """ + Main method that builds and publishes a scoring pipeline. + """ + + try: + env = Env() + + # Get Azure machine learning workspace + aml_workspace = Workspace.get( + name=env.workspace_name, + subscription_id=env.subscription_id, + resource_group=env.resource_group, + ) + + # Get Azure machine learning cluster + aml_compute_score = get_compute( + aml_workspace, + env.compute_name_scoring, + env.vm_size_scoring, + for_batch_scoring=True, + ) + + input_dataset, output_location = get_inputds_outputloc( + aml_workspace, env + ) # NOQA: E501 + + scoring_runconfig, score_copy_runconfig = get_run_configs( + aml_workspace, aml_compute_score, env + ) + + scoring_pipeline = get_scoring_pipeline( + input_dataset, + output_location, + scoring_runconfig, + score_copy_runconfig, + aml_compute_score, + aml_workspace, + env, + ) + + published_pipeline = scoring_pipeline.publish( + name=env.scoring_pipeline_name, + description="Diabetes Batch Scoring Pipeline", + ) + pipeline_id_string = "##vso[task.setvariable variable=pipeline_id;isOutput=true]{}".format( # NOQA: E501 + published_pipeline.id + ) + print(pipeline_id_string) + except Exception as e: + print(e) + exit(1) + + +if __name__ == "__main__": + build_batchscore_pipeline() diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py index dfe3f5b3..03937186 100644 --- a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py @@ -16,81 +16,90 @@ def main(): aml_workspace = Workspace.get( name=e.workspace_name, subscription_id=e.subscription_id, - resource_group=e.resource_group + resource_group=e.resource_group, ) print("get_workspace:") print(aml_workspace) # Get Azure machine learning cluster - aml_compute = get_compute( - aml_workspace, - e.compute_name, - e.vm_size) + aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) # Create a reusable Azure ML environment environment = get_environment( - aml_workspace, e.aml_env_name, create_new=e.rebuild_env) # + aml_workspace, + e.aml_env_name, + conda_dependencies_file=e.aml_env_train_conda_dep_file, + create_new=e.rebuild_env, + ) # run_config = RunConfiguration() run_config.environment = environment - if (e.datastore_name): + if e.datastore_name: datastore_name = e.datastore_name else: datastore_name = aml_workspace.get_default_datastore().name - run_config.environment.environment_variables["DATASTORE_NAME"] = datastore_name # NOQA: E501 + run_config.environment.environment_variables[ + "DATASTORE_NAME" + ] = datastore_name # NOQA: E501 - model_name_param = PipelineParameter( - name="model_name", default_value=e.model_name) + model_name_param = PipelineParameter(name="model_name", default_value=e.model_name) # NOQA: E501 dataset_version_param = PipelineParameter( - name="dataset_version", default_value=e.dataset_version) + name="dataset_version", default_value=e.dataset_version + ) data_file_path_param = PipelineParameter( - name="data_file_path", default_value="none") - caller_run_id_param = PipelineParameter( - name="caller_run_id", default_value="none") + name="data_file_path", default_value="none" + ) + caller_run_id_param = PipelineParameter(name="caller_run_id", default_value="none") # NOQA: E501 # Get dataset name dataset_name = e.dataset_name # Check to see if dataset exists - if (dataset_name not in aml_workspace.datasets): + if dataset_name not in aml_workspace.datasets: # This call creates an example CSV from sklearn sample data. If you # have already bootstrapped your project, you can comment this line # out and use your own CSV. create_sample_data_csv() # Use a CSV to read in the data set. - file_name = 'diabetes.csv' + file_name = "diabetes.csv" - if (not os.path.exists(file_name)): - raise Exception("Could not find CSV dataset at \"%s\". If you have bootstrapped your project, you will need to provide a CSV." % file_name) # NOQA: E501 + if not os.path.exists(file_name): + raise Exception( + 'Could not find CSV dataset at "%s". If you have bootstrapped your project, you will need to provide a CSV.' # NOQA: E501 + % file_name + ) # NOQA: E501 # Upload file to default datastore in workspace datatstore = Datastore.get(aml_workspace, datastore_name) - target_path = 'training-data/' + target_path = "training-data/" datatstore.upload_files( files=[file_name], target_path=target_path, overwrite=True, - show_progress=False) + show_progress=False, + ) # Register dataset path_on_datastore = os.path.join(target_path, file_name) dataset = Dataset.Tabular.from_delimited_files( - path=(datatstore, path_on_datastore)) + path=(datatstore, path_on_datastore) + ) dataset = dataset.register( workspace=aml_workspace, name=dataset_name, - description='diabetes training data', - tags={'format': 'CSV'}, - create_new_version=True) + description="diabetes training data", + tags={"format": "CSV"}, + create_new_version=True, + ) # Create a PipelineData to pass data between steps pipeline_data = PipelineData( - 'pipeline_data', - datastore=aml_workspace.get_default_datastore()) + "pipeline_data", datastore=aml_workspace.get_default_datastore() + ) train_step = PythonScriptStep( name="Train Model", @@ -99,12 +108,18 @@ def main(): source_directory=e.sources_directory_train, outputs=[pipeline_data], arguments=[ - "--model_name", model_name_param, - "--step_output", pipeline_data, - "--dataset_version", dataset_version_param, - "--data_file_path", data_file_path_param, - "--caller_run_id", caller_run_id_param, - "--dataset_name", dataset_name, + "--model_name", + model_name_param, + "--step_output", + pipeline_data, + "--dataset_version", + dataset_version_param, + "--data_file_path", + data_file_path_param, + "--caller_run_id", + caller_run_id_param, + "--dataset_name", + dataset_name, ], runconfig=run_config, allow_reuse=True, @@ -117,8 +132,10 @@ def main(): compute_target=aml_compute, source_directory=e.sources_directory_train, arguments=[ - "--model_name", model_name_param, - "--allow_run_cancel", e.allow_run_cancel, + "--model_name", + model_name_param, + "--allow_run_cancel", + e.allow_run_cancel, ], runconfig=run_config, allow_reuse=False, @@ -131,16 +148,13 @@ def main(): compute_target=aml_compute, source_directory=e.sources_directory_train, inputs=[pipeline_data], - arguments=[ - "--model_name", model_name_param, - "--step_input", pipeline_data, - ], + arguments=["--model_name", model_name_param, "--step_input", pipeline_data, ], # NOQA: E501 runconfig=run_config, allow_reuse=False, ) print("Step Register created") # Check run_evaluation flag to include or exclude evaluation step. - if ((e.run_evaluation).lower() == 'true'): + if (e.run_evaluation).lower() == "true": print("Include evaluation step before register step.") evaluate_step.run_after(train_step) register_step.run_after(evaluate_step) @@ -156,11 +170,11 @@ def main(): published_pipeline = train_pipeline.publish( name=e.pipeline_name, description="Model training/retraining pipeline", - version=e.build_id + version=e.build_id, ) - print(f'Published pipeline: {published_pipeline.name}') - print(f'for build {published_pipeline.version}') + print(f"Published pipeline: {published_pipeline.name}") + print(f"for build {published_pipeline.version}") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py index b49ae53d..254f22eb 100644 --- a/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py @@ -13,16 +13,13 @@ def main(): aml_workspace = Workspace.get( name=e.workspace_name, subscription_id=e.subscription_id, - resource_group=e.resource_group + resource_group=e.resource_group, ) print("get_workspace:") print(aml_workspace) # Get Azure machine learning cluster - aml_compute = get_compute( - aml_workspace, - e.compute_name, - e.vm_size) + aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) @@ -31,7 +28,11 @@ def main(): # Make sure to include `r-essentials' # in diabetes_regression/conda_dependencies.yml environment = get_environment( - aml_workspace, e.aml_env_name, create_new=e.rebuild_env) # NOQA: E501 + aml_workspace, + e.aml_env_name, + conda_dependencies_file=e.aml_env_train_conda_dep_file, + create_new=e.rebuild_env, + ) # NOQA: E501 run_config = RunConfiguration() run_config.environment = environment @@ -52,11 +53,11 @@ def main(): published_pipeline = train_pipeline.publish( name=e.pipeline_name, description="Model training/retraining pipeline", - version=e.build_id + version=e.build_id, ) - print(f'Published pipeline: {published_pipeline.name}') - print(f'for build {published_pipeline.version}') + print(f"Published pipeline: {published_pipeline.name}") + print(f"for build {published_pipeline.version}") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ml_service/pipelines/diabetes_regression_verify_train_pipeline.py b/ml_service/pipelines/diabetes_regression_verify_train_pipeline.py deleted file mode 100644 index 306f2259..00000000 --- a/ml_service/pipelines/diabetes_regression_verify_train_pipeline.py +++ /dev/null @@ -1,75 +0,0 @@ -import argparse -import sys -import os -from azureml.core import Run, Experiment, Workspace -from ml_service.util.env_variables import Env -from diabetes_regression.util.model_helper import get_latest_model - - -def main(): - - run = Run.get_context() - - if (run.id.startswith('OfflineRun')): - from dotenv import load_dotenv - load_dotenv() - sources_dir = os.environ.get("SOURCES_DIR_TRAIN") - if (sources_dir is None): - sources_dir = 'diabetes_regression' - workspace_name = os.environ.get("WORKSPACE_NAME") - experiment_name = os.environ.get("EXPERIMENT_NAME") - resource_group = os.environ.get("RESOURCE_GROUP") - subscription_id = os.environ.get("SUBSCRIPTION_ID") - build_id = os.environ.get('BUILD_BUILDID') - aml_workspace = Workspace.get( - name=workspace_name, - subscription_id=subscription_id, - resource_group=resource_group - ) - ws = aml_workspace - exp = Experiment(ws, experiment_name) - else: - exp = run.experiment - - e = Env() - - parser = argparse.ArgumentParser("register") - parser.add_argument( - "--build_id", - type=str, - help="The Build ID of the build triggering this pipeline run", - ) - parser.add_argument( - "--output_model_version_file", - type=str, - default="model_version.txt", - help="Name of a file to write model version to" - ) - - args = parser.parse_args() - if (args.build_id is not None): - build_id = args.build_id - model_name = e.model_name - - try: - tag_name = 'BuildId' - model = get_latest_model( - model_name, tag_name, build_id, exp.workspace) - if (model is not None): - print("Model was registered for this build.") - if (model is None): - print("Model was not registered for this run.") - sys.exit(1) - except Exception as e: - print(e) - print("Model was not registered for this run.") - sys.exit(1) - - # Save the Model Version for other AzDO jobs after script is complete - if args.output_model_version_file is not None: - with open(args.output_model_version_file, "w") as out_file: - out_file.write(str(model.version)) - - -if __name__ == '__main__': - main() diff --git a/ml_service/pipelines/load_sample_data.py b/ml_service/pipelines/load_sample_data.py index 717fc7ab..304a8e7b 100644 --- a/ml_service/pipelines/load_sample_data.py +++ b/ml_service/pipelines/load_sample_data.py @@ -5,12 +5,14 @@ # Loads the diabetes sample data from sklearn and produces a csv file that can # be used by the build/train pipeline script. -def create_sample_data_csv(): +def create_sample_data_csv(file_name: str = "diabetes.csv", + for_scoring: bool = False): sample_data = load_diabetes() df = pd.DataFrame( data=sample_data.data, columns=sample_data.feature_names) - df['Y'] = sample_data.target + if not for_scoring: + df['Y'] = sample_data.target # Hard code to diabetes so we fail fast if the project has been # bootstrapped. - df.to_csv('diabetes.csv', index=False) + df.to_csv(file_name, index=False) diff --git a/ml_service/pipelines/run_parallel_batchscore_pipeline.py b/ml_service/pipelines/run_parallel_batchscore_pipeline.py new file mode 100644 index 00000000..c046eb9c --- /dev/null +++ b/ml_service/pipelines/run_parallel_batchscore_pipeline.py @@ -0,0 +1,134 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" + +from azure.storage.blob import ContainerClient +from ml_service.util.env_variables import Env +from azureml.core import Experiment, Workspace +from azureml.pipeline.core import PublishedPipeline +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--pipeline_id", type=str, default=None) + return parser.parse_args() + + +def get_pipeline(pipeline_id, ws: Workspace, env: Env): + if pipeline_id is not None: + scoringpipeline = PublishedPipeline.get(ws, pipeline_id) + else: + pipelines = PublishedPipeline.list(ws) + scoringpipelinelist = [ + pl for pl in pipelines if pl.name == env.scoring_pipeline_name + ] # noqa E501 + + if scoringpipelinelist.count == 0: + raise Exception( + "No pipeline found matching name:{}".format(env.scoring_pipeline_name) # NOQA: E501 + ) + else: + # latest published + scoringpipeline = scoringpipelinelist[0] + + return scoringpipeline + + +def copy_output(step_id: str, env: Env): + accounturl = "https://{}.blob.core.windows.net".format( + env.scoring_datastore_storage_name + ) + + srcblobname = "azureml/{}/{}_out/parallel_run_step.txt".format( + step_id, env.scoring_datastore_storage_name + ) + + srcbloburl = "{}/{}/{}".format( + accounturl, env.scoring_datastore_output_container, srcblobname + ) + + containerclient = ContainerClient( + accounturl, + env.scoring_datastore_output_container, + env.scoring_datastore_access_key, + ) + srcblobproperties = containerclient.get_blob_client( + srcblobname + ).get_blob_properties() # noqa E501 + + destfolder = srcblobproperties.last_modified.date().isoformat() + filetime = ( + srcblobproperties.last_modified.time() + .isoformat("milliseconds") + .replace(":", "_") + .replace(".", "_") + ) # noqa E501 + destfilenameparts = env.scoring_datastore_output_filename.split(".") + destblobname = "{}/{}_{}.{}".format( + destfolder, destfilenameparts[0], filetime, destfilenameparts[1] + ) + + destblobclient = containerclient.get_blob_client(destblobname) + destblobclient.start_copy_from_url(srcbloburl) + + +def run_batchscore_pipeline(): + try: + env = Env() + + args = parse_args() + + aml_workspace = Workspace.get( + name=env.workspace_name, + subscription_id=env.subscription_id, + resource_group=env.resource_group, + ) + + scoringpipeline = get_pipeline(args.pipeline_id, aml_workspace, env) + + experiment = Experiment(workspace=aml_workspace, name=env.experiment_name) # NOQA: E501 + + run = experiment.submit( + scoringpipeline, + pipeline_parameters={ + "model_name": env.model_name, + "model_version": env.model_version, + "model_tag_name": " ", + "model_tag_value": " ", + }, + ) + + run.wait_for_completion(show_output=True) + + if run.get_status() == "Finished": + copy_output(list(run.get_steps())[0].id, env) + + except Exception as ex: + print("Error: {}".format(ex)) + + +if __name__ == "__main__": + run_batchscore_pipeline() diff --git a/ml_service/util/attach_compute.py b/ml_service/util/attach_compute.py index bcff58da..cf8c07a6 100644 --- a/ml_service/util/attach_compute.py +++ b/ml_service/util/attach_compute.py @@ -1,3 +1,5 @@ + +import traceback from azureml.core import Workspace from azureml.core.compute import AmlCompute from azureml.core.compute import ComputeTarget @@ -5,38 +7,33 @@ from ml_service.util.env_variables import Env -def get_compute( - workspace: Workspace, - compute_name: str, - vm_size: str -): +def get_compute(workspace: Workspace, compute_name: str, vm_size: str, for_batch_scoring: bool = False): # NOQA E501 try: if compute_name in workspace.compute_targets: compute_target = workspace.compute_targets[compute_name] if compute_target and type(compute_target) is AmlCompute: - print('Found existing compute target ' + compute_name - + ' so using it.') + print("Found existing compute target " + compute_name + " so using it.") # NOQA else: e = Env() compute_config = AmlCompute.provisioning_configuration( vm_size=vm_size, - vm_priority=e.vm_priority, - min_nodes=e.min_nodes, - max_nodes=e.max_nodes, + vm_priority=e.vm_priority if not for_batch_scoring else e.vm_priority_scoring, # NOQA E501 + min_nodes=e.min_nodes if not for_batch_scoring else e.min_nodes_scoring, # NOQA E501 + max_nodes=e.max_nodes if not for_batch_scoring else e.max_nodes_scoring, # NOQA E501 idle_seconds_before_scaledown="300" # #Uncomment the below lines for VNet support # vnet_resourcegroup_name=vnet_resourcegroup_name, # vnet_name=vnet_name, # subnet_name=subnet_name ) - compute_target = ComputeTarget.create(workspace, compute_name, - compute_config) + compute_target = ComputeTarget.create( + workspace, compute_name, compute_config + ) compute_target.wait_for_completion( - show_output=True, - min_node_count=None, - timeout_in_minutes=10) + show_output=True, min_node_count=None, timeout_in_minutes=10 + ) return compute_target - except ComputeTargetException as e: - print(e) - print('An error occurred trying to provision compute.') + except ComputeTargetException: + traceback.print_exc() + print("An error occurred trying to provision compute.") exit(1) diff --git a/ml_service/util/env_variables.py b/ml_service/util/env_variables.py index c734e098..753c152d 100644 --- a/ml_service/util/env_variables.py +++ b/ml_service/util/env_variables.py @@ -1,174 +1,126 @@ +"""Env dataclass to load and hold all environment variables +""" +from dataclasses import dataclass import os -from dotenv import load_dotenv - - -class Singleton(object): - _instances = {} - - def __new__(class_, *args, **kwargs): - if class_ not in class_._instances: - class_._instances[class_] = super(Singleton, class_).__new__(class_, *args, **kwargs) # noqa E501 - return class_._instances[class_] - - -class Env(Singleton): - - def __init__(self): - load_dotenv() - self._workspace_name = os.environ.get("WORKSPACE_NAME") - self._resource_group = os.environ.get("RESOURCE_GROUP") - self._subscription_id = os.environ.get("SUBSCRIPTION_ID") - self._tenant_id = os.environ.get("TENANT_ID") - self._app_id = os.environ.get("SP_APP_ID") - self._app_secret = os.environ.get("SP_APP_SECRET") - self._vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") - self._compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME") - self._vm_priority = os.environ.get("AML_CLUSTER_PRIORITY", 'lowpriority') # noqa E501 - self._min_nodes = int(os.environ.get("AML_CLUSTER_MIN_NODES", 0)) - self._max_nodes = int(os.environ.get("AML_CLUSTER_MAX_NODES", 4)) - self._build_id = os.environ.get("BUILD_BUILDID") - self._pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME") - self._sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") - self._train_script_path = os.environ.get("TRAIN_SCRIPT_PATH") - self._evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH") - self._register_script_path = os.environ.get("REGISTER_SCRIPT_PATH") - self._model_name = os.environ.get("MODEL_NAME") - self._experiment_name = os.environ.get("EXPERIMENT_NAME") - self._model_version = os.environ.get('MODEL_VERSION') - self._image_name = os.environ.get('IMAGE_NAME') - self._db_cluster_id = os.environ.get("DB_CLUSTER_ID") - self._score_script = os.environ.get("SCORE_SCRIPT") - self._build_uri = os.environ.get("BUILD_URI") - self._dataset_name = os.environ.get("DATASET_NAME") - self._datastore_name = os.environ.get("DATASTORE_NAME") - self._dataset_version = os.environ.get("DATASET_VERSION") - self._run_evaluation = os.environ.get("RUN_EVALUATION", "true") - self._allow_run_cancel = os.environ.get( - "ALLOW_RUN_CANCEL", "true") - self._aml_env_name = os.environ.get("AML_ENV_NAME") - self._rebuild_env = os.environ.get("AML_REBUILD_ENVIRONMENT", - "false").lower().strip() == "true" - - @property - def workspace_name(self): - return self._workspace_name - - @property - def resource_group(self): - return self._resource_group - - @property - def subscription_id(self): - return self._subscription_id - - @property - def tenant_id(self): - return self._tenant_id - - @property - def app_id(self): - return self._app_id - - @property - def app_secret(self): - return self._app_secret - - @property - def vm_size(self): - return self._vm_size - - @property - def compute_name(self): - return self._compute_name - - @property - def db_cluster_id(self): - return self._db_cluster_id - - @property - def build_id(self): - return self._build_id - - @property - def pipeline_name(self): - return self._pipeline_name +from typing import Optional - @property - def sources_directory_train(self): - return self._sources_directory_train - - @property - def train_script_path(self): - return self._train_script_path - - @property - def evaluate_script_path(self): - return self._evaluate_script_path - - @property - def register_script_path(self): - return self._register_script_path - - @property - def model_name(self): - return self._model_name - - @property - def experiment_name(self): - return self._experiment_name - - @property - def vm_priority(self): - return self._vm_priority - - @property - def min_nodes(self): - return self._min_nodes - - @property - def max_nodes(self): - return self._max_nodes - - @property - def model_version(self): - return self._model_version - - @property - def image_name(self): - return self._image_name - - @property - def score_script(self): - return self._score_script - - @property - def build_uri(self): - return self._build_uri - - @property - def dataset_name(self): - return self._dataset_name - - @property - def datastore_name(self): - return self._datastore_name - - @property - def dataset_version(self): - return self._dataset_version - - @property - def run_evaluation(self): - return self._run_evaluation - - @property - def allow_run_cancel(self): - return self._allow_run_cancel +from dotenv import load_dotenv - @property - def aml_env_name(self): - return self._aml_env_name - @property - def rebuild_env(self): - return self._rebuild_env +@dataclass(frozen=True) +class Env: + """Loads all environment variables into a predefined set of properties + """ + + # to load .env file into environment variables for local execution + load_dotenv() + workspace_name: Optional[str] = os.environ.get("WORKSPACE_NAME") + resource_group: Optional[str] = os.environ.get("RESOURCE_GROUP") + subscription_id: Optional[str] = os.environ.get("SUBSCRIPTION_ID") + tenant_id: Optional[str] = os.environ.get("TENANT_ID") + app_id: Optional[str] = os.environ.get("SP_APP_ID") + app_secret: Optional[str] = os.environ.get("SP_APP_SECRET") + vm_size: Optional[str] = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") + compute_name: Optional[str] = os.environ.get("AML_COMPUTE_CLUSTER_NAME") + vm_priority: Optional[str] = os.environ.get( + "AML_CLUSTER_PRIORITY", "lowpriority" + ) # NOQA: E501 + min_nodes: int = int(os.environ.get("AML_CLUSTER_MIN_NODES", 0)) + max_nodes: int = int(os.environ.get("AML_CLUSTER_MAX_NODES", 4)) + build_id: Optional[str] = os.environ.get("BUILD_BUILDID") + pipeline_name: Optional[str] = os.environ.get("TRAINING_PIPELINE_NAME") + sources_directory_train: Optional[str] = os.environ.get( + "SOURCES_DIR_TRAIN" + ) # NOQA: E501 + train_script_path: Optional[str] = os.environ.get("TRAIN_SCRIPT_PATH") + evaluate_script_path: Optional[str] = os.environ.get( + "EVALUATE_SCRIPT_PATH" + ) # NOQA: E501 + register_script_path: Optional[str] = os.environ.get( + "REGISTER_SCRIPT_PATH" + ) # NOQA: E501 + model_name: Optional[str] = os.environ.get("MODEL_NAME") + experiment_name: Optional[str] = os.environ.get("EXPERIMENT_NAME") + model_version: Optional[str] = os.environ.get("MODEL_VERSION") + image_name: Optional[str] = os.environ.get("IMAGE_NAME") + db_cluster_id: Optional[str] = os.environ.get("DB_CLUSTER_ID") + score_script: Optional[str] = os.environ.get("SCORE_SCRIPT") + build_uri: Optional[str] = os.environ.get("BUILD_URI") + dataset_name: Optional[str] = os.environ.get("DATASET_NAME") + datastore_name: Optional[str] = os.environ.get("DATASTORE_NAME") + dataset_version: Optional[str] = os.environ.get("DATASET_VERSION") + run_evaluation: Optional[str] = os.environ.get("RUN_EVALUATION", "true") + allow_run_cancel: Optional[str] = os.environ.get( + "ALLOW_RUN_CANCEL", "true" + ) # NOQA: E501 + aml_env_name: Optional[str] = os.environ.get("AML_ENV_NAME") + aml_env_train_conda_dep_file: Optional[str] = os.environ.get( + "AML_ENV_TRAIN_CONDA_DEP_FILE", "conda_dependencies.yml" + ) + rebuild_env: Optional[bool] = os.environ.get( + "AML_REBUILD_ENVIRONMENT", "false" + ).lower().strip() == "true" + + use_gpu_for_scoring: Optional[bool] = os.environ.get( + "USE_GPU_FOR_SCORING", "false" + ).lower().strip() == "true" + aml_env_score_conda_dep_file: Optional[str] = os.environ.get( + "AML_ENV_SCORE_CONDA_DEP_FILE", "conda_dependencies_scoring.yml" + ) + aml_env_scorecopy_conda_dep_file: Optional[str] = os.environ.get( + "AML_ENV_SCORECOPY_CONDA_DEP_FILE", "conda_dependencies_scorecopy.yml" + ) + vm_size_scoring: Optional[str] = os.environ.get( + "AML_COMPUTE_CLUSTER_CPU_SKU_SCORING" + ) + compute_name_scoring: Optional[str] = os.environ.get( + "AML_COMPUTE_CLUSTER_NAME_SCORING" + ) + vm_priority_scoring: Optional[str] = os.environ.get( + "AML_CLUSTER_PRIORITY_SCORING", "lowpriority" + ) + min_nodes_scoring: int = int( + os.environ.get("AML_CLUSTER_MIN_NODES_SCORING", 0) + ) # NOQA: E501 + max_nodes_scoring: int = int( + os.environ.get("AML_CLUSTER_MAX_NODES_SCORING", 4) + ) # NOQA: E501 + rebuild_env_scoring: Optional[bool] = os.environ.get( + "AML_REBUILD_ENVIRONMENT_SCORING", "false" + ).lower().strip() == "true" + scoring_datastore_storage_name: Optional[str] = os.environ.get( + "SCORING_DATASTORE_STORAGE_NAME" + ) + scoring_datastore_access_key: Optional[str] = os.environ.get( + "SCORING_DATASTORE_ACCESS_KEY" + ) + scoring_datastore_input_container: Optional[str] = os.environ.get( + "SCORING_DATASTORE_INPUT_CONTAINER" + ) + scoring_datastore_input_filename: Optional[str] = os.environ.get( + "SCORING_DATASTORE_INPUT_FILENAME" + ) + scoring_datastore_output_container: Optional[str] = os.environ.get( + "SCORING_DATASTORE_OUTPUT_CONTAINER" + ) + scoring_datastore_output_filename: Optional[str] = os.environ.get( + "SCORING_DATASTORE_OUTPUT_FILENAME" + ) + scoring_dataset_name: Optional[str] = os.environ.get( + "SCORING_DATASET_NAME" + ) # NOQA: E501 + scoring_pipeline_name: Optional[str] = os.environ.get( + "SCORING_PIPELINE_NAME" + ) # NOQA: E501 + aml_env_name_scoring: Optional[str] = os.environ.get( + "AML_ENV_NAME_SCORING" + ) # NOQA: E501 + aml_env_name_score_copy: Optional[str] = os.environ.get( + "AML_ENV_NAME_SCORE_COPY" + ) # NOQA: E501 + batchscore_script_path: Optional[str] = os.environ.get( + "BATCHSCORE_SCRIPT_PATH" + ) # NOQA: E501 + batchscore_copy_script_path: Optional[str] = os.environ.get( + "BATCHSCORE_COPY_SCRIPT_PATH" + ) # NOQA: E501 diff --git a/ml_service/util/manage_environment.py b/ml_service/util/manage_environment.py index 43749f3f..b61c97fe 100644 --- a/ml_service/util/manage_environment.py +++ b/ml_service/util/manage_environment.py @@ -1,12 +1,18 @@ + +import os +import traceback from azureml.core import Workspace, Environment from ml_service.util.env_variables import Env -import os +from azureml.core.runconfig import DEFAULT_CPU_IMAGE, DEFAULT_GPU_IMAGE def get_environment( workspace: Workspace, environment_name: str, - create_new: bool = False + conda_dependencies_file: str, + create_new: bool = False, + enable_docker: bool = None, + use_gpu: bool = False ): try: e = Env() @@ -17,13 +23,19 @@ def get_environment( restored_environment = environments[environment_name] if restored_environment is None or create_new: - new_env = Environment.from_conda_specification(environment_name, os.path.join(e.sources_directory_train, "conda_dependencies.yml")) # NOQA: E501 + new_env = Environment.from_conda_specification( + environment_name, + os.path.join(e.sources_directory_train, conda_dependencies_file), # NOQA: E501 + ) # NOQA: E501 restored_environment = new_env + if enable_docker is not None: + restored_environment.docker.enabled = enable_docker + restored_environment.docker.base_image = DEFAULT_GPU_IMAGE if use_gpu else DEFAULT_CPU_IMAGE # NOQA: E501 restored_environment.register(workspace) if restored_environment is not None: print(restored_environment) return restored_environment - except Exception as e: - print(e) + except Exception: + traceback.print_exc() exit(1)