diff --git a/.env.example b/.env.example index b7d0d5b5..47311d1e 100644 --- a/.env.example +++ b/.env.example @@ -5,44 +5,77 @@ TENANT_ID = '' BASE_NAME = '' SP_APP_ID = '' SP_APP_SECRET = '' +RESOURCE_GROUP = 'mlops-RG' -# Mock build/release ID for local testing - update ReleaseID each "release" +# Mock build/release ID for local testing BUILD_BUILDID = '001' -RELEASE_RELEASEID = '001' # Azure ML Workspace Variables -EXPERIMENT_NAME = '' -SCRIPT_FOLDER = './' +WORKSPACE_NAME = 'mlops-aml-ws' +EXPERIMENT_NAME = 'mlopspython' # AML Compute Cluster Config -AML_COMPUTE_CLUSTER_NAME = '' -AML_COMPUTE_CLUSTER_CPU_SKU = '' -AML_CLUSTER_MAX_NODES = '' -AML_CLUSTER_MIN_NODES = '' +AML_ENV_NAME='diabetes_regression_training_env' +AML_ENV_TRAIN_CONDA_DEP_FILE="conda_dependencies.yml" +AML_COMPUTE_CLUSTER_NAME = 'train-cluster' +AML_COMPUTE_CLUSTER_CPU_SKU = 'STANDARD_DS2_V2' +AML_CLUSTER_MAX_NODES = '4' +AML_CLUSTER_MIN_NODES = '0' AML_CLUSTER_PRIORITY = 'lowpriority' # Training Config -MODEL_NAME = 'sklearn_regression_model.pkl' +MODEL_NAME = 'diabetes_regression_model.pkl' MODEL_VERSION = '1' -TRAIN_SCRIPT_PATH = 'training/train.py' +TRAIN_SCRIPT_PATH = 'training/train_aml.py' + + # AML Pipeline Config -TRAINING_PIPELINE_NAME = '' -PIPELINE_CONDA_PATH = 'aml_config/conda_dependencies.yml' +TRAINING_PIPELINE_NAME = 'Training Pipeline' MODEL_PATH = '' EVALUATE_SCRIPT_PATH = 'evaluate/evaluate_model.py' REGISTER_SCRIPT_PATH = 'register/register_model.py' -SOURCES_DIR_TRAIN = 'code' - -# These are not mandatory for the core workflow -# Remote VM Config -REMOTE_VM_NAME = '' -REMOTE_VM_USERNAME = '' -REMOTE_VM_PASSWORD = '' -REMOTE_VM_IP = '' -# Image config -IMAGE_NAME = '' -IMAGE_DESCRIPTION = '' -IMAGE_VERSION = '' -# ACI Config -ACI_CPU_CORES = '' -ACI_MEM_GB = '' -ACI_DESCRIPTION = '' \ No newline at end of file +SOURCES_DIR_TRAIN = 'diabetes_regression' +DATASET_NAME = 'diabetes_ds' +DATASET_VERSION = 'latest' +# Optional. Set it if you have configured non default datastore to point to your data +DATASTORE_NAME = '' +SCORE_SCRIPT = 'scoring/score.py' + +# Optional. Used by a training pipeline with R on Databricks +DB_CLUSTER_ID = '' + +# Optional. Container Image name for image creation +IMAGE_NAME = 'mltrained' + +# Run Evaluation Step in AML pipeline +RUN_EVALUATION = 'true' + +# Set to true cancels the Azure ML pipeline run when evaluation criteria are not met. +ALLOW_RUN_CANCEL = 'true' + +# Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml. +AML_REBUILD_ENVIRONMENT = 'false' + + + +USE_GPU_FOR_SCORING = "false" +AML_ENV_SCORE_CONDA_DEP_FILE="conda_dependencies_scoring.yml" +AML_ENV_SCORECOPY_CONDA_DEP_FILE="conda_dependencies_scorecopy.yml" +# AML Compute Cluster Config for parallel batch scoring +AML_ENV_NAME_SCORING='diabetes_regression_scoring_env' +AML_ENV_NAME_SCORE_COPY='diabetes_regression_score_copy_env' +AML_COMPUTE_CLUSTER_NAME_SCORING = 'score-cluster' +AML_COMPUTE_CLUSTER_CPU_SKU_SCORING = 'STANDARD_DS2_V2' +AML_CLUSTER_MAX_NODES_SCORING = '4' +AML_CLUSTER_MIN_NODES_SCORING = '0' +AML_CLUSTER_PRIORITY_SCORING = 'lowpriority' +AML_REBUILD_ENVIRONMENT_SCORING = 'true' +BATCHSCORE_SCRIPT_PATH = 'scoring/parallel_batchscore.py' +BATCHSCORE_COPY_SCRIPT_PATH = 'scoring/parallel_batchscore_copyoutput.py' + + +SCORING_DATASTORE_INPUT_CONTAINER = 'input' +SCORING_DATASTORE_INPUT_FILENAME = 'diabetes_scoring_input.csv' +SCORING_DATASTORE_OUTPUT_CONTAINER = 'output' +SCORING_DATASTORE_OUTPUT_FILENAME = 'diabetes_scoring_output.csv' +SCORING_DATASET_NAME = 'diabetes_scoring_ds' +SCORING_PIPELINE_NAME = 'diabetes-scoring-pipeline' diff --git a/.gitignore b/.gitignore index 3a5a8879..3ab04e2f 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ wheels/ .installed.cfg *.egg MANIFEST +venv/ # PyInstaller # Usually these files are written by a python script from a template @@ -46,6 +47,8 @@ coverage.xml *.cover .hypothesis/ .pytest_cache/ +*-testresults.xml +test-output.xml # Translations *.mo @@ -90,6 +93,7 @@ ENV/ env.bak/ venv.bak/ *.vscode +condaenv.* # Spyder project settings .spyderproject diff --git a/.pipelines/abtest.yml b/.pipelines/abtest.yml new file mode 100644 index 00000000..cf876181 --- /dev/null +++ b/.pipelines/abtest.yml @@ -0,0 +1,168 @@ +# Pipeline for the canary deployment workflow. + +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest + +pr: none +trigger: + branches: + include: + - master + paths: + exclude: + - docs/ + - environment_setup/ + - ml_service/util/create_scoring_image.* + - ml_service/util/smoke_test_scoring_service.py + +variables: +- template: diabetes_regression-variables-template.yml +- group: 'devopsforai-aml-vg' +- name: 'helmVersion' + value: 'v3.1.1' +- name: 'helmDownloadURL' + value: 'https://get.helm.sh/helm-$HELM_VERSION-linux-amd64.tar.gz' +- name: 'blueReleaseName' + value: 'model-blue' +- name: 'greenReleaseName' + value: 'model-green' +- name: 'SCORE_SCRIPT' + value: 'scoring/scoreA.py' + +stages: +- stage: 'Building' + jobs: + - job: "Build_Scoring_image" + timeoutInMinutes: 0 + pool: + vmImage: 'ubuntu-latest' + container: mlops + steps: + - task: AzureCLI@1 + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python -m ml_service.util.create_scoring_image --output_image_location_file image_location.txt + displayName: 'Create Scoring Image' + name: 'buildscoringimage' + + - publish: image_location.txt + artifact: image_location + + - publish: $(System.DefaultWorkingDirectory)/charts + artifact: allcharts + +- stage: 'Blue_Staging' + jobs: + - deployment: "Deploy_to_Staging" + timeoutInMinutes: 0 + environment: abtestenv + strategy: + runOnce: + deploy: + steps: + - script: | + IMAGE_LOCATION="$(cat $(Pipeline.Workspace)/image_location/image_location.txt)" + echo "##vso[task.setvariable variable=IMAGE_LOCATION]$IMAGE_LOCATION" + displayName: 'Get Image Location' + - template: helm-upgrade-template.yml + parameters: + chartPath: '$(Pipeline.Workspace)/allcharts/abtest-model' + releaseName: $(blueReleaseName) + overrideValues: 'deployment.name=$(blueReleaseName),deployment.bluegreen=blue,deployment.image.name=$(IMAGE_LOCATION)' + +- stage: 'Blue_50' + jobs: + - job: 'Blue_Rollout_50' + displayName: 50 50 rollout to blue environment + timeoutInMinutes: 0 + steps: + - template: helm-upgrade-template.yml + parameters: + chartPath: '$(System.DefaultWorkingDirectory)/charts/abtest-istio' + releaseName: 'abtest-istio' + overrideValues: 'weight.blue=50,weight.green=50' + +- stage: 'Blue_100' + jobs: + - deployment: 'blue_Rollout_100' + timeoutInMinutes: 0 + environment: abtestenv + strategy: + runOnce: + deploy: + steps: + - template: helm-upgrade-template.yml + parameters: + chartPath: '$(Pipeline.Workspace)/allcharts/abtest-istio' + releaseName: 'abtest-istio' + overrideValues: 'weight.blue=100,weight.green=0' + +- stage: 'Rollback' + dependsOn: 'Blue_100' + condition: failed() + jobs: + - deployment: 'Roll_Back' + displayName: 'Roll Back after failure' + environment: abtestenv + strategy: + runOnce: + deploy: + steps: + - template: helm-upgrade-template.yml + parameters: + chartPath: '$(Pipeline.Workspace)/allcharts/abtest-istio' + releaseName: 'abtest-istio' + overrideValues: 'weight.blue=0,weight.green=100' + +- stage: 'Set_Production_Tag' + dependsOn: 'Blue_100' + condition: succeeded() + jobs: + - deployment: 'green_blue_tagging' + timeoutInMinutes: 0 + environment: abtestenv + strategy: + runOnce: + deploy: + steps: + - script: | + IMAGE_LOCATION="$(cat $(Pipeline.Workspace)/image_location/image_location.txt)" + echo "##vso[task.setvariable variable=IMAGE_LOCATION]$IMAGE_LOCATION" + displayName: 'Get Image Location' + - template: helm-upgrade-template.yml + parameters: + chartPath: '$(Pipeline.Workspace)/allcharts/abtest-model' + releaseName: $(greenReleaseName) + overrideValues: 'deployment.name=$(greenReleaseName),deployment.bluegreen=green,deployment.image.name=$(IMAGE_LOCATION)' + +- stage: 'Green_100' + jobs: + - job: 'Prod_Rollout_100' + timeoutInMinutes: 0 + steps: + - template: helm-upgrade-template.yml + parameters: + chartPath: '$(System.DefaultWorkingDirectory)/charts/abtest-istio' + releaseName: 'abtest-istio' + overrideValues: 'weight.blue=0,weight.green=100' + +- stage: 'Disable_blue' + condition: always() + jobs: + - job: 'blue_disable' + timeoutInMinutes: 0 + steps: + - template: helm-install-template.yml + - task: HelmDeploy@0 + displayName: 'helm uninstall blue' + inputs: + connectionType: 'Kubernetes Service Connection' + kubernetesServiceConnection: $(K8S_AB_SERVICE_CONNECTION) + command: delete + arguments: $(blueReleaseName) --namespace $(K8S_AB_NAMESPACE) diff --git a/.pipelines/azdo-base-pipeline.yml b/.pipelines/azdo-base-pipeline.yml deleted file mode 100644 index 926b404f..00000000 --- a/.pipelines/azdo-base-pipeline.yml +++ /dev/null @@ -1,26 +0,0 @@ -# this pipeline should be ignored for now -parameters: - pipelineType: 'training' - -steps: -- script: | - flake8 --output-file=$(Build.BinariesDirectory)/lint-testresults.xml --format junit-xml - workingDirectory: '$(Build.SourcesDirectory)' - displayName: 'Run code quality tests' - enabled: 'true' - -- script: | - pytest --junitxml=$(Build.BinariesDirectory)/unit-testresults.xml $(Build.SourcesDirectory)/tests/unit - displayName: 'Run unit tests' - enabled: 'true' - env: - SP_APP_SECRET: '$(SP_APP_SECRET)' - -- task: PublishTestResults@2 - condition: succeededOrFailed() - inputs: - testResultsFiles: '$(Build.BinariesDirectory)/*-testresults.xml' - testRunTitle: 'Linting & Unit tests' - failTaskOnFailedTests: true - displayName: 'Publish linting and unit test results' - enabled: 'true' diff --git a/.pipelines/azdo-ci-build-train.yml b/.pipelines/azdo-ci-build-train.yml deleted file mode 100644 index 1b34b892..00000000 --- a/.pipelines/azdo-ci-build-train.yml +++ /dev/null @@ -1,45 +0,0 @@ -pr: none -trigger: - branches: - include: - - master - -pool: - vmImage: 'ubuntu-latest' - -container: mcr.microsoft.com/mlops/python:latest - - -variables: -- group: devopsforai-aml-vg - - -steps: -- template: azdo-base-pipeline.yml - -- bash: | - # Invoke the Python building and publishing a training pipeline - python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline.py - failOnStderr: 'false' - env: - SP_APP_SECRET: '$(SP_APP_SECRET)' - displayName: 'Publish Azure Machine Learning Pipeline' - enabled: 'true' - -- task: CopyFiles@2 - displayName: 'Copy Files to: $(Build.ArtifactStagingDirectory)' - inputs: - SourceFolder: '$(Build.SourcesDirectory)' - TargetFolder: '$(Build.ArtifactStagingDirectory)' - Contents: | - ml_service/pipelines/?(run_train_pipeline.py|*.json) - code/scoring/** - - -- task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact' - inputs: - ArtifactName: 'mlops-pipelines' - publishLocation: 'container' - pathtoPublish: '$(Build.ArtifactStagingDirectory)' - TargetPath: '$(Build.ArtifactStagingDirectory)' \ No newline at end of file diff --git a/.pipelines/azdo-pr-build-train.yml b/.pipelines/azdo-pr-build-train.yml deleted file mode 100644 index 8bf6ca56..00000000 --- a/.pipelines/azdo-pr-build-train.yml +++ /dev/null @@ -1,18 +0,0 @@ -trigger: none -pr: - branches: - include: - - master - -pool: - vmImage: 'ubuntu-latest' - -container: mcr.microsoft.com/mlops/python:latest - - -variables: -- group: devopsforai-aml-vg - - -steps: -- template: azdo-base-pipeline.yml \ No newline at end of file diff --git a/.pipelines/code-quality-template.yml b/.pipelines/code-quality-template.yml new file mode 100644 index 00000000..afaf7a9a --- /dev/null +++ b/.pipelines/code-quality-template.yml @@ -0,0 +1,27 @@ +# Pipeline template to run linting, unit tests with code coverage, and publish the results. +steps: +- script: | + flake8 --output-file=lint-testresults.xml --format junit-xml + displayName: 'Run lint tests' + +- script: | + python -m pytest . --cov=diabetes_regression --cov-report=html --cov-report=xml --junitxml=unit-testresults.xml + condition: succeededOrFailed() + displayName: 'Run unit tests' + +- task: PublishTestResults@2 + condition: succeededOrFailed() + inputs: + testResultsFiles: '*-testresults.xml' + testRunTitle: 'Linting & Unit tests' + failTaskOnFailedTests: true + displayName: 'Publish test results' + +- task: PublishCodeCoverageResults@1 + displayName: 'Publish coverage report' + condition: succeededOrFailed() + inputs: + codeCoverageTool: Cobertura + summaryFileLocation: 'coverage.xml' + reportDirectory: 'htmlcov' + failIfCoverageEmpty: true diff --git a/.pipelines/diabetes_regression-batchscoring-ci.yml b/.pipelines/diabetes_regression-batchscoring-ci.yml new file mode 100644 index 00000000..1392fddb --- /dev/null +++ b/.pipelines/diabetes_regression-batchscoring-ci.yml @@ -0,0 +1,89 @@ +# Continuous Integration (CI) pipeline that orchestrates the batch scoring of the diabetes_regression model. + +# Runtime parameters to select artifacts +parameters: +- name : artifactBuildId + displayName: Model Train CI Build ID. Default is 'latest'. + type: string + default: latest + +pr: none + +# Trigger this pipeline on model-train pipeline completion +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest + pipelines: + - pipeline: model-train-ci + source: Model-Train-Register-CI # Name of the triggering pipeline + trigger: + branches: + include: + - master + +trigger: + branches: + include: + - master + paths: + include: + - diabetes_regression/scoring/parallel_batchscore.py + - ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py + - ml_service/pipelines/run_parallel_batchscore_pipeline.py + +variables: +- template: diabetes_regression-variables-template.yml +- group: devopsforai-aml-vg + +pool: + vmImage: ubuntu-latest + +stages: +- stage: 'Batch_Scoring_Pipeline_CI' + displayName: 'Batch Scoring Pipeline CI' + jobs: + - job: "Build_Batch_Scoring_Pipeline" + displayName: "Build Batch Scoring Pipeline" + container: mlops + timeoutInMinutes: 0 + steps: + - template: code-quality-template.yml + - template: diabetes_regression-get-model-id-artifact-template.yml + parameters: + projectId: '$(resources.pipeline.model-train-ci.projectID)' + pipelineId: '$(resources.pipeline.model-train-ci.pipelineID)' + artifactBuildId: ${{ parameters.artifactBuildId }} + - task: AzureCLI@1 + displayName: "Publish Batch Scoring Pipeline" + name: publish_batchscore + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + # Invoke the Python building and publishing a training pipeline + python -m ml_service.pipelines.diabetes_regression_build_parallel_batchscore_pipeline + env: + SCORING_DATASTORE_ACCESS_KEY: $(SCORING_DATASTORE_ACCESS_KEY) + + - job: "Run_Batch_Score_Pipeline" + displayName: "Run Batch Scoring Pipeline" + dependsOn: ["Build_Batch_Scoring_Pipeline"] + timeoutInMinutes: 240 + pool: server + variables: + pipeline_id: $[ dependencies.Build_Batch_Scoring_Pipeline.outputs['publish_batchscore.pipeline_id']] + model_name: $[ dependencies.Build_Batch_Scoring_Pipeline.outputs['get_model.MODEL_NAME']] + model_version: $[ dependencies.Build_Batch_Scoring_Pipeline.outputs['get_model.MODEL_VERSION']] + steps: + - task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0 + displayName: 'Invoke Batch Scoring pipeline' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + PipelineId: '$(pipeline_id)' + ExperimentName: '$(EXPERIMENT_NAME)' + PipelineParameters: '"ParameterAssignments": {"model_name": "$(model_name)", "model_version": "$(model_version)"}' + \ No newline at end of file diff --git a/.pipelines/diabetes_regression-cd.yml b/.pipelines/diabetes_regression-cd.yml new file mode 100644 index 00000000..a691cc47 --- /dev/null +++ b/.pipelines/diabetes_regression-cd.yml @@ -0,0 +1,161 @@ +# Continuous Integration (CI) pipeline that orchestrates the deployment of the diabetes_regression model. + +# Runtime parameters to select artifacts +parameters: +- name : artifactBuildId + displayName: Model Train CI Build ID. Default is 'latest'. + type: string + default: latest + +pr: none + +# Trigger this pipeline on model-train pipeline completion +trigger: none +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest + pipelines: + - pipeline: model-train-ci + source: Model-Train-Register-CI # Name of the triggering pipeline + trigger: + branches: + include: + - master + +variables: +- template: diabetes_regression-variables-template.yml +- group: devopsforai-aml-vg + +stages: +- stage: 'Deploy_ACI' + displayName: 'Deploy to ACI' + condition: variables['ACI_DEPLOYMENT_NAME'] + jobs: + - job: "Deploy_ACI" + displayName: "Deploy to ACI" + container: mlops + timeoutInMinutes: 0 + steps: + - download: none + - template: diabetes_regression-get-model-id-artifact-template.yml + parameters: + projectId: '$(resources.pipeline.model-train-ci.projectID)' + pipelineId: '$(resources.pipeline.model-train-ci.pipelineID)' + artifactBuildId: ${{ parameters.artifactBuildId }} + - task: AzureCLI@1 + displayName: 'Install AzureML CLI' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: 'az extension add --source https://azurecliext.blob.core.windows.net/release/azure_cli_ml-1.27.0-py3-none-any.whl --yes' + - task: AzureCLI@1 + displayName: "Deploy to ACI (CLI)" + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring + inlineScript: | + set -e # fail on error + + az ml model deploy --name $(ACI_DEPLOYMENT_NAME) --model '$(MODEL_NAME):$(get_model.MODEL_VERSION)' \ + --ic inference_config.yml \ + --dc deployment_config_aci.yml \ + -g $(RESOURCE_GROUP) --workspace-name $(WORKSPACE_NAME) \ + --overwrite -v + - task: AzureCLI@1 + displayName: 'Smoke test' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python -m ml_service.util.smoke_test_scoring_service --type ACI --service "$(ACI_DEPLOYMENT_NAME)" + +- stage: 'Deploy_AKS' + displayName: 'Deploy to AKS' + dependsOn: Deploy_ACI + condition: and(succeeded(), variables['AKS_DEPLOYMENT_NAME']) + jobs: + - job: "Deploy_AKS" + displayName: "Deploy to AKS" + container: mlops + timeoutInMinutes: 0 + steps: + - template: diabetes_regression-get-model-id-artifact-template.yml + parameters: + projectId: '$(resources.pipeline.model-train-ci.projectID)' + pipelineId: '$(resources.pipeline.model-train-ci.pipelineID)' + artifactBuildId: ${{ parameters.artifactBuildId }} + - task: AzureCLI@1 + displayName: 'Install AzureML CLI' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: 'az extension add --source https://azurecliext.blob.core.windows.net/release/azure_cli_ml-1.27.0-py3-none-any.whl --yes' + - task: AzureCLI@1 + displayName: "Deploy to AKS (CLI)" + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring + inlineScript: | + set -e # fail on error + + az ml model deploy --name $(AKS_DEPLOYMENT_NAME) --model '$(MODEL_NAME):$(get_model.MODEL_VERSION)' \ + --compute-target $(AKS_COMPUTE_NAME) \ + --ic inference_config.yml \ + --dc deployment_config_aks.yml \ + -g $(RESOURCE_GROUP) --workspace-name $(WORKSPACE_NAME) \ + --overwrite -v + - task: AzureCLI@1 + displayName: 'Smoke test' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python -m ml_service.util.smoke_test_scoring_service --type AKS --service "$(AKS_DEPLOYMENT_NAME)" + +- stage: 'Deploy_Webapp' + displayName: 'Deploy to Webapp' + condition: variables['WEBAPP_DEPLOYMENT_NAME'] + jobs: + - job: "Deploy_Webapp" + displayName: "Package and deploy model" + container: mlops + timeoutInMinutes: 0 + steps: + - template: diabetes_regression-get-model-id-artifact-template.yml + parameters: + projectId: '$(resources.pipeline.model-train-ci.projectID)' + pipelineId: '$(resources.pipeline.model-train-ci.pipelineID)' + artifactBuildId: ${{ parameters.artifactBuildId }} + - template: diabetes_regression-package-model-template.yml + parameters: + modelId: $(MODEL_NAME):$(get_model.MODEL_VERSION) + scoringScriptPath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/scoring/score.py' + condaFilePath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/conda_dependencies.yml' + - script: echo $(IMAGE_LOCATION) >image_location.txt + displayName: "Write image location file" + - task: AzureWebAppContainer@1 + name: WebAppDeploy + displayName: 'Azure Web App on Container Deploy' + inputs: + azureSubscription: '$(AZURE_RM_SVC_CONNECTION)' + appName: '$(WEBAPP_DEPLOYMENT_NAME)' + resourceGroupName: '$(RESOURCE_GROUP)' + imageName: '$(IMAGE_LOCATION)' + - task: AzureCLI@1 + displayName: 'Smoke test' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python -m ml_service.util.smoke_test_scoring_service --type Webapp --service "$(WebAppDeploy.AppServiceApplicationUrl)/score" diff --git a/.pipelines/diabetes_regression-ci-image.yml b/.pipelines/diabetes_regression-ci-image.yml new file mode 100644 index 00000000..d7c925bf --- /dev/null +++ b/.pipelines/diabetes_regression-ci-image.yml @@ -0,0 +1,38 @@ +# Pipeline for building the container image that is used by other pipelines for scoring. + +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest + +pr: none +trigger: + branches: + include: + - master + paths: + include: + - ml_service/util/create_scoring_image.py + - ml_service/util/Dockerfile + - diabetes_regression/scoring/ + exclude: + - diabetes_regression/scoring/deployment_config_aci.yml + - diabetes_regression/scoring/deployment_config_aks.yml + +pool: + vmImage: 'ubuntu-latest' + +container: mlops + +variables: +- group: devopsforai-aml-vg +- name: 'SCORE_SCRIPT' + value: 'scoring/scoreB.py' + +steps: +- template: diabetes_regression-package-model-template.yml + parameters: + modelId: $(MODEL_NAME):$(MODEL_VERSION) + scoringScriptPath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/$(SCORE_SCRIPT)' + condaFilePath: '$(Build.SourcesDirectory)/$(SOURCES_DIR_TRAIN)/conda_dependencies.yml' + diff --git a/.pipelines/diabetes_regression-ci.yml b/.pipelines/diabetes_regression-ci.yml new file mode 100644 index 00000000..5a539af0 --- /dev/null +++ b/.pipelines/diabetes_regression-ci.yml @@ -0,0 +1,97 @@ +# Continuous Integration (CI) pipeline that orchestrates the training, evaluation, and registration of the diabetes_regression model. + +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest + +pr: none +trigger: + branches: + include: + - master + paths: + include: + - diabetes_regression/ + - ml_service/pipelines/diabetes_regression_build_train_pipeline.py + - ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py + - ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py + +variables: +- template: diabetes_regression-variables-template.yml +- group: devopsforai-aml-vg + +pool: + vmImage: ubuntu-latest + +stages: +- stage: 'Model_CI' + displayName: 'Model CI' + jobs: + - job: "Model_CI_Pipeline" + displayName: "Model CI Pipeline" + container: mlops + timeoutInMinutes: 0 + steps: + - template: code-quality-template.yml + - task: AzureCLI@1 + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + # Invoke the Python building and publishing a training pipeline + python -m ml_service.pipelines.diabetes_regression_build_train_pipeline + displayName: 'Publish Azure Machine Learning Pipeline' + +- stage: 'Trigger_AML_Pipeline' + displayName: 'Train and evaluate model' + condition: succeeded() + variables: + BUILD_URI: '$(SYSTEM.COLLECTIONURI)$(SYSTEM.TEAMPROJECT)/_build/results?buildId=$(BUILD.BUILDID)' + jobs: + - job: "Get_Pipeline_ID" + condition: and(succeeded(), eq(coalesce(variables['auto-trigger-training'], 'true'), 'true')) + displayName: "Get Pipeline ID for execution" + container: mlops + timeoutInMinutes: 0 + steps: + - task: AzureCLI@1 + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: | + set -e # fail on error + export SUBSCRIPTION_ID=$(az account show --query id -o tsv) + python -m ml_service.pipelines.run_train_pipeline --output_pipeline_id_file "pipeline_id.txt" --skip_train_execution + # Set AMLPIPELINEID variable for next AML Pipeline task in next job + AMLPIPELINEID="$(cat pipeline_id.txt)" + echo "##vso[task.setvariable variable=AMLPIPELINEID;isOutput=true]$AMLPIPELINEID" + name: 'getpipelineid' + displayName: 'Get Pipeline ID' + - job: "Run_ML_Pipeline" + dependsOn: "Get_Pipeline_ID" + displayName: "Trigger ML Training Pipeline" + timeoutInMinutes: 0 + pool: server + variables: + AMLPIPELINE_ID: $[ dependencies.Get_Pipeline_ID.outputs['getpipelineid.AMLPIPELINEID'] ] + steps: + - task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0 + displayName: 'Invoke ML pipeline' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + PipelineId: '$(AMLPIPELINE_ID)' + ExperimentName: '$(EXPERIMENT_NAME)' + PipelineParameters: '"ParameterAssignments": {"model_name": "$(MODEL_NAME)"}, "tags": {"BuildId": "$(Build.BuildId)", "BuildUri": "$(BUILD_URI)"}, "StepTags": {"BuildId": "$(Build.BuildId)", "BuildUri": "$(BUILD_URI)"}' + - job: "Training_Run_Report" + dependsOn: "Run_ML_Pipeline" + condition: always() + displayName: "Publish artifact if new model was registered" + container: mlops + timeoutInMinutes: 0 + steps: + - template: diabetes_regression-publish-model-artifact-template.yml diff --git a/.pipelines/diabetes_regression-get-model-id-artifact-template.yml b/.pipelines/diabetes_regression-get-model-id-artifact-template.yml new file mode 100644 index 00000000..b9e61306 --- /dev/null +++ b/.pipelines/diabetes_regression-get-model-id-artifact-template.yml @@ -0,0 +1,48 @@ +# Pipeline template that gets the model name and version from a previous build's artifact + +parameters: +- name: projectId + type: string + default: '' +- name: pipelineId + type: string + default: '' +- name: artifactBuildId + type: string + default: latest + +steps: + - download: none + - task: DownloadPipelineArtifact@2 + displayName: Download Pipeline Artifacts + inputs: + source: 'specific' + project: '${{ parameters.projectId }}' + pipeline: '${{ parameters.pipelineId }}' + preferTriggeringPipeline: true + ${{ if eq(parameters.artifactBuildId, 'latest') }}: + buildVersionToDownload: 'latestFromBranch' + ${{ if ne(parameters.artifactBuildId, 'latest') }}: + buildVersionToDownload: 'specific' + runId: '${{ parameters.artifactBuildId }}' + runBranch: '$(Build.SourceBranch)' + path: $(Build.SourcesDirectory)/bin + - task: Bash@3 + name: get_model + displayName: Parse Json for Model Name and Version + inputs: + targetType: 'inline' + script: | + # Print JSON + cat $(Build.SourcesDirectory)/bin/model/model.json | jq '.' + + # Set model name and version variables + MODEL_NAME=$(jq -r '.name' <$(Build.SourcesDirectory)/bin/model/model.json) + MODEL_VERSION=$(jq -r '.version' <$(Build.SourcesDirectory)/bin/model/model.json) + + echo "Model Name: $MODEL_NAME" + echo "Model Version: $MODEL_VERSION" + + # Set environment variables + echo "##vso[task.setvariable variable=MODEL_VERSION;isOutput=true]$MODEL_VERSION" + echo "##vso[task.setvariable variable=MODEL_NAME;isOutput=true]$MODEL_NAME" diff --git a/.pipelines/diabetes_regression-package-model-template.yml b/.pipelines/diabetes_regression-package-model-template.yml new file mode 100644 index 00000000..16fc1c1d --- /dev/null +++ b/.pipelines/diabetes_regression-package-model-template.yml @@ -0,0 +1,42 @@ +# Pipeline template that creates a model package and adds the package location to the environment for subsequent tasks to use. +parameters: +- name: modelId + type: string + default: '' +- name: scoringScriptPath + type: string + default: '' +- name: condaFilePath + type: string + default: '' + +steps: + - task: AzureCLI@1 + displayName: 'Install AzureML CLI' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: 'az extension add --source https://azurecliext.blob.core.windows.net/release/azure_cli_ml-1.27.0-py3-none-any.whl --yes' + - task: AzureCLI@1 + displayName: 'Create model package and set IMAGE_LOCATION variable' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + inlineScript: | + set -e # fail on error + + # Create model package using CLI + az ml model package --workspace-name $(WORKSPACE_NAME) -g $(RESOURCE_GROUP) \ + --model '${{ parameters.modelId }}' \ + --entry-script '${{ parameters.scoringScriptPath }}' \ + --cf '${{ parameters.condaFilePath }}' \ + -v \ + --rt python --query 'location' -o tsv > image_logs.txt + + # Show logs + cat image_logs.txt + + # Set environment variable using the last line of logs that has the package location + IMAGE_LOCATION=$(tail -n 1 image_logs.txt) + echo "##vso[task.setvariable variable=IMAGE_LOCATION]$IMAGE_LOCATION" diff --git a/.pipelines/diabetes_regression-publish-model-artifact-template.yml b/.pipelines/diabetes_regression-publish-model-artifact-template.yml new file mode 100644 index 00000000..d666750d --- /dev/null +++ b/.pipelines/diabetes_regression-publish-model-artifact-template.yml @@ -0,0 +1,29 @@ +# Pipeline template to check if a model was registered for the build and publishes an artifact with the model JSON +steps: +- task: AzureCLI@1 + displayName: 'Install AzureML CLI' + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: 'az extension add --source https://azurecliext.blob.core.windows.net/release/azure_cli_ml-1.27.0-py3-none-any.whl --yes' +- task: AzureCLI@1 + inputs: + azureSubscription: '$(WORKSPACE_SVC_CONNECTION)' + scriptLocation: inlineScript + workingDirectory: $(Build.SourcesDirectory) + inlineScript: | + set -e # fail on error + + # Get the model using the build ID tag + FOUND_MODEL=$(az ml model list -g $(RESOURCE_GROUP) --workspace-name $(WORKSPACE_NAME) --tag BuildId=$(Build.BuildId) --query '[0]') + + # If the variable is empty, print and fail + [[ -z "$FOUND_MODEL" ]] && { echo "Model was not registered for this run." ; exit 1; } + + # Write to a file + echo $FOUND_MODEL >model.json + name: 'getversion' + displayName: "Determine if evaluation succeeded and new model is registered (CLI)" +- publish: model.json + artifact: model diff --git a/.pipelines/diabetes_regression-variables-template.yml b/.pipelines/diabetes_regression-variables-template.yml new file mode 100644 index 00000000..502753fb --- /dev/null +++ b/.pipelines/diabetes_regression-variables-template.yml @@ -0,0 +1,129 @@ +# Pipeline template that defines common runtime environment variables. +variables: + # Source Config + # The directory containing the scripts for training, evaluating, and registering the model + - name: SOURCES_DIR_TRAIN + value: diabetes_regression + # The path to the model training script under SOURCES_DIR_TRAIN + - name: TRAIN_SCRIPT_PATH + value: training/train_aml.py + # The path to the model evaluation script under SOURCES_DIR_TRAIN + - name: EVALUATE_SCRIPT_PATH + value: evaluate/evaluate_model.py + # The path to the model registration script under SOURCES_DIR_TRAIN + - name: REGISTER_SCRIPT_PATH + value: register/register_model.py + # The path to the model scoring script relative to SOURCES_DIR_TRAIN + - name: SCORE_SCRIPT + value: scoring/score.py + + + # Azure ML Variables + - name: EXPERIMENT_NAME + value: mlopspython + - name: DATASET_NAME + value: diabetes_ds + # Uncomment DATASTORE_NAME if you have configured non default datastore to point to your data + # - name: DATASTORE_NAME + # value: datablobstore + - name: DATASET_VERSION + value: latest + - name: TRAINING_PIPELINE_NAME + value: "diabetes-Training-Pipeline" + - name: MODEL_NAME + value: diabetes_regression_model.pkl + + # AML Compute Cluster Config + - name: AML_ENV_NAME + value: diabetes_regression_training_env + - name: AML_ENV_TRAIN_CONDA_DEP_FILE + value: "conda_dependencies.yml" + - name: AML_COMPUTE_CLUSTER_CPU_SKU + value: STANDARD_DS2_V2 + - name: AML_COMPUTE_CLUSTER_NAME + value: train-cluster + - name: AML_CLUSTER_MIN_NODES + value: 0 + - name: AML_CLUSTER_MAX_NODES + value: 4 + - name: AML_CLUSTER_PRIORITY + value: lowpriority + + # The name for the (docker/webapp) scoring image + - name: IMAGE_NAME + value: "diabetestrained" + + # Optional. Used by a training pipeline with R on Databricks + - name: DB_CLUSTER_ID + value: "" + + # These are the default values set in ml_service\util\env_variables.py. Uncomment and override if desired. + # Set to false to disable the evaluation step in the ML pipeline and register the newly trained model unconditionally. + # - name: RUN_EVALUATION + # value: "true" + # Set to false to register the model regardless of the outcome of the evaluation step in the ML pipeline. + # - name: ALLOW_RUN_CANCEL + # value: "true" + + # Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml. + # - name: AML_REBUILD_ENVIRONMENT + # value: "false" + + # Variables below are used for controlling various aspects of batch scoring + - name: USE_GPU_FOR_SCORING + value: False + # Conda dependencies for the batch scoring step + - name: AML_ENV_SCORE_CONDA_DEP_FILE + value: "conda_dependencies_scoring.yml" + # Conda dependencies for the score copying step + - name: AML_ENV_SCORECOPY_CONDA_DEP_FILE + value: "conda_dependencies_scorecopy.yml" + # AML Compute Cluster Config for parallel batch scoring + - name: AML_ENV_NAME_SCORING + value: diabetes_regression_scoring_env + - name: AML_ENV_NAME_SCORE_COPY + value: diabetes_regression_score_copy_env + - name: AML_COMPUTE_CLUSTER_CPU_SKU_SCORING + value: STANDARD_DS2_V2 + - name: AML_COMPUTE_CLUSTER_NAME_SCORING + value: score-cluster + - name: AML_CLUSTER_MIN_NODES_SCORING + value: 0 + - name: AML_CLUSTER_MAX_NODES_SCORING + value: 4 + - name: AML_CLUSTER_PRIORITY_SCORING + value: lowpriority + # The path to the batch scoring script relative to SOURCES_DIR_TRAIN + - name: BATCHSCORE_SCRIPT_PATH + value: scoring/parallel_batchscore.py + - name: BATCHSCORE_COPY_SCRIPT_PATH + value: scoring/parallel_batchscore_copyoutput.py + # Flag to allow rebuilding the AML Environment after it was built for the first time. + # This enables dependency updates from the conda dependencies yaml for scoring activities. + - name: AML_REBUILD_ENVIRONMENT_SCORING + value: "true" + + # Datastore config for scoring + # The storage account name and key are supplied as variables in a variable group + # in the Azure Pipelines library for this project. Please refer to repo docs for + # more details + + # Blob container where the input data for scoring can be found + - name: SCORING_DATASTORE_INPUT_CONTAINER + value: "input" + # Blobname for the input data - include any applicable path in the string + - name: SCORING_DATASTORE_INPUT_FILENAME + value: "diabetes_scoring_input.csv" + # Blob container where the output data for scoring can be found + - name: SCORING_DATASTORE_OUTPUT_CONTAINER + value: "output" + # Blobname for the output data - include any applicable path in the string + - name: SCORING_DATASTORE_OUTPUT_FILENAME + value: "diabetes_scoring_output.csv" + # Dataset name for input data for scoring + - name: SCORING_DATASET_NAME + value: "diabetes_scoring_ds" + # Scoring pipeline name + - name: SCORING_PIPELINE_NAME + value: "diabetes-scoring-pipeline" + \ No newline at end of file diff --git a/.pipelines/helm-install-template.yml b/.pipelines/helm-install-template.yml new file mode 100644 index 00000000..a4dbd581 --- /dev/null +++ b/.pipelines/helm-install-template.yml @@ -0,0 +1,10 @@ +# Pipeline template for installing helm on the agent. +steps: +- task: Bash@3 + displayName: 'Install Helm $(helmVersion)' + inputs: + targetType: inline + script: wget -q $(helmDownloadURL) -O /tmp/$FILENAME && tar -zxvf /tmp/$FILENAME -C /tmp && sudo mv /tmp/linux-amd64/helm /usr/local/bin/helm + env: + HELM_VERSION: $(helmVersion) + FILENAME: helm-$(helmVersion)-linux-amd64.tar.gz diff --git a/.pipelines/helm-upgrade-template.yml b/.pipelines/helm-upgrade-template.yml new file mode 100644 index 00000000..4f75c8ed --- /dev/null +++ b/.pipelines/helm-upgrade-template.yml @@ -0,0 +1,20 @@ +# Pipeline template for deploying / upgrading using Helm. +parameters: + chartPath: '' + releaseName: '' + overrideValues: '' + +steps: +- template: helm-install-template.yml +- task: HelmDeploy@0 + displayName: 'helm upgrade' + inputs: + connectionType: 'Kubernetes Service Connection' + kubernetesServiceConnection: $(K8S_AB_SERVICE_CONNECTION) + command: upgrade + chartType: FilePath + chartPath: ${{ parameters.chartPath }} + releaseName: ${{ parameters.releaseName }} + overrideValues: ${{ parameters.overrideValues }} + install: true + arguments: --namespace $(K8S_AB_NAMESPACE) diff --git a/.pipelines/pr.yml b/.pipelines/pr.yml new file mode 100644 index 00000000..765a5fef --- /dev/null +++ b/.pipelines/pr.yml @@ -0,0 +1,24 @@ +# Pipeline to run basic code quality tests as part of pull requests to the master branch. + +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest + +trigger: none +pr: + branches: + include: + - master + +pool: + vmImage: 'ubuntu-latest' + +container: mlops + +variables: +- template: diabetes_regression-variables-template.yml +- group: devopsforai-aml-vg + +steps: +- template: code-quality-template.yml diff --git a/README.md b/README.md index bb202690..434be0df 100644 --- a/README.md +++ b/README.md @@ -11,74 +11,49 @@ description: "Code which demonstrates how to set up and operationalize an MLOps # MLOps with Azure ML +CI: [![Build Status](https://aidemos.visualstudio.com/MLOps/_apis/build/status/Model-Train-Register-CI?branchName=master)](https://aidemos.visualstudio.com/MLOps/_build/latest?definitionId=160&branchName=master) -[![Build Status](https://dev.azure.com/customai/DevopsForAI-AML/_apis/build/status/Microsoft.MLOpsPython?branchName=master)](https://dev.azure.com/customai/DevopsForAI-AML/_build/latest?definitionId=25&branchName=master) +CD: [![Build Status](https://aidemos.visualstudio.com/MLOps/_apis/build/status/microsoft.MLOpsPython-CD?branchName=master)](https://aidemos.visualstudio.com/MLOps/_build/latest?definitionId=161&branchName=master) - -MLOps will help you to understand how to build the Continuous Integration and Continuous Delivery pipeline for a ML/AI project. We will be using the Azure DevOps Project for build and release/deployment pipelines along with Azure ML services for model retraining pipeline, model management and operationalization. +MLOps will help you to understand how to build a Continuous Integration and Continuous Delivery pipeline for an ML/AI project. We will be using the Azure DevOps Project for build and release/deployment pipelines along with Azure ML services for model retraining pipeline, model management and operationalization. ![ML lifecycle](/docs/images/ml-lifecycle.png) -This template contains code and pipeline definition for a machine learning project demonstrating how to automate an end to end ML/AI workflow. The build pipelines include DevOps tasks for data sanity test, unit test, model training on different compute targets, model version management, model evaluation/model selection, model deployment as realtime web service, staged deployment to QA/prod and integration testing. - - -## Prerequisite -- Active Azure subscription -- At least contributor access to Azure subscription - -## Getting Started: - -To deploy this solution in your subscription, follow the manual instructions in the [getting started](docs/getting_started.md) doc - +This template contains code and pipeline definitions for a machine learning project that demonstrates how to automate an end to end ML/AI workflow. -## Architecture Diagram +## Architecture and Features -This reference architecture shows how to implement continuous integration (CI), continuous delivery (CD), and retraining pipeline for an AI application using Azure DevOps and Azure Machine Learning. The solution is built on the scikit-learn diabetes dataset but can be easily adapted for any AI scenario and other popular build systems such as Jenkins and Travis. +Architecture Reference: [Machine learning operationalization (MLOps) for Python models using Azure Machine Learning](https://docs.microsoft.com/en-us/azure/architecture/reference-architectures/ai/mlops-python) -![Architecture](/docs/images/main-flow.png) +This reference architecture shows how to implement continuous integration (CI), continuous delivery (CD), and retraining pipeline for an AI application using Azure DevOps and [Azure Machine Learning](/azure/machine-learning/service/overview-what-is-azure-ml). The solution is built on the scikit-learn diabetes dataset but can be easily adapted for any AI scenario and other popular build systems such as Jenkins and Travis. +The build pipelines include DevOps tasks for data sanity tests, unit tests, model training on different compute targets, model version management, model evaluation/model selection, model deployment as realtime web service, staged deployment to QA/prod and integration testing. -## Architecture Flow - -### Train Model -1. Data Scientist writes/updates the code and push it to git repo. This triggers the Azure DevOps build pipeline (continuous integration). -2. Once the Azure DevOps build pipeline is triggered, it performs code quality checks, data sanity tests, unit tests, builds an [Azure ML Pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines) and publishes it in an [Azure ML Service Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). -3. The [Azure ML Pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines) is triggered once the Azure DevOps build pipeline completes. All the tasks in this pipeline runs on Azure ML Compute. Following are the tasks in this pipeline: - - - **Train Model** task executes model training script on Azure ML Compute. It outputs a [model](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#model) file which is stored in the [run history](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#run). - - - **Evaluate Model** task evaluates the performance of the newly trained model with the model in production. If the new model performs better than the production model, the following steps are executed. If not, they will be skipped. - - - **Register Model** task takes the improved model and registers it with the [Azure ML Model registry](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#model-registry). This allows us to version control it. +## Prerequisite -### Deploy Model +- Active Azure subscription +- At least contributor access to Azure subscription -Once you have registered your ML model, you can use Azure ML + Azure DevOps to deploy it. +## Getting Started -[Azure DevOps release pipeline](https://docs.microsoft.com/en-us/azure/devops/pipelines/release/?view=azure-devops) packages the new model along with the scoring file and its python dependencies into a [docker image](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#image) and pushes it to [Azure Container Registry](https://docs.microsoft.com/en-us/azure/container-registry/container-registry-intro). This image is used to deploy the model as [web service](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#web-service) across QA and Prod environments. The QA environment is running on top of [Azure Container Instances (ACI)](https://azure.microsoft.com/en-us/services/container-instances/) and the Prod environment is built with [Azure Kubernetes Service (AKS)](https://docs.microsoft.com/en-us/azure/aks/intro-kubernetes). - +To deploy this solution in your subscription, follow the manual instructions in the [getting started](docs/getting_started.md) doc. Then optionally follow the guide for [integrating your own code](docs/custom_model.md) with this repository template. ### Repo Details You can find the details of the code and scripts in the repository [here](/docs/code_description.md) ### References -- [Azure Machine Learning(Azure ML) Service Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/overview-what-is-azure-ml) + +- [Azure Machine Learning (Azure ML) Service Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/overview-what-is-azure-ml) - [Azure ML CLI](https://docs.microsoft.com/en-us/azure/machine-learning/service/reference-azure-machine-learning-cli) - [Azure ML Samples](https://docs.microsoft.com/en-us/azure/machine-learning/service/samples-notebooks) - [Azure ML Python SDK Quickstart](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-create-workspace-with-python) - [Azure DevOps](https://docs.microsoft.com/en-us/azure/devops/?view=vsts) -# Contributing +## Contributing -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.microsoft.com. +This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit -When you submit a pull request, a CLA-bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. +When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). -For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or -contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. diff --git a/bootstrap/README.md b/bootstrap/README.md new file mode 100644 index 00000000..0841cc30 --- /dev/null +++ b/bootstrap/README.md @@ -0,0 +1,3 @@ +# Bootstrap from MLOpsPython repository + +For steps on how to use the bootstrap script, please see the "Bootstrap the project" section of the [custom model guide](../docs/custom_model.md#bootstrap-the-project). diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py new file mode 100644 index 00000000..02f51bbc --- /dev/null +++ b/bootstrap/bootstrap.py @@ -0,0 +1,155 @@ +import os +import sys +import platform +import argparse +import re + + +class Helper: + + def __init__(self, project_directory, project_name): + self._project_directory = project_directory + self._project_name = project_name + self._git_repo = "https://github.com/microsoft/MLOpsPython.git" + + @property + def project_directory(self): + return self._project_directory + + @property + def project_name(self): + return self._project_name + + @property + def git_repo(self): + return self._git_repo + + def rename_files(self): + # Rename all files starting with diabetes_regression with project name + strtoreplace = "diabetes_regression" + dirs = [".pipelines", r"ml_service/pipelines"] + for dir in dirs: + normDir = os.path.normpath(dir) + dirpath = os.path.join(self._project_directory, normDir) + for filename in os.listdir(dirpath): + if(filename.find(strtoreplace) != -1): + src = os.path.join(self._project_directory, normDir, filename) # NOQA: E501 + dst = os.path.join(self._project_directory, + normDir, + filename.replace(strtoreplace, self._project_name, 1)) # NOQA: E501 + os.rename(src, dst) + + def rename_dir(self): + dir = "diabetes_regression" + src = os.path.join(self._project_directory, dir) + for path, subdirs, files in os.walk(src): + for name in files: + newPath = path.replace(dir, self._project_name) + if (not (os.path.exists(newPath))): + os.mkdir(newPath) + file_path = os.path.join(path, name) + new_name = os.path.join(newPath, name) + os.rename(file_path, new_name) + + def delete_dir(self): + # Delete unwanted directories + dirs = ["docs", r"diabetes_regression"] + if (platform.system() == "Windows"): + cmd = 'rmdir /S /Q "{}"' + else: + cmd = 'rm -r "{}"' + for dir in dirs: + os.system(cmd.format(os.path.join(self._project_directory, os.path.normpath(dir)))) # NOQA: E501 + + def clean_dir(self): + # Clean up directories + dirs = ["data", "experimentation"] + for dir in dirs: + for root, dirs, files in os.walk(os.path.join(self._project_directory, dir)): # NOQA: E501 + for file in files: + os.remove(os.path.join(root, file)) + + def validate_args(self): + # Validate arguments + if (os.path.isdir(self._project_directory) is False): + raise Exception("Not a valid directory. Please provide an absolute directory path.") # NOQA: E501 + if (len(self._project_name) < 3 or len(self._project_name) > 15): + raise Exception("Invalid project name length. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501 + if (not re.search("^[\\w_]+$", self._project_name)): + raise Exception("Invalid characters in project name. Project name should be 3 to 15 chars long, letters and underscores only.") # NOQA: E501 + + +def replace_project_name(project_dir, project_name, rename_name): + # Replace instances of rename_name within files with project_name + files = [r".env.example", + r".pipelines/code-quality-template.yml", + r".pipelines/pr.yml", + r".pipelines/diabetes_regression-cd.yml", + r".pipelines/diabetes_regression-ci.yml", + r".pipelines/abtest.yml", + r".pipelines/diabetes_regression-ci-image.yml", + r".pipelines/diabetes_regression-publish-model-artifact-template.yml", # NOQA: E501 + r".pipelines/diabetes_regression-get-model-id-artifact-template.yml", # NOQA: E501 + r".pipelines/diabetes_regression-batchscoring-ci.yml", + r".pipelines/diabetes_regression-variables-template.yml", + r"environment_setup/Dockerfile", + r"environment_setup/install_requirements.sh", + r"ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py", # NOQA: E501 + r"ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py", # NOQA: E501 + r"ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py", # NOQA: E501 + r"ml_service/pipelines/diabetes_regression_build_train_pipeline.py", # NOQA: E501 + r"ml_service/util/create_scoring_image.py", + r"diabetes_regression/conda_dependencies.yml", + r"diabetes_regression/evaluate/evaluate_model.py", + r"diabetes_regression/register/register_model.py", + r"diabetes_regression/training/test_train.py"] + + for file in files: + path = os.path.join(project_dir, os.path.normpath(file)) + try: + with open(path, "rt", encoding="utf8") as f_in: + data = f_in.read() + data = data.replace(rename_name, project_name) + with open(os.path.join(project_dir, file), "wt", encoding="utf8") as f_out: # NOQA: E501 + f_out.write(data) + except IOError as e: + print("Could not modify \"%s\". Is the MLOpsPython repo already cloned at \"%s\"?" % (path, project_dir)) # NOQA: E501 + raise e + + +def main(args): + parser = argparse.ArgumentParser(description='New Template') + parser.add_argument("-d", + "--directory", + type=str, + required=True, + help="Absolute path to new project direcory") + parser.add_argument("-n", + "--name", + type=str, + required=True, + help="Name of the project [3-15 chars, letters and underscores only]") # NOQA: E501 + try: + args = parser.parse_args() + + project_directory = args.directory + project_name = args.name + + helper = Helper(project_directory, project_name) + helper.validate_args() + helper.clean_dir() + + replace_project_name(project_directory, project_name, "diabetes_regression") # NOQA: E501 + replace_project_name(project_directory, project_name, "diabetes") + + helper.rename_files() + helper.rename_dir() + helper.delete_dir() + except Exception as e: + print(e) + + return 0 + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/charts/abtest-istio/Chart.yaml b/charts/abtest-istio/Chart.yaml new file mode 100644 index 00000000..bfcf8584 --- /dev/null +++ b/charts/abtest-istio/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +appVersion: "1.0" +description: A Helm chart for Kubernetes +name: abtest-istio +version: 0.1.0 diff --git a/charts/abtest-istio/templates/istio-canary.yaml b/charts/abtest-istio/templates/istio-canary.yaml new file mode 100644 index 00000000..a030fd0d --- /dev/null +++ b/charts/abtest-istio/templates/istio-canary.yaml @@ -0,0 +1,60 @@ +apiVersion: networking.istio.io/v1alpha3 +kind: Gateway +metadata: + name: mlmodel-gateway + namespace: abtesting +spec: + selector: + istio: ingressgateway + servers: + - port: + number: {{ .Values.ingress.port }} + name: http + protocol: HTTP + hosts: + - "*" +--- +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: mlmodel-virtualservice + namespace: abtesting +spec: + gateways: + - mlmodel-gateway + hosts: + - '*' + http: + - match: + - uri: + prefix: /score + headers: + x-api-version: + exact: 'blue' + route: + - destination: + host: {{ .Values.svc.name }}-blue.abtesting.svc.cluster.local + port: + number: {{ .Values.svc.port }} + - match: + - uri: + prefix: /score + headers: + x-api-version: + exact: 'green' + route: + - destination: + host: {{ .Values.svc.name }}-green.abtesting.svc.cluster.local + port: + number: {{ .Values.svc.port }} + - route: + - destination: + host: {{ .Values.svc.name }}-green.abtesting.svc.cluster.local + port: + number: {{ .Values.svc.port }} + weight: {{ .Values.weight.green }} + - destination: + host: {{ .Values.svc.name }}-blue.abtesting.svc.cluster.local + port: + number: {{ .Values.svc.port }} + weight: {{ .Values.weight.blue }} \ No newline at end of file diff --git a/charts/abtest-istio/values.yaml b/charts/abtest-istio/values.yaml new file mode 100644 index 00000000..014845bc --- /dev/null +++ b/charts/abtest-istio/values.yaml @@ -0,0 +1,15 @@ +ingress: + port: 80 + +svc: + port: 5001 + name: model-svc + + +weight: + green: 50 + blue: 50 + +uri: + prefix: /score + diff --git a/charts/abtest-model/Chart.yaml b/charts/abtest-model/Chart.yaml new file mode 100644 index 00000000..eeaa24bf --- /dev/null +++ b/charts/abtest-model/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +appVersion: "1.0" +description: A Helm chart for Kubernetes +name: abtest-model +version: 0.1.0 diff --git a/charts/abtest-model/templates/deployment.yaml b/charts/abtest-model/templates/deployment.yaml new file mode 100644 index 00000000..78d01cc4 --- /dev/null +++ b/charts/abtest-model/templates/deployment.yaml @@ -0,0 +1,30 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Values.deployment.name }} + namespace: {{ .Values.namespace }} + labels: + app: {{ .Values.appname }} + model_version: {{ .Values.deployment.bluegreen }} +spec: + replicas: 1 + selector: + matchLabels: + app: {{ .Values.appname }} + model_version: {{ .Values.deployment.bluegreen }} + template: + metadata: + labels: + app: {{ .Values.appname }} + model_version: {{ .Values.deployment.bluegreen }} + spec: + containers: + - name: {{ .Values.deployment.container.name }} + image: "{{ .Values.deployment.image.name }}" + imagePullPolicy: Always + ports: + - name: http + containerPort: 5001 + - name: probe + containerPort: 8086 + diff --git a/charts/abtest-model/templates/service.yaml b/charts/abtest-model/templates/service.yaml new file mode 100644 index 00000000..a4a6ed8b --- /dev/null +++ b/charts/abtest-model/templates/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Values.svc.name }}-{{ .Values.deployment.bluegreen }}" + namespace: {{ .Values.namespace }} +spec: + selector: + app: {{ .Values.appname }} + model_version: {{ .Values.deployment.bluegreen }} + ports: + - port: {{ .Values.svc.port }} + targetPort: {{ .Values.deployment.container.port }} + \ No newline at end of file diff --git a/charts/abtest-model/values.yaml b/charts/abtest-model/values.yaml new file mode 100644 index 00000000..c3ab1b60 --- /dev/null +++ b/charts/abtest-model/values.yaml @@ -0,0 +1,13 @@ +namespace: abtesting +appname: model + +deployment: + name: model-green + bluegreen: green + container: + name: model + port: 5001 + +svc: + name: model-svc + port: 5001 \ No newline at end of file diff --git a/charts/load_test.sh b/charts/load_test.sh new file mode 100755 index 00000000..25a06452 --- /dev/null +++ b/charts/load_test.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +for ((i=1;i<=$1;i++)) +do + curl --header "x-api-version: $3" $2 + echo + sleep .2 +done \ No newline at end of file diff --git a/code/evaluate/evaluate_model.py b/code/evaluate/evaluate_model.py deleted file mode 100644 index ec5dc5e0..00000000 --- a/code/evaluate/evaluate_model.py +++ /dev/null @@ -1,113 +0,0 @@ -""" -Copyright (C) Microsoft Corporation. All rights reserved.​ - ​ -Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, -royalty-free right to use, copy, and modify the software code provided by us -("Software Code"). You may not sublicense the Software Code or any use of it -(except to your affiliates and to vendors to perform work on your behalf) -through distribution, network access, service agreement, lease, rental, or -otherwise. This license does not purport to express any claim of ownership over -data you may have shared with Microsoft in the creation of the Software Code. -Unless applicable law gives you more rights, Microsoft reserves all other -rights not expressly granted herein, whether by implication, estoppel or -otherwise. ​ - ​ -THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS -OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. -""" -import os -from azureml.core import Model, Run -import argparse - - -# Get workspace -run = Run.get_context() -exp = run.experiment -ws = run.experiment.workspace - - -parser = argparse.ArgumentParser("evaluate") -parser.add_argument( - "--release_id", - type=str, - help="The ID of the release triggering this pipeline run", -) -parser.add_argument( - "--model_name", - type=str, - help="Name of the Model", - default="sklearn_regression_model.pkl", -) -args = parser.parse_args() - -print("Argument 1: %s" % args.release_id) -print("Argument 2: %s" % args.model_name) -model_name = args.model_name -release_id = args.release_id - -# Paramaterize the matrics on which the models should be compared -# Add golden data set on which all the model performance can be evaluated - -all_runs = exp.get_runs( - properties={"release_id": release_id, "run_type": "train"}, - include_children=True - ) -new_model_run = next(all_runs) -new_model_run_id = new_model_run.id -print(f'New Run found with Run ID of: {new_model_run_id}') - -try: - # Get most recently registered model, we assume that - # is the model in production. - # Download this model and compare it with the recently - # trained model by running test with same data set. - model_list = Model.list(ws) - production_model = next( - filter( - lambda x: x.created_time == max( - model.created_time for model in model_list), - model_list, - ) - ) - production_model_run_id = production_model.tags.get("run_id") - run_list = exp.get_runs() - - # Get the run history for both production model and - # newly trained model and compare mse - production_model_run = Run(exp, run_id=production_model_run_id) - new_model_run = Run(exp, run_id=new_model_run_id) - - production_model_mse = production_model_run.get_metrics().get("mse") - new_model_mse = new_model_run.get_metrics().get("mse") - print( - "Current Production model mse: {}, New trained model mse: {}".format( - production_model_mse, new_model_mse - ) - ) - - promote_new_model = False - if new_model_mse < production_model_mse: - promote_new_model = True - print("New trained model performs better, thus it will be registered") -except Exception: - promote_new_model = True - print("This is the first model to be trained, \ - thus nothing to evaluate for now") - - -# Writing the run id to /aml_config/run_id.json -if promote_new_model: - model_path = os.path.join('outputs', model_name) - new_model_run.register_model( - model_name=model_name, - model_path=model_path, - properties={"release_id": release_id}) - print("Registered new model!") diff --git a/code/register/register_model.py b/code/register/register_model.py deleted file mode 100644 index ae2b8216..00000000 --- a/code/register/register_model.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -Copyright (C) Microsoft Corporation. All rights reserved.​ - ​ -Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, -royalty-free right to use, copy, and modify the software code provided by us -("Software Code"). You may not sublicense the Software Code or any use of it -(except to your affiliates and to vendors to perform work on your behalf) -through distribution, network access, service agreement, lease, rental, or -otherwise. This license does not purport to express any claim of ownership over -data you may have shared with Microsoft in the creation of the Software Code. -Unless applicable law gives you more rights, Microsoft reserves all other -rights not expressly granted herein, whether by implication, estoppel or -otherwise. ​ - ​ -THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS -OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. -""" -import os -import json -import sys -from azureml.core import Run -import argparse - -from azureml.core.authentication import AzureCliAuthentication - -cli_auth = AzureCliAuthentication() - -# Get workspace -# ws = Workspace.from_config(auth=cli_auth, path='./') - - -run = Run.get_context() -exp = run.experiment -ws = run.experiment.workspace - -parser = argparse.ArgumentParser("register") -parser.add_argument( - "--config_suffix", type=str, help="Datetime suffix for json config files" -) -parser.add_argument( - "--json_config", - type=str, - help="Directory to write all the intermediate json configs", -) -parser.add_argument( - "--model_name", - type=str, - help="Name of the Model", - default="sklearn_regression_model.pkl", -) - -args = parser.parse_args() - -print("Argument 1: %s" % args.config_suffix) -print("Argument 2: %s" % args.json_config) - -if not (args.json_config is None): - os.makedirs(args.json_config, exist_ok=True) - print("%s created" % args.json_config) - -evaluate_run_id_json = "run_id_{}.json".format(args.config_suffix) -evaluate_output_path = os.path.join(args.json_config, evaluate_run_id_json) -model_name = args.model_name - -# Get the latest evaluation result -try: - with open(evaluate_output_path) as f: - config = json.load(f) - if not config["run_id"]: - raise Exception( - "No new model to register as production model perform better") -except Exception: - print("No new model to register as production model perform better") - sys.exit(0) - -run_id = config["run_id"] -experiment_name = config["experiment_name"] -# exp = Experiment(workspace=ws, name=experiment_name) - -run = Run(experiment=exp, run_id=run_id) -names = run.get_file_names -names() -print("Run ID for last run: {}".format(run_id)) - -model = run.register_model(model_name=model_name, - model_path="./outputs/" + model_name, - tags={"area": "diabetes", "type": "regression"}) -os.chdir("..") -print( - "Model registered: {} \nModel Description: {} \nModel Version: {}".format( - model.name, model.description, model.version - ) -) - -# Writing the registered model details to /aml_config/model.json -model_json = {} -model_json["model_name"] = model.name -model_json["model_version"] = model.version -model_json["run_id"] = run_id -filename = "model_{}.json".format(args.config_suffix) -output_path = os.path.join(args.json_config, filename) -with open(output_path, "w") as outfile: - json.dump(model_json, outfile) diff --git a/code/training/train.py b/code/training/train.py deleted file mode 100644 index d703964f..00000000 --- a/code/training/train.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Copyright (C) Microsoft Corporation. All rights reserved.​ - ​ -Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, -royalty-free right to use, copy, and modify the software code provided by us -("Software Code"). You may not sublicense the Software Code or any use of it -(except to your affiliates and to vendors to perform work on your behalf) -through distribution, network access, service agreement, lease, rental, or -otherwise. This license does not purport to express any claim of ownership over -data you may have shared with Microsoft in the creation of the Software Code. -Unless applicable law gives you more rights, Microsoft reserves all other -rights not expressly granted herein, whether by implication, estoppel or -otherwise. ​ - ​ -THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS -OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER -IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. -""" -from azureml.core.run import Run -import os -import argparse -from sklearn.datasets import load_diabetes -from sklearn.linear_model import Ridge -from sklearn.metrics import mean_squared_error -from sklearn.model_selection import train_test_split -from sklearn.externals import joblib -import numpy as np - - -parser = argparse.ArgumentParser("train") -parser.add_argument( - "--release_id", - type=str, - help="The ID of the release triggering this pipeline run", -) -parser.add_argument( - "--model_name", - type=str, - help="Name of the Model", - default="sklearn_regression_model.pkl", -) - -args = parser.parse_args() - -print("Argument 1: %s" % args.release_id) -print("Argument 2: %s" % args.model_name) - -model_name = args.model_name -release_id = args.release_id - -run = Run.get_context() -exp = run.experiment -ws = run.experiment.workspace - -X, y = load_diabetes(return_X_y=True) -columns = ["age", "gender", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"] -X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=0) -data = {"train": {"X": X_train, "y": y_train}, - "test": {"X": X_test, "y": y_test}} - -print("Running train.py") - -# Randomly pic alpha -alphas = np.arange(0.0, 1.0, 0.05) -alpha = alphas[np.random.choice(alphas.shape[0], 1, replace=False)][0] -print(alpha) -run.log("alpha", alpha) -reg = Ridge(alpha=alpha) -reg.fit(data["train"]["X"], data["train"]["y"]) -preds = reg.predict(data["test"]["X"]) -run.log("mse", mean_squared_error(preds, data["test"]["y"])) - - -# Save model as part of the run history - -# model_name = "." - -with open(model_name, "wb") as file: - joblib.dump(value=reg, filename=model_name) - -# upload the model file explicitly into artifacts -run.upload_file(name="./outputs/" + model_name, path_or_stream=model_name) -print("Uploaded the model {} to experiment {}".format( - model_name, run.experiment.name)) -dirpath = os.getcwd() -print(dirpath) -print("Following files are uploaded ") -print(run.get_file_names()) - -# Add properties to identify this specific training run -run.add_properties({"release_id": release_id, "run_type": "train"}) -print(f"added properties: {run.properties}") - -run.complete() diff --git a/data/README.md b/data/README.md new file mode 100644 index 00000000..d43d139c --- /dev/null +++ b/data/README.md @@ -0,0 +1,3 @@ +This folder is used for example data, and it is not meant to be used for storing training data. + +Follow steps to [Configure Training Data](../docs/custom_model.md#Configure-Custom-Training) to use your own data for training. \ No newline at end of file diff --git a/tests/unit/data_test.py b/data/data_test.py similarity index 93% rename from tests/unit/data_test.py rename to data/data_test.py index 8b40b8bc..6d7d2ddf 100644 --- a/tests/unit/data_test.py +++ b/data/data_test.py @@ -35,7 +35,7 @@ def get_absPath(filename): path = os.path.abspath( os.path.join( os.path.dirname( - __file__), os.path.pardir, os.path.pardir, "data", filename + __file__), os.path.pardir, "data", filename ) ) return path @@ -120,8 +120,8 @@ def test_check_distribution(): mean = np.mean(dataset.values, axis=0) std = np.mean(dataset.values, axis=0) assert ( - np.sum(abs(mean - historical_mean) > - shift_tolerance * abs(historical_mean)) - or np.sum(abs(std - historical_std) > - shift_tolerance * abs(historical_std)) > 0 + np.sum(abs(mean - historical_mean) + > shift_tolerance * abs(historical_mean)) + or np.sum(abs(std - historical_std) + > shift_tolerance * abs(historical_std)) > 0 ) diff --git a/diabetes_regression/.amlignore b/diabetes_regression/.amlignore new file mode 100644 index 00000000..e8705e07 --- /dev/null +++ b/diabetes_regression/.amlignore @@ -0,0 +1,10 @@ +# To prevent unnecessary files from being included in +# the snapshot, make an ignore file (.gitignore or .amlignore). +# Place this file in the Snapshot directory and add the +# filenames to ignore in it. The .amlignore file uses +# the same syntax and patterns as the .gitignore file. +# If both files exist, the .amlignore file takes precedence. + +# We use yml files to configure deployment, +# but we are not deploying them to compute +*.yml diff --git a/diabetes_regression/ci_dependencies.yml b/diabetes_regression/ci_dependencies.yml new file mode 100644 index 00000000..73086471 --- /dev/null +++ b/diabetes_regression/ci_dependencies.yml @@ -0,0 +1,29 @@ +name: mlopspython_ci + +dependencies: + # The python interpreter version. + - python=3.7.* + + # dependencies with versions aligned with conda_dependencies.yml. + - numpy=1.18.* + - pandas=1.0.* + - scikit-learn=0.22.* + # dependencies for MLOps with R. + - r=3.6.0 + - r-essentials=3.6.0 + + - conda-forge::jq + - pip=20.0.* + + - pip: + # dependencies with versions aligned with conda_dependencies.yml. + - azureml-sdk==1.27.* + + # Additional pip dependencies for the CI environment. + - pytest==5.4.* + - pytest-cov==2.8.* + - requests==2.23.* + - python-dotenv==0.12.* + - flake8==3.7.* + - flake8_formatter_junit_xml==0.0.* + - azure-cli==2.3.* diff --git a/code/scoring/conda_dependencies.yml b/diabetes_regression/conda_dependencies.yml similarity index 63% rename from code/scoring/conda_dependencies.yml rename to diabetes_regression/conda_dependencies.yml index f13c3c3d..e214c7b2 100644 --- a/code/scoring/conda_dependencies.yml +++ b/diabetes_regression/conda_dependencies.yml @@ -1,54 +1,39 @@ # Conda environment specification. The dependencies defined in this file will - # be automatically provisioned for managed runs. These include runs against - # the localdocker, remotedocker, and cluster compute targets. - # Note that this file is NOT used to automatically manage dependencies for the - # local compute target. To provision these dependencies locally, run: - # conda env update --file conda_dependencies.yml - # Details about the Conda environment file format: - # https://conda.io/docs/using/envs.html#create-environment-file-by-hand - # For managing Spark packages and configuration, see spark_dependencies.yml. - - # Version of this configuration file's structure and semantics in AzureML. - # This directive is stored in a comment to preserve the Conda file structure. - # [AzureMlVersion] = 2 - -name: project_environment +name: diabetes_regression_training_env dependencies: # The python interpreter version. - # Currently Azure ML Workbench only supports 3.5.2 and later. + - python=3.7.* + - pip -- python=3.6.2 - # Required by azureml-defaults, installed separately through Conda to + - pip: + # Base AzureML SDK + - azureml-sdk==1.27.* - # get a prebuilt version and not require build tools for the install. + # Must match AzureML SDK version. + # https://docs.microsoft.com/en-us/azure/machine-learning/concept-environments + - azureml-defaults==1.27.* -- psutil=5.3 + # Training deps + - scikit-learn -- pip: - # Required packages for AzureML execution, history, and data preparation. - - azureml-sdk[notebooks] # add the version to lock it ==0.1.74 - - scipy==1.0.0 - - scikit-learn==0.21.3 - - pandas==0.23.1 - - numpy==1.14.5 - - joblib==0.13.2 - - gunicorn==19.9.0 - - flask==1.1.1 - - azure-ml-api-sdk + # Scoring deps + - inference-schema[numpy-support] + # MLOps with R + - azure-storage-blob diff --git a/diabetes_regression/conda_dependencies_scorecopy.yml b/diabetes_regression/conda_dependencies_scorecopy.yml new file mode 100644 index 00000000..9ed22ccd --- /dev/null +++ b/diabetes_regression/conda_dependencies_scorecopy.yml @@ -0,0 +1,31 @@ +# Conda environment specification. The dependencies defined in this file will +# be automatically provisioned for managed runs. These include runs against +# the localdocker, remotedocker, and cluster compute targets. + +# Note that this file is NOT used to automatically manage dependencies for the +# local compute target. To provision these dependencies locally, run: +# conda env update --file conda_dependencies.yml + +# Details about the Conda environment file format: +# https://conda.io/docs/using/envs.html#create-environment-file-by-hand + +# For managing Spark packages and configuration, see spark_dependencies.yml. +# Version of this configuration file's structure and semantics in AzureML. +# This directive is stored in a comment to preserve the Conda file structure. +# [AzureMlVersion] = 2 + +# These dependencies are used to create the environment used by the batch score +# copy pipeline step +name: diabetes_regression_score_copy_env +dependencies: + # The python interpreter version. + # Currently Azure ML Workbench only supports 3.5.2 and later. + - python=3.7.* + - pip + + - pip: + # Base AzureML SDK + - azureml-sdk==1.27.* + + # Score copying deps + - azure-storage-blob diff --git a/diabetes_regression/conda_dependencies_scoring.yml b/diabetes_regression/conda_dependencies_scoring.yml new file mode 100644 index 00000000..e744b369 --- /dev/null +++ b/diabetes_regression/conda_dependencies_scoring.yml @@ -0,0 +1,32 @@ +# Conda environment specification. The dependencies defined in this file will +# be automatically provisioned for managed runs. These include runs against +# the localdocker, remotedocker, and cluster compute targets. + +# Note that this file is NOT used to automatically manage dependencies for the +# local compute target. To provision these dependencies locally, run: +# conda env update --file conda_dependencies.yml + +# Details about the Conda environment file format: +# https://conda.io/docs/using/envs.html#create-environment-file-by-hand + +# For managing Spark packages and configuration, see spark_dependencies.yml. +# Version of this configuration file's structure and semantics in AzureML. +# This directive is stored in a comment to preserve the Conda file structure. +# [AzureMlVersion] = 2 + +# These dependencies are used to create the environment used by the batch score +# pipeline step +name: diabetes_regression_scoring_env +dependencies: + # The python interpreter version. + # Currently Azure ML Workbench only supports 3.5.2 and later. + - python=3.7.* + - pip + + - pip: + # Base AzureML SDK + - azureml-sdk==1.27.* + + # Scoring deps + - scikit-learn + - pandas diff --git a/diabetes_regression/evaluate/evaluate_model.py b/diabetes_regression/evaluate/evaluate_model.py new file mode 100644 index 00000000..d1ff3c6a --- /dev/null +++ b/diabetes_regression/evaluate/evaluate_model.py @@ -0,0 +1,154 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" +from azureml.core import Run +import argparse +import traceback +from util.model_helper import get_model + +run = Run.get_context() + +# if you would like to run this script on a local computer +# the following code is a good starting point for you +# use +# python -m evaluate.evaluate_model +# in diabetes_regression folder context + +# if (run.id.startswith('OfflineRun')): +# from dotenv import load_dotenv +# # For local development, set values in this section +# load_dotenv() +# sources_dir = os.environ.get("SOURCES_DIR_TRAIN") +# if (sources_dir is None): +# sources_dir = 'diabetes_regression' +# path_to_util = os.path.join(".", sources_dir, "util") +# sys.path.append(os.path.abspath(path_to_util)) # NOQA: E402 +# from model_helper import get_model +# workspace_name = os.environ.get("WORKSPACE_NAME") +# experiment_name = os.environ.get("EXPERIMENT_NAME") +# resource_group = os.environ.get("RESOURCE_GROUP") +# subscription_id = os.environ.get("SUBSCRIPTION_ID") +# tenant_id = os.environ.get("TENANT_ID") +# model_name = os.environ.get("MODEL_NAME") +# app_id = os.environ.get('SP_APP_ID') +# app_secret = os.environ.get('SP_APP_SECRET') +# build_id = os.environ.get('BUILD_BUILDID') +# # run_id useful to query previous runs +# run_id = "57fee47f-5ae8-441c-bc0c-d4c371f32d70" + +# aml_workspace = Workspace.get( +# name=workspace_name, +# subscription_id=subscription_id, +# resource_group=resource_group +# ) +# ws = aml_workspace +# exp = Experiment(ws, experiment_name) + +# comment the following three lines +# if you would like to use Offline mode +exp = run.experiment +ws = run.experiment.workspace +run_id = 'amlcompute' + +parser = argparse.ArgumentParser("evaluate") + +parser.add_argument( + "--run_id", + type=str, + help="Training run ID", +) +parser.add_argument( + "--model_name", + type=str, + help="Name of the Model", + default="diabetes_model.pkl", +) + +parser.add_argument( + "--allow_run_cancel", + type=str, + help="Set this to false to avoid evaluation step from cancelling run after an unsuccessful evaluation", # NOQA: E501 + default="true", +) + +args = parser.parse_args() +if (args.run_id is not None): + run_id = args.run_id +if (run_id == 'amlcompute'): + run_id = run.parent.id +model_name = args.model_name +metric_eval = "mse" + +allow_run_cancel = args.allow_run_cancel +# Parameterize the matrices on which the models should be compared +# Add golden data set on which all the model performance can be evaluated +try: + firstRegistration = False + tag_name = 'experiment_name' + + model = get_model( + model_name=model_name, + tag_name=tag_name, + tag_value=exp.name, + aml_workspace=ws) + + if (model is not None): + production_model_mse = 10000 + if (metric_eval in model.tags): + production_model_mse = float(model.tags[metric_eval]) + try: + new_model_mse = float(run.parent.get_metrics().get(metric_eval)) + except TypeError: + new_model_mse = None + if (production_model_mse is None or new_model_mse is None): + print("Unable to find ", metric_eval, " metrics, " + "exiting evaluation") + if((allow_run_cancel).lower() == 'true'): + run.parent.cancel() + else: + print( + "Current Production model {}: {}, ".format( + metric_eval, production_model_mse) + + "New trained model {}: {}".format( + metric_eval, new_model_mse + ) + ) + + if (new_model_mse < production_model_mse): + print("New trained model performs better, " + "thus it should be registered") + else: + print("New trained model metric is worse than or equal to " + "production model so skipping model registration.") + if((allow_run_cancel).lower() == 'true'): + run.parent.cancel() + else: + print("This is the first model, " + "thus it should be registered") + +except Exception: + traceback.print_exc(limit=None, file=None, chain=True) + print("Something went wrong trying to evaluate. Exiting.") + raise diff --git a/diabetes_regression/parameters.json b/diabetes_regression/parameters.json new file mode 100644 index 00000000..48f7227d --- /dev/null +++ b/diabetes_regression/parameters.json @@ -0,0 +1,18 @@ +{ + "training": + { + "alpha": 0.4 + }, + "evaluation": + { + + }, + "registration": + { + "tags": ["mse"] + }, + "scoring": + { + + } +} diff --git a/diabetes_regression/register/register_model.py b/diabetes_regression/register/register_model.py new file mode 100644 index 00000000..bca55a83 --- /dev/null +++ b/diabetes_regression/register/register_model.py @@ -0,0 +1,214 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" +import json +import os +import sys +import argparse +import traceback +import joblib +from azureml.core import Run, Experiment, Workspace, Dataset +from azureml.core.model import Model as AMLModel + + +def main(): + + run = Run.get_context() + if (run.id.startswith('OfflineRun')): + from dotenv import load_dotenv + # For local development, set values in this section + load_dotenv() + workspace_name = os.environ.get("WORKSPACE_NAME") + experiment_name = os.environ.get("EXPERIMENT_NAME") + resource_group = os.environ.get("RESOURCE_GROUP") + subscription_id = os.environ.get("SUBSCRIPTION_ID") + # run_id useful to query previous runs + run_id = "bd184a18-2ac8-4951-8e78-e290bef3b012" + aml_workspace = Workspace.get( + name=workspace_name, + subscription_id=subscription_id, + resource_group=resource_group + ) + ws = aml_workspace + exp = Experiment(ws, experiment_name) + else: + ws = run.experiment.workspace + exp = run.experiment + run_id = 'amlcompute' + + parser = argparse.ArgumentParser("register") + + parser.add_argument( + "--run_id", + type=str, + help="Training run ID", + ) + + parser.add_argument( + "--model_name", + type=str, + help="Name of the Model", + default="diabetes_model.pkl", + ) + + parser.add_argument( + "--step_input", + type=str, + help=("input from previous steps") + ) + + args = parser.parse_args() + if (args.run_id is not None): + run_id = args.run_id + if (run_id == 'amlcompute'): + run_id = run.parent.id + model_name = args.model_name + model_path = args.step_input + + print("Getting registration parameters") + + # Load the registration parameters from the parameters file + with open("parameters.json") as f: + pars = json.load(f) + try: + register_args = pars["registration"] + except KeyError: + print("Could not load registration values from file") + register_args = {"tags": []} + + model_tags = {} + for tag in register_args["tags"]: + try: + mtag = run.parent.get_metrics()[tag] + model_tags[tag] = mtag + except KeyError: + print(f"Could not find {tag} metric on parent run.") + + # load the model + print("Loading model from " + model_path) + model_file = os.path.join(model_path, model_name) + model = joblib.load(model_file) + parent_tags = run.parent.get_tags() + try: + build_id = parent_tags["BuildId"] + except KeyError: + build_id = None + print("BuildId tag not found on parent run.") + print(f"Tags present: {parent_tags}") + try: + build_uri = parent_tags["BuildUri"] + except KeyError: + build_uri = None + print("BuildUri tag not found on parent run.") + print(f"Tags present: {parent_tags}") + + if (model is not None): + dataset_id = parent_tags["dataset_id"] + if (build_id is None): + register_aml_model( + model_file, + model_name, + model_tags, + exp, + run_id, + dataset_id) + elif (build_uri is None): + register_aml_model( + model_file, + model_name, + model_tags, + exp, + run_id, + dataset_id, + build_id) + else: + register_aml_model( + model_file, + model_name, + model_tags, + exp, + run_id, + dataset_id, + build_id, + build_uri) + else: + print("Model not found. Skipping model registration.") + sys.exit(0) + + +def model_already_registered(model_name, exp, run_id): + model_list = AMLModel.list(exp.workspace, name=model_name, run_id=run_id) + if len(model_list) >= 1: + e = ("Model name:", model_name, "in workspace", + exp.workspace, "with run_id ", run_id, "is already registered.") + print(e) + raise Exception(e) + else: + print("Model is not registered for this run.") + + +def register_aml_model( + model_path, + model_name, + model_tags, + exp, + run_id, + dataset_id, + build_id: str = 'none', + build_uri=None +): + try: + tagsValue = {"area": "diabetes_regression", + "run_id": run_id, + "experiment_name": exp.name} + tagsValue.update(model_tags) + if (build_id != 'none'): + model_already_registered(model_name, exp, run_id) + tagsValue["BuildId"] = build_id + if (build_uri is not None): + tagsValue["BuildUri"] = build_uri + + model = AMLModel.register( + workspace=exp.workspace, + model_name=model_name, + model_path=model_path, + tags=tagsValue, + datasets=[('training data', + Dataset.get_by_id(exp.workspace, dataset_id))]) + os.chdir("..") + print( + "Model registered: {} \nModel Description: {} " + "\nModel Version: {}".format( + model.name, model.description, model.version + ) + ) + except Exception: + traceback.print_exc(limit=None, file=None, chain=True) + print("Model registration failed") + raise + + +if __name__ == '__main__': + main() diff --git a/code/scoring/deployment_config_aci.yml b/diabetes_regression/scoring/deployment_config_aci.yml similarity index 73% rename from code/scoring/deployment_config_aci.yml rename to diabetes_regression/scoring/deployment_config_aci.yml index 939483b5..d2e0ba12 100644 --- a/code/scoring/deployment_config_aci.yml +++ b/diabetes_regression/scoring/deployment_config_aci.yml @@ -1,5 +1,4 @@ ---- +computeType: ACI containerResourceRequirements: cpu: 1 memoryInGB: 4 -computeType: ACI \ No newline at end of file diff --git a/code/scoring/deployment_config_aks.yml b/diabetes_regression/scoring/deployment_config_aks.yml similarity index 79% rename from code/scoring/deployment_config_aks.yml rename to diabetes_regression/scoring/deployment_config_aks.yml index 5cc78847..cd81009d 100644 --- a/code/scoring/deployment_config_aks.yml +++ b/diabetes_regression/scoring/deployment_config_aks.yml @@ -7,10 +7,10 @@ autoScaler: targetUtilization: 70 authEnabled: True containerResourceRequirements: - cpu: 1 - memoryInGB: 4 -appInsightsEnabled: False + cpu: 0.5 + memoryInGB: 2 +appInsightsEnabled: True scoringTimeoutMs: 5000 maxConcurrentRequestsPerContainer: 2 maxQueueWaitMs: 5000 -sslEnabled: True \ No newline at end of file +sslEnabled: True diff --git a/code/scoring/inference_config.yml b/diabetes_regression/scoring/inference_config.yml similarity index 78% rename from code/scoring/inference_config.yml rename to diabetes_regression/scoring/inference_config.yml index 3f65cf33..3fc86686 100644 --- a/code/scoring/inference_config.yml +++ b/diabetes_regression/scoring/inference_config.yml @@ -1,6 +1,6 @@ entryScript: score.py runtime: python -condaFile: conda_dependencies.yml +condaFile: ../conda_dependencies.yml extraDockerfileSteps: schemaFile: sourceDirectory: diff --git a/diabetes_regression/scoring/parallel_batchscore.py b/diabetes_regression/scoring/parallel_batchscore.py new file mode 100644 index 00000000..cd42c79c --- /dev/null +++ b/diabetes_regression/scoring/parallel_batchscore.py @@ -0,0 +1,157 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" + +import numpy as np +import pandas as pd +import joblib +import sys +from typing import List +from util.model_helper import get_model +from azureml.core import Model + +model = None + + +def parse_args() -> List[str]: + """ + The AML pipeline calls this file with a set of additional command + line arguments whose names are not documented. As such using the + ArgumentParser which necessitates that we supply the names of the + arguments is risky should those undocumented names change. Hence + we parse the arguments manually. + + :returns: List of model filters + + :raises: ValueError + """ + model_name_param = [ + (sys.argv[idx], sys.argv[idx + 1]) + for idx, itm in enumerate(sys.argv) + if itm == "--model_name" + ] + + if len(model_name_param) == 0: + raise ValueError( + "Model name is required but no model name parameter was passed to the script" # NOQA: E501 + ) + + model_name = model_name_param[0][1] + + model_version_param = [ + (sys.argv[idx], sys.argv[idx + 1]) + for idx, itm in enumerate(sys.argv) + if itm == "--model_version" + ] + model_version = ( + None + if len(model_version_param) < 1 + or len(model_version_param[0][1].strip()) == 0 # NOQA: E501 + else model_version_param[0][1] + ) + + model_tag_name_param = [ + (sys.argv[idx], sys.argv[idx + 1]) + for idx, itm in enumerate(sys.argv) + if itm == "--model_tag_name" + ] + model_tag_name = ( + None + if len(model_tag_name_param) < 1 + or len(model_tag_name_param[0][1].strip()) == 0 # NOQA: E501 + else model_tag_name_param[0][1] + ) + + model_tag_value_param = [ + (sys.argv[idx], sys.argv[idx + 1]) + for idx, itm in enumerate(sys.argv) + if itm == "--model_tag_value" + ] + model_tag_value = ( + None + if len(model_tag_value_param) < 1 + or len(model_tag_name_param[0][1].strip()) == 0 + else model_tag_value_param[0][1] + ) + + return [model_name, model_version, model_tag_name, model_tag_value] + + +def init(): + """ + Initializer called once per node that runs the scoring job. Parse command + line arguments and get the right model to use for scoring. + """ + try: + print("Initializing batch scoring script...") + + # Get the model using name/version/tags filter + model_filter = parse_args() + amlmodel = get_model( + model_name=model_filter[0], + model_version=model_filter[1], + tag_name=model_filter[2], + tag_value=model_filter[3]) + + # Load the model using name/version found + global model + modelpath = Model.get_model_path( + model_name=amlmodel.name, version=amlmodel.version) + model = joblib.load(modelpath) + print("Loaded model {}".format(model_filter[0])) + except Exception as ex: + print("Error: {}".format(ex)) + + +def run(mini_batch: pd.DataFrame) -> pd.DataFrame: + """ + The run method is called multiple times by the runtime. Each time + a mini-batch consisting of a portion of the input data is passed + in as a pandas DataFrame. The run method should return the scoring + results as a List or a pandas DataFrame. + + :param mini_batch: Dataframe containing a portion of the scoring data + + :returns: array containing the scores. + """ + + try: + result = None + + for _, sample in mini_batch.iterrows(): + # prediction + pred = model.predict(sample.values.reshape(1, -1)) + result = ( + np.array(pred) if result is None else np.vstack((result, pred)) + ) # NOQA: E501 + + return ( + [] + if result is None + else mini_batch.join(pd.DataFrame(result, columns=["score"])) + ) + + except Exception as ex: + print(ex) diff --git a/diabetes_regression/scoring/parallel_batchscore_copyoutput.py b/diabetes_regression/scoring/parallel_batchscore_copyoutput.py new file mode 100644 index 00000000..1bcde4b6 --- /dev/null +++ b/diabetes_regression/scoring/parallel_batchscore_copyoutput.py @@ -0,0 +1,91 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" + +from azure.storage.blob import ContainerClient +from datetime import datetime, date, timezone +import argparse +import os + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--output_path", type=str, default=None) + parser.add_argument("--scoring_datastore", type=str, default=None) + parser.add_argument("--score_container", type=str, default=None) + parser.add_argument("--scoring_datastore_key", type=str, default=None) + parser.add_argument("--scoring_output_filename", type=str, default=None) + + return parser.parse_args() + + +def copy_output(args): + print("Output : {}".format(args.output_path)) + + accounturl = "https://{}.blob.core.windows.net".format( + args.scoring_datastore + ) # NOQA E501 + + containerclient = ContainerClient( + accounturl, args.score_container, args.scoring_datastore_key + ) + + destfolder = date.today().isoformat() + filetime = ( + datetime.now(timezone.utc) + .time() + .isoformat("milliseconds") + .replace(":", "_") + .replace(".", "_") + ) # noqa E501 + destfilenameparts = args.scoring_output_filename.split(".") + destblobname = "{}/{}_{}.{}".format( + destfolder, destfilenameparts[0], filetime, destfilenameparts[1] + ) + + destblobclient = containerclient.get_blob_client(destblobname) + with open( + os.path.join(args.output_path, "parallel_run_step.txt"), "rb" + ) as scorefile: # noqa E501 + destblobclient.upload_blob(scorefile, blob_type="BlockBlob") + + +if __name__ == "__main__": + args = parse_args() + if ( + args.scoring_datastore is None + or args.scoring_datastore.strip() == "" + or args.score_container is None + or args.score_container.strip() == "" + or args.scoring_datastore_key is None + or args.scoring_datastore_key.strip() == "" + or args.scoring_output_filename is None + or args.scoring_output_filename.strip() == "" + or args.output_path is None + or args.output_path.strip() == "" + ): + print("Missing parameters in parallel_batchscore_copyoutput.py -- Not going to copy inferences to an output datastore") # NOQA E501 + else: + copy_output(args) diff --git a/diabetes_regression/scoring/score.py b/diabetes_regression/scoring/score.py new file mode 100644 index 00000000..4acd5c8d --- /dev/null +++ b/diabetes_regression/scoring/score.py @@ -0,0 +1,90 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" +import numpy +import joblib +import os +from azureml.core.model import Model +from inference_schema.schema_decorators \ + import input_schema, output_schema +from inference_schema.parameter_types.numpy_parameter_type \ + import NumpyParameterType + + +def init(): + # load the model from file into a global object + global model + + # we assume that we have just one model + # AZUREML_MODEL_DIR is an environment variable created during deployment. + # It is the path to the model folder + # (./azureml-models/$MODEL_NAME/$VERSION) + model_path = Model.get_model_path( + os.getenv("AZUREML_MODEL_DIR").split('/')[-2]) + + model = joblib.load(model_path) + + +input_sample = numpy.array([ + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + [10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]]) +output_sample = numpy.array([ + 5021.509689995557, + 3693.645386402646]) + + +# Inference_schema generates a schema for your web service +# It then creates an OpenAPI (Swagger) specification for the web service +# at http:///swagger.json +@input_schema('data', NumpyParameterType(input_sample)) +@output_schema(NumpyParameterType(output_sample)) +def run(data, request_headers): + result = model.predict(data) + + # Demonstrate how we can log custom data into the Application Insights + # traces collection. + # The 'X-Ms-Request-id' value is generated internally and can be used to + # correlate a log entry with the Application Insights requests collection. + # The HTTP 'traceparent' header may be set by the caller to implement + # distributed tracing (per the W3C Trace Context proposed specification) + # and can be used to correlate the request to external systems. + print(('{{"RequestId":"{0}", ' + '"TraceParent":"{1}", ' + '"NumberOfPredictions":{2}}}' + ).format( + request_headers.get("X-Ms-Request-Id", ""), + request_headers.get("Traceparent", ""), + len(result) + )) + + return {"result": result.tolist()} + + +if __name__ == "__main__": + # Test scoring + init() + test_row = '{"data":[[1,2,3,4,5,6,7,8,9,10],[10,9,8,7,6,5,4,3,2,1]]}' + prediction = run(test_row, {}) + print("Test result: ", prediction) diff --git a/diabetes_regression/scoring/scoreA.py b/diabetes_regression/scoring/scoreA.py new file mode 100644 index 00000000..ac4a6100 --- /dev/null +++ b/diabetes_regression/scoring/scoreA.py @@ -0,0 +1,6 @@ +def init(): + global model + + +def run(raw_data): + return "New Model A" diff --git a/diabetes_regression/scoring/scoreB.py b/diabetes_regression/scoring/scoreB.py new file mode 100644 index 00000000..c0865269 --- /dev/null +++ b/diabetes_regression/scoring/scoreB.py @@ -0,0 +1,6 @@ +def init(): + global model + + +def run(raw_data): + return "New Model B" diff --git a/diabetes_regression/training/R/r_train.r b/diabetes_regression/training/R/r_train.r new file mode 100644 index 00000000..c19a58be --- /dev/null +++ b/diabetes_regression/training/R/r_train.r @@ -0,0 +1,41 @@ +print(R.version.string) + +# COMMAND ---------- + +path="weight_data.csv" +print(paste("Reading file from",path)) + +routes<-read.csv(path, header=TRUE) + +# The predictor vector (height). +x <- routes$height +# The response vector (weight). +y <- routes$weight +# Apply the lm() function. +model <- lm(y~x) + +# COMMAND ---------- + +routes + +# COMMAND ---------- + +# Make Predictions +df_test_heights <- data.frame(x = as.numeric(c(115,20))) +result <- predict(model,df_test_heights) +print(result) + +# COMMAND ---------- + +# Save the model to blob storage +model_path="model.rds" +saveRDS(model, model_path) + +# COMMAND ---------- + +# View model details +print(model) + +# COMMAND ---------- + +print('Completed') \ No newline at end of file diff --git a/diabetes_regression/training/R/train_with_r.py b/diabetes_regression/training/R/train_with_r.py new file mode 100644 index 00000000..b8a0a2c3 --- /dev/null +++ b/diabetes_regression/training/R/train_with_r.py @@ -0,0 +1,3 @@ +import subprocess + +subprocess.check_call(["bash", "-c", "Rscript r_train.r && ls -ltr model.rds"]) diff --git a/diabetes_regression/training/R/train_with_r_on_databricks.py b/diabetes_regression/training/R/train_with_r_on_databricks.py new file mode 100644 index 00000000..c571d609 --- /dev/null +++ b/diabetes_regression/training/R/train_with_r_on_databricks.py @@ -0,0 +1,15 @@ +import os +import argparse + +parser = argparse.ArgumentParser("train") +parser.add_argument( + "--AZUREML_SCRIPT_DIRECTORY_NAME", + type=str, + help="folder", +) + +args, unknown = parser.parse_known_args() +folder = args.AZUREML_SCRIPT_DIRECTORY_NAME + +os.system("cd " + "/dbfs/" + folder + + " && Rscript r_train.r && ls -ltr model.rds") diff --git a/diabetes_regression/training/R/weight_data.csv b/diabetes_regression/training/R/weight_data.csv new file mode 100644 index 00000000..cc441ee9 --- /dev/null +++ b/diabetes_regression/training/R/weight_data.csv @@ -0,0 +1,30 @@ +height,weight +79,174 +63,250 +75,223 +75,130 +70,120 +76,239 +63,129 +64,185 +59,246 +80,241 +79,217 +65,212 +74,242 +71,223 +61,167 +78,148 +75,229 +75,116 +75,182 +72,237 +72,160 +79,169 +67,219 +61,202 +65,168 +79,181 +81,214 +78,216 +59,245 diff --git a/diabetes_regression/training/test_train.py b/diabetes_regression/training/test_train.py new file mode 100644 index 00000000..e1a79781 --- /dev/null +++ b/diabetes_regression/training/test_train.py @@ -0,0 +1,32 @@ +import numpy as np +from diabetes_regression.training.train import train_model, get_model_metrics + + +def test_train_model(): + X_train = np.array([1, 2, 3, 4, 5, 6]).reshape(-1, 1) + y_train = np.array([10, 9, 8, 8, 6, 5]) + data = {"train": {"X": X_train, "y": y_train}} + + reg_model = train_model(data, {"alpha": 1.2}) + + preds = reg_model.predict([[1], [2]]) + np.testing.assert_almost_equal(preds, [9.93939393939394, 9.03030303030303]) + + +def test_get_model_metrics(): + + class MockModel: + + @staticmethod + def predict(data): + return ([8.12121212, 7.21212121]) + + X_test = np.array([3, 4]).reshape(-1, 1) + y_test = np.array([8, 7]) + data = {"test": {"X": X_test, "y": y_test}} + + metrics = get_model_metrics(MockModel(), data) + + assert 'mse' in metrics + mse = metrics['mse'] + np.testing.assert_almost_equal(mse, 0.029843893480257067) diff --git a/code/scoring/score.py b/diabetes_regression/training/train.py similarity index 50% rename from code/scoring/score.py rename to diabetes_regression/training/train.py index dafe6bee..22258042 100644 --- a/code/scoring/score.py +++ b/diabetes_regression/training/train.py @@ -23,35 +23,62 @@ ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ -import json -import numpy -from azureml.core.model import Model -import joblib - - -def init(): - global model - - # load the model from file into a global object - model_path = Model.get_model_path( - model_name="sklearn_regression_model.pkl") - model = joblib.load(model_path) - - -def run(raw_data): - try: - data = json.loads(raw_data)["data"] - data = numpy.array(data) - result = model.predict(data) - return json.dumps({"result": result.tolist()}) - except Exception as e: - result = str(e) - return json.dumps({"error": result}) - - -if __name__ == "__main__": - # Test scoring - init() - test_row = '{"data":[[1,2,3,4,5,6,7,8,9,10],[10,9,8,7,6,5,4,3,2,1]]}' - prediction = run(test_row) - print("Test result: ", prediction) + +import os +import pandas as pd +from sklearn.linear_model import Ridge +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split + + +# Split the dataframe into test and train data +def split_data(df): + X = df.drop('Y', axis=1).values + y = df['Y'].values + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=0) + data = {"train": {"X": X_train, "y": y_train}, + "test": {"X": X_test, "y": y_test}} + return data + + +# Train the model, return the model +def train_model(data, ridge_args): + reg_model = Ridge(**ridge_args) + reg_model.fit(data["train"]["X"], data["train"]["y"]) + return reg_model + + +# Evaluate the metrics for the model +def get_model_metrics(model, data): + preds = model.predict(data["test"]["X"]) + mse = mean_squared_error(preds, data["test"]["y"]) + metrics = {"mse": mse} + return metrics + + +def main(): + print("Running train.py") + + # Define training parameters + ridge_args = {"alpha": 0.5} + + # Load the training data as dataframe + data_dir = "data" + data_file = os.path.join(data_dir, 'diabetes.csv') + train_df = pd.read_csv(data_file) + + data = split_data(train_df) + + # Train the model + model = train_model(data, ridge_args) + + # Log the metrics for the model + metrics = get_model_metrics(model, data) + for (k, v) in metrics.items(): + print(f"{k}: {v}") + + +if __name__ == '__main__': + main() diff --git a/diabetes_regression/training/train_aml.py b/diabetes_regression/training/train_aml.py new file mode 100644 index 00000000..9303198b --- /dev/null +++ b/diabetes_regression/training/train_aml.py @@ -0,0 +1,176 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" +from azureml.core.run import Run +from azureml.core import Dataset, Datastore, Workspace +import os +import argparse +import joblib +import json +from train import split_data, train_model, get_model_metrics + + +def register_dataset( + aml_workspace: Workspace, + dataset_name: str, + datastore_name: str, + file_path: str +) -> Dataset: + datastore = Datastore.get(aml_workspace, datastore_name) + dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path)) + dataset = dataset.register(workspace=aml_workspace, + name=dataset_name, + create_new_version=True) + + return dataset + + +def main(): + print("Running train_aml.py") + + parser = argparse.ArgumentParser("train") + parser.add_argument( + "--model_name", + type=str, + help="Name of the Model", + default="diabetes_model.pkl", + ) + + parser.add_argument( + "--step_output", + type=str, + help=("output for passing data to next step") + ) + + parser.add_argument( + "--dataset_version", + type=str, + help=("dataset version") + ) + + parser.add_argument( + "--data_file_path", + type=str, + help=("data file path, if specified,\ + a new version of the dataset will be registered") + ) + + parser.add_argument( + "--caller_run_id", + type=str, + help=("caller run id, for example ADF pipeline run id") + ) + + parser.add_argument( + "--dataset_name", + type=str, + help=("Dataset name. Dataset must be passed by name\ + to always get the desired dataset version\ + rather than the one used while the pipeline creation") + ) + + args = parser.parse_args() + + print("Argument [model_name]: %s" % args.model_name) + print("Argument [step_output]: %s" % args.step_output) + print("Argument [dataset_version]: %s" % args.dataset_version) + print("Argument [data_file_path]: %s" % args.data_file_path) + print("Argument [caller_run_id]: %s" % args.caller_run_id) + print("Argument [dataset_name]: %s" % args.dataset_name) + + model_name = args.model_name + step_output_path = args.step_output + dataset_version = args.dataset_version + data_file_path = args.data_file_path + dataset_name = args.dataset_name + + run = Run.get_context() + + print("Getting training parameters") + + # Load the training parameters from the parameters file + with open("parameters.json") as f: + pars = json.load(f) + try: + train_args = pars["training"] + except KeyError: + print("Could not load training values from file") + train_args = {} + + # Log the training parameters + print(f"Parameters: {train_args}") + for (k, v) in train_args.items(): + run.log(k, v) + run.parent.log(k, v) + + # Get the dataset + if (dataset_name): + if (data_file_path == 'none'): + dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 + else: + dataset = register_dataset(run.experiment.workspace, + dataset_name, + os.environ.get("DATASTORE_NAME"), + data_file_path) + else: + e = ("No dataset provided") + print(e) + raise Exception(e) + + # Link dataset to the step run so it is trackable in the UI + run.input_datasets['training_data'] = dataset + run.parent.tag("dataset_id", value=dataset.id) + + # Split the data into test/train + df = dataset.to_pandas_dataframe() + data = split_data(df) + + # Train the model + model = train_model(data, train_args) + + # Evaluate and log the metrics returned from the train function + metrics = get_model_metrics(model, data) + for (k, v) in metrics.items(): + run.log(k, v) + run.parent.log(k, v) + + # Pass model file to next step + os.makedirs(step_output_path, exist_ok=True) + model_output_path = os.path.join(step_output_path, model_name) + joblib.dump(value=model, filename=model_output_path) + + # Also upload model file to run outputs for history + os.makedirs('outputs', exist_ok=True) + output_path = os.path.join('outputs', model_name) + joblib.dump(value=model, filename=output_path) + + run.tag("run_type", value="train") + print(f"tags now present for run: {run.tags}") + + run.complete() + + +if __name__ == '__main__': + main() diff --git a/diabetes_regression/util/__init__.py b/diabetes_regression/util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/diabetes_regression/util/model_helper.py b/diabetes_regression/util/model_helper.py new file mode 100644 index 00000000..0fd20ef0 --- /dev/null +++ b/diabetes_regression/util/model_helper.py @@ -0,0 +1,79 @@ +""" +model_helper.py +""" +from azureml.core import Run +from azureml.core import Workspace +from azureml.core.model import Model as AMLModel + + +def get_current_workspace() -> Workspace: + """ + Retrieves and returns the current workspace. + Will not work when ran locally. + + Parameters: + None + + Return: + The current workspace. + """ + run = Run.get_context(allow_offline=False) + experiment = run.experiment + return experiment.workspace + + +def get_model( + model_name: str, + model_version: int = None, # If none, return latest model + tag_name: str = None, + tag_value: str = None, + aml_workspace: Workspace = None +) -> AMLModel: + """ + Retrieves and returns a model from the workspace by its name + and (optional) tag. + + Parameters: + aml_workspace (Workspace): aml.core Workspace that the model lives. + model_name (str): name of the model we are looking for + (optional) model_version (str): model version. Latest if not provided. + (optional) tag (str): the tag value & name the model was registered under. + + Return: + A single aml model from the workspace that matches the name and tag, or + None. + """ + if aml_workspace is None: + print("No workspace defined - using current experiment workspace.") + aml_workspace = get_current_workspace() + + tags = None + if tag_name is not None or tag_value is not None: + # Both a name and value must be specified to use tags. + if tag_name is None or tag_value is None: + raise ValueError( + "model_tag_name and model_tag_value should both be supplied" + + "or excluded" # NOQA: E501 + ) + tags = [[tag_name, tag_value]] + + model = None + if model_version is not None: + # TODO(tcare): Finding a specific version currently expects exceptions + # to propagate in the case we can't find the model. This call may + # result in a WebserviceException that may or may not be due to the + # model not existing. + model = AMLModel( + aml_workspace, + name=model_name, + version=model_version, + tags=tags) + else: + models = AMLModel.list( + aml_workspace, name=model_name, tags=tags, latest=True) + if len(models) == 1: + model = models[0] + elif len(models) > 1: + raise Exception("Expected only one model") + + return model diff --git a/docs/canary_ab_deployment.md b/docs/canary_ab_deployment.md new file mode 100644 index 00000000..49edb503 --- /dev/null +++ b/docs/canary_ab_deployment.md @@ -0,0 +1,124 @@ +# Model deployment to AKS cluster with Canary deployment + +[![Build Status](https://aidemos.visualstudio.com/MLOps/_apis/build/status/microsoft.MLOpsPython-Canary?branchName=master)](https://aidemos.visualstudio.com/MLOps/_build/latest?definitionId=133&branchName=master) + +If your target deployment environment is a Kubernetes cluster and you want to implement [Canary and/or A/B testing deployment strategies](http://adfpractice-fedor.blogspot.com/2019/04/deployment-strategies-with-kubernetes.html) you can follow this sample guide. + +- [Prerequisites](#prerequisites) +- [Install Istio on a K8s cluster](#install-istio-on-a-k8s-cluster) +- [Set up variables](#set-up-variables) +- [Configure a pipeline to build and deploy a scoring Image](#configure-a-pipeline-to-build-and-deploy-a-scoring-image) +- [Build a new Scoring Image](#build-a-new-scoring-image) + +## Prerequisites + +Before continuing with this guide, you will need: + +* An [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service) cluster + * This does **not** have to be the same cluster as the example in [Getting Started: Deploy the model to Azure Kubernetes Service](/docs/getting_started.md#deploy-the-model-to-azure-kubernetes-service) + * The cluster does not have to be connected to Azure Machine Learning. + * If you want to deploy a new cluster, see [Quickstart: Deploy an Azure Kubernetes Service cluster using the Azure CLI](https://docs.microsoft.com/en-us/azure/aks/kubernetes-walkthrough) +* An Azure Container Registry instance that is authenticated with your Azure Kubernetes Service cluster. + * The chart you will deploy is assuming you are authenticated using a service principal. + * See [Authenticate with Azure Container Registry from Azure Kubernetes Service](https://docs.microsoft.com/en-us/azure/aks/cluster-container-registry-integration#configure-acr-integration-for-existing-aks-clusters) for an authentication guide. +* In Azure DevOps, a service connection to your Kubernetes cluster. + * If you do not currently have a namespace, create one named 'abtesting'. + +## Install Istio on a K8s cluster + +You'll be using the [Istio](https://istio.io) service mesh implementation to control traffic routing between model versions. Follow the instructions at [Install and use Istio in Azure Kubernetes Service (AKS)](https://docs.microsoft.com/azure/aks/servicemesh-istio-install?pivots=client-operating-system-linux). + +After Istio is installed, figure out the Istio gateway endpoint on your K8s cluster: + +```bash +GATEWAY_IP=$(kubectl get svc istio-ingressgateway -n istio-system -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +``` + +You don't need to create any Istio resources (e.g. Gateway or VirtualService) at this point. It will be handled by the AzDo pipeline that builds and deploys a scoring image. + +## Set up variables + +There are some extra variables that you need to setup in ***devopsforai-aml-vg*** variable group (see [getting started](./getting_started.md)): + +| Variable Name | Suggested Value | Short Description | +|---------------------------|-----------------------|-----------------------------------------------------------| +| K8S_AB_SERVICE_CONNECTION | mlops-aks | Name of the service connection to your Kubernetes cluster | +| K8S_AB_NAMESPACE | abtesting | Kubernetes namespace for model deployment | +| IMAGE_REPO_NAME | [Your ACR's DNS name] | Image reposiory name (e.g. mlopspyciamlcr.azurecr.io) | + +## Configure a pipeline to build and deploy a scoring Image + +Import and run the [abtest.yml](./.pipelines/abtest.yml) multistage deployment pipeline. + +After the pipeline completes successfully, you will see a registered Docker image in the ACR repository attached to the Azure ML Service: + +![scoring image](./images/scoring_image.png) + +The pipeline creates Istio Gateway and VirtualService and deploys the scoring image to the Kubernetes cluster. + +```bash +kubectl get deployments --namespace abtesting +NAME READY UP-TO-DATE AVAILABLE AGE +model-green 1/1 1 1 19h +``` + +## Build a new Scoring Image + +Change value of the ***SCORE_SCRIPT*** variable in the [abtest.yml](./.pipelines/abtest.yml) to point to ***scoring/scoreA.py*** and merge it to the master branch. + +**Note:** ***scoreA.py*** and ***scoreB.py*** files used in this tutorial are just mockups returning either "New Model A" or "New Model B" respectively. They are used to demonstrate the concept of testing two scoring images with different models or scoring code. In real life you would implement a scoring file similar to [score.py](./../code/scoring/score.py) (see the [Getting Started](./getting_started.md) guide). + +It will automatically trigger the pipeline and deploy a new scoring image with the following stages implementing ***Canary*** deployment strategy: + +| Stage | Green Weight | Blue Weight | Description | +|------------|--------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| +| Blue_0 | 100 | 0 | New image (blue) is deployed.
But all traffic (100%) is still routed to the old (green) image. | +| Blue_50 | 50 | 50 | Traffic is split between old (green) and new (blue) images 50/50. | +| Blue_100 | 0 | 100 | All traffic (100%) is routed to the blue image. | +| Blue_Green | 0 | 100 | Old green image is removed. The new blue image is copied as green.
Blue and Green images are equal.
All traffic (100%) is routed to the blue image. | +| Green_100 | 100 | 0 | All traffic (100%) is routed to the green image.
The blue image is removed. | + +**Note:** The pipeline performs the rollout without any pausing. You may want to configure [Approvals and Checks](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/approvals?view=azure-devops&tabs=check-pass) for the stages on your environment for better experience of the model testing. The environment ***abtestenv*** will be added automatically to your AzDo project after the first pipeline run. + +At each stage you can verify how the traffic is routed sending requests to $GATEWAY_IP/score with ***Postman*** or with ***curl***: + +```bash +curl $GATEWAY_IP/score +``` + +You can also emulate a simple load test on the gateway with the ***load_test.sh***: + +```bash +./charts/load_test.sh 10 $GATEWAY_IP/score +``` + +The command above sends 10 requests to the gateway. So if the pipeline has completed stage Blue_50, the result will look like this: + +```bash +"New Model A" +"New Model A" +"New Model A" +"New Model B" +"New Model A" +"New Model B" +"New Model B" +"New Model A" +"New Model A" +"New Model A" +``` + +Regardless of the blue/green weight values set on the cluster, you can perform ***A/B testing*** and send requests directly to either blue or green images: + +```bash +curl --header "x-api-version: blue" $GATEWAY_IP/score +curl --header "x-api-version: green" $GATEWAY_IP/score +``` + +or with a load_test.sh script: + +```bash +./charts/load_test.sh 10 $GATEWAY_IP/score blue +./charts/load_test.sh 10 $GATEWAY_IP/score green +``` + +In this case the Istio Virtual Service analyzes the request header and routes the traffic directly to the specified model version. diff --git a/docs/code_description.md b/docs/code_description.md index d93ef077..81abc78f 100644 --- a/docs/code_description.md +++ b/docs/code_description.md @@ -1,38 +1,97 @@ ## Repo Details +### Directory Structure + +High level directory structure for this repository: + +```bash +├── .pipelines <- Azure DevOps YAML pipelines for CI, PR and model training and deployment. +├── bootstrap <- Python script to initialize this repository with a custom project name. +├── charts <- Helm charts to deploy resources on Azure Kubernetes Service(AKS). +├── data <- Initial set of data to train and evaluate model. Not for use to store data. +├── diabetes_regression <- The top-level folder for the ML project. +│ ├── evaluate <- Python script to evaluate trained ML model. +│ ├── register <- Python script to register trained ML model with Azure Machine Learning Service. +│ ├── scoring <- Python score.py to deploy trained ML model. +│ ├── training <- Python script to train ML model. +│ ├── R <- R script to train R based ML model. +│ ├── util <- Python script for various utility operations specific to this ML project. +├── docs <- Extensive markdown documentation for entire project. +├── environment_setup <- The top-level folder for everything related to infrastructure. +│ ├── arm-templates <- Azure Resource Manager(ARM) templates to build infrastructure needed for this project. +│ ├── tf-templates <- Terraform templates to build infrastructure needed for this project. +├── experimentation <- Jupyter notebooks with ML experimentation code. +├── ml_service <- The top-level folder for all Azure Machine Learning resources. +│ ├── pipelines <- Python script that builds Azure Machine Learning pipelines. +│ ├── util <- Python script for various utility operations specific to Azure Machine Learning. +├── .env.example <- Example .env file with environment for local development experience. +├── .gitignore <- A gitignore file specifies intentionally un-tracked files that Git should ignore. +├── LICENSE <- License document for this project. +├── README.md <- The top-level README for developers using this project. +``` + +The repository provides a template with folders structure suitable for maintaining multiple ML projects. There are common folders such as ***.pipelines***, ***environment_setup***, ***ml_service*** and folders containing the code base for each ML project. This repository contains a single sample ML project in the ***diabetes_regression*** folder. This folder is going to be automatically renamed to your project name if you follow the [bootstrap procedure](../bootstrap/README.md). + ### Environment Setup -- `environment_setup/requirements.txt` : It consists of a list of python packages which are needed by the train.py to run successfully on host agent (locally). +- `environment_setup/install_requirements.sh` : This script prepares a local conda environment i.e. install the Azure ML SDK and the packages specified in environment definitions. + +- `environment_setup/iac-*-arm.yml, arm-templates` : Infrastructure as Code piplines to create required resources using ARM, along with corresponding arm-templates. Infrastructure as Code can be deployed with this template or with the Terraform template. -- `environment_setup/install_requirements.sh` : This script prepares the python environment i.e. install the Azure ML SDK and the packages specified in requirements.txt +- `environment_setup/iac-*-tf.yml, tf-templates` : Infrastructure as Code piplines to create required resources using Terraform, along with corresponding tf-templates. Infrastructure as Code can be deployed with this template or with the ARM template. -- `environment_setup/iac-*.yml, arm-templates` : Infrastructure as Code piplines to create and delete required resources along with corresponding arm-templates. +- `environment_setup/iac-remove-environment.yml` : Infrastructure as Code piplines to delete the created required resources. - `environment_setup/Dockerfile` : Dockerfile of a build agent containing Python 3.6 and all required packages. -- `environment_setup/docker-image-pipeline.yml` : An AzDo pipeline for building and pushing [microsoft/mlopspython](https://hub.docker.com/_/microsoft-mlops-python) image. +- `environment_setup/docker-image-pipeline.yml` : An AzDo pipeline for building and pushing [microsoft/mlopspython](https://hub.docker.com/_/microsoft-mlops-python) image. ### Pipelines -- `.pipelines/azdo-base-pipeline.yml` : a pipeline template used by ci-build-train pipeline and pr-build-train pipelines. It contains steps performing linting, data and unit testing. -- `.pipelines/azdo-ci-build-train.yml` : a pipeline triggered when the code is merged into **master**. It performs linting, data integrity testing, unit testing, building and publishing an ML pipeline. -- `.pipelines/azdo-pr-build-train.yml` : a pipeline triggered when a **pull request** to the **master** branch is created. It performs linting, data integrity testing and unit testing only. +- `.pipelines/abtest.yml` : a pipeline demonstrating [Canary deployment strategy](./docs/canary_ab_deployment.md). +- `.pipelines/code-quality-template.yml` : a pipeline template used by the CI and PR pipelines. It contains steps performing linting, data and unit testing. +- `.pipelines/diabetes_regression-ci-image.yml` : a pipeline building a scoring image for the diabetes regression model. +- `.pipelines/diabetes_regression-ci.yml` : a pipeline triggered when the code is merged into **master**. It performs linting, data integrity testing, unit testing, building and publishing an ML pipeline. +- `.pipelines/diabetes_regression-cd.yml` : a pipeline triggered when the code is merged into **master** and the `.pipelines/diabetes_regression-ci.yml` completes. Deploys the model to ACI, AKS or Webapp. +- `.pipelines/diabetes_regression-package-model-template.yml` : Pipeline template that creates a model package and adds the package location to the environment for subsequent tasks to use. +- `.pipelines/diabetes_regression-get-model-id-artifact-template.yml` : a pipeline template used by the `.pipelines/diabetes_regression-cd.yml` pipeline. It takes the model metadata artifact published by the previous pipeline and gets the model ID. +- `.pipelines/diabetes_regression-publish-model-artifact-template.yml` : a pipeline template used by the `.pipelines/diabetes_regression-ci.yml` pipeline. It finds out if a new model was registered and publishes a pipeline artifact containing the model metadata. +- `.pipelines/helm-*.yml` : pipeline templates used by the `.pipelines/abtest.yml` pipeline. +- `.pipelines/pr.yml` : a pipeline triggered when a **pull request** to the **master** branch is created. It performs linting, data integrity testing and unit testing only. ### ML Services -- `ml_service/pipelines/build_train_pipeline.py` : builds and publishes an ML training pipeline. -- `ml_service/pipelines/run_train_pipeline.py` : invokes a published ML training pipeline via REST API. +- `ml_service/pipelines/diabetes_regression_build_train_pipeline.py` : builds and publishes an ML training pipeline. It uses Python on ML Compute. +- `ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py` : builds and publishes an ML training pipeline. It uses R on ML Compute. +- `ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py` : builds and publishes an ML training pipeline. It uses R on Databricks Compute. +- `ml_service/pipelines/run_train_pipeline.py` : invokes a published ML training pipeline (Python on ML Compute) via REST API. - `ml_service/util` : contains common utility functions used to build and publish an ML training pipeline. -### Code +### Environment Definitions -- `code/training/train.py` : a training step of an ML training pipeline. -- `code/evaluate/evaluate_model.py` : an evaluating step of an ML training pipeline which registers a new trained model if evaluation shows the new model is more performant than the previous one. -- `code/evaluate/register_model.py` : (LEGACY) registers a new trained model if evaluation shows the new model is more performant than the previous one. +- `diabetes_regression/conda_dependencies.yml` : Conda environment definition for the environment used for both training and scoring (Docker image in which train.py and score.py are run). +- `diabetes_regression/ci_dependencies.yml` : Conda environment definition for the CI environment. -### Scoring -- code/scoring/score.py : a scoring script which is about to be packed into a Docker Image along with a model while being deployed to QA/Prod environment. -- code/scoring/conda_dependencies.yml : contains a list of dependencies required by score.py to be installed in a deployable Docker Image -- code/scoring/inference_config.yml, deployment_config_aci.yml, deployment_config_aks.yml : configuration files for the [AML Model Deploy](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.private-vss-services-azureml&ssr=false#overview) pipeline task for ACI and AKS deployment targets. +### Training Step +- `diabetes_regression/training/train_aml.py`: a training step of an ML training pipeline. +- `diabetes_regression/training/train.py` : ML functionality called by train_aml.py +- `diabetes_regression/training/R/r_train.r` : training a model with R basing on a sample dataset (weight_data.csv). +- `diabetes_regression/training/R/train_with_r.py` : a python wrapper (ML Pipeline Step) invoking R training script on ML Compute +- `diabetes_regression/training/R/train_with_r_on_databricks.py` : a python wrapper (ML Pipeline Step) invoking R training script on Databricks Compute +- `diabetes_regression/training/R/weight_data.csv` : a sample dataset used by R script (r_train.r) to train a model +- `diabetes_regression/training/R/test_train.py` : a unit test for the training script(s) + +### Evaluation Step + +- `diabetes_regression/evaluate/evaluate_model.py` : an evaluating step which cancels the pipeline in case of non-improvement. + +### Registering Step + +- `diabetes_regression/register/register_model.py` : registers a new trained model if evaluation shows the new model is more performant than the previous one. + +### Scoring +- `diabetes_regression/scoring/score.py` : a scoring script which is about to be packed into a Docker Image along with a model while being deployed to QA/Prod environment. +- `diabetes_regression/scoring/inference_config.yml`, `deployment_config_aci.yml`, `deployment_config_aks.yml` : configuration files for the [AML Model Deploy](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.private-vss-services-azureml&ssr=false#overview) pipeline task for ACI and AKS deployment targets. +- `diabetes_regression/scoring/scoreA.py`, `diabetes_regression/scoring/scoreB.py` : simplified scoring files for the [Canary deployment sample](./docs/canary_ab_deployment.md). diff --git a/docs/custom_container.md b/docs/custom_container.md new file mode 100644 index 00000000..46e692f9 --- /dev/null +++ b/docs/custom_container.md @@ -0,0 +1,113 @@ +# Customizing the Azure DevOps job container + +The Model training and deployment pipeline uses a Docker container +on the Azure Pipelines agents to provide a reproducible environment +to run test and deployment code. + The image of the container +`mcr.microsoft.com/mlops/python:latest` is built with this +[Dockerfile](../environment_setup/Dockerfile). + +Additionally mcr.microsoft.com/mlops/python image is also tagged with below tags. + +| Image Tags | Description | +| ----------------------------------------------- | :---------------------------------------------------------------------------------------- | +| mcr.microsoft.com/mlops/python:latest | latest image | +| mcr.microsoft.com/mlops/python:build-[id] | where [id] is Azure Devops build id e.g. mcr.microsoft.com/mlops/python:build-20200325.1 | +| mcr.microsoft.com/mlops/python:amlsdk-[version] | where [version] is aml sdk version e.g. mcr.microsoft.com/mlops/python:amlsdk-1.1.5.1 | +| mcr.microsoft.com/mlops/python:release-[id] | where [id] is github release id e.g. mcr.microsoft.com/mlops/python:release-3.0.0 | | + +In your project you will want to build your own +Docker image that only contains the dependencies and tools required for your +use case. This image will be more likely smaller and therefore faster, and it +will be totally maintained by your team. + +## Provision an Azure Container Registry + +An Azure Container Registry is deployed along your Azure ML Workspace to manage models. +You can use that registry instance to store your MLOps container image as well, or +provision a separate instance. + +## Create a Registry Service Connection + +[Create a service connection](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#sep-docreg) to your Azure Container Registry: + +- As *Connection type*, select *Docker Registry* +- As *Registry type*, select *Azure Container Registry* +- As *Azure container registry*, select your Container registry instance +- As *Service connection name*, enter `acrconnection` + +## Update the environment definition + +Modify the [Dockerfile](../environment_setup/Dockerfile) and/or the +[ci_dependencies.yml](../diabetes_regression/ci_dependencies.yml) CI Conda +environment definition to tailor your environment. +Conda provides a [reusable environment for training and deployment with Azure Machine Learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-environments). +The Conda environment used for CI should use the same package versions as the Conda environment +used for the Azure ML training and scoring environments (defined in [conda_dependencies.yml](../diabetes_regression/conda_dependencies.yml)). +This enables you to run unit and integration tests using the exact same dependencies as used in the ML pipeline. + +If a package is available in a Conda package repository, then we recommend that +you use the Conda installation rather than the pip installation. Conda packages +typically come with prebuilt binaries that make installation more reliable. + +## Create a container build pipeline + +In your [Azure DevOps](https://dev.azure.com) project create a new build +pipeline referring to the +[environment_setup/docker-image-pipeline.yml](../environment_setup/docker-image-pipeline.yml) +pipeline definition in your forked repository. + +Edit the [environment_setup/docker-image-pipeline.yml](../environment_setup/docker-image-pipeline.yml) file +and modify the string `'public/mlops/python'` with an name suitable to describe your environment, +e.g. `'mlops/diabetes_regression'`. + +Save and run the pipeline, making sure to set the these runtime variables: `amlsdkversion` and `githubrelease`. The values are up to you to set depending on your environment. These will show as tags on your image. + +![Custom Container Vars](./images/custom-container-variables.png) + +This will build and push a container image to your Azure Container Registry with +the name you have just edited. The next step is to modify the build pipeline to run the CI job on a container +run from that image. + +## Modify the model pipeline + +Modify the model pipeline file [diabetes_regression-ci.yml](../.pipelines/diabetes_regression-ci.yml) by replacing this section: + +``` +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest +``` + +with (using the image name previously defined): + +``` +resources: + containers: + - container: mlops + image: mlops/diabetes_regression + endpoint: acrconnection +``` + +Run the pipeline and ensure your container has been used. + +## Addressing conflicting dependencies + +Especially when working in a team, it's possible for environment changes across branches to interfere with one another. + +For example, if the master branch is using scikit-learn and you create a branch to use Tensorflow instead, and you +decide to remove scikit-learn from the +[ci_dependencies.yml](../diabetes_regression/ci_dependencies.yml) Conda environment definition +and run the [docker-image-pipeline.yml](../environment_setup/docker-image-pipeline.yml) Docker image, +then the master branch will stop building. + +You could leave scikit-learn in addition to Tensorflow in the environment, but that is not ideal, as you would have to take an extra step to remove scikit-learn after merging your branch to master. + +A better approach would be to use a distinct name for your modified environment, such as `mlops/diabetes_regression/tensorflow`. +By changing the name of the image in your branch in both the container build pipeline +[environment_setup/docker-image-pipeline.yml](../environment_setup/docker-image-pipeline.yml) +and the model pipeline file +[diabetes_regression-ci.yml](../.pipelines/diabetes_regression-ci.yml), +and running both pipelines in sequence on your branch, +you avoid any branch conflicts, and the name does not have to be changed after merging to master. diff --git a/docs/custom_model.md b/docs/custom_model.md new file mode 100644 index 00000000..28a15d78 --- /dev/null +++ b/docs/custom_model.md @@ -0,0 +1,124 @@ +# Bring your own code with the MLOpsPython repository template + +This document provides steps to follow when using this repository as a template to train models and deploy the models with real-time inference in Azure ML with your own scripts and data. + +1. Follow the MLOpsPython [Getting Started](getting_started.md) guide +1. Bootstrap the project +1. Configure training data +1. [If necessary] Convert your ML experimental code into production ready code +1. Replace the training code +1. [Optional] Update the evaluation code +1. Customize the build agent environment +1. [If appropriate] Replace the score code +1. [If appropriate] Configure batch scoring data + +## Follow the Getting Started guide + +Follow the [Getting Started](getting_started.md) guide to set up the infrastructure and pipelines to execute MLOpsPython. + +Take a look at the [Repo Details](code_description.md) document for a description of the structure of this repository. + +## Bootstrap the project + +Bootstrapping will prepare the directory structure to be used for your project name which includes: + +* renaming files and folders from the base project name `diabetes_regression` to your project name +* fixing imports and absolute path based on your project name +* deleting and cleaning up some directories + +**Note:** Since the bootstrap script will rename the `diabetes_regression` folder to the project name of your choice, we'll refer to your project as `[project name]` when paths are involved. + +To bootstrap from the existing MLOpsPython repository: + +1. Ensure Python 3 is installed locally +1. From a local copy of the code, run the `bootstrap.py` script in the `bootstrap` folder +`python bootstrap.py -d [dirpath] -n [projectname]` + * `[dirpath]` is the absolute path to the root of the directory where MLOpsPython is cloned + * `[projectname]` is the name of your ML project + +# Configure Custom Training + +## Configure training data + +The training ML pipeline uses a [sample diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html) as training data. + +**Important** Convert the template to use your own Azure ML Dataset for model training via these steps: + +1. [Create a Dataset](https://docs.microsoft.com/azure/machine-learning/how-to-create-register-datasets) in your Azure ML workspace +1. Update the `DATASET_NAME` and `DATASTORE_NAME` variables in `.pipelines/[project name]-variables-template.yml` + +## Convert your ML experimental code into production ready code + +The MLOpsPython template creates an Azure Machine Learning (ML) pipeline that invokes a set of [Azure ML pipeline steps](https://docs.microsoft.com/python/api/azureml-pipeline-steps/azureml.pipeline.steps) (see `ml_service/pipelines/[project name]_build_train_pipeline.py`). If your experiment is currently in a Jupyter notebook, it will need to be refactored into scripts that can be run independently and dropped into the template which the existing Azure ML pipeline steps utilize. + +1. Refactor your experiment code into scripts +1. [Recommended] Prepare unit tests + +Examples of all these scripts are provided in this repository. +See the [Convert ML experimental code to production code tutorial](https://docs.microsoft.com/azure/machine-learning/tutorial-convert-ml-experiment-to-production) for a step by step guide and additional details. + +## Replace training code + +The template contains three scripts in the `[project name]/training` folder. Update these scripts for your experiment code. + +* `train.py` contains the platform-agnostic logic required to do basic data preparation and train the model. This script can be invoked against a static data file for local development. +* `train_aml.py` is the entry script for the ML pipeline step. It invokes the functions in `train.py` in an Azure ML context and adds logging. `train_aml.py` loads parameters for training from `[project name]/parameters.json` and passes them to the training function in `train.py`. If your experiment code can be refactored to match the function signatures in `train.py`, this file shouldn't need many changes. +* `test_train.py` contains tests that guard against functional regressions in `train.py`. Remove this file if you have no tests for your own code. + +Add any dependencies required by training to `[project name]/conda_dependencies.yml]`. This file will be used to generate the environment that the pipeline steps will run in. + +## Update evaluation code + +The MLOpsPython template uses the evaluate_model script to compare the performance of the newly trained model and the current production model based on Mean Squared Error. If the performance of the newly trained model is better than the current production model, then the pipelines continue. Otherwise, the pipelines are canceled. + +To keep the evaluation step, replace all instances of `mse` in `[project name]/evaluate/evaluate_model.py` with the metric that you want. + +To disable the evaluation step, either: + +* set the DevOps pipeline variable `RUN_EVALUATION` to `false` +* uncomment `RUN_EVALUATION` in `.pipelines/[project name]-variables-template.yml` and set the value to `false` + +## Customize the build agent environment + +The DevOps pipeline definitions in the MLOpsPython template run several steps in a Docker container that contains the dependencies required to work through the Getting Started guide. These dependencies may change over time and may not suit your project's needs. To manage your own dependencies, there are a few options: + +* Add a pipeline step to install dependencies required by unit tests to `.pipelines/code-quality-template.yml`. Recommended if you only have a small number of test dependencies. +* Create a new Docker image containing your dependencies. See [docs/custom_container.md](custom_container.md). Recommended if you have a larger number of dependencies, or if the overhead of installing additional dependencies on each run is too high. +* Remove the container references from the pipeline definition files and run the pipelines on self hosted agents with dependencies pre-installed. + +# Configure Custom Scoring + +## Replace score code + +For the model to provide real-time inference capabilities, the score code needs to be replaced. The MLOpsPython template uses the score code to deploy the model to do real-time scoring on ACI, AKS, or Web apps. + +If you want to keep scoring: + +1. Update or replace `[project name]/scoring/score.py` +1. Add any dependencies required by scoring to `[project name]/conda_dependencies.yml` +1. Modify the test cases in the `ml_service/util/smoke_test_scoring_service.py` script to match the schema of the training features in your data +1. Check and modify [project name]/scoring/deployment_config_aks.yml if AKS deployment is planned. The deployment configuration shall suit custom model as well as AKS cluster size. + +# Configure Custom Batch Scoring + +## Configure input and output data + +The batch scoring pipeline is configured to use the default datastore for input and output. It will use sample data for scoring. + +In order to configure your own input datastore and output datastores, you will need to specify an Azure Blob Storage Account and set up input and output containers. + +Configure the variables below in your variable group. + +**Note: The datastore storage resource, input/output containers, and scoring data is not created automatically. Make sure that you have manually provisioned these resources and placed your scoring data in your input container with the proper name.** + + +| Variable Name | Suggested Value | Short description | +| ------------------------ | ------------------------- | --------------------------------------------------------------------------------------------------------------------------- | +| SCORING_DATASTORE_STORAGE_NAME | | [Azure Blob Storage Account](https://docs.microsoft.com/en-us/azure/storage/blobs/) name. | +| SCORING_DATASTORE_ACCESS_KEY | | [Azure Storage Account Key](https://docs.microsoft.com/en-us/rest/api/storageservices/authorize-requests-to-azure-storage). You may want to consider linking this variable to Azure KeyVault to avoid storing the access key in plain text. | +| SCORING_DATASTORE_INPUT_CONTAINER | | The name of the container for input data. Defaults to `input` if not set. | +| SCORING_DATASTORE_OUTPUT_CONTAINER| | The name of the container for output data. Defaults to `output` if not set. | +| SCORING_DATASTORE_INPUT_FILENAME | | The filename of the input data in your container Defaults to `diabetes_scoring_input.csv` if not set. | +| SCORING_DATASET_NAME | | The AzureML Dataset name to use. Defaults to `diabetes_scoring_ds` if not set (optional). | +| SCORING_DATASTORE_OUTPUT_FILENAME | | The filename to use for the output data. The pipeline will create this file. Defaults to `diabetes_scoring_output.csv` if not set (optional). | + diff --git a/docs/development_setup.md b/docs/development_setup.md new file mode 100644 index 00000000..1c8c2479 --- /dev/null +++ b/docs/development_setup.md @@ -0,0 +1,33 @@ +## Development environment setup + +### Setup + +Please be aware that the local environment also needs access to the Azure subscription so you have to have Contributor access on the Azure ML Workspace. + +In order to configure the project locally, create a copy of `.env.example` in the root directory and name it `.env`. Fill out all missing values and adjust the existing ones to suit your requirements. + +### Installation + +[Install the Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli). The Azure CLI will be used to log you in interactively. + +Install [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). + +Install the required Python modules. [`install_requirements.sh`](https://github.com/microsoft/MLOpsPython/blob/master/environment_setup/install_requirements.sh) creates and activates a new conda environment with required Python modules. + +``` +. environment_setup/install_requirements.sh +``` + +### Running local code + +To run your local ML pipeline code on Azure ML, run a command such as the following (in bash, all on one line): + +``` +export BUILD_BUILDID=$(uuidgen); python ml_service/pipelines/diabetes_regression_build_train_pipeline.py && python ml_service/pipelines/run_train_pipeline.py +``` + +BUILD_BUILDID is a variable used to uniquely identify the ML pipeline between the +`diabetes_regression_build_train_pipeline.py` and `run_train_pipeline.py` scripts. In Azure DevOps it is +set to the current build number. In a local environment, we can use a command such as +`uuidgen` so set a different random identifier on each run, ensuring there are +no collisions. diff --git a/docs/getting_started.md b/docs/getting_started.md index ffe175b9..4ba694d7 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -1,266 +1,464 @@ -## Getting Started with this Repo - -### 1. Get the source code -- Either clone the repository to your workspace and create your own repo with the code in GitHub. -- An easier way is to just fork the project, so you have the repository under your username on GitHub itself. - - -### 2. Create Azure DevOps account -We use Azure DevOps for running our build(CI), retraining trigger and release (CD) pipelines. If you don't already have Azure DevOps account, create one by following the instructions [here](https://docs.microsoft.com/en-us/azure/devops/organizations/accounts/create-organization?view=azure-devops) - -If you already have Azure DevOps account, create a [new project](https://docs.microsoft.com/en-us/azure/devops/organizations/projects/create-project?view=azure-devops). - - -### 3. Create Service Principal to Login to Azure - -To create service principal, register an application entity in Azure Active Directory (Azure AD) and grant it the Contributor or Owner role of the subscription or the resource group where the web service belongs to. See [how to create service principal](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal) and assign permissions to manage Azure resource. -Please make note of the following values after creating a service principal, we will need them in subsequent steps -- Application (client) ID -- Directory (tenant) ID -- Application Secret - - -**Note:** You must have sufficient permissions to register an application with your Azure AD tenant, and assign the application to a role in your Azure subscription. Contact your subscription administrator if you don't have the permissions. Normally a subscription admin can create a Service principal and can provide you the details. - - -### 4. Create a Variable Group - -We make use of variable group inside Azure DevOps to store variables and their values that we want to make available across multiple pipelines. You can either store the values directly in [Azure DevOps](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) or connect to an Azure Key Vault in your subscription. Please refer to the documentation [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) to learn more about how to create a variable group and [link](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#use-a-variable-group) it to your pipeline. Click on **Library** in the **Pipelines** section as indicated below: - -![library_variable groups](./images/library_variable_groups.png) - -Please name your variable group **``devopsforai-aml-vg``** as we are using this name within our build yaml file. - -The variable group should contain the following variables: - -| Variable Name | Suggested Value | -| --------------------------- | ---------------------------- | -| AML_COMPUTE_CLUSTER_CPU_SKU | STANDARD_DS2_V2 | -| AML_COMPUTE_CLUSTER_NAME | train-cluster | -| BASE_NAME | [unique base name] | -| EVALUATE_SCRIPT_PATH | evaluate/evaluate_model.py | -| EXPERIMENT_NAME | mlopspython | -| LOCATION | centralus | -| MODEL_NAME | sklearn_regression_model.pkl | -| REGISTER_SCRIPT_PATH | register/register_model.py | -| SOURCES_DIR_TRAIN | code | -| SP_APP_ID | | -| SP_APP_SECRET | | -| SUBSCRIPTION_ID | | -| TENANT_ID | | -| TRAIN_SCRIPT_PATH | training/train.py | -| TRAINING_PIPELINE_NAME | training-pipeline | - -Mark **SP_APP_SECRET** variable as a secret one. - -**Note:** The BASE_NAME parameter is used throughout the solution for naming Azure resources. When the solution is used in a shared subscription, there can be naming collisions with resources that require unique names like azure blob storage and registry DNS naming. Make sure to give a unique value to the BASE_NAME variable (e.g. MyUniqueML), so that the created resources will have unique names (e.g. MyUniqueML-AML-RG, MyUniqueML-AML-WS, etc.). The length of the BASE_NAME value should not exceed 10 characters. - -Make sure to select the **Allow access to all pipelines** checkbox in the variable group configuration. - -Up until now you should have: -- Forked (or cloned) the repo -- Created a devops account or use an existing one -- Got service principal details and subscription id -- A variable group with all configuration values - -### 5. Create resources - -The easiest way to create all required resources (Resource Group, ML Workspace, Container Registry, Storage Account, etc.) is to leverage an "Infrastructure as Code" [pipeline coming in this repository](../environment_setup/iac-create-environment.yml). This **IaC** pipeline takes care of all required resources basing on these [ARM templates](../environment_setup/arm-templates/cloud-environment.json). The pipeline requires an **Azure Resource Manager** service connection: - -![create service connection](./images/create-rm-service-connection.png) - -Give the connection name **``AzureResourceConnection``** as it is referred by the pipeline definition. Leave the **``Resource Group``** field empty. - -In your DevOps project create a build pipeline from your forked **GitHub** repository: - -![build connnect step](./images/build-connect.png) - -Refer to an **Existing Azure Pipelines YAML file**: - -![configure step](./images/select-iac-pipeline.png) - -Having done that, run the pipeline: - -![iac run](./images/run-iac-pipeline.png) - -Check out created resources in the [Azure Portal](portal.azure.com): - -![created resources](./images/created-resources.png) - -Alternatively, you can also use a [cleaning pipeline](../environment_setup/iac-remove-environment.yml) that removes resources created for this project or you can just delete a resource group in the [Azure Portal](portal.azure.com). - -Once this resource group is created, be sure that the Service Principal you have created has access to this resource group. - -### 6. Set up Build Pipeline - -In your [Azure DevOps](https://dev.azure.com) project create and run a new build pipeline refereing to [azdo-ci-build-train.yml](../.pipelines/azdo-ci-build-train.yml) pipeline in your forked **GitHub** repository: - -![configure ci build pipeline](./images/ci-build-pipeline-configure.png) - -Name the pipeline **ci-build**. Once the pipline is finished, explore the execution logs: - -![ci build logs](./images/ci-build-logs.png) - -and checkout a published training pipeline in the **mlops-AML-WS** workspace in [Azure Portal](https://ms.portal.azure.com/): - -![training pipeline](./images/training-pipeline.png) - - -Great, you now have the build pipeline setup, you can either manually trigger it or it gets automatically triggered everytime there is a change in the master branch. The pipeline performs linting, unit testing, builds and publishes an **ML Training Pipeline** in an **ML Workspace** - -### 7. Train the Model - -The next step is to invoke the training pipeline created in the previous step. It can be done with a **Release Pipeline**. Click on the Pipelines/Releases menu, and then **New pipeline**, and then click on "Empty Job" on the "Select a template" window that pops to the right: - -![invoke training pipeline](./images/invoke-training-pipeline.png) - -An artifact of this pipeline will be the result of the build pipeline **ci-buid**: - -![artifact invoke pipeline](./images/artifact-invoke-pipeline.png) - -Configure a pipeline to see values from the previously defined variable group **devopsforai-aml-vg**. Click on the "Variable groups", and to the right, click on "Link variable group". From there, pick the **devopsforai-aml-vg** variable group we created in an earlier step, choose "Release" as a variable group scope, and click on "Link": - -![retrain pipeline vg](./images/retrain-pipeline-vg.png) - -Rename the default "Stage 1" to **Invoke Training Pipeline** and make sure that the **Agent Specification** is **ubuntu-16.04** under the Agent Job: - -![agent specification](./images/agent-specification.png) - -Add a **Command Line Script** step, rename it to **Run Training Pipeline** with the following script: - -```bash -docker run -v $(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/ml_service/pipelines:/pipelines \ - -w=/pipelines -e MODEL_NAME=$MODEL_NAME -e EXPERIMENT_NAME=$EXPERIMENT_NAME \ - -e TENANT_ID=$TENANT_ID -e SP_APP_ID=$SP_APP_ID -e SP_APP_SECRET=$(SP_APP_SECRET) \ - -e SUBSCRIPTION_ID=$SUBSCRIPTION_ID -e RELEASE_RELEASEID=$RELEASE_RELEASEID \ - -e BUILD_BUILDID=$BUILD_BUILDID -e BASE_NAME=$BASE_NAME \ -mcr.microsoft.com/mlops/python:latest python run_train_pipeline.py -``` - -as in the screen shot below, leaving all other fields to their default value: - -![Run Training Pipeline Task](./images/run_training_pipeline_task.png) - -Now, add the automation to trigger a run of this pipeline whenever the **ci_build** build is completed, click on the lightning bolt icon on the top right of the **\_ci-build** artifact is selected, and enable the automatic release : - -![automate_infoke_training_pipeline](./images/automate_infoke_training_pipeline.png) - -This release pipeline should now be automatically triggered (continuous deployment) whenever a new **ML training pipeline** is published by the **ci-build builder pipeline**. It can also be triggered manually or configured to run on a scheduled basis. Create a new release to trigger the pipeline manually by clicking on the "Create release" button on the top right of your screen, when selecting this new build pipeline: - -![create release](./images/create-release.png) - -Leave the fields empty and click on "create". Once the release pipeline is completed, check out in the **ML Workspace** that the training pipeline is running: - -![running training pipeline](./images/running-training-pipeline.png) - -The training pipeline will train, evaluate and register a new model. Wait until it is fininshed and make sure there is a new model in the **ML Workspace**: - -![trained model](./images/trained-model.png) - -Good! Now we have a trained model. - -### 8. Deploy the Model - -The final step is to deploy the model across environments with a release pipeline. There will be a **``QA``** environment running on [Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/) and a **``Prod``** environment running on [Azure Kubernetes Service](https://azure.microsoft.com/en-us/services/kubernetes-service). This is the final picture of what your release pipeline should look like: - -![deploy model](./images/deploy-model.png) - - -This pipeline leverages the **Azure Machine Learning** extension that should be installed in your organization from the [marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml). - -The pipeline consumes two artifacts: the result of the **Build Pipeline** as it contains configuration files and the **model** trained and registered by the ML training pipeline. - -Add the **\_ci-build** artifact using the same process as what we did in the previous step. - -In order to configure a model artifact there should be a service connection to **mlops-AML-WS** workspace. To get there, go to the project settings (by clicking on the cog wheel to the bottom left of the screen), and then click on **Service connections** under the **Pipelines** section: - -**Note:** Creating service connection using Azure Machine Learning extension requires 'Owner' or 'User Access Administrator' permissions on the Workspace. - -![workspace connection](./images/workspace-connection.png) - -Add an artifact to the pipeline and select **AzureML Model Artifact** source type. Select the **Service Endpoint** and **Model Names** from the drop down lists. **Service Endpoint** refers to the **Service connection** created in the previous step: - -![model artifact](./images/model-artifact.png) - -Go to the new **Releases Pipelines** section, and click new to create a new release pipeline. A first stage is automatically created and choose **start with an Empty job**. Name the stage **QA (ACI)** and add a single task to the job **Azure ML Model Deploy**. Make sure that the Agent Specification is ubuntu-16.04 under the Agent Job: - -![deploy aci](./images/deploy-aci.png) - -Specify task parameters as it is shown in the table below: - - -| Parameter | Value | -| ----------------------------- | ---------------------------------------------------------------------------------------------------- | -| Display Name | Azure ML Model Deploy | -| Azure ML Workspace | mlops-AML-WS | -| Inference config Path | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/inference_config.yml` | -| Model Deployment Target | Azure Container Instance | -| Deployment Name | mlopspython-aci | -| Deployment Configuration file | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/deployment_config_aci.yml` | -| Overwrite existing deployment | X | - - -In a similar way create a stage **Prod (AKS)** and add a single task to the job **Azure ML Model Deploy**. Make sure that the Agent Specification is ubuntu-16.04 under the Agent Job: - -![deploy aks](./images/deploy-aks.png) - -Specify task parameters as it is shown in the table below: - -| Parameter | Value | -| --------------------------------- | ---------------------------------------------------------------------------------------------------- | -| Display Name | Azure ML Model Deploy | -| Azure ML Workspace | mlops-AML-WS | -| Inference config Path | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/inference_config.yml` | -| Model Deployment Target | Azure Kubernetes Service | -| Select AKS Cluster for Deployment | YOUR_DEPLOYMENT_K8S_CLUSTER | -| Deployment Name | mlopspython-aks | -| Deployment Configuration file | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/deployment_config_aks.yml` | -| Overwrite existing deployment | X | - -Similarly to the **Invoke Training Pipeline** release pipeline, previously created, in order to trigger a coutinuous integration, click on the lightning bolt icon, make sure the **Continuous deployment trigger** is checked and save the trigger: - -![Automate Deploy Model Pipeline](./images/automate_deploy_model_pipeline.png) - -**Note:** Creating of a Kubernetes cluster on AKS is out of scope of this tutorial, so you should take care of it on your own. - -**Deploy trained model to Azure Web App for containers** - -Note: This is an optional step and can be used only if you are deploying your scoring service on Azure Web Apps. - -[Create Image Script](../ml_service/util/create_scoring_image.py) -can be used to create a scoring image from the release pipeline. Image created by this script will be registered under Azure Container Registry (ACR) instance that belongs to Azure Machine Learning Service. Any dependencies that scoring file depends on can also be packaged with the container with Image config. To learn more on how to create a container with AML SDK click [here](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.image.image.image?view=azure-ml-py#create-workspace--name--models--image-config-). - -Below is release pipeline with two tasks one to create an image using the above script and second is the deploy the image to Web App for containers -![release_webapp](./images/release-webapp-pipeline.PNG) - -For the bash script task to invoke the [Create Image Script](../ml_service/util/create_scoring_image.py), specify the following task parameters: - -| Parameter | Value | -| ------------------ | --------------------------------------------------------------------------------------------------- | -| Display Name | Create Scoring Image | -| Script | python3 $(System.DefaultWorkingDirectory)/\_MLOpsPythonRepo/ml_service/util/create_scoring_image.py | - -Finally -![release_createimage](./images/release-task-createimage.PNG) - -Finally for the Azure WebApp on Container Task, specify the following task parameters as it is shown in the table below: - - -| Parameter | Value | -| ------------------ | --------------------------------------------------------------------------------------------------- | -| Azure subscription | Subscription used to deploy Web App | -| App name | Web App for Containers name | -| Image name | Specify the fully qualified container image name. For example, 'myregistry.azurecr.io/nginx:latest' | - -![release_webapp](./images/release-task-webappdeploy.PNG) - - -Save the pipeline and create a release to trigger it manually. To create the trigger, click on the "Create release" button on the top right of your screen, leave the fields blank and click on **Create** at the bottom of the screen. Once the pipeline execution is finished, check out deployments in the **mlops-AML-WS** workspace. - - - -Congratulations! You have three pipelines set up end to end: - - Build pipeline: triggered on code change to master branch on GitHub, performs linting, unit testing and publishing a training pipeline - - Release Trigger pipeline: runs a published training pipeline to train, evaluate and register a model - - Release Deployment pipeline: deploys a model to QA (ACI) and Prod (AKS) environments - +# Getting Started with MLOpsPython + +This guide shows how to get MLOpsPython working with a sample ML project **_diabetes_regression_**. The project creates a linear regression model to predict diabetes and has CI/CD DevOps practices enabled for model training and serving when these steps are completed in this getting started guide. + +If you would like to bring your own model code to use this template structure, follow the [custom model](custom_model.md) guide. We recommend completing this getting started guide with the diabetes model through ACI deployment first to ensure everything is working in your environment before converting the template to use your own model code. + +- [Setting up Azure DevOps](#setting-up-azure-devops) + - [Install the Azure Machine Learning extension](#install-the-azure-machine-learning-extension) +- [Get the code](#get-the-code) +- [Create a Variable Group for your Pipeline](#create-a-variable-group-for-your-pipeline) + - [Variable Descriptions](#variable-descriptions) +- [Provisioning resources using Azure Pipelines](#provisioning-resources-using-azure-pipelines) + - [Create an Azure DevOps Service Connection for the Azure Resource Manager](#create-an-azure-devops-service-connection-for-the-azure-resource-manager) + - [Create the IaC Pipeline](#create-the-iac-pipeline) +- [Create an Azure DevOps Service Connection for the Azure ML Workspace](#create-an-azure-devops-service-connection-for-the-azure-ml-workspace) +- [Set up Build, Release Trigger, and Release Multi-Stage Pipeline](#set-up-build-release-trigger-and-release-multi-stage-pipelines) + - [Set up the Model CI Training, Evaluation, and Registration Pipeline](#set-up-the-model-ci-training-evaluation-and-registration-pipeline) + - [Set up the Release Deployment and/or Batch Scoring Pipelines](#set-up-the-release-deployment-andor-batch-scoring-pipelines) +- [Further Exploration](#further-exploration) + - [Deploy the model to Azure Kubernetes Service](#deploy-the-model-to-azure-kubernetes-service) + - [Web Service Authentication on Azure Kubernetes Service](#web-service-authentication-on-azure-kubernetes-service) + - [Deploy the model to Azure App Service (Azure Web App for containers)](#deploy-the-model-to-azure-app-service-azure-web-app-for-containers) + - [Example pipelines using R](#example-pipelines-using-r) + - [Observability and Monitoring](#observability-and-monitoring) + - [Clean up the example resources](#clean-up-the-example-resources) +- [Next Steps: Integrating your project](#next-steps-integrating-your-project) + - [Additional Variables and Configuration](#additional-variables-and-configuration) + - [More variable options](#more-variable-options) + - [Local configuration](#local-configuration) + +## Setting up Azure DevOps + +You'll use Azure DevOps for running the multi-stage pipeline with build, model training, and scoring service release stages. If you don't already have an Azure DevOps organization, create one by following the instructions at [Quickstart: Create an organization or project collection](https://docs.microsoft.com/en-us/azure/devops/organizations/accounts/create-organization?view=azure-devops). + +If you already have an Azure DevOps organization, create a new project using the guide at [Create a project in Azure DevOps and TFS](https://docs.microsoft.com/en-us/azure/devops/organizations/projects/create-project?view=azure-devops). + +### Install the Azure Machine Learning extension + +Install the **Azure Machine Learning** extension to your Azure DevOps organization from the [Visual Studio Marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml) by clicking "Get it free" and following the steps. The UI will tell you if try to add it and it's already installed. + +This extension contains the Azure ML pipeline tasks and adds the ability to create Azure ML Workspace service connections. The documentation page on the marketplace includes detailed instructions with screenshots on what capabilities it includes. + +## Get the code + +We recommend using the [repository template](https://github.com/microsoft/MLOpsPython/generate), which effectively forks this repository to your own GitHub location and squashes the history. You can use the resulting repository for this guide and for your own experimentation. + +## Create a Variable Group for your Pipeline + +MLOpsPython requires some variables to be set before you can run any pipelines. You'll need to create a _variable group_ in Azure DevOps to store values that are reused across multiple pipelines or pipeline stages. Either store the values directly in [Azure DevOps](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) or connect to an Azure Key Vault in your subscription. Check out the [Add & use variable groups](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=yaml#use-a-variable-group) documentation to learn more about how to create a variable group and link it to your pipeline. + +Navigate to **Library** in the **Pipelines** section as indicated below: + +![Library Variable Groups](./images/library_variable_groups.png) + +Create a variable group named **`devopsforai-aml-vg`**. The YAML pipeline definitions in this repository refer to this variable group by name. + +The variable group should contain the following required variables. **Azure resources that don't exist yet will be created in the [Provisioning resources using Azure Pipelines](#provisioning-resources-using-azure-pipelines) step below.** + +| Variable Name | Suggested Value | Short description | +| ------------------------ | ------------------------- | --------------------------------------------------------------------------------------------------------------------------- | +| BASE_NAME | [your project name] | Unique naming prefix for created resources - max 10 chars, letters and numbers only | +| LOCATION | centralus | [Azure location](https://azure.microsoft.com/en-us/global-infrastructure/locations/), no spaces. You can list all the region codes by running `az account list-locations -o table` in the Azure CLI | +| RESOURCE_GROUP | mlops-RG | Azure Resource Group name | +| WORKSPACE_NAME | mlops-AML-WS | Azure ML Workspace name | +| AZURE_RM_SVC_CONNECTION | azure-resource-connection | [Azure Resource Manager Service Connection](#create-an-azure-devops-service-connection-for-the-azure-resource-manager) name | +| WORKSPACE_SVC_CONNECTION | aml-workspace-connection | [Azure ML Workspace Service Connection](#create-an-azure-devops-azure-ml-workspace-service-connection) name | +| ACI_DEPLOYMENT_NAME | mlops-aci | [Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/) name | | + +Make sure you select the **Allow access to all pipelines** checkbox in the variable group configuration. To do this, first **Save** the variable group, then click **Pipeline Permissions**, then the button with 3 vertical dots, and then **Open access** button. + +More variables are available for further tweaking, but the above variables are all you need to get started with this example. For more information, see the [Additional Variables and Configuration](#additional-variables-and-configuration) section. + +### Variable Descriptions + +**BASE_NAME** is used as a prefix for naming Azure resources and should be unique. When sharing an Azure subscription, the prefix allows you to avoid naming collisions for resources that require unique names, for example, Azure Blob Storage and Registry DNS. Make sure to set BASE_NAME to a unique name so that created resources will have unique names, for example, MyUniqueMLamlcr, MyUniqueML-AML-KV, and so on. The length of the BASE_NAME value shouldn't exceed 10 characters and must contain letters and numbers only. + +**LOCATION** is the name of the [Azure location](https://azure.microsoft.com/en-us/global-infrastructure/locations/) for your resources. There should be no spaces in the name. For example, central, westus, northeurope. You can list all the region codes by running `az account list-locations -o table` in the Azure CLI. + +**RESOURCE_GROUP** is used as the name for the resource group that will hold the Azure resources for the solution. If providing an existing Azure ML Workspace, set this value to the corresponding resource group name. + +**WORKSPACE_NAME** is used for creating the Azure Machine Learning Workspace. *While you should be able to provide an existing Azure ML Workspace if you have one, you will run into problems if this has been provisioned manually and the naming of the associated storage account doesn't follow the convention followed in this repo -- as the environment provisioning will try to associate it with a new Storage Account and this is not supported. To avoid these problems, specify a new workspace/unique name.* + +**AZURE_RM_SVC_CONNECTION** is used by the [Azure Pipeline](../environment_setup/iac-create-environment-pipeline.yml) in Azure DevOps that creates the Azure ML workspace and associated resources through Azure Resource Manager. You'll create the connection in a [step below](#create-an-azure-devops-service-connection-for-the-azure-resource-manager). + +**WORKSPACE_SVC_CONNECTION** is used to reference a [service connection for the Azure ML workspace](#create-an-azure-devops-azure-ml-workspace-service-connection). You'll create the connection after [provisioning the workspace](#provisioning-resources-using-azure-pipelines) in the [Create an Azure DevOps Service Connection for the Azure ML Workspace](#create-an-azure-devops-service-connection-for-the-azure-ml-workspace) section below. + +**ACI_DEPLOYMENT_NAME** is used for naming the scoring service during deployment to [Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/). + + +## Provisioning resources using Azure Pipelines + +The easiest way to create all required Azure resources (Resource Group, Azure ML Workspace, Container Registry, and others) is to use the **Infrastructure as Code (IaC)** [pipeline with ARM templates](../environment_setup/iac-create-environment-pipeline-arm.yml) or the [pipeline with Terraform templates](../environment_setup/iac-create-environment-pipeline-tf.yml). The pipeline takes care of setting up all required resources based on these [Azure Resource Manager templates](../environment_setup/arm-templates/cloud-environment.json), or based on these [Terraform templates](../environment_setup/tf-templates). + +**Note:** Since Azure Blob storage account required for batch scoring is optional, the resource provisioning pipelines mentioned above do not create this resource automatically, and manual creation is required before use. + +### Create an Azure DevOps Service Connection for the Azure Resource Manager + +The [IaC provisioning pipeline](../environment_setup/iac-create-environment-pipeline.yml) requires an **Azure Resource Manager** [service connection](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#create-a-service-connection). To create one, in Azure DevOps select **Project Settings**, then **Service Connections**, and create a new one, where: + +- Type is **Azure Resource Manager** +- Authentication method is **Service principal (automatic)** +- Scope level is **Subscription** +- Leave **`Resource Group`** empty after selecting your subscription in the dropdown +- Use the same **`Service Connection Name`** that you used in the variable group you created +- Select **Grant access permission to all pipelines** + +![Create service connection](./images/create-rm-service-connection.png) + +**Note:** Creating the Azure Resource Manager service connection scope requires 'Owner' or 'User Access Administrator' permissions on the subscription. +You'll also need sufficient permissions to register an application with your Azure AD tenant, or you can get the ID and secret of a service principal from your Azure AD Administrator. That principal must have 'Contributor' permissions on the subscription. + +### Create the IaC Pipeline + +In your Azure DevOps project, create a build pipeline from your forked repository: + +![Build connect step](./images/build-connect.png) + +If you are using GitHub, after picking the option above, you'll be asked to authorize to GitHub and select the repo you forked. Then you'll have to select your forked repository on GitHub under the **Repository Access** section, and click **Approve and Install**. + +After the above, and when you're redirected back to Azure DevOps, select the **Existing Azure Pipelines YAML file** option and set the path to [/environment_setup/iac-create-environment-pipeline-arm.yml](../environment_setup/iac-create-environment-pipeline-arm.yml) or to [/environment_setup/iac-create-environment-pipeline-tf.yml](../environment_setup/iac-create-environment-pipeline-tf.yml), depending on if you want to deploy your infrastructure using ARM templates or Terraform: + +![Configure step](./images/select-iac-pipeline.png) + +If you decide to use Terraform, make sure the ['Terraform Build & Release Tasks' from Charles Zipp](https://marketplace.visualstudio.com/items?itemName=charleszipp.azure-pipelines-tasks-terraform) is installed. + +Having done that, run the pipeline: + +![IaC run](./images/run-iac-pipeline.png) + +Check that the newly created resources appear in the [Azure Portal](https://portal.azure.com): + +![Created resources](./images/created-resources.png) + +**Note**: If you have other errors, one good thing to check is what you used in the variable names. If you end up running the pipeline multiple times, you may also run into errors and need to delete the Azure services and re-run the pipeline -- this should include a resource group, a KeyVault, a Storage Account, a Container Registry, an Application Insights and a Machine Learning workspace. + +## Create an Azure DevOps Service Connection for the Azure ML Workspace + +At this point, you should have an Azure ML Workspace created. Similar to the Azure Resource Manager service connection, you need to create an additional one for the Azure ML Workspace. + +Create a new service connection to your Azure ML Workspace using the [Machine Learning Extension](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml) instructions to enable executing the Azure ML training pipeline. The connection name needs to match `WORKSPACE_SVC_CONNECTION` that you set in the variable group above (e.g., 'aml-workspace-connection'). + +![Created resources](./images/ml-ws-svc-connection.png) + +**Note:** Similar to the Azure Resource Manager service connection you created earlier, creating a service connection with Azure Machine Learning workspace scope requires 'Owner' or 'User Access Administrator' permissions on the Workspace. +You'll need sufficient permissions to register an application with your Azure AD tenant, or you can get the ID and secret of a service principal from your Azure AD Administrator. That principal must have Contributor permissions on the Azure ML Workspace. + +## Set up Build, Release Trigger, and Release Multi-Stage Pipelines + +Now that you've provisioned all the required Azure resources and service connections, you can set up the pipelines for training (Continuous Integration - **CI**) and deploying (Continuous Deployment - **CD**) your machine learning model to production. Additionally, you can set up a pipeline for batch scoring. + +1. **Model CI, training, evaluation, and registration** - triggered on code changes to master branch on GitHub. Runs linting, unit tests, code coverage, and publishes and runs the training pipeline. If a new model is registered after evaluation, it creates a build artifact containing the JSON metadata of the model. Definition: [diabetes_regression-ci.yml](../.pipelines/diabetes_regression-ci.yml). +1. **Release deployment** - consumes the artifact of the previous pipeline and deploys a model to either [Azure Container Instances (ACI)](https://azure.microsoft.com/en-us/services/container-instances/), [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service), or [Azure App Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-app-service) environments. See [Further Exploration](#further-exploration) for other deployment types. Definition: [diabetes_regression-cd.yml](../.pipelines/diabetes_regression-cd.yml). + 1. **Note:** Edit the pipeline definition to remove unused stages. For example, if you're deploying to Azure Container Instances and Azure Kubernetes Service only, you'll need to delete the unused `Deploy_Webapp` stage. +1. **Batch Scoring Code Continuous Integration** - consumes the artifact of the model training pipeline. Runs linting, unit tests, code coverage, publishes a batch scoring pipeline, and invokes the published batch scoring pipeline to score a model. + +These pipelines use a Docker container on the Azure Pipelines agents to accomplish the pipeline steps. The container image ***mcr.microsoft.com/mlops/python:latest*** is built with [this Dockerfile](../environment_setup/Dockerfile) and has all the necessary dependencies installed for MLOpsPython and ***diabetes_regression***. This image is an example of a custom Docker image with a pre-baked environment. The environment is guaranteed to be the same on any building agent, VM, or local machine. **In your project, you'll want to build your own Docker image that only contains the dependencies and tools required for your use case. Your image will probably be smaller and faster, and it will be maintained by your team.** + +### Set up the Model CI, training, evaluation, and registration pipeline + +In your Azure DevOps project, create and run a new build pipeline based on the [./pipelines/diabetes_regression-ci.yml](../.pipelines/diabetes_regression-ci.yml) +pipeline definition in your forked repository. + +If you plan to use the release deployment pipeline (in the next section), you will need to rename this pipeline to `Model-Train-Register-CI`. + +**Note**: *To rename your pipeline, after you saved it, click **Pipelines** on the left menu on Azure DevOps, then **All** to see all the pipelines, then click the menu with the 3 vertical dots that appears when you hover the name of the new pipeline, and click it to pick **"Rename/move pipeline"**.* + +Start a run of the pipeline if you haven't already, and once the pipeline is finished, check the execution result. Note that the run can take 20 minutes, with time mostly spent in **Trigger ML Training Pipeline > Invoke ML Pipeline** step. You can track the execution of the AML pipeline by opening the AML Workspace user interface. Screenshots are below: + +![Build](./images/model-train-register.png) + +And the pipeline artifacts: + +![Build](./images/model-train-register-artifacts.png) + +Also check the published training pipeline in your newly created AML workspace in [Azure Machine Learning Studio](https://ml.azure.com/): + +![Training pipeline](./images/training-pipeline.png) + +Great, you now have the build pipeline for training set up which automatically triggers every time there's a change in the master branch! + +After the pipeline is finished, you'll also see a new model in the **AML Workspace** model registry section: + +![Trained model](./images/trained-model.png) + +To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\diabetes_regression-ci.yml` pipeline to `false`. You can also override the variable at runtime execution of the pipeline. + +The pipeline stages are summarized below: + +#### Model CI + +- Linting (code quality analysis) +- Unit tests and code coverage analysis +- Build and publish _ML Training Pipeline_ in an _ML Workspace_ + +#### Train model + +- Determine the ID of the _ML Training Pipeline_ published in the previous stage. +- Trigger the _ML Training Pipeline_ and waits for it to complete. + - This is an **agentless** job. The CI pipeline can wait for ML pipeline completion for hours or even days without using agent resources. +- Determine if a new model was registered by the _ML Training Pipeline_. + - If the model evaluation step of the AML Pipeline determines that the new model doesn't perform any better than the previous one, the new model won't register and the _ML Training Pipeline_ will be **canceled**. In this case, you'll see a message in the 'Train Model' job under the 'Determine if evaluation succeeded and new model is registered' step saying '**Model was not registered for this run.**' + - See [evaluate_model.py](../diabetes_regression/evaluate/evaluate_model.py#L118) for the evaluation logic. This is a simplified test that just looks at MSE to decide whether or not to register a new model. A more realistic verification would also do some error analysis and verify the inferences/error distribution against a test dataset, for example. + - **Note**: *while it's possible to do an Evaluation Step as part of the ADO pipeline, this evaluation is logically part of the work done by Data Scientists, and as such the recommendation is that this step is done as part of the AML Pipeline and not ADO pipelines.* + - [Additional Variables and Configuration](#additional-variables-and-configuration) for configuring this and other behavior. + +#### Create pipeline artifact + +- Get the info about the registered model +- Create an Azure DevOps pipeline artifact called `model` that contains a `model.json` file containing the model information, for example: + +```json +{ "createdTime": "2021-12-14T13:03:24.494748+00:00", "framework": "Custom", "frameworkVersion": null, "id": "diabetes_regression_model.pkl:1", "name": "diabetes_regression_model.pkl", "version": 1 } +``` + +- Here's [more information on Azure DevOps Artifacts](https://docs.microsoft.com/en-us/azure/devops/pipelines/artifacts/build-artifacts?view=azure-devops&tabs=yaml#explore-download-and-deploy-your-artifacts) and where to find them on the ADO user interface. + +### Set up the Release Deployment and/or Batch Scoring pipelines + +--- +**PRE-REQUISITES** + +In order to use these pipelines: + +1. Follow the steps to set up the Model CI, training, evaluation, and registration pipeline. +1. You **must** rename your model CI/train/eval/register pipeline to `Model-Train-Register-CI`. + +These pipelines rely on the model CI pipeline and reference it by name. + +If you would like to change the name of your model CI pipeline, you must edit this section of yml for the CD and batch scoring pipeline, where it says `source: Model-Train-Register-CI` to use your own name. +``` +trigger: none +resources: + containers: + - container: mlops + image: mcr.microsoft.com/mlops/python:latest + pipelines: + - pipeline: model-train-ci + source: Model-Train-Register-CI # Name of the triggering pipeline + trigger: + branches: + include: + - master +``` + +--- + +The release deployment and batch scoring pipelines have the following behaviors: + +- The pipeline will **automatically trigger** on completion of the `Model-Train-Register-CI` pipeline for the master branch. +- The pipeline will default to using the latest successful build of the `Model-Train-Register-CI` pipeline. It will deploy the model produced by that build. +- You can specify a `Model-Train-Register-CI` build ID when running the pipeline manually. You can find this in the url of the build, and the model registered from that build will also be tagged with the build ID. This is useful to skip model training and registration, and deploy/score a model successfully registered by a `Model-Train-Register-CI` build. + - For example, if you navigate to a specific run of your CI pipeline, the URL should be something like `https://dev.azure.com/yourOrgName/yourProjectName/_build/results?buildId=653&view=results`. **653** is the build ID in this case. See the second screenshot below to verify where this number would be used. + +### Set up the Release Deployment pipeline + +In your Azure DevOps project, create and run a new **build** pipeline based on the [./pipelines/diabetes_regression-cd.yml](../.pipelines/diabetes_regression-cd.yml) +pipeline definition in your forked repository. It is recommended you rename this pipeline to something like `Model-Deploy-CD` for clarity. + +**Note**: *While Azure DevOps supports both Build and Release pipelines, when using YAML you don't usually need to use Release pipelines. This repository assumes the usage only of Build pipelines.* + +Your first run will use the latest model created by the `Model-Train-Register-CI` pipeline. + +Once the pipeline is finished, check the execution result: + +![Build](./images/model-deploy-result.png) + +To specify a particular build's model, set the `Model Train CI Build Id` parameter to the build ID you would like to use: + +![Build](./images/model-deploy-configure.png) + +Once your pipeline run begins, you can see the model name and version downloaded from the `Model-Train-Register-CI` pipeline. The run time will typically be 5-10 minutes. + +![Build](./images/model-deploy-get-artifact-logs.png) + +The pipeline has the following stage: + +#### Deploy to ACI + +- Deploy the model to the QA environment in [Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/). +- Smoke test + - The test sends a sample query to the scoring web service and verifies that it returns the expected response. Have a look at the [smoke test code](../ml_service/util/smoke_test_scoring_service.py) for an example. + +- You can verify that an ACI instance was created in the same resource group you specified: + +![Created Resouces ](./images/aci-in-azure-portal.png) + +### Set up the Batch Scoring pipeline + +In your Azure DevOps project, create and run a new build pipeline based on the [.pipelines/diabetes_regression-batchscoring-ci.yml](../.pipelines/diabetes_regression-batchscoring-ci.yml) +pipeline definition in your forked repository. Rename this pipeline to `Batch-Scoring`. + +Once the pipeline is finished, check the execution result: + +![Build](./images/batchscoring-ci-result.png) + +Also check the published batch scoring pipeline in your AML workspace in the [Azure Portal](https://portal.azure.com/): + +![Batch scoring pipeline](./images/batchscoring-pipeline.png) + +Great, you now have the build pipeline set up for batch scoring which automatically triggers every time there's a change in the master branch! + +The pipeline stages are described below in detail -- and you must do further configurations to actually see the batch inferences: + +#### Batch Scoring CI + +- Linting (code quality analysis) +- Unit tests and code coverage analysis +- Build and publish *ML Batch Scoring Pipeline* in an *AML Workspace* + +#### Batch Score model + +- Determine the model to be used based on the model name (required), model version, model tag name and model tag value bound pipeline parameters. + - If run via Azure DevOps pipeline, the batch scoring pipeline will take the model name and version from the `Model-Train-Register-CI` build used as input. + - If run locally without the model version, the batch scoring pipeline will use the model's latest version. +- Trigger the *ML Batch Scoring Pipeline* and wait for it to complete. + - This is an **agentless** job. The CI pipeline can wait for ML pipeline completion for hours or even days without using agent resources. +- Create an Azure ML pipeline with two steps. The pipeline is created by the code in `ml_service\pipelines\diabetes_regression_build_parallel_batchscore_pipeline.py` and has two steps: + - `scoringstep` - this step is a **`ParallelRunStep`** that executes the code in `diabetes_regression\scoring\parallel_batchscore.py` with several different batches of the data to be scored. + - `scorecopystep` - this is a **`PythonScriptStep`** step that copies the output inferences from Azure ML's internal storage into a target location in a another storage account. + - If you run the instructions as defined above with no changes to variables, this step will be **not** executed. You'll see a message in the logs for the corresponding step saying `Missing Parameters`. In this case, you'll be able to find the file with the inferences in the same Storage Account associated with Azure ML, in a location similar to `azureml-blobstore-SomeGuid\azureml\SomeOtherGuid\defaultoutput\parallel_run_step.txt`. One way to find the right path is this: + - Open your experiment in Azure ML (by default called `mlopspython`). + - Open the run that you want to look at (named something like `neat_morning_qc10dzjy` or similar). + - In the graphical pipeline view with 2 steps, click the button to open the details tab: `Show run overview`. + - You'll see two steps (corresponding to `scoringstep`and `scorecopystep` as described above). + - Click the step with the with older "Submitted time". + - Click "Output + logs" at the top, and you'll see something like the following: + ![Outputs of `scoringstep`](./images/batch-child-run-scoringstep.png) + - The `defaultoutput` file will have JSON content with the path to a file called `parallel_run_step.txt` containing the scoring. + +To properly configure this step for your own custom scoring data, you must follow the instructions in [Configure Custom Batch Scoring](custom_model.md#Configure-Custom-Batch-Scoring), which let you specify both the location of the files to score (via the `SCORING_DATASTORE_INPUT_*` configuration variables) and where to store the inferences (via the `SCORING_DATASTORE_OUTPUT_*` configuration variables). + +## Further Exploration + +You should now have a working set of pipelines that can get you started with MLOpsPython. Below are some additional features offered that might suit your scenario. + +### Deploy the model to Azure Kubernetes Service + +MLOpsPython also can deploy to [Azure Kubernetes Service](https://azure.microsoft.com/en-us/services/kubernetes-service). + +Creating a cluster on Azure Kubernetes Service is out of scope of this tutorial, but you can find set up information on the [Quickstart: Deploy an Azure Kubernetes Service (AKS) cluster using the Azure portal](https://docs.microsoft.com/en-us/azure/aks/kubernetes-walkthrough-portal#create-an-aks-cluster) page. + +> **_Note_** +> +> If your target deployment environment is a Kubernetes cluster and you want to implement Canary and/or A/B testing deployment strategies, check out this [tutorial](./canary_ab_deployment.md). + +Keep the Azure Container Instances deployment active because it's a lightweight way to validate changes before deploying to Azure Kubernetes Service. + +In the Variables tab, edit your variable group (`devopsforai-aml-vg`). In the variable group definition, add these variables: + +| Variable Name | Suggested Value | Description | +| ------------------- | --------------- | ----------- | +| AKS_COMPUTE_NAME | aks | The Compute name of the inference cluster, created in the Azure ML Workspace (ml.azure.com). This connection has to be created manually before setting the value! | +| AKS_DEPLOYMENT_NAME | mlops-aks | The name of the deployed aks cluster in your subscripttion. | + +After successfully deploying to Azure Container Instances, the next stage will deploy the model to Kubernetes and run a smoke test. + +Set **AKS_COMPUTE_NAME** to the _Compute name_ of the Inference Cluster that references the Azure Kubernetes Service cluster in your Azure ML Workspace. + +![build](./images/multi-stage-aci-aks.png) + +Consider enabling [manual approvals](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/approvals) before the deployment stages. + +#### Web Service Authentication on Azure Kubernetes Service + +When deploying to Azure Kubernetes Service, key-based authentication is enabled by default. You can also enable token-based authentication. Token-based authentication requires clients to use an Azure Active Directory account to request an authentication token, which is used to make requests to the deployed service. For more details on how to authenticate with ML web service deployed on the AKS service please follow [Smoke Test](../ml_service/util/smoke_test_scoring_service.py) or the Azure documentation on [web service authentication](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-kubernetes-service#web-service-authentication). + +### Deploy the model to Azure App Service (Azure Web App for containers) + +If you want to deploy your scoring service as an [Azure App Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-deploy-app-service) instead of Azure Container Instances or Azure Kubernetes Service, follow these additional steps. + +- First, you'll need to create an App Service Plan using Linux. The simplest way is to run this from your Azure CLI: `az appservice plan create --name nameOfAppServicePlan --resource-group nameOfYourResourceGroup --sku B1 --is-linux`. + +- Second, you'll need to create a webapp in this App Service Plan, and configure it to run a certain container. As currently there is no UI in the Azure Portal to do this, this has to be done from the command line. We'll come back to this. + +- In the Variables tab, edit your variable group (`devopsforai-aml-vg`) and add a variable: + + | Variable Name | Suggested Value | + | ---------------------- | ---------------------- | + | WEBAPP_DEPLOYMENT_NAME | _name of your web app_ | + + Set **WEBAPP_DEPLOYMENT_NAME** to the name of your Azure Web App. You have not yet created this webapp, so just use the name you're planning on giving it. + +- Delete the **ACI_DEPLOYMENT_NAME** or any AKS-related variable. + +- Next, you'll need to run your `Model-Deploy-CD` pipeline + + - The pipeline uses the [Azure ML CLI](../.pipelines/diabetes_regression-package-model-template.yml) to create a scoring image. The image will be registered under an Azure Container Registry instance that belongs to the Azure Machine Learning Service. Any dependencies that the scoring file depends on can also be packaged with the container with an image config. Learn more about how to create a container using the Azure ML SDK with the [Image class](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.image.image.image?view=azure-ml-py#create-workspace--name--models--image-config-) API documentation. + + - This pipeline will **fail** on the `Azure Web App on Container Deploy` step, with an error saying the webapp doesn't exist yet. This is expected. Go to the next step. + +- If you want to confirm that the scoring image has been created, open the Azure Container Registry mentioned above, which will be in the Resource Group of the Azure ML workspace, and look for the repositories. You'll have one that was created by the pipeline, called `package`, which was created by the CD pipeline: + + ![Azure Container Registry repository list](./images/container-registry-webapp-image.png) + +- Notedown the name of the Login Server of your Azure Container Registry. It'll be something like `YourAcrName.azurecr.io`. + +- Going back to the Step Two, now you can create a Web App in you App Service Plan using this scoring image but with the `latest` tag. The easiest way to do this is to run this in the Azure CLI: `az webapp create --resource-group yourResourceGroup --plan nameOfAppServicePlan --name nameOfWebApp --deployment-container-image-name YourAcrName.azurecr.io/package:latest` + - Here, `nameOfWebApp` is the same you put in your Azure DevOps `WEBAPP_DEPLOYMENT_NAME` variable. + +From now on, whenever you run the CD pipeline, it will update the image in the container registry and it'll automatically update the one used in the WebApp. CD pipeline runs will now succeed. + +![build](./images/ADO-CD-pipeline-to-webapp.png) + +To confirm, you can open the App Service Plan, open your new WebApp, and open the **Deployment Center**, where you'll see something like: + +![WebApp Deployment Center page](./images/appservice-webapp-deploymentcenter.png) + +If you run into problems, you may have to make sure your webapp has the credentials to pull the image from the Azure Container Registry created by the Infrastructure as Code pipeline. Instructions can be found on the [Configure registry credentials in web app](https://docs.microsoft.com/en-us/azure/devops/pipelines/targets/webapp-on-container-linux?view=azure-devops&tabs=dotnet-core%2Cyaml#configure-registry-credentials-in-web-app) page. + +### Example pipelines using R + +The build pipeline also supports building and publishing Azure ML pipelines using R to train a model. You can enable it by changing the `build-train-script` pipeline variable to either of the following values: + +- `diabetes_regression_build_train_pipeline_with_r.py` to train a model with R on Azure ML Compute. You'll also need to uncomment (include) the `r-essentials` Conda packages in the environment definition YAML `diabetes_regression/conda_dependencies.yml`. +- `diabetes_regression_build_train_pipeline_with_r_on_dbricks.py` to train a model with R on Databricks. You'll need to manually create a Databricks cluster and attach it to the Azure ML Workspace as a compute resource. Set the DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables in your variable group. + +Example ML pipelines using R have a single step to train a model. They don't demonstrate how to evaluate and register a model. The evaluation and registering techniques are shown only in the Python implementation. + +### Observability and Monitoring + +You can explore aspects of model observability in the solution, such as: + +- **Logging**: Navigate to the Application Insights instance linked to the Azure ML Portal, then go to the Logs (Analytics) pane. The following sample query correlates HTTP requests with custom logs generated in `score.py`. This can be used, for example, to analyze query duration vs. scoring batch size: + + ```sql + let Traceinfo=traces + | extend d=parse_json(tostring(customDimensions.Content)) + | project workspace=customDimensions.["Workspace Name"], + service=customDimensions.["Service Name"], + NumberOfPredictions=tostring(d.NumberOfPredictions), + id=tostring(d.RequestId), + TraceParent=tostring(d.TraceParent); + requests + | project timestamp, id, success, resultCode, duration + | join kind=fullouter Traceinfo on id + | project-away id1 + ``` + +- **Distributed tracing**: The smoke test client code sets an HTTP `traceparent` header (per the [W3C Trace Context proposed specification](https://www.w3.org/TR/trace-context-1)), and the `score.py` code logs the header. The query above shows how to surface this value. You can adapt it to your tracing framework. +- **Monitoring**: You can use [Azure Monitor for containers](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-overview) to monitor the Azure ML scoring containers' performance. + +### Clean up the example resources + +To remove the resources created for this project, use the [/environment_setup/iac-remove-environment-pipeline.yml](../environment_setup/iac-remove-environment-pipeline.yml) definition or you can just delete the resource group in the [Azure Portal](https://portal.azure.com). + +## Next Steps: Integrating your project + +- The [custom model](custom_model.md) guide includes information on bringing your own code to this repository template. +- We recommend using a [custom container](custom_model.md#customize-the-build-agent-environment) to manage your pipeline environment and dependencies. The container provided with the getting started guide may not be suitable or up to date with your project needs. +- Consider using [Azure Pipelines self-hosted agents](https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/agents?view=azure-devops&tabs=browser#install) to speed up your Azure ML pipeline execution. The Docker container image for the Azure ML pipeline is sizable, and having it cached on the agent between runs can trim several minutes from your runs. Additionally, for secure deployments of Azure Machine Learning, you'll probably need to have a self-hosted agent in a Virtual Network. + +### Additional Variables and Configuration + +#### More variable options + +There are more variables used in the project. They're defined in two places: one for local execution and one for using Azure DevOps Pipelines. + +For using Azure Pipelines, all other variables are stored in the file `.pipelines/diabetes_regression-variables-template.yml`. Using the default values as a starting point, adjust the variables to suit your requirements. + +In the `diabetes_regression` folder, you'll also find the `parameters.json` file that we recommend using to provide parameters for training, evaluation, and scoring scripts. The sample parameter that `diabetes_regression` uses is the ridge regression [_alpha_ hyperparameter](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html). We don't provide any serializers for this config file. + +#### Local configuration + +For instructions on how to set up a local development environment, refer to the [Development environment setup instructions](development_setup.md). diff --git a/docs/images/ADO-CD-pipeline-to-webapp.png b/docs/images/ADO-CD-pipeline-to-webapp.png new file mode 100644 index 00000000..aac8c9ee Binary files /dev/null and b/docs/images/ADO-CD-pipeline-to-webapp.png differ diff --git a/docs/images/Architecture_DevOps_AI.png b/docs/images/Architecture_DevOps_AI.png deleted file mode 100644 index 9aac5dd4..00000000 Binary files a/docs/images/Architecture_DevOps_AI.png and /dev/null differ diff --git a/docs/images/EditPipeline1.png b/docs/images/EditPipeline1.png deleted file mode 100644 index b2e60c60..00000000 Binary files a/docs/images/EditPipeline1.png and /dev/null differ diff --git a/docs/images/EditPipeline2.png b/docs/images/EditPipeline2.png deleted file mode 100644 index df91ad2d..00000000 Binary files a/docs/images/EditPipeline2.png and /dev/null differ diff --git a/docs/images/EditPipeline3.png b/docs/images/EditPipeline3.png deleted file mode 100644 index 47a114ae..00000000 Binary files a/docs/images/EditPipeline3.png and /dev/null differ diff --git a/docs/images/EditPipeline4.png b/docs/images/EditPipeline4.png deleted file mode 100644 index e90ddd76..00000000 Binary files a/docs/images/EditPipeline4.png and /dev/null differ diff --git a/docs/images/EditPipeline5.png b/docs/images/EditPipeline5.png deleted file mode 100644 index e5f77898..00000000 Binary files a/docs/images/EditPipeline5.png and /dev/null differ diff --git a/docs/images/EditPipeline6.png b/docs/images/EditPipeline6.png deleted file mode 100644 index bdcf6ab2..00000000 Binary files a/docs/images/EditPipeline6.png and /dev/null differ diff --git a/docs/images/EditPipeline7.png b/docs/images/EditPipeline7.png deleted file mode 100644 index aff974bd..00000000 Binary files a/docs/images/EditPipeline7.png and /dev/null differ diff --git a/docs/images/EditPipeline8.png b/docs/images/EditPipeline8.png deleted file mode 100644 index 396dc084..00000000 Binary files a/docs/images/EditPipeline8.png and /dev/null differ diff --git a/docs/images/Install_Azure_Pipeline.png b/docs/images/Install_Azure_Pipeline.png deleted file mode 100644 index cd1de310..00000000 Binary files a/docs/images/Install_Azure_Pipeline.png and /dev/null differ diff --git a/docs/images/aci-in-azure-portal.png b/docs/images/aci-in-azure-portal.png new file mode 100644 index 00000000..e7bfa8cd Binary files /dev/null and b/docs/images/aci-in-azure-portal.png differ diff --git a/docs/images/agent-specification.png b/docs/images/agent-specification.png deleted file mode 100644 index c71c3b68..00000000 Binary files a/docs/images/agent-specification.png and /dev/null differ diff --git a/docs/images/appservice-webapp-deploymentcenter.png b/docs/images/appservice-webapp-deploymentcenter.png new file mode 100644 index 00000000..b79ff615 Binary files /dev/null and b/docs/images/appservice-webapp-deploymentcenter.png differ diff --git a/docs/images/artifact-invoke-pipeline.png b/docs/images/artifact-invoke-pipeline.png deleted file mode 100644 index 2a6dcebf..00000000 Binary files a/docs/images/artifact-invoke-pipeline.png and /dev/null differ diff --git a/docs/images/automate_deploy_model_pipeline.png b/docs/images/automate_deploy_model_pipeline.png deleted file mode 100644 index 35c7f54e..00000000 Binary files a/docs/images/automate_deploy_model_pipeline.png and /dev/null differ diff --git a/docs/images/automate_infoke_training_pipeline.png b/docs/images/automate_infoke_training_pipeline.png deleted file mode 100644 index 875d1410..00000000 Binary files a/docs/images/automate_infoke_training_pipeline.png and /dev/null differ diff --git a/docs/images/batch-child-run-scoringstep.png b/docs/images/batch-child-run-scoringstep.png new file mode 100644 index 00000000..6b87f52d Binary files /dev/null and b/docs/images/batch-child-run-scoringstep.png differ diff --git a/docs/images/batchscoring-ci-result.png b/docs/images/batchscoring-ci-result.png new file mode 100644 index 00000000..d07d41a8 Binary files /dev/null and b/docs/images/batchscoring-ci-result.png differ diff --git a/docs/images/batchscoring-pipeline.png b/docs/images/batchscoring-pipeline.png new file mode 100644 index 00000000..2b79fe03 Binary files /dev/null and b/docs/images/batchscoring-pipeline.png differ diff --git a/docs/images/build-createpipeline.png b/docs/images/build-createpipeline.png deleted file mode 100644 index 6258895a..00000000 Binary files a/docs/images/build-createpipeline.png and /dev/null differ diff --git a/docs/images/build-createpipeline1.png b/docs/images/build-createpipeline1.png deleted file mode 100644 index 2fa77203..00000000 Binary files a/docs/images/build-createpipeline1.png and /dev/null differ diff --git a/docs/images/build-run.png b/docs/images/build-run.png deleted file mode 100644 index de79438b..00000000 Binary files a/docs/images/build-run.png and /dev/null differ diff --git a/docs/images/build-selectrepo.png b/docs/images/build-selectrepo.png deleted file mode 100644 index a78c96d9..00000000 Binary files a/docs/images/build-selectrepo.png and /dev/null differ diff --git a/docs/images/ci-build-logs.png b/docs/images/ci-build-logs.png deleted file mode 100644 index 726f70ac..00000000 Binary files a/docs/images/ci-build-logs.png and /dev/null differ diff --git a/docs/images/ci-build-pipeline-configure.png b/docs/images/ci-build-pipeline-configure.png index d593d1dc..62953b53 100644 Binary files a/docs/images/ci-build-pipeline-configure.png and b/docs/images/ci-build-pipeline-configure.png differ diff --git a/docs/images/container-registry-webapp-image.png b/docs/images/container-registry-webapp-image.png new file mode 100644 index 00000000..4ec09f8f Binary files /dev/null and b/docs/images/container-registry-webapp-image.png differ diff --git a/docs/images/create-release.png b/docs/images/create-release.png deleted file mode 100644 index 15069b5d..00000000 Binary files a/docs/images/create-release.png and /dev/null differ diff --git a/docs/images/create-rm-service-connection.png b/docs/images/create-rm-service-connection.png index 629d3c2a..e677636a 100644 Binary files a/docs/images/create-rm-service-connection.png and b/docs/images/create-rm-service-connection.png differ diff --git a/docs/images/custom-container-variables.png b/docs/images/custom-container-variables.png new file mode 100644 index 00000000..24a6a92a Binary files /dev/null and b/docs/images/custom-container-variables.png differ diff --git a/docs/images/deploy-model.png b/docs/images/deploy-model.png deleted file mode 100644 index 8a4cbd06..00000000 Binary files a/docs/images/deploy-model.png and /dev/null differ diff --git a/docs/images/invoke-training-pipeline.png b/docs/images/invoke-training-pipeline.png deleted file mode 100644 index 21619ae3..00000000 Binary files a/docs/images/invoke-training-pipeline.png and /dev/null differ diff --git a/docs/images/main-flow.png b/docs/images/main-flow.png deleted file mode 100644 index a49f7440..00000000 Binary files a/docs/images/main-flow.png and /dev/null differ diff --git a/docs/images/ml-ws-svc-connection.png b/docs/images/ml-ws-svc-connection.png new file mode 100644 index 00000000..baf52e1f Binary files /dev/null and b/docs/images/ml-ws-svc-connection.png differ diff --git a/docs/images/model-deploy-configure.png b/docs/images/model-deploy-configure.png new file mode 100644 index 00000000..fcd87750 Binary files /dev/null and b/docs/images/model-deploy-configure.png differ diff --git a/docs/images/model-deploy-get-artifact-logs.png b/docs/images/model-deploy-get-artifact-logs.png new file mode 100644 index 00000000..2249a8d3 Binary files /dev/null and b/docs/images/model-deploy-get-artifact-logs.png differ diff --git a/docs/images/model-deploy-result.png b/docs/images/model-deploy-result.png new file mode 100644 index 00000000..cd3d166e Binary files /dev/null and b/docs/images/model-deploy-result.png differ diff --git a/docs/images/model-train-register-artifacts.png b/docs/images/model-train-register-artifacts.png new file mode 100644 index 00000000..0d3eed26 Binary files /dev/null and b/docs/images/model-train-register-artifacts.png differ diff --git a/docs/images/model-train-register.png b/docs/images/model-train-register.png new file mode 100644 index 00000000..5ce4ef41 Binary files /dev/null and b/docs/images/model-train-register.png differ diff --git a/docs/images/multi-stage-aci-aks.png b/docs/images/multi-stage-aci-aks.png new file mode 100644 index 00000000..0307fbf6 Binary files /dev/null and b/docs/images/multi-stage-aci-aks.png differ diff --git a/docs/images/multi-stage-aci.png b/docs/images/multi-stage-aci.png new file mode 100644 index 00000000..a96f3195 Binary files /dev/null and b/docs/images/multi-stage-aci.png differ diff --git a/docs/images/multi-stage-webapp.png b/docs/images/multi-stage-webapp.png new file mode 100644 index 00000000..e6d60ce1 Binary files /dev/null and b/docs/images/multi-stage-webapp.png differ diff --git a/docs/images/new-build-pipeline.png b/docs/images/new-build-pipeline.png deleted file mode 100644 index 01229f1f..00000000 Binary files a/docs/images/new-build-pipeline.png and /dev/null differ diff --git a/docs/images/new-build-pipeline1.png b/docs/images/new-build-pipeline1.png deleted file mode 100644 index c6fa88ea..00000000 Binary files a/docs/images/new-build-pipeline1.png and /dev/null differ diff --git a/docs/images/postmane.png b/docs/images/postmane.png deleted file mode 100644 index d2b8a49d..00000000 Binary files a/docs/images/postmane.png and /dev/null differ diff --git a/docs/images/release-create.png b/docs/images/release-create.png deleted file mode 100644 index 46f20042..00000000 Binary files a/docs/images/release-create.png and /dev/null differ diff --git a/docs/images/release-createarelease.png b/docs/images/release-createarelease.png deleted file mode 100644 index 740f4a81..00000000 Binary files a/docs/images/release-createarelease.png and /dev/null differ diff --git a/docs/images/release-deployment-service-conn.png b/docs/images/release-deployment-service-conn.png deleted file mode 100644 index 81b402c9..00000000 Binary files a/docs/images/release-deployment-service-conn.png and /dev/null differ diff --git a/docs/images/release-deployment.png b/docs/images/release-deployment.png deleted file mode 100644 index 0fdebc0e..00000000 Binary files a/docs/images/release-deployment.png and /dev/null differ diff --git a/docs/images/release-deploymentacr.png b/docs/images/release-deploymentacr.png deleted file mode 100644 index c179b08c..00000000 Binary files a/docs/images/release-deploymentacr.png and /dev/null differ diff --git a/docs/images/release-deploymentcitrigger.png b/docs/images/release-deploymentcitrigger.png deleted file mode 100644 index f8661db8..00000000 Binary files a/docs/images/release-deploymentcitrigger.png and /dev/null differ diff --git a/docs/images/release-deploymentprodagent.png b/docs/images/release-deploymentprodagent.png deleted file mode 100644 index b2ad0274..00000000 Binary files a/docs/images/release-deploymentprodagent.png and /dev/null differ diff --git a/docs/images/release-deploymentprodtrigger.png b/docs/images/release-deploymentprodtrigger.png deleted file mode 100644 index 31c60450..00000000 Binary files a/docs/images/release-deploymentprodtrigger.png and /dev/null differ diff --git a/docs/images/release-deploymentqaagent.png b/docs/images/release-deploymentqaagent.png deleted file mode 100644 index e7d8999f..00000000 Binary files a/docs/images/release-deploymentqaagent.png and /dev/null differ diff --git a/docs/images/release-empty-job.png b/docs/images/release-empty-job.png deleted file mode 100644 index 7980b89a..00000000 Binary files a/docs/images/release-empty-job.png and /dev/null differ diff --git a/docs/images/release-envtask-scriptpath.png b/docs/images/release-envtask-scriptpath.png deleted file mode 100644 index 9524af0c..00000000 Binary files a/docs/images/release-envtask-scriptpath.png and /dev/null differ diff --git a/docs/images/release-envtask.png b/docs/images/release-envtask.png deleted file mode 100644 index a90f9d79..00000000 Binary files a/docs/images/release-envtask.png and /dev/null differ diff --git a/docs/images/release-import.png b/docs/images/release-import.png deleted file mode 100644 index 01533427..00000000 Binary files a/docs/images/release-import.png and /dev/null differ diff --git a/docs/images/release-link-vg.png b/docs/images/release-link-vg.png deleted file mode 100644 index 95e981a4..00000000 Binary files a/docs/images/release-link-vg.png and /dev/null differ diff --git a/docs/images/release-new-pipeline.png b/docs/images/release-new-pipeline.png deleted file mode 100644 index 8b245095..00000000 Binary files a/docs/images/release-new-pipeline.png and /dev/null differ diff --git a/docs/images/release-retrainingagent.png b/docs/images/release-retrainingagent.png deleted file mode 100644 index 0f000d8a..00000000 Binary files a/docs/images/release-retrainingagent.png and /dev/null differ diff --git a/docs/images/release-retrainingartifact.png b/docs/images/release-retrainingartifact.png deleted file mode 100644 index 16b23515..00000000 Binary files a/docs/images/release-retrainingartifact.png and /dev/null differ diff --git a/docs/images/release-retrainingartifactsuccess.png b/docs/images/release-retrainingartifactsuccess.png deleted file mode 100644 index 36f3b5b1..00000000 Binary files a/docs/images/release-retrainingartifactsuccess.png and /dev/null differ diff --git a/docs/images/release-retrainingpipeline.png b/docs/images/release-retrainingpipeline.png deleted file mode 100644 index a48313d9..00000000 Binary files a/docs/images/release-retrainingpipeline.png and /dev/null differ diff --git a/docs/images/release-retrainingtrigger.png b/docs/images/release-retrainingtrigger.png deleted file mode 100644 index 4786a9d2..00000000 Binary files a/docs/images/release-retrainingtrigger.png and /dev/null differ diff --git a/docs/images/release-retrainingtrigger1.png b/docs/images/release-retrainingtrigger1.png deleted file mode 100644 index b911996e..00000000 Binary files a/docs/images/release-retrainingtrigger1.png and /dev/null differ diff --git a/docs/images/release-retraintask.png b/docs/images/release-retraintask.png deleted file mode 100644 index 062d660b..00000000 Binary files a/docs/images/release-retraintask.png and /dev/null differ diff --git a/docs/images/release-save-empty.png b/docs/images/release-save-empty.png deleted file mode 100644 index 556ed9b6..00000000 Binary files a/docs/images/release-save-empty.png and /dev/null differ diff --git a/docs/images/release-task-createimage.PNG b/docs/images/release-task-createimage.PNG index e24ab5ce..8224db18 100644 Binary files a/docs/images/release-task-createimage.PNG and b/docs/images/release-task-createimage.PNG differ diff --git a/docs/images/release-webapp-pipeline.PNG b/docs/images/release-webapp-pipeline.PNG index 63d8e1c0..10ffddff 100644 Binary files a/docs/images/release-webapp-pipeline.PNG and b/docs/images/release-webapp-pipeline.PNG differ diff --git a/docs/images/release-workingdir.png b/docs/images/release-workingdir.png deleted file mode 100644 index 7e817104..00000000 Binary files a/docs/images/release-workingdir.png and /dev/null differ diff --git a/docs/images/retrain-pipeline-vg.png b/docs/images/retrain-pipeline-vg.png deleted file mode 100644 index 4aa30e9f..00000000 Binary files a/docs/images/retrain-pipeline-vg.png and /dev/null differ diff --git a/docs/images/run-iac-pipeline.png b/docs/images/run-iac-pipeline.png index 15771246..f2549da8 100644 Binary files a/docs/images/run-iac-pipeline.png and b/docs/images/run-iac-pipeline.png differ diff --git a/docs/images/run_training_pipeline_task.png b/docs/images/run_training_pipeline_task.png deleted file mode 100644 index e0455807..00000000 Binary files a/docs/images/run_training_pipeline_task.png and /dev/null differ diff --git a/docs/images/running-training-pipeline.png b/docs/images/running-training-pipeline.png deleted file mode 100644 index 0d3af93e..00000000 Binary files a/docs/images/running-training-pipeline.png and /dev/null differ diff --git a/docs/images/scoring_image.png b/docs/images/scoring_image.png new file mode 100644 index 00000000..ecb1c245 Binary files /dev/null and b/docs/images/scoring_image.png differ diff --git a/docs/images/select-iac-pipeline.png b/docs/images/select-iac-pipeline.png index e165ccc8..695b041f 100644 Binary files a/docs/images/select-iac-pipeline.png and b/docs/images/select-iac-pipeline.png differ diff --git a/docs/images/service-connection-add.png b/docs/images/service-connection-add.png deleted file mode 100644 index b9663f49..00000000 Binary files a/docs/images/service-connection-add.png and /dev/null differ diff --git a/docs/images/service-connection.png b/docs/images/service-connection.png deleted file mode 100644 index 7f3c4f6c..00000000 Binary files a/docs/images/service-connection.png and /dev/null differ diff --git a/docs/images/trained-model.png b/docs/images/trained-model.png index 3753fd7d..5bea4fe2 100644 Binary files a/docs/images/trained-model.png and b/docs/images/trained-model.png differ diff --git a/docs/images/training-pipeline.png b/docs/images/training-pipeline.png index cbdaf048..48854513 100644 Binary files a/docs/images/training-pipeline.png and b/docs/images/training-pipeline.png differ diff --git a/docs/images/workspace-connection.png b/docs/images/workspace-connection.png deleted file mode 100644 index 570a724e..00000000 Binary files a/docs/images/workspace-connection.png and /dev/null differ diff --git a/environment_setup/Dockerfile b/environment_setup/Dockerfile index b6b3be6a..0dfa36b6 100644 --- a/environment_setup/Dockerfile +++ b/environment_setup/Dockerfile @@ -1,13 +1,17 @@ FROM conda/miniconda3 LABEL org.label-schema.vendor = "Microsoft" \ - org.label-schema.url = "https://hub.docker.com/r/microsoft/mlopspython" \ - org.label-schema.vcs-url = "https://github.com/microsoft/MLOpsPython" + org.label-schema.url = "https://hub.docker.com/r/microsoft/mlopspython" \ + org.label-schema.vcs-url = "https://github.com/microsoft/MLOpsPython" - +COPY diabetes_regression/ci_dependencies.yml /setup/ -COPY environment_setup/requirements.txt /setup/ - -RUN apt-get update && apt-get install gcc -y && pip install --upgrade -r /setup/requirements.txt +# activate environment +ENV PATH /usr/local/envs/mlopspython_ci/bin:$PATH -CMD ["python"] \ No newline at end of file +RUN conda update -n base -c defaults conda && \ + conda install python=3.7.5 && \ + conda env create -f /setup/ci_dependencies.yml && \ + /bin/bash -c "source activate mlopspython_ci" && \ + az --version && \ + chmod -R 777 /usr/local/envs/mlopspython_ci/lib/python3.7 diff --git a/environment_setup/arm-templates/cloud-environment.json b/environment_setup/arm-templates/cloud-environment.json index 590a4aed..5f102747 100644 --- a/environment_setup/arm-templates/cloud-environment.json +++ b/environment_setup/arm-templates/cloud-environment.json @@ -13,29 +13,49 @@ "location": { "type": "string", "defaultValue": "eastus", + "metadata": { + "description": "Specifies the location for all resources." + } + }, + "workspace": { + "type": "string" + }, + "storageAccount": { + "type": "string", + "defaultValue": "[concat(toLower(parameters('baseName')), 'amlsa')]" + }, + "keyvault": { + "type": "string", + "defaultValue": "[concat(parameters('baseName'),'-AML-KV')]" + }, + "appInsights": { + "type": "string", + "defaultValue": "[concat(parameters('baseName'),'-AML-AI')]" + }, + "acr": { + "type": "string", + "defaultValue": "[concat(toLower(parameters('baseName')),'amlcr')]" + }, + "sku": { + "type": "string", + "defaultValue": "basic", "allowedValues": [ - "eastus", - "eastus2", - "southcentralus", - "southeastasia", - "westcentralus", - "westeurope", - "westus2", - "centralus" + "basic", + "enterprise" ], "metadata": { - "description": "Specifies the location for all resources." + "description": "Specifies the sku, also referred as 'edition' of the Azure Machine Learning workspace." } } }, "variables": { - "amlWorkspaceName": "[concat(parameters('baseName'),'-AML-WS')]", - "storageAccountName": "[concat(toLower(parameters('baseName')), 'amlsa')]", + "amlWorkspaceName": "[parameters('workspace')]", + "storageAccountName": "[parameters('storageAccount')]", "storageAccountType": "Standard_LRS", - "keyVaultName": "[concat(parameters('baseName'),'-AML-KV')]", + "keyVaultName": "[parameters('keyvault')]", "tenantId": "[subscription().tenantId]", - "applicationInsightsName": "[concat(parameters('baseName'),'-AML-AI')]", - "containerRegistryName": "[concat(toLower(parameters('baseName')),'amlcr')]" + "applicationInsightsName": "[parameters('appInsights')]", + "containerRegistryName": "[parameters('acr')]" }, "resources": [ { @@ -73,7 +93,8 @@ "name": "standard", "family": "A" }, - "accessPolicies": [] + "accessPolicies": [ + ] } }, { @@ -112,6 +133,10 @@ "identity": { "type": "systemAssigned" }, + "sku": { + "tier": "[parameters('sku')]", + "name": "[parameters('sku')]" + }, "properties": { "friendlyName": "[variables('amlWorkspaceName')]", "keyVault": "[resourceId('Microsoft.KeyVault/vaults',variables('keyVaultName'))]", @@ -119,6 +144,6 @@ "containerRegistry": "[resourceId('Microsoft.ContainerRegistry/registries',variables('containerRegistryName'))]", "storageAccount": "[resourceId('Microsoft.Storage/storageAccounts/',variables('storageAccountName'))]" } - } + } ] } \ No newline at end of file diff --git a/environment_setup/docker-image-pipeline.yml b/environment_setup/docker-image-pipeline.yml index a97f8207..9f7361ac 100644 --- a/environment_setup/docker-image-pipeline.yml +++ b/environment_setup/docker-image-pipeline.yml @@ -1,8 +1,9 @@ +# Pipeline that builds and pushes the microsoft/mlopspython image. resources: - repo: self -queue: - name: Hosted Ubuntu 1604 +pool: + vmImage: 'ubuntu-latest' trigger: branches: @@ -11,12 +12,12 @@ trigger: paths: include: - - environment_setup/* + - environment_setup/Dockerfile variables: containerRegistry: $[coalesce(variables['acrServiceConnection'], 'acrconnection')] - imageName: $[coalesce(variables['agentImageName'], 'public/mlops/python')] - + imageName: $[coalesce(variables['agentImageName'], 'public/mlops/python')] + steps: - task: Docker@2 displayName: Build and Push @@ -24,6 +25,10 @@ steps: command: buildAndPush containerRegistry: '$(containerRegistry)' repository: '$(imageName)' - tags: 'latest' + tags: | + ${{format('build-{0}', '$(Build.BuildNumber)')}} + ${{format('amlsdk-{0}', '$(amlsdkversion)')}} + ${{format('release-{0}', '$(githubrelease)')}} + latest buildContext: '$(Build.SourcesDirectory)' dockerFile: '$(Build.SourcesDirectory)/environment_setup/Dockerfile' diff --git a/environment_setup/iac-create-environment-pipeline-arm.yml b/environment_setup/iac-create-environment-pipeline-arm.yml new file mode 100644 index 00000000..0b9f474c --- /dev/null +++ b/environment_setup/iac-create-environment-pipeline-arm.yml @@ -0,0 +1,36 @@ +# CI/PR Pipeline that deploys an ARM template to create or update the resources needed by the other pipelines. +trigger: + branches: + include: + - master + paths: + include: + - environment_setup/arm-templates/* +pr: + branches: + include: + - master + paths: + include: + - environment_setup/arm-templates/* + +pool: + vmImage: "ubuntu-latest" + +variables: + - group: devopsforai-aml-vg + - name: WORKSPACE_SKU # https://docs.microsoft.com/en-us/azure/machine-learning/overview-what-is-azure-ml#sku + value: basic + +steps: + - task: AzureResourceGroupDeployment@2 + inputs: + azureSubscription: "$(AZURE_RM_SVC_CONNECTION)" + action: "Create Or Update Resource Group" + resourceGroupName: "$(RESOURCE_GROUP)" + location: $(LOCATION) + templateLocation: "Linked artifact" + csmFile: "$(Build.SourcesDirectory)/environment_setup/arm-templates/cloud-environment.json" + overrideParameters: "-baseName $(BASE_NAME) -location $(LOCATION) -workspace $(WORKSPACE_NAME) -sku $(WORKSPACE_SKU)" + deploymentMode: "Incremental" + displayName: "Deploy MLOps resources to Azure" diff --git a/environment_setup/iac-create-environment-pipeline-tf.yml b/environment_setup/iac-create-environment-pipeline-tf.yml new file mode 100644 index 00000000..ef184546 --- /dev/null +++ b/environment_setup/iac-create-environment-pipeline-tf.yml @@ -0,0 +1,72 @@ +# CI/PR Pipeline that deploys an TF template to create or update the resources needed by the other pipelines. +trigger: + branches: + include: + - master + paths: + include: + - environment_setup/tf-templates/* +pr: + branches: + include: + - master + paths: + include: + - environment_setup/tf-templates/* + +pool: + vmImage: 'ubuntu-latest' + +variables: +- group: devopsforai-aml-vg + +steps: +- task: charleszipp.azure-pipelines-tasks-terraform.azure-pipelines-tasks-terraform-installer.TerraformInstaller@0 + displayName: 'Use Terraform 0.12.24' + inputs: + terraformVersion: 0.12.24 + +- task: charleszipp.azure-pipelines-tasks-terraform.azure-pipelines-tasks-terraform-cli.TerraformCLI@0 + displayName: 'TF init - Deploy MLOps resources to Azure' + inputs: + command: init + commandOptions: '-backend=true -backend-config=$(Build.SourcesDirectory)/environment_setup/tf-templates/backend.tf' + workingDirectory: '$(Build.SourcesDirectory)/environment_setup/tf-templates' + backendType: azurerm + backendServiceArm: $(AZURE_RM_SVC_CONNECTION) + ensureBackend: true + backendAzureRmResourceGroupLocation: $(LOCATION) + backendAzureRmResourceGroupName: $(RESOURCE_GROUP) + backendAzureRmStorageAccountName: '$(BASE_NAME)statestor' + backendAzureRmStorageAccountSku: 'Standard_LRS' + backendAzureRmContainerName: 'tfstate-cont' + backendAzureRmKey: 'mlopsinfra.tfstate' + +- task: charleszipp.azure-pipelines-tasks-terraform.azure-pipelines-tasks-terraform-cli.TerraformCLI@0 + displayName: 'TF validate - Deploy MLOps resources to Azure' + inputs: + command: validate + workingDirectory: '$(Build.SourcesDirectory)/environment_setup/tf-templates' + +- task: charleszipp.azure-pipelines-tasks-terraform.azure-pipelines-tasks-terraform-cli.TerraformCLI@0 + displayName: 'TF plan - Deploy MLOps resources to Azure' + inputs: + command: plan + workingDirectory: '$(Build.SourcesDirectory)/environment_setup/tf-templates' + environmentServiceName: $(AZURE_RM_SVC_CONNECTION) + env: + TF_VAR_BASE_NAME: $(BASE_NAME) + TF_VAR_RESOURCE_GROUP: $(RESOURCE_GROUP) + TF_VAR_WORKSPACE_NAME: $(WORKSPACE_NAME) + +- task: charleszipp.azure-pipelines-tasks-terraform.azure-pipelines-tasks-terraform-cli.TerraformCLI@0 + displayName: 'TF apply - Deploy MLOps resources to Azure' + inputs: + command: apply + workingDirectory: '$(Build.SourcesDirectory)/environment_setup/tf-templates' + environmentServiceName: $(AZURE_RM_SVC_CONNECTION) + env: + TF_VAR_BASE_NAME: $(BASE_NAME) + TF_VAR_RESOURCE_GROUP: $(RESOURCE_GROUP) + TF_VAR_WORKSPACE_NAME: $(WORKSPACE_NAME) + diff --git a/environment_setup/iac-create-environment.yml b/environment_setup/iac-create-environment.yml deleted file mode 100644 index 4a930c50..00000000 --- a/environment_setup/iac-create-environment.yml +++ /dev/null @@ -1,29 +0,0 @@ -trigger: - branches: - include: - - master - paths: - include: - - environment_setup/arm-templates/* - -pool: - vmImage: 'ubuntu-latest' - -variables: -- group: devopsforai-aml-vg - - -steps: -- task: AzureResourceGroupDeployment@2 - inputs: - azureSubscription: 'AzureResourceConnection' - action: 'Create Or Update Resource Group' - resourceGroupName: '$(BASE_NAME)-AML-RG' - location: $(LOCATION) - templateLocation: 'Linked artifact' - csmFile: '$(Build.SourcesDirectory)/environment_setup/arm-templates/cloud-environment.json' - overrideParameters: '-baseName $(BASE_NAME) -location $(LOCATION)' - deploymentMode: 'Incremental' - displayName: 'Deploy MLOps resources to Azure' - - \ No newline at end of file diff --git a/environment_setup/iac-remove-environment.yml b/environment_setup/iac-remove-environment-pipeline.yml similarity index 52% rename from environment_setup/iac-remove-environment.yml rename to environment_setup/iac-remove-environment-pipeline.yml index 4ca8b04e..39ff9e7a 100644 --- a/environment_setup/iac-remove-environment.yml +++ b/environment_setup/iac-remove-environment-pipeline.yml @@ -1,10 +1,6 @@ -trigger: - branches: - include: - - master - paths: - include: - - environment_setup/arm-templates/* +# Pipeline that removes the resources created by the IaC Create Environment pipeline. +pr: none +trigger: none pool: vmImage: 'ubuntu-latest' @@ -16,9 +12,9 @@ variables: steps: - task: AzureResourceGroupDeployment@2 inputs: - azureSubscription: 'AzureResourceConnection' + azureSubscription: '$(AZURE_RM_SVC_CONNECTION)' action: 'DeleteRG' - resourceGroupName: '$(BASE_NAME)-AML-RG' + resourceGroupName: '$(RESOURCE_GROUP)' location: $(LOCATION) displayName: 'Delete resources in Azure' diff --git a/environment_setup/install_requirements.sh b/environment_setup/install_requirements.sh old mode 100644 new mode 100755 index 1bdd081d..989e8b1e --- a/environment_setup/install_requirements.sh +++ b/environment_setup/install_requirements.sh @@ -24,8 +24,8 @@ # ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +set -eux -python --version -pip install azure-cli==2.0.46 -pip install --upgrade azureml-sdk[cli] -pip install -r requirements.txt \ No newline at end of file +conda env create -f diabetes_regression/ci_dependencies.yml + +conda activate mlopspython_ci diff --git a/environment_setup/requirements.txt b/environment_setup/requirements.txt deleted file mode 100644 index 23880c0c..00000000 --- a/environment_setup/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -pytest==4.3.0 -requests>=2.22 -azureml-sdk>=1.0 -python-dotenv>=0.10.3 -flake8 -flake8_formatter_junit_xml -azure-cli==2.0.71 \ No newline at end of file diff --git a/environment_setup/tf-templates/backend.tf b/environment_setup/tf-templates/backend.tf new file mode 100644 index 00000000..0aec0499 --- /dev/null +++ b/environment_setup/tf-templates/backend.tf @@ -0,0 +1,4 @@ +terraform { + backend "azurerm" { + } +} diff --git a/environment_setup/tf-templates/main.tf b/environment_setup/tf-templates/main.tf new file mode 100644 index 00000000..c57a5a84 --- /dev/null +++ b/environment_setup/tf-templates/main.tf @@ -0,0 +1,71 @@ +provider "azurerm" { + version = "=2.3.0" + features {} +} + +variable BASE_NAME {} +variable RESOURCE_GROUP {} +variable WORKSPACE_NAME {} + +#-------------------------------------------------------------------------------- + +#Set the already-existing resource group +data "azurerm_resource_group" "amlrg" { + name = var.RESOURCE_GROUP +} + +#Set client config for a.o. tenant id +data "azurerm_client_config" "currentconfig" { +} + +#-------------------------------------------------------------------------------- + +# Storage account for AML Service +resource "azurerm_storage_account" "amlstor" { + name = "${var.BASE_NAME}amlsa" + location = data.azurerm_resource_group.amlrg.location + resource_group_name = data.azurerm_resource_group.amlrg.name + account_tier = "Standard" + account_replication_type = "LRS" +} + +# Keyvault for AML Service +resource "azurerm_key_vault" "amlkv" { + name = "${var.BASE_NAME}-AML-KV" + location = data.azurerm_resource_group.amlrg.location + resource_group_name = data.azurerm_resource_group.amlrg.name + tenant_id = data.azurerm_client_config.currentconfig.tenant_id + sku_name = "standard" +} + +# App Insights for AML Service +resource "azurerm_application_insights" "amlai" { + name = "${var.BASE_NAME}-AML-AI" + location = data.azurerm_resource_group.amlrg.location + resource_group_name = data.azurerm_resource_group.amlrg.name + application_type = "web" +} + +# Container registry for AML Service +resource "azurerm_container_registry" "amlacr" { + name = "${var.BASE_NAME}amlcr" + resource_group_name = data.azurerm_resource_group.amlrg.name + location = data.azurerm_resource_group.amlrg.location + sku = "Standard" + admin_enabled = true +} + +# ML Workspace for AML Service, depending on the storage account, Keyvault, App Insights and ACR. +resource "azurerm_machine_learning_workspace" "amlws" { + name = var.WORKSPACE_NAME + location = data.azurerm_resource_group.amlrg.location + resource_group_name = data.azurerm_resource_group.amlrg.name + application_insights_id = azurerm_application_insights.amlai.id + key_vault_id = azurerm_key_vault.amlkv.id + storage_account_id = azurerm_storage_account.amlstor.id + container_registry_id = azurerm_container_registry.amlacr.id + + identity { + type = "SystemAssigned" + } +} diff --git a/experimentation/Diabetes Ridge Regression Experimentation Pipeline.ipynb b/experimentation/Diabetes Ridge Regression Experimentation Pipeline.ipynb new file mode 100644 index 00000000..8b04a5c5 --- /dev/null +++ b/experimentation/Diabetes Ridge Regression Experimentation Pipeline.ipynb @@ -0,0 +1,353 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Experiment with parameters for a Ridge Regression Model on the Diabetes Dataset in an Azure ML Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook is for experimenting with different parameters to train a ridge regression model on the Diabetes dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Change out of the experimentation directory\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "from azureml.core import Workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the workspace from the saved config file\n", + "ws = Workspace.from_config()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, shutil\n", + "\n", + "# Create a folder for the experiment files\n", + "training_folder = 'diabetes-training'\n", + "os.makedirs(training_folder, exist_ok=True)\n", + "\n", + "# Copy the data file into the experiment folder\n", + "shutil.copy('data/diabetes.csv', os.path.join(training_folder, \"diabetes.csv\"))\n", + "\n", + "# Copy the train functions into the experiment folder\n", + "shutil.copy('diabetes_regression/training/train.py', os.path.join(training_folder, \"train.py\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/parameters.json\n", + "{\n", + " \"training\":\n", + " {\n", + " \"alpha\": 0.3\n", + " },\n", + " \"evaluation\":\n", + " {\n", + "\n", + " },\n", + " \"scoring\":\n", + " {\n", + " \n", + " }\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/diabetes_training.py\n", + "# Import libraries\n", + "from azureml.core import Run\n", + "import pandas as pd\n", + "import shutil\n", + "import joblib\n", + "\n", + "from train import split_data, train_model\n", + "\n", + "# Get parameters\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument('--output_folder', type=str, dest='output_folder', default=\"diabetes_model\", help='output folder')\n", + "args = parser.parse_args()\n", + "output_folder = args.output_folder\n", + "\n", + "# Get the experiment run context\n", + "run = Run.get_context()\n", + "\n", + "# load the diabetes dataset\n", + "print(\"Loading Data...\")\n", + "train_df = pd.read_csv('diabetes.csv')\n", + "\n", + "data = split_data(train_df)\n", + "\n", + "# Specify the parameters to test\n", + "with open(\"parameters.json\") as f:\n", + " pars = json.load(f)\n", + " train_args = pars[\"training\"]\n", + "\n", + "# Log parameters\n", + "for k, v in train_args.items():\n", + " run.log(k, v)\n", + "\n", + "model, metrics = train_model(data, train_args)\n", + "\n", + "# Log metrics\n", + "for k, v in metrics.items():\n", + " run.log(k, v)\n", + "\n", + "# Save the parameters file to the outputs folder\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "shutil.copy('parameters.json', os.path.join(output_folder, 'parameters.json'))\n", + "joblib.dump(value=model, filename= output_folder + \"/model.pkl\")\n", + " \n", + "run.complete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/register_diabetes.py\n", + "# Import libraries\n", + "import argparse\n", + "import joblib\n", + "from azureml.core import Workspace, Model, Run\n", + "\n", + "# Get parameters\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument('--model_folder', type=str, dest='model_folder', default=\"diabetes_model\", help='model location')\n", + "args = parser.parse_args()\n", + "model_folder = args.model_folder\n", + "\n", + "# Get the experiment run context\n", + "run = Run.get_context()\n", + "\n", + "# load the model\n", + "print(\"Loading model from \" + model_folder)\n", + "model_file = model_folder + \"/model.pkl\"\n", + "model = joblib.load(model_file)\n", + "\n", + "Model.register(workspace=run.experiment.workspace,\n", + " model_path = model_file,\n", + " model_name = 'diabetes_model',\n", + " tags={'Training context':'Pipeline'})\n", + "\n", + "run.complete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, AmlCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "cluster_name = \"aml-cluster\"\n", + "\n", + "# Verify that cluster exists\n", + "try:\n", + " pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)\n", + " print('Found existing cluster, use it.')\n", + "except ComputeTargetException:\n", + " # If not, create it\n", + " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',\n", + " max_nodes=4,\n", + " idle_seconds_before_scaledown=1800)\n", + " pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)\n", + "\n", + "pipeline_cluster.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Environment\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "from azureml.core.runconfig import RunConfiguration\n", + "\n", + "# Create a Python environment for the experiment\n", + "diabetes_env = Environment(\"diabetes-pipeline-env\")\n", + "diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies\n", + "diabetes_env.docker.enabled = True # Use a docker container\n", + "\n", + "# Create a set of package dependencies\n", + "diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas'],\n", + " pip_packages=['azureml-sdk'])\n", + "\n", + "# Add the dependencies to the environment\n", + "diabetes_env.python.conda_dependencies = diabetes_packages\n", + "\n", + "# Register the environment (just in case you want to use it again)\n", + "diabetes_env.register(workspace=ws)\n", + "registered_env = Environment.get(ws, 'diabetes-pipeline-env')\n", + "\n", + "# Create a new runconfig object for the pipeline\n", + "pipeline_run_config = RunConfiguration()\n", + "\n", + "# Use the compute you created above. \n", + "pipeline_run_config.target = pipeline_cluster\n", + "\n", + "# Assign the environment to the run configuration\n", + "pipeline_run_config.environment = registered_env\n", + "\n", + "print (\"Run configuration created.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.pipeline.core import PipelineData\n", + "from azureml.pipeline.steps import PythonScriptStep, EstimatorStep\n", + "from azureml.train.estimator import Estimator\n", + "\n", + "# Get the training dataset\n", + "#diabetes_ds = ws.datasets.get(\"diabetes dataset\")\n", + "\n", + "# Create a PipelineData (temporary Data Reference) for the model folder\n", + "model_folder = PipelineData(\"model_folder\", datastore=ws.get_default_datastore())\n", + "\n", + "estimator = Estimator(source_directory=training_folder,\n", + " compute_target = pipeline_cluster,\n", + " environment_definition=pipeline_run_config.environment,\n", + " entry_script='diabetes_training.py')\n", + "\n", + "# Step 1, run the estimator to train the model\n", + "train_step = EstimatorStep(name = \"Train Model\",\n", + " estimator=estimator, \n", + " estimator_entry_script_arguments=['--output_folder', model_folder],\n", + " outputs=[model_folder],\n", + " compute_target = pipeline_cluster,\n", + " allow_reuse = True)\n", + "\n", + "# Step 2, run the model registration script\n", + "register_step = PythonScriptStep(name = \"Register Model\",\n", + " source_directory = training_folder,\n", + " script_name = \"register_diabetes.py\",\n", + " arguments = ['--model_folder', model_folder],\n", + " inputs=[model_folder],\n", + " compute_target = pipeline_cluster,\n", + " runconfig = pipeline_run_config,\n", + " allow_reuse = True)\n", + "\n", + "print(\"Pipeline steps defined\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "from azureml.pipeline.core import Pipeline\n", + "from azureml.widgets import RunDetails\n", + "\n", + "# Construct the pipeline\n", + "pipeline_steps = [train_step, register_step]\n", + "pipeline = Pipeline(workspace = ws, steps=pipeline_steps)\n", + "print(\"Pipeline is built.\")\n", + "\n", + "# Create an experiment and run the pipeline\n", + "experiment = Experiment(workspace = ws, name = 'diabetes-training-pipeline')\n", + "pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)\n", + "print(\"Pipeline submitted for execution.\")\n", + "\n", + "RunDetails(pipeline_run).show()\n", + "pipeline_run.wait_for_completion()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Model\n", + "\n", + "for model in Model.list(ws):\n", + " print(model.name, 'version:', model.version)\n", + " for tag_name in model.tags:\n", + " tag = model.tags[tag_name]\n", + " print ('\\t',tag_name, ':', tag)\n", + " for prop_name in model.properties:\n", + " prop = model.properties[prop_name]\n", + " print ('\\t',prop_name, ':', prop)\n", + " print('\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/experimentation/Diabetes Ridge Regression Parameter Experimentation.ipynb b/experimentation/Diabetes Ridge Regression Parameter Experimentation.ipynb new file mode 100644 index 00000000..aab5e052 --- /dev/null +++ b/experimentation/Diabetes Ridge Regression Parameter Experimentation.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Experiment with parameters for a Ridge Regression Model on the Diabetes Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook is for experimenting with different parameters to train a ridge regression model on the Diabetes dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Change out of the experimentation directory\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "from azureml.core import Workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the workspace from the saved config file\n", + "ws = Workspace.from_config()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, shutil\n", + "\n", + "# Create a folder for the experiment files\n", + "training_folder = 'diabetes-training'\n", + "os.makedirs(training_folder, exist_ok=True)\n", + "\n", + "# Copy the data file into the experiment folder\n", + "shutil.copy('data/diabetes.csv', os.path.join(training_folder, \"diabetes.csv\"))\n", + "\n", + "# Copy the train functions into the experiment folder\n", + "shutil.copy('diabetes_regression/training/train.py', os.path.join(training_folder, \"train.py\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/parameters.json\n", + "{\n", + " \"training\":\n", + " {\n", + " \"alpha\": 0.3\n", + " },\n", + " \"evaluation\":\n", + " {\n", + "\n", + " },\n", + " \"scoring\":\n", + " {\n", + " \n", + " }\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/diabetes_training.py\n", + "# Import libraries\n", + "from azureml.core import Run\n", + "import json\n", + "import os\n", + "import pandas as pd\n", + "import shutil\n", + "\n", + "from train import split_data, train_model\n", + "\n", + "# Get the experiment run context\n", + "run = Run.get_context()\n", + "\n", + "# load the diabetes dataset\n", + "print(\"Loading Data...\")\n", + "train_df = pd.read_csv('diabetes.csv')\n", + "\n", + "data = split_data(train_df)\n", + "\n", + "# Specify the parameters to test\n", + "with open(\"parameters.json\") as f:\n", + " pars = json.load(f)\n", + " train_args = pars[\"training\"]\n", + "\n", + "# Log parameters\n", + "for k, v in train_args.items():\n", + " run.log(k, v)\n", + "\n", + "model, metrics = train_model(data, train_args)\n", + "\n", + "# Log metrics\n", + "for k, v in metrics.items():\n", + " run.log(k, v)\n", + "\n", + "# Save the parameters file to the outputs folder\n", + "os.makedirs('outputs', exist_ok=True)\n", + "shutil.copy('parameters.json', os.path.join('outputs', 'parameters.json'))\n", + " \n", + "run.complete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.estimator import Estimator\n", + "from azureml.core import Experiment\n", + "\n", + "# Create an estimator\n", + "estimator = Estimator(source_directory=training_folder,\n", + " entry_script='diabetes_training.py',\n", + " compute_target='local',\n", + " conda_packages=['scikit-learn']\n", + " )\n", + "\n", + "# Create an experiment\n", + "experiment_name = 'diabetes-training'\n", + "experiment = Experiment(workspace = ws, name = experiment_name)\n", + "\n", + "# Run the experiment based on the estimator\n", + "run = experiment.submit(config=estimator)\n", + "run.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = run.get_metrics()\n", + "for k, v in metrics.items():\n", + " print(k, v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for file in run.get_file_names():\n", + " print(file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.6.10 64-bit ('OH3': conda)", + "language": "python", + "name": "python361064bitoh3conda5f7beeba8c1d407187c86667ecfb684f" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/experimentation/Diabetes Ridge Regression Scoring.ipynb b/experimentation/Diabetes Ridge Regression Scoring.ipynb new file mode 100644 index 00000000..9ac340ed --- /dev/null +++ b/experimentation/Diabetes Ridge Regression Scoring.ipynb @@ -0,0 +1,114 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Score Data with a Ridge Regression Model Trained on the Diabetes Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook loads the model trained in the Diabetes Ridge Regression Training notebook, prepares the data, and scores the data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import numpy\n", + "from azureml.core.model import Model\n", + "import joblib" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Model" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "model_path = Model.get_model_path(model_name=\"sklearn_regression_model.pkl\")\n", + "model = joblib.load(model_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "raw_data = '{\"data\":[[1,2,3,4,5,6,7,8,9,10],[10,9,8,7,6,5,4,3,2,1]]}'\n", + "\n", + "data = json.loads(raw_data)[\"data\"]\n", + "data = numpy.array(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Score Data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test result: {'result': [5113.099642122813, 3713.6329271385353]}\n" + ] + } + ], + "source": [ + "request_headers = {}\n", + "\n", + "result = model.predict(data)\n", + "print(\"Test result: \", {\"result\": result.tolist()})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (storedna)", + "language": "python", + "name": "storedna" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/experimentation/Diabetes Ridge Regression Training.ipynb b/experimentation/Diabetes Ridge Regression Training.ipynb new file mode 100644 index 00000000..fa192115 --- /dev/null +++ b/experimentation/Diabetes Ridge Regression Training.ipynb @@ -0,0 +1,401 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train a Ridge Regression Model on the Diabetes Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook loads the Diabetes dataset from sklearn, splits the data into training and validation sets, trains a Ridge regression model, validates the model on the validation set, and saves the model." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_diabetes\n", + "from sklearn.linear_model import Ridge\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.model_selection import train_test_split\n", + "import joblib\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "sample_data = load_diabetes()\n", + "\n", + "df = pd.DataFrame(\n", + " data=sample_data.data,\n", + " columns=sample_data.feature_names)\n", + "df['Y'] = sample_data.target" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(442, 10)\n" + ] + } + ], + "source": [ + "print(df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agesexbmibps1s2s3s4s5s6Y
count4.420000e+024.420000e+024.420000e+024.420000e+024.420000e+024.420000e+024.420000e+024.420000e+024.420000e+024.420000e+02442.000000
mean-3.634285e-161.308343e-16-8.045349e-161.281655e-16-8.835316e-171.327024e-16-4.574646e-163.777301e-16-3.830854e-16-3.412882e-16152.133484
std4.761905e-024.761905e-024.761905e-024.761905e-024.761905e-024.761905e-024.761905e-024.761905e-024.761905e-024.761905e-0277.093005
min-1.072256e-01-4.464164e-02-9.027530e-02-1.123996e-01-1.267807e-01-1.156131e-01-1.023071e-01-7.639450e-02-1.260974e-01-1.377672e-0125.000000
25%-3.729927e-02-4.464164e-02-3.422907e-02-3.665645e-02-3.424784e-02-3.035840e-02-3.511716e-02-3.949338e-02-3.324879e-02-3.317903e-0287.000000
50%5.383060e-03-4.464164e-02-7.283766e-03-5.670611e-03-4.320866e-03-3.819065e-03-6.584468e-03-2.592262e-03-1.947634e-03-1.077698e-03140.500000
75%3.807591e-025.068012e-023.124802e-023.564384e-022.835801e-022.984439e-022.931150e-023.430886e-023.243323e-022.791705e-02211.500000
max1.107267e-015.068012e-021.705552e-011.320442e-011.539137e-011.987880e-011.811791e-011.852344e-011.335990e-011.356118e-01346.000000
\n", + "
" + ], + "text/plain": [ + " age sex bmi bp s1 \\\n", + "count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 \n", + "mean -3.634285e-16 1.308343e-16 -8.045349e-16 1.281655e-16 -8.835316e-17 \n", + "std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 \n", + "min -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123996e-01 -1.267807e-01 \n", + "25% -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665645e-02 -3.424784e-02 \n", + "50% 5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670611e-03 -4.320866e-03 \n", + "75% 3.807591e-02 5.068012e-02 3.124802e-02 3.564384e-02 2.835801e-02 \n", + "max 1.107267e-01 5.068012e-02 1.705552e-01 1.320442e-01 1.539137e-01 \n", + "\n", + " s2 s3 s4 s5 s6 \\\n", + "count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 \n", + "mean 1.327024e-16 -4.574646e-16 3.777301e-16 -3.830854e-16 -3.412882e-16 \n", + "std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 \n", + "min -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260974e-01 -1.377672e-01 \n", + "25% -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324879e-02 -3.317903e-02 \n", + "50% -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947634e-03 -1.077698e-03 \n", + "75% 2.984439e-02 2.931150e-02 3.430886e-02 3.243323e-02 2.791705e-02 \n", + "max 1.987880e-01 1.811791e-01 1.852344e-01 1.335990e-01 1.356118e-01 \n", + "\n", + " Y \n", + "count 442.000000 \n", + "mean 152.133484 \n", + "std 77.093005 \n", + "min 25.000000 \n", + "25% 87.000000 \n", + "50% 140.500000 \n", + "75% 211.500000 \n", + "max 346.000000 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# All data in a single dataframe\n", + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split Data into Training and Validation Sets" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop('Y', axis=1).values\n", + "y = df['Y'].values\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=0)\n", + "data = {\"train\": {\"X\": X_train, \"y\": y_train},\n", + " \"test\": {\"X\": X_test, \"y\": y_test}}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train Model on Training Set" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,\n", + " normalize=False, random_state=None, solver='auto', tol=0.001)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# experiment parameters\n", + "args = {\n", + " \"alpha\": 0.5\n", + "}\n", + "\n", + "reg_model = Ridge(**args)\n", + "reg_model.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Validate Model on Validation Set" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'mse': 3298.9096058070622}\n" + ] + } + ], + "source": [ + "preds = reg_model.predict(data[\"test\"][\"X\"])\n", + "mse = mean_squared_error(preds, y_test)\n", + "metrics = {\"mse\": mse}\n", + "print(metrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Model" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sklearn_regression_model.pkl']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_name = \"sklearn_regression_model.pkl\"\n", + "\n", + "joblib.dump(value=reg, filename=model_name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ml_service/pipelines/__init__.py b/ml_service/pipelines/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ml_service/pipelines/build_train_pipeline.py b/ml_service/pipelines/build_train_pipeline.py deleted file mode 100644 index cd65ff83..00000000 --- a/ml_service/pipelines/build_train_pipeline.py +++ /dev/null @@ -1,128 +0,0 @@ -from azureml.pipeline.core.graph import PipelineParameter -from azureml.pipeline.steps import PythonScriptStep -from azureml.pipeline.core import Pipeline # , PipelineData -from azureml.core.runconfig import RunConfiguration, CondaDependencies -# from azureml.core import Datastore -import os -import sys -from dotenv import load_dotenv -sys.path.append(os.path.abspath("./ml_service/util")) # NOQA: E402 -from workspace import get_workspace -from attach_compute import get_compute - - -def main(): - load_dotenv() - workspace_name = os.environ.get("BASE_NAME")+"-AML-WS" - resource_group = os.environ.get("BASE_NAME")+"-AML-RG" - subscription_id = os.environ.get("SUBSCRIPTION_ID") - tenant_id = os.environ.get("TENANT_ID") - app_id = os.environ.get("SP_APP_ID") - app_secret = os.environ.get("SP_APP_SECRET") - sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN") - train_script_path = os.environ.get("TRAIN_SCRIPT_PATH") - evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH") - # register_script_path = os.environ.get("REGISTER_SCRIPT_PATH") - vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") - compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME") - model_name = os.environ.get("MODEL_NAME") - build_id = os.environ.get("BUILD_BUILDID") - pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME") - - # Get Azure machine learning workspace - aml_workspace = get_workspace( - workspace_name, - resource_group, - subscription_id, - tenant_id, - app_id, - app_secret) - print(aml_workspace) - - # Get Azure machine learning cluster - aml_compute = get_compute( - aml_workspace, - compute_name, - vm_size) - if aml_compute is not None: - print(aml_compute) - - run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( - conda_packages=['numpy', 'pandas', - 'scikit-learn', 'tensorflow', 'keras'], - pip_packages=['azure', 'azureml-core', - 'azure-storage', - 'azure-storage-blob']) - ) - run_config.environment.docker.enabled = True - - model_name = PipelineParameter( - name="model_name", default_value=model_name) - release_id = PipelineParameter( - name="release_id", default_value="0" - ) - - train_step = PythonScriptStep( - name="Train Model", - script_name=train_script_path, - compute_target=aml_compute, - source_directory=sources_directory_train, - arguments=[ - "--release_id", release_id, - "--model_name", model_name, - ], - runconfig=run_config, - allow_reuse=False, - ) - print("Step Train created") - - evaluate_step = PythonScriptStep( - name="Evaluate Model ", - script_name=evaluate_script_path, - compute_target=aml_compute, - source_directory=sources_directory_train, - arguments=[ - "--release_id", release_id, - "--model_name", model_name, - ], - runconfig=run_config, - allow_reuse=False, - ) - print("Step Evaluate created") - - # Currently, the Evaluate step will automatically register - # the model if it performs better. This step is based on a - # previous version of the repo which utilized JSON files to - # track evaluation results. - - # register_model_step = PythonScriptStep( - # name="Register New Trained Model", - # script_name=register_script_path, - # compute_target=aml_compute, - # source_directory=sources_directory_train, - # arguments=[ - # "--release_id", release_id, - # "--model_name", model_name, - # ], - # runconfig=run_config, - # allow_reuse=False, - # ) - # print("Step register model created") - - evaluate_step.run_after(train_step) - # register_model_step.run_after(evaluate_step) - steps = [evaluate_step] - - train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) - train_pipeline.validate() - published_pipeline = train_pipeline.publish( - name=pipeline_name, - description="Model training/retraining pipeline", - version=build_id - ) - print(f'Published pipeline: {published_pipeline.name}') - print(f'for build {published_pipeline.version}') - - -if __name__ == '__main__': - main() diff --git a/ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py b/ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py new file mode 100644 index 00000000..5a0f0125 --- /dev/null +++ b/ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py @@ -0,0 +1,428 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" +import os +from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep +from ml_service.util.manage_environment import get_environment +from ml_service.pipelines.load_sample_data import create_sample_data_csv +from ml_service.util.env_variables import Env +from ml_service.util.attach_compute import get_compute +from azureml.core import ( + Workspace, + Dataset, + Datastore, + RunConfiguration, +) +from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter +from azureml.core.compute import ComputeTarget +from azureml.data.datapath import DataPath +from azureml.pipeline.steps import PythonScriptStep +from typing import Tuple + + +def get_or_create_datastore( + datastorename: str, ws: Workspace, env: Env, input: bool = True +) -> Datastore: + """ + Obtains a datastore with matching name. Creates it if none exists. + + :param datastorename: Name of the datastore + :param ws: Current AML Workspace + :param env: Environment variables + :param input: Datastore points to the input container if + this is True(default) or the output storage container otherwise + + :returns: Datastore + + :raises: ValueError + """ + if datastorename is None: + raise ValueError("Datastore name is required.") + + containername = ( + env.scoring_datastore_input_container + if input + else env.scoring_datastore_output_container + ) + + if datastorename in ws.datastores: + + datastore = ws.datastores[datastorename] + + # the datastore is not registered but we have all details to register it + elif ( + env.scoring_datastore_access_key is not None + and containername is not None # NOQA: E501 + ): # NOQA:E501 + + datastore = Datastore.register_azure_blob_container( + workspace=ws, + datastore_name=datastorename, + account_name=env.scoring_datastore_storage_name, + account_key=env.scoring_datastore_access_key, + container_name=containername, + ) + else: + raise ValueError( + "No existing datastore named {} nor was enough information supplied to create one.".format( # NOQA: E501 + datastorename + ) + ) + + return datastore + + +def get_input_dataset(ws: Workspace, ds: Datastore, env: Env) -> Dataset: + """ + Gets an input dataset wrapped around an input data file. The input + data file is assumed to exist in the supplied datastore. + + + :param ws: AML Workspace + :param ds: Datastore containing the data file + :param env: Environment variables + + :returns: Input Dataset + """ + + scoringinputds = Dataset.Tabular.from_delimited_files( + path=DataPath(ds, env.scoring_datastore_input_filename) + ) + + scoringinputds = scoringinputds.register( + ws, + name=env.scoring_dataset_name, + tags={"purpose": "scoring input", "format": "csv"}, + create_new_version=True, + ).as_named_input(env.scoring_dataset_name) + + return scoringinputds + + +def get_fallback_input_dataset(ws: Workspace, env: Env) -> Dataset: + """ + Called when an input datastore does not exist or no input data file exists + at that location. Create a sample dataset using the diabetes dataset from + scikit-learn. Useful when debugging this code in the absence of the input + data location Azure blob. + + + :param ws: AML Workspace + :param env: Environment Variables + + :returns: Fallback input dataset + + :raises: FileNotFoundError + """ + # This call creates an example CSV from sklearn sample data. If you + # have already bootstrapped your project, you can comment this line + # out and use your own CSV. + create_sample_data_csv( + file_name=env.scoring_datastore_input_filename, for_scoring=True + ) + + if not os.path.exists(env.scoring_datastore_input_filename): + error_message = ( + "Could not find CSV dataset for scoring at {}. " + + "No alternate data store location was provided either.".format( + env.scoring_datastore_input_filename + ) # NOQA: E501 + ) + + raise FileNotFoundError(error_message) + + # upload the input data to the workspace default datastore + default_datastore = ws.get_default_datastore() + scoreinputdataref = default_datastore.upload_files( + [env.scoring_datastore_input_filename], + target_path="scoringinput", + overwrite=False, + ) + + scoringinputds = ( + Dataset.Tabular.from_delimited_files(scoreinputdataref) + .register(ws, env.scoring_dataset_name, create_new_version=True) + .as_named_input(env.scoring_dataset_name) + ) + + return scoringinputds + + +def get_output_location( + ws: Workspace, env: Env, outputdatastore: Datastore = None +) -> PipelineData: + """ + Returns a Datastore wrapped as a PipelineData instance suitable + for passing into a pipeline step. Represents the location where + the scoring output should be written. Uses the default workspace + blob store if no output datastore is supplied. + + + :param ws: AML Workspace + :param env: Environment Variables + :param outputdatastore: AML Datastore, optional, default is None + + :returns: PipelineData wrapping the output datastore + """ + + if outputdatastore is None: + output_loc = PipelineData( + name="defaultoutput", datastore=ws.get_default_datastore() + ) + else: + output_loc = PipelineData( + name=outputdatastore.name, datastore=outputdatastore + ) # NOQA: E501 + + return output_loc + + +def get_inputds_outputloc( + ws: Workspace, env: Env +) -> Tuple[Dataset, PipelineData]: # NOQA: E501 + """ + Prepare the input and output for the scoring step. Input is a tabular + dataset wrapped around the scoring data. Output is PipelineData + representing a location to write the scores down. + + :param ws: AML Workspace + :param env: Environment Variables + + :returns: Input dataset and output location + """ + + if env.scoring_datastore_storage_name is None: + # fall back to default + scoringinputds = get_fallback_input_dataset(ws, env) + output_loc = get_output_location(ws, env) + else: + inputdatastore = get_or_create_datastore( + "{}_in".format(env.scoring_datastore_storage_name), ws, env + ) + outputdatastore = get_or_create_datastore( + "{}_out".format(env.scoring_datastore_storage_name), + ws, + env, + input=False, # NOQA: E501 + ) + scoringinputds = get_input_dataset(ws, inputdatastore, env) + output_loc = get_output_location(ws, env, outputdatastore) + + return (scoringinputds, output_loc) + + +def get_run_configs( + ws: Workspace, computetarget: ComputeTarget, env: Env +) -> Tuple[ParallelRunConfig, RunConfiguration]: + """ + Creates the necessary run configurations required by the + pipeline to enable parallelized scoring. + + :param ws: AML Workspace + :param computetarget: AML Compute target + :param env: Environment Variables + + :returns: Tuple[Scoring Run configuration, Score copy run configuration] + """ + + # get a conda environment for scoring + environment = get_environment( + ws, + env.aml_env_name_scoring, + conda_dependencies_file=env.aml_env_score_conda_dep_file, + enable_docker=True, + use_gpu=env.use_gpu_for_scoring, + create_new=env.rebuild_env_scoring, + ) + + score_run_config = ParallelRunConfig( + entry_script=env.batchscore_script_path, + source_directory=env.sources_directory_train, + error_threshold=10, + output_action="append_row", + compute_target=computetarget, + node_count=env.max_nodes_scoring, + environment=environment, + run_invocation_timeout=300, + ) + + copy_run_config = RunConfiguration() + copy_run_config.environment = get_environment( + ws, + env.aml_env_name_score_copy, + conda_dependencies_file=env.aml_env_scorecopy_conda_dep_file, + enable_docker=True, + use_gpu=env.use_gpu_for_scoring, + create_new=env.rebuild_env_scoring, + ) + return (score_run_config, copy_run_config) + + +def get_scoring_pipeline( + scoring_dataset: Dataset, + output_loc: PipelineData, + score_run_config: ParallelRunConfig, + copy_run_config: RunConfiguration, + computetarget: ComputeTarget, + ws: Workspace, + env: Env, +) -> Pipeline: + """ + Creates the scoring pipeline. + + :param scoring_dataset: Data to score + :param output_loc: Location to save the scoring results + :param score_run_config: Parallel Run configuration to support + parallelized scoring + :param copy_run_config: Script Run configuration to support + score copying + :param computetarget: AML Compute target + :param ws: AML Workspace + :param env: Environment Variables + + :returns: Scoring pipeline instance + """ + # To help filter the model make the model name, model version and a + # tag/value pair bindable parameters so that they can be passed to + # the pipeline when invoked either over REST or via the AML SDK. + model_name_param = PipelineParameter( + "model_name", default_value=" " + ) # NOQA: E501 + model_version_param = PipelineParameter( + "model_version", default_value=" " + ) # NOQA: E501 + model_tag_name_param = PipelineParameter( + "model_tag_name", default_value=" " + ) # NOQA: E501 + model_tag_value_param = PipelineParameter( + "model_tag_value", default_value=" " + ) # NOQA: E501 + + scoring_step = ParallelRunStep( + name="scoringstep", + inputs=[scoring_dataset], + output=output_loc, + arguments=[ + "--model_name", + model_name_param, + "--model_version", + model_version_param, + "--model_tag_name", + model_tag_name_param, + "--model_tag_value", + model_tag_value_param, + ], + parallel_run_config=score_run_config, + allow_reuse=False, + ) + + copying_step = PythonScriptStep( + name="scorecopystep", + script_name=env.batchscore_copy_script_path, + source_directory=env.sources_directory_train, + arguments=[ + "--output_path", + output_loc, + "--scoring_output_filename", + env.scoring_datastore_output_filename + if env.scoring_datastore_output_filename is not None + else "", + "--scoring_datastore", + env.scoring_datastore_storage_name + if env.scoring_datastore_storage_name is not None + else "", + "--score_container", + env.scoring_datastore_output_container + if env.scoring_datastore_output_container is not None + else "", + "--scoring_datastore_key", + env.scoring_datastore_access_key + if env.scoring_datastore_access_key is not None + else "", + ], + inputs=[output_loc], + allow_reuse=False, + compute_target=computetarget, + runconfig=copy_run_config, + ) + return Pipeline(workspace=ws, steps=[scoring_step, copying_step]) + + +def build_batchscore_pipeline(): + """ + Main method that builds and publishes a scoring pipeline. + """ + + try: + env = Env() + + # Get Azure machine learning workspace + aml_workspace = Workspace.get( + name=env.workspace_name, + subscription_id=env.subscription_id, + resource_group=env.resource_group, + ) + + # Get Azure machine learning cluster + aml_compute_score = get_compute( + aml_workspace, + env.compute_name_scoring, + env.vm_size_scoring, + for_batch_scoring=True, + ) + + input_dataset, output_location = get_inputds_outputloc( + aml_workspace, env + ) # NOQA: E501 + + scoring_runconfig, score_copy_runconfig = get_run_configs( + aml_workspace, aml_compute_score, env + ) + + scoring_pipeline = get_scoring_pipeline( + input_dataset, + output_location, + scoring_runconfig, + score_copy_runconfig, + aml_compute_score, + aml_workspace, + env, + ) + + published_pipeline = scoring_pipeline.publish( + name=env.scoring_pipeline_name, + description="Diabetes Batch Scoring Pipeline", + ) + pipeline_id_string = "##vso[task.setvariable variable=pipeline_id;isOutput=true]{}".format( # NOQA: E501 + published_pipeline.id + ) + print(pipeline_id_string) + except Exception as e: + print(e) + exit(1) + + +if __name__ == "__main__": + build_batchscore_pipeline() diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py new file mode 100644 index 00000000..03937186 --- /dev/null +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py @@ -0,0 +1,180 @@ +from azureml.pipeline.core.graph import PipelineParameter +from azureml.pipeline.steps import PythonScriptStep +from azureml.pipeline.core import Pipeline, PipelineData +from azureml.core import Workspace, Dataset, Datastore +from azureml.core.runconfig import RunConfiguration +from ml_service.pipelines.load_sample_data import create_sample_data_csv +from ml_service.util.attach_compute import get_compute +from ml_service.util.env_variables import Env +from ml_service.util.manage_environment import get_environment +import os + + +def main(): + e = Env() + # Get Azure machine learning workspace + aml_workspace = Workspace.get( + name=e.workspace_name, + subscription_id=e.subscription_id, + resource_group=e.resource_group, + ) + print("get_workspace:") + print(aml_workspace) + + # Get Azure machine learning cluster + aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) + if aml_compute is not None: + print("aml_compute:") + print(aml_compute) + + # Create a reusable Azure ML environment + environment = get_environment( + aml_workspace, + e.aml_env_name, + conda_dependencies_file=e.aml_env_train_conda_dep_file, + create_new=e.rebuild_env, + ) # + run_config = RunConfiguration() + run_config.environment = environment + + if e.datastore_name: + datastore_name = e.datastore_name + else: + datastore_name = aml_workspace.get_default_datastore().name + run_config.environment.environment_variables[ + "DATASTORE_NAME" + ] = datastore_name # NOQA: E501 + + model_name_param = PipelineParameter(name="model_name", default_value=e.model_name) # NOQA: E501 + dataset_version_param = PipelineParameter( + name="dataset_version", default_value=e.dataset_version + ) + data_file_path_param = PipelineParameter( + name="data_file_path", default_value="none" + ) + caller_run_id_param = PipelineParameter(name="caller_run_id", default_value="none") # NOQA: E501 + + # Get dataset name + dataset_name = e.dataset_name + + # Check to see if dataset exists + if dataset_name not in aml_workspace.datasets: + # This call creates an example CSV from sklearn sample data. If you + # have already bootstrapped your project, you can comment this line + # out and use your own CSV. + create_sample_data_csv() + + # Use a CSV to read in the data set. + file_name = "diabetes.csv" + + if not os.path.exists(file_name): + raise Exception( + 'Could not find CSV dataset at "%s". If you have bootstrapped your project, you will need to provide a CSV.' # NOQA: E501 + % file_name + ) # NOQA: E501 + + # Upload file to default datastore in workspace + datatstore = Datastore.get(aml_workspace, datastore_name) + target_path = "training-data/" + datatstore.upload_files( + files=[file_name], + target_path=target_path, + overwrite=True, + show_progress=False, + ) + + # Register dataset + path_on_datastore = os.path.join(target_path, file_name) + dataset = Dataset.Tabular.from_delimited_files( + path=(datatstore, path_on_datastore) + ) + dataset = dataset.register( + workspace=aml_workspace, + name=dataset_name, + description="diabetes training data", + tags={"format": "CSV"}, + create_new_version=True, + ) + + # Create a PipelineData to pass data between steps + pipeline_data = PipelineData( + "pipeline_data", datastore=aml_workspace.get_default_datastore() + ) + + train_step = PythonScriptStep( + name="Train Model", + script_name=e.train_script_path, + compute_target=aml_compute, + source_directory=e.sources_directory_train, + outputs=[pipeline_data], + arguments=[ + "--model_name", + model_name_param, + "--step_output", + pipeline_data, + "--dataset_version", + dataset_version_param, + "--data_file_path", + data_file_path_param, + "--caller_run_id", + caller_run_id_param, + "--dataset_name", + dataset_name, + ], + runconfig=run_config, + allow_reuse=True, + ) + print("Step Train created") + + evaluate_step = PythonScriptStep( + name="Evaluate Model ", + script_name=e.evaluate_script_path, + compute_target=aml_compute, + source_directory=e.sources_directory_train, + arguments=[ + "--model_name", + model_name_param, + "--allow_run_cancel", + e.allow_run_cancel, + ], + runconfig=run_config, + allow_reuse=False, + ) + print("Step Evaluate created") + + register_step = PythonScriptStep( + name="Register Model ", + script_name=e.register_script_path, + compute_target=aml_compute, + source_directory=e.sources_directory_train, + inputs=[pipeline_data], + arguments=["--model_name", model_name_param, "--step_input", pipeline_data, ], # NOQA: E501 + runconfig=run_config, + allow_reuse=False, + ) + print("Step Register created") + # Check run_evaluation flag to include or exclude evaluation step. + if (e.run_evaluation).lower() == "true": + print("Include evaluation step before register step.") + evaluate_step.run_after(train_step) + register_step.run_after(evaluate_step) + steps = [train_step, evaluate_step, register_step] + else: + print("Exclude evaluation step and directly run register step.") + register_step.run_after(train_step) + steps = [train_step, register_step] + + train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) + train_pipeline._set_experiment_name + train_pipeline.validate() + published_pipeline = train_pipeline.publish( + name=e.pipeline_name, + description="Model training/retraining pipeline", + version=e.build_id, + ) + print(f"Published pipeline: {published_pipeline.name}") + print(f"for build {published_pipeline.version}") + + +if __name__ == "__main__": + main() diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py new file mode 100644 index 00000000..254f22eb --- /dev/null +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py @@ -0,0 +1,63 @@ +from azureml.pipeline.steps import PythonScriptStep +from azureml.pipeline.core import Pipeline +from azureml.core import Workspace +from azureml.core.runconfig import RunConfiguration +from ml_service.util.attach_compute import get_compute +from ml_service.util.env_variables import Env +from ml_service.util.manage_environment import get_environment + + +def main(): + e = Env() + # Get Azure machine learning workspace + aml_workspace = Workspace.get( + name=e.workspace_name, + subscription_id=e.subscription_id, + resource_group=e.resource_group, + ) + print("get_workspace:") + print(aml_workspace) + + # Get Azure machine learning cluster + aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) + if aml_compute is not None: + print("aml_compute:") + print(aml_compute) + + # Create a reusable Azure ML environment + # Make sure to include `r-essentials' + # in diabetes_regression/conda_dependencies.yml + environment = get_environment( + aml_workspace, + e.aml_env_name, + conda_dependencies_file=e.aml_env_train_conda_dep_file, + create_new=e.rebuild_env, + ) # NOQA: E501 + run_config = RunConfiguration() + run_config.environment = environment + + train_step = PythonScriptStep( + name="Train Model", + script_name="train_with_r.py", + compute_target=aml_compute, + source_directory="diabetes_regression/training/R", + runconfig=run_config, + allow_reuse=False, + ) + print("Step Train created") + + steps = [train_step] + + train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) + train_pipeline.validate() + published_pipeline = train_pipeline.publish( + name=e.pipeline_name, + description="Model training/retraining pipeline", + version=e.build_id, + ) + print(f"Published pipeline: {published_pipeline.name}") + print(f"for build {published_pipeline.version}") + + +if __name__ == "__main__": + main() diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py new file mode 100644 index 00000000..ae607b3b --- /dev/null +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py @@ -0,0 +1,55 @@ +from azureml.pipeline.core import Pipeline +from azureml.core import Workspace +from ml_service.util.attach_compute import get_compute +from azureml.pipeline.steps import DatabricksStep +from ml_service.util.env_variables import Env + + +def main(): + e = Env() + # Get Azure machine learning workspace + aml_workspace = Workspace.get( + name=e.workspace_name, + subscription_id=e.subscription_id, + resource_group=e.resource_group + ) + print("get_workspace:") + print(aml_workspace) + + # Get Azure machine learning cluster + aml_compute = get_compute( + aml_workspace, + e.compute_name, + e.vm_size) + if aml_compute is not None: + print("aml_compute:") + print(aml_compute) + + train_step = DatabricksStep( + name="DBPythonInLocalMachine", + num_workers=1, + python_script_name="train_with_r_on_databricks.py", + source_directory="diabetes_regression/training/R", + run_name='DB_Python_R_demo', + existing_cluster_id=e.db_cluster_id, + compute_target=aml_compute, + allow_reuse=False + ) + + print("Step Train created") + + steps = [train_step] + + train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) + train_pipeline.validate() + published_pipeline = train_pipeline.publish( + name=e.pipeline_name + "_with_R_on_DB", + description="Model training/retraining pipeline", + version=e.build_id + ) + print(f'Published pipeline: {published_pipeline.name}') + print(f'for build {published_pipeline.version}') + + +if __name__ == '__main__': + main() diff --git a/ml_service/pipelines/load_sample_data.py b/ml_service/pipelines/load_sample_data.py new file mode 100644 index 00000000..304a8e7b --- /dev/null +++ b/ml_service/pipelines/load_sample_data.py @@ -0,0 +1,18 @@ + +import pandas as pd +from sklearn.datasets import load_diabetes + + +# Loads the diabetes sample data from sklearn and produces a csv file that can +# be used by the build/train pipeline script. +def create_sample_data_csv(file_name: str = "diabetes.csv", + for_scoring: bool = False): + sample_data = load_diabetes() + df = pd.DataFrame( + data=sample_data.data, + columns=sample_data.feature_names) + if not for_scoring: + df['Y'] = sample_data.target + # Hard code to diabetes so we fail fast if the project has been + # bootstrapped. + df.to_csv(file_name, index=False) diff --git a/ml_service/pipelines/run_parallel_batchscore_pipeline.py b/ml_service/pipelines/run_parallel_batchscore_pipeline.py new file mode 100644 index 00000000..c046eb9c --- /dev/null +++ b/ml_service/pipelines/run_parallel_batchscore_pipeline.py @@ -0,0 +1,134 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" + +from azure.storage.blob import ContainerClient +from ml_service.util.env_variables import Env +from azureml.core import Experiment, Workspace +from azureml.pipeline.core import PublishedPipeline +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--pipeline_id", type=str, default=None) + return parser.parse_args() + + +def get_pipeline(pipeline_id, ws: Workspace, env: Env): + if pipeline_id is not None: + scoringpipeline = PublishedPipeline.get(ws, pipeline_id) + else: + pipelines = PublishedPipeline.list(ws) + scoringpipelinelist = [ + pl for pl in pipelines if pl.name == env.scoring_pipeline_name + ] # noqa E501 + + if scoringpipelinelist.count == 0: + raise Exception( + "No pipeline found matching name:{}".format(env.scoring_pipeline_name) # NOQA: E501 + ) + else: + # latest published + scoringpipeline = scoringpipelinelist[0] + + return scoringpipeline + + +def copy_output(step_id: str, env: Env): + accounturl = "https://{}.blob.core.windows.net".format( + env.scoring_datastore_storage_name + ) + + srcblobname = "azureml/{}/{}_out/parallel_run_step.txt".format( + step_id, env.scoring_datastore_storage_name + ) + + srcbloburl = "{}/{}/{}".format( + accounturl, env.scoring_datastore_output_container, srcblobname + ) + + containerclient = ContainerClient( + accounturl, + env.scoring_datastore_output_container, + env.scoring_datastore_access_key, + ) + srcblobproperties = containerclient.get_blob_client( + srcblobname + ).get_blob_properties() # noqa E501 + + destfolder = srcblobproperties.last_modified.date().isoformat() + filetime = ( + srcblobproperties.last_modified.time() + .isoformat("milliseconds") + .replace(":", "_") + .replace(".", "_") + ) # noqa E501 + destfilenameparts = env.scoring_datastore_output_filename.split(".") + destblobname = "{}/{}_{}.{}".format( + destfolder, destfilenameparts[0], filetime, destfilenameparts[1] + ) + + destblobclient = containerclient.get_blob_client(destblobname) + destblobclient.start_copy_from_url(srcbloburl) + + +def run_batchscore_pipeline(): + try: + env = Env() + + args = parse_args() + + aml_workspace = Workspace.get( + name=env.workspace_name, + subscription_id=env.subscription_id, + resource_group=env.resource_group, + ) + + scoringpipeline = get_pipeline(args.pipeline_id, aml_workspace, env) + + experiment = Experiment(workspace=aml_workspace, name=env.experiment_name) # NOQA: E501 + + run = experiment.submit( + scoringpipeline, + pipeline_parameters={ + "model_name": env.model_name, + "model_version": env.model_version, + "model_tag_name": " ", + "model_tag_value": " ", + }, + ) + + run.wait_for_completion(show_output=True) + + if run.get_status() == "Finished": + copy_output(list(run.get_steps())[0].id, env) + + except Exception as ex: + print("Error: {}".format(ex)) + + +if __name__ == "__main__": + run_batchscore_pipeline() diff --git a/ml_service/pipelines/run_train_pipeline.py b/ml_service/pipelines/run_train_pipeline.py index 11252a88..b68b9a15 100644 --- a/ml_service/pipelines/run_train_pipeline.py +++ b/ml_service/pipelines/run_train_pipeline.py @@ -1,61 +1,72 @@ -import os from azureml.pipeline.core import PublishedPipeline -from azureml.core import Workspace -from azureml.core.authentication import ServicePrincipalAuthentication -from dotenv import load_dotenv +from azureml.core import Experiment, Workspace +import argparse +from ml_service.util.env_variables import Env def main(): - load_dotenv() - workspace_name = os.environ.get("BASE_NAME")+"-AML-WS" - resource_group = os.environ.get("BASE_NAME")+"-AML-RG" - subscription_id = os.environ.get("SUBSCRIPTION_ID") - tenant_id = os.environ.get("TENANT_ID") - experiment_name = os.environ.get("EXPERIMENT_NAME") - model_name = os.environ.get("MODEL_NAME") - app_id = os.environ.get('SP_APP_ID') - app_secret = os.environ.get('SP_APP_SECRET') - release_id = os.environ.get('RELEASE_RELEASEID') - build_id = os.environ.get('BUILD_BUILDID') - - service_principal = ServicePrincipalAuthentication( - tenant_id=tenant_id, - service_principal_id=app_id, - service_principal_password=app_secret) + + parser = argparse.ArgumentParser("register") + parser.add_argument( + "--output_pipeline_id_file", + type=str, + default="pipeline_id.txt", + help="Name of a file to write pipeline ID to" + ) + parser.add_argument( + "--skip_train_execution", + action="store_true", + help=("Do not trigger the execution. " + "Use this in Azure DevOps when using a server job to trigger") + ) + args = parser.parse_args() + + e = Env() aml_workspace = Workspace.get( - name=workspace_name, - subscription_id=subscription_id, - resource_group=resource_group, - auth=service_principal - ) + name=e.workspace_name, + subscription_id=e.subscription_id, + resource_group=e.resource_group + ) # Find the pipeline that was published by the specified build ID pipelines = PublishedPipeline.list(aml_workspace) matched_pipes = [] for p in pipelines: - if p.version == build_id: - matched_pipes.append(p) + if p.name == e.pipeline_name: + if p.version == e.build_id: + matched_pipes.append(p) if(len(matched_pipes) > 1): published_pipeline = None - raise Exception(f"Multiple active pipelines are published for build {build_id}.") # NOQA: E501 + raise Exception(f"Multiple active pipelines are published for build {e.build_id}.") # NOQA: E501 elif(len(matched_pipes) == 0): published_pipeline = None - raise KeyError(f"Unable to find a published pipeline for this build {build_id}") # NOQA: E501 + raise KeyError(f"Unable to find a published pipeline for this build {e.build_id}") # NOQA: E501 else: published_pipeline = matched_pipes[0] + print("published pipeline id is", published_pipeline.id) - pipeline_parameters = {"model_name": model_name, "release_id": release_id} + # Save the Pipeline ID for other AzDO jobs after script is complete + if args.output_pipeline_id_file is not None: + with open(args.output_pipeline_id_file, "w") as out_file: + out_file.write(published_pipeline.id) - response = published_pipeline.submit( - aml_workspace, - experiment_name, - pipeline_parameters) + if(args.skip_train_execution is False): + pipeline_parameters = {"model_name": e.model_name} + tags = {"BuildId": e.build_id} + if (e.build_uri is not None): + tags["BuildUri"] = e.build_uri + experiment = Experiment( + workspace=aml_workspace, + name=e.experiment_name) + run = experiment.submit( + published_pipeline, + tags=tags, + pipeline_parameters=pipeline_parameters) - run_id = response.id - print("Pipeline run initiated ", run_id) + print("Pipeline run initiated ", run.id) if __name__ == "__main__": diff --git a/ml_service/util/__init__.py b/ml_service/util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ml_service/util/attach_compute.py b/ml_service/util/attach_compute.py index 7a34cd38..cf8c07a6 100644 --- a/ml_service/util/attach_compute.py +++ b/ml_service/util/attach_compute.py @@ -1,47 +1,39 @@ -import os -from dotenv import load_dotenv + +import traceback from azureml.core import Workspace from azureml.core.compute import AmlCompute from azureml.core.compute import ComputeTarget from azureml.exceptions import ComputeTargetException +from ml_service.util.env_variables import Env -def get_compute( - workspace: Workspace, - compute_name: str, - vm_size: str -): - # Load the environment variables from .env in case this script - # is called outside an existing process - load_dotenv() - # Verify that cluster does not exist already +def get_compute(workspace: Workspace, compute_name: str, vm_size: str, for_batch_scoring: bool = False): # NOQA E501 try: if compute_name in workspace.compute_targets: compute_target = workspace.compute_targets[compute_name] if compute_target and type(compute_target) is AmlCompute: - print('Found existing compute target ' + compute_name - + ' so using it.') + print("Found existing compute target " + compute_name + " so using it.") # NOQA else: + e = Env() compute_config = AmlCompute.provisioning_configuration( vm_size=vm_size, - vm_priority=os.environ.get("AML_CLUSTER_PRIORITY", - 'lowpriority'), - min_nodes=int(os.environ.get("AML_CLUSTER_MIN_NODES", 0)), - max_nodes=int(os.environ.get("AML_CLUSTER_MAX_NODES", 4)), + vm_priority=e.vm_priority if not for_batch_scoring else e.vm_priority_scoring, # NOQA E501 + min_nodes=e.min_nodes if not for_batch_scoring else e.min_nodes_scoring, # NOQA E501 + max_nodes=e.max_nodes if not for_batch_scoring else e.max_nodes_scoring, # NOQA E501 idle_seconds_before_scaledown="300" # #Uncomment the below lines for VNet support # vnet_resourcegroup_name=vnet_resourcegroup_name, # vnet_name=vnet_name, # subnet_name=subnet_name ) - compute_target = ComputeTarget.create(workspace, compute_name, - compute_config) + compute_target = ComputeTarget.create( + workspace, compute_name, compute_config + ) compute_target.wait_for_completion( - show_output=True, - min_node_count=None, - timeout_in_minutes=10) + show_output=True, min_node_count=None, timeout_in_minutes=10 + ) return compute_target - except ComputeTargetException as e: - print(e) - print('An error occurred trying to provision compute.') - exit() + except ComputeTargetException: + traceback.print_exc() + print("An error occurred trying to provision compute.") + exit(1) diff --git a/ml_service/util/create_scoring_image.py b/ml_service/util/create_scoring_image.py index 08ae49b5..378cb3b4 100644 --- a/ml_service/util/create_scoring_image.py +++ b/ml_service/util/create_scoring_image.py @@ -1,61 +1,59 @@ import os +import argparse from azureml.core import Workspace -from azureml.core.image import ContainerImage, Image -from azureml.core.model import Model -from dotenv import load_dotenv -from azureml.core.authentication import ServicePrincipalAuthentication +from azureml.core.environment import Environment +from azureml.core.model import Model, InferenceConfig +import shutil +from ml_service.util.env_variables import Env -load_dotenv() - -TENANT_ID = os.environ.get('TENANT_ID') -APP_ID = os.environ.get('SP_APP_ID') -APP_SECRET = os.environ.get('SP_APP_SECRET') -WORKSPACE_NAME = os.environ.get("BASE_NAME")+"-AML-WS" -SUBSCRIPTION_ID = os.environ.get('SUBSCRIPTION_ID') -RESOURCE_GROUP = os.environ.get("BASE_NAME")+"-AML-RG" -MODEL_NAME = os.environ.get('MODEL_NAME') -MODEL_VERSION = os.environ.get('MODEL_VERSION') -IMAGE_NAME = os.environ.get('IMAGE_NAME') - - -SP_AUTH = ServicePrincipalAuthentication( - tenant_id=TENANT_ID, - service_principal_id=APP_ID, - service_principal_password=APP_SECRET) +e = Env() +# Get Azure machine learning workspace ws = Workspace.get( - WORKSPACE_NAME, - SP_AUTH, - SUBSCRIPTION_ID, - RESOURCE_GROUP -) - - -model = Model(ws, name=MODEL_NAME, version=MODEL_VERSION) -os.chdir("./code/scoring") - -image_config = ContainerImage.image_configuration( - execution_script="score.py", - runtime="python", - conda_file="conda_dependencies.yml", - description="Image with ridge regression model", - tags={"area": "diabetes", "type": "regression"}, -) - -image = Image.create( - name=IMAGE_NAME, models=[model], image_config=image_config, workspace=ws + name=e.workspace_name, + subscription_id=e.subscription_id, + resource_group=e.resource_group ) -image.wait_for_creation(show_output=True) - -if image.creation_state != "Succeeded": - raise Exception("Image creation status: {image.creation_state}") - -print("{}(v.{} [{}]) stored at {} with build log {}".format( - image.name, - image.version, - image.creation_state, - image.image_location, - image.image_build_log_uri, -) +parser = argparse.ArgumentParser("create scoring image") +parser.add_argument( + "--output_image_location_file", + type=str, + help=("Name of a file to write image location to, " + "in format REGISTRY.azurecr.io/IMAGE_NAME:IMAGE_VERSION") ) +args = parser.parse_args() + +model = Model(ws, name=e.model_name, version=e.model_version) +sources_dir = e.sources_directory_train +if (sources_dir is None): + sources_dir = 'diabetes_regression' +score_script = os.path.join(".", sources_dir, e.score_script) +score_file = os.path.basename(score_script) +path_to_scoring = os.path.dirname(score_script) +cwd = os.getcwd() +# Copy conda_dependencies.yml into scoring as this method does not accept relative paths. # NOQA: E501 +shutil.copy(os.path.join(".", sources_dir, + "conda_dependencies.yml"), path_to_scoring) +os.chdir(path_to_scoring) + +scoring_env = Environment.from_conda_specification(name="scoringenv", file_path="conda_dependencies.yml") # NOQA: E501 +inference_config = InferenceConfig( + entry_script=score_file, environment=scoring_env) +package = Model.package(ws, [model], inference_config) +package.wait_for_creation(show_output=True) +# Display the package location/ACR path +print(package.location) + +os.chdir(cwd) + +if package.state != "Succeeded": + raise Exception("Image creation status: {package.creation_state}") + +print("Package stored at {} with build log {}".format(package.location, package.package_build_log_uri)) # NOQA: E501 + +# Save the Image Location for other AzDO jobs after script is complete +if args.output_image_location_file is not None: + print("Writing image location to %s" % args.output_image_location_file) + with open(args.output_image_location_file, "w") as out_file: + out_file.write(str(package.location)) diff --git a/ml_service/util/create_scoring_image.sh b/ml_service/util/create_scoring_image.sh new file mode 100644 index 00000000..1651b73e --- /dev/null +++ b/ml_service/util/create_scoring_image.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +set -euo pipefail # strict mode, fail on error +set -x # verbose + +docker run \ + --rm \ + -t \ + -v $PWD:/mlops \ + -v ${AZURE_CONFIG_DIR:-$HOME/.azure}:/root/.azure \ + -e SUBSCRIPTION_ID=$(az account show --query id -o tsv) \ + -e RESOURCE_GROUP=$RESOURCE_GROUP \ + -e WORKSPACE_NAME=$WORKSPACE_NAME \ + -e MODEL_NAME=$MODEL_NAME \ + -e IMAGE_NAME=$IMAGE_NAME \ + mcr.microsoft.com/mlops/python:latest \ + bash -c "cd /mlops/ && python ml_service/util/create_scoring_image.py" diff --git a/ml_service/util/env_variables.py b/ml_service/util/env_variables.py new file mode 100644 index 00000000..753c152d --- /dev/null +++ b/ml_service/util/env_variables.py @@ -0,0 +1,126 @@ +"""Env dataclass to load and hold all environment variables +""" +from dataclasses import dataclass +import os +from typing import Optional + +from dotenv import load_dotenv + + +@dataclass(frozen=True) +class Env: + """Loads all environment variables into a predefined set of properties + """ + + # to load .env file into environment variables for local execution + load_dotenv() + workspace_name: Optional[str] = os.environ.get("WORKSPACE_NAME") + resource_group: Optional[str] = os.environ.get("RESOURCE_GROUP") + subscription_id: Optional[str] = os.environ.get("SUBSCRIPTION_ID") + tenant_id: Optional[str] = os.environ.get("TENANT_ID") + app_id: Optional[str] = os.environ.get("SP_APP_ID") + app_secret: Optional[str] = os.environ.get("SP_APP_SECRET") + vm_size: Optional[str] = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") + compute_name: Optional[str] = os.environ.get("AML_COMPUTE_CLUSTER_NAME") + vm_priority: Optional[str] = os.environ.get( + "AML_CLUSTER_PRIORITY", "lowpriority" + ) # NOQA: E501 + min_nodes: int = int(os.environ.get("AML_CLUSTER_MIN_NODES", 0)) + max_nodes: int = int(os.environ.get("AML_CLUSTER_MAX_NODES", 4)) + build_id: Optional[str] = os.environ.get("BUILD_BUILDID") + pipeline_name: Optional[str] = os.environ.get("TRAINING_PIPELINE_NAME") + sources_directory_train: Optional[str] = os.environ.get( + "SOURCES_DIR_TRAIN" + ) # NOQA: E501 + train_script_path: Optional[str] = os.environ.get("TRAIN_SCRIPT_PATH") + evaluate_script_path: Optional[str] = os.environ.get( + "EVALUATE_SCRIPT_PATH" + ) # NOQA: E501 + register_script_path: Optional[str] = os.environ.get( + "REGISTER_SCRIPT_PATH" + ) # NOQA: E501 + model_name: Optional[str] = os.environ.get("MODEL_NAME") + experiment_name: Optional[str] = os.environ.get("EXPERIMENT_NAME") + model_version: Optional[str] = os.environ.get("MODEL_VERSION") + image_name: Optional[str] = os.environ.get("IMAGE_NAME") + db_cluster_id: Optional[str] = os.environ.get("DB_CLUSTER_ID") + score_script: Optional[str] = os.environ.get("SCORE_SCRIPT") + build_uri: Optional[str] = os.environ.get("BUILD_URI") + dataset_name: Optional[str] = os.environ.get("DATASET_NAME") + datastore_name: Optional[str] = os.environ.get("DATASTORE_NAME") + dataset_version: Optional[str] = os.environ.get("DATASET_VERSION") + run_evaluation: Optional[str] = os.environ.get("RUN_EVALUATION", "true") + allow_run_cancel: Optional[str] = os.environ.get( + "ALLOW_RUN_CANCEL", "true" + ) # NOQA: E501 + aml_env_name: Optional[str] = os.environ.get("AML_ENV_NAME") + aml_env_train_conda_dep_file: Optional[str] = os.environ.get( + "AML_ENV_TRAIN_CONDA_DEP_FILE", "conda_dependencies.yml" + ) + rebuild_env: Optional[bool] = os.environ.get( + "AML_REBUILD_ENVIRONMENT", "false" + ).lower().strip() == "true" + + use_gpu_for_scoring: Optional[bool] = os.environ.get( + "USE_GPU_FOR_SCORING", "false" + ).lower().strip() == "true" + aml_env_score_conda_dep_file: Optional[str] = os.environ.get( + "AML_ENV_SCORE_CONDA_DEP_FILE", "conda_dependencies_scoring.yml" + ) + aml_env_scorecopy_conda_dep_file: Optional[str] = os.environ.get( + "AML_ENV_SCORECOPY_CONDA_DEP_FILE", "conda_dependencies_scorecopy.yml" + ) + vm_size_scoring: Optional[str] = os.environ.get( + "AML_COMPUTE_CLUSTER_CPU_SKU_SCORING" + ) + compute_name_scoring: Optional[str] = os.environ.get( + "AML_COMPUTE_CLUSTER_NAME_SCORING" + ) + vm_priority_scoring: Optional[str] = os.environ.get( + "AML_CLUSTER_PRIORITY_SCORING", "lowpriority" + ) + min_nodes_scoring: int = int( + os.environ.get("AML_CLUSTER_MIN_NODES_SCORING", 0) + ) # NOQA: E501 + max_nodes_scoring: int = int( + os.environ.get("AML_CLUSTER_MAX_NODES_SCORING", 4) + ) # NOQA: E501 + rebuild_env_scoring: Optional[bool] = os.environ.get( + "AML_REBUILD_ENVIRONMENT_SCORING", "false" + ).lower().strip() == "true" + scoring_datastore_storage_name: Optional[str] = os.environ.get( + "SCORING_DATASTORE_STORAGE_NAME" + ) + scoring_datastore_access_key: Optional[str] = os.environ.get( + "SCORING_DATASTORE_ACCESS_KEY" + ) + scoring_datastore_input_container: Optional[str] = os.environ.get( + "SCORING_DATASTORE_INPUT_CONTAINER" + ) + scoring_datastore_input_filename: Optional[str] = os.environ.get( + "SCORING_DATASTORE_INPUT_FILENAME" + ) + scoring_datastore_output_container: Optional[str] = os.environ.get( + "SCORING_DATASTORE_OUTPUT_CONTAINER" + ) + scoring_datastore_output_filename: Optional[str] = os.environ.get( + "SCORING_DATASTORE_OUTPUT_FILENAME" + ) + scoring_dataset_name: Optional[str] = os.environ.get( + "SCORING_DATASET_NAME" + ) # NOQA: E501 + scoring_pipeline_name: Optional[str] = os.environ.get( + "SCORING_PIPELINE_NAME" + ) # NOQA: E501 + aml_env_name_scoring: Optional[str] = os.environ.get( + "AML_ENV_NAME_SCORING" + ) # NOQA: E501 + aml_env_name_score_copy: Optional[str] = os.environ.get( + "AML_ENV_NAME_SCORE_COPY" + ) # NOQA: E501 + batchscore_script_path: Optional[str] = os.environ.get( + "BATCHSCORE_SCRIPT_PATH" + ) # NOQA: E501 + batchscore_copy_script_path: Optional[str] = os.environ.get( + "BATCHSCORE_COPY_SCRIPT_PATH" + ) # NOQA: E501 diff --git a/ml_service/util/manage_environment.py b/ml_service/util/manage_environment.py new file mode 100644 index 00000000..b61c97fe --- /dev/null +++ b/ml_service/util/manage_environment.py @@ -0,0 +1,41 @@ + +import os +import traceback +from azureml.core import Workspace, Environment +from ml_service.util.env_variables import Env +from azureml.core.runconfig import DEFAULT_CPU_IMAGE, DEFAULT_GPU_IMAGE + + +def get_environment( + workspace: Workspace, + environment_name: str, + conda_dependencies_file: str, + create_new: bool = False, + enable_docker: bool = None, + use_gpu: bool = False +): + try: + e = Env() + environments = Environment.list(workspace=workspace) + restored_environment = None + for env in environments: + if env == environment_name: + restored_environment = environments[environment_name] + + if restored_environment is None or create_new: + new_env = Environment.from_conda_specification( + environment_name, + os.path.join(e.sources_directory_train, conda_dependencies_file), # NOQA: E501 + ) # NOQA: E501 + restored_environment = new_env + if enable_docker is not None: + restored_environment.docker.enabled = enable_docker + restored_environment.docker.base_image = DEFAULT_GPU_IMAGE if use_gpu else DEFAULT_CPU_IMAGE # NOQA: E501 + restored_environment.register(workspace) + + if restored_environment is not None: + print(restored_environment) + return restored_environment + except Exception: + traceback.print_exc() + exit(1) diff --git a/ml_service/util/register_model.py b/ml_service/util/register_model.py deleted file mode 100644 index ea26a997..00000000 --- a/ml_service/util/register_model.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys -import os -import os.path -from dotenv import load_dotenv -from azureml.core import Workspace -from azureml.core.model import Model -from azureml.core.authentication import ServicePrincipalAuthentication - -# Load the environment variables from .env in case this script -# is called outside an existing process -load_dotenv() - -TENANT_ID = os.environ.get('TENANT_ID') -APP_ID = os.environ.get('SP_APP_ID') -APP_SECRET = os.environ.get('SP_APP_SECRET') -MODEL_PATH = os.environ.get('MODEL_PATH') -MODEL_NAME = os.environ.get('MODEL_NAME') -WORKSPACE_NAME = os.environ.get("BASE_NAME")+"-AML-WS" -SUBSCRIPTION_ID = os.environ.get('SUBSCRIPTION_ID') -RESOURCE_GROUP = os.environ.get("BASE_NAME")+"-AML-RG" - - -if os.path.isfile(MODEL_PATH) is False: - print("The given model path %s is invalid" % (MODEL_PATH)) - sys.exit(1) - -SP_AUTH = ServicePrincipalAuthentication( - tenant_id=TENANT_ID, - service_principal_id=APP_ID, - service_principal_password=APP_SECRET) - -WORKSPACE = Workspace.get( - WORKSPACE_NAME, - SP_AUTH, - SUBSCRIPTION_ID, - RESOURCE_GROUP -) - -try: - MODEL = Model.register( - model_path=MODEL_PATH, - model_name=MODEL_NAME, - description="Forecasting Model", - workspace=WORKSPACE) - - print("Model registered successfully. ID: " + MODEL.id) -except Exception as caught_error: - print("Error while registering the model: " + str(caught_error)) - sys.exit(1) diff --git a/ml_service/util/smoke_test_scoring_service.py b/ml_service/util/smoke_test_scoring_service.py new file mode 100644 index 00000000..0fa34b1e --- /dev/null +++ b/ml_service/util/smoke_test_scoring_service.py @@ -0,0 +1,91 @@ +import argparse +import requests +import time +from azureml.core import Workspace +from azureml.core.webservice import AksWebservice, AciWebservice +from ml_service.util.env_variables import Env +import secrets + + +input = {"data": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]]} +output_len = 2 + + +def call_web_service(e, service_type, service_name): + aml_workspace = Workspace.get( + name=e.workspace_name, + subscription_id=e.subscription_id, + resource_group=e.resource_group + ) + print("Fetching service") + headers = {} + if service_type == "ACI": + service = AciWebservice(aml_workspace, service_name) + else: + service = AksWebservice(aml_workspace, service_name) + if service.auth_enabled: + service_keys = service.get_keys() + headers['Authorization'] = 'Bearer ' + service_keys[0] + print("Testing service") + print(". url: %s" % service.scoring_uri) + output = call_web_app(service.scoring_uri, headers) + + return output + + +def call_web_app(url, headers): + + # Generate an HTTP 'traceparent' distributed tracing header + # (per the W3C Trace Context proposed specification). + headers['traceparent'] = "00-{0}-{1}-00".format( + secrets.token_hex(16), secrets.token_hex(8)) + + retries = 600 + for i in range(retries): + try: + response = requests.post( + url, json=input, headers=headers) + response.raise_for_status() + return response.json() + except requests.exceptions.HTTPError as e: + if i == retries - 1: + raise e + print(e) + print("Retrying...") + time.sleep(1) + + +def main(): + + parser = argparse.ArgumentParser("smoke_test_scoring_service.py") + + parser.add_argument( + "--type", + type=str, + choices=["AKS", "ACI", "Webapp"], + required=True, + help="type of service" + ) + parser.add_argument( + "--service", + type=str, + required=True, + help="Name of the image to test" + ) + args = parser.parse_args() + + e = Env() + if args.type == "Webapp": + output = call_web_app(args.service, {}) + else: + output = call_web_service(e, args.type, args.service) + print("Verifying service output") + + assert "result" in output + assert len(output["result"]) == output_len + print("Smoke test successful.") + + +if __name__ == '__main__': + main() diff --git a/ml_service/util/workspace.py b/ml_service/util/workspace.py deleted file mode 100644 index 08d1f67d..00000000 --- a/ml_service/util/workspace.py +++ /dev/null @@ -1,29 +0,0 @@ -import sys -from azureml.core import Workspace -from azureml.core.authentication import ServicePrincipalAuthentication - - -def get_workspace( - name: str, - resource_group: str, - subscription_id: str, - tenant_id: str, - app_id: str, - app_secret: str): - service_principal = ServicePrincipalAuthentication( - tenant_id=tenant_id, - service_principal_id=app_id, - service_principal_password=app_secret) - - try: - aml_workspace = Workspace.get( - name=name, - subscription_id=subscription_id, - resource_group=resource_group, - auth=service_principal) - - return aml_workspace - except Exception as caught_exception: - print("Error while retrieving Workspace...") - print(str(caught_exception)) - sys.exit(1) diff --git a/tests/unit/code_test.py b/tests/unit/code_test.py deleted file mode 100644 index b22b186c..00000000 --- a/tests/unit/code_test.py +++ /dev/null @@ -1,25 +0,0 @@ -import sys -import os -sys.path.append(os.path.abspath("./ml_service/util")) # NOQA: E402 -from workspace import get_workspace - - -# Just an example of a unit test against -# a utility function common_scoring.next_saturday -def test_get_workspace(): - workspace_name = os.environ.get("BASE_NAME")+"-AML-WS" - resource_group = os.environ.get("BASE_NAME")+"-AML-RG" - subscription_id = os.environ.get("SUBSCRIPTION_ID") - tenant_id = os.environ.get("TENANT_ID") - app_id = os.environ.get("SP_APP_ID") - app_secret = os.environ.get("SP_APP_SECRET") - - aml_workspace = get_workspace( - workspace_name, - resource_group, - subscription_id, - tenant_id, - app_id, - app_secret) - - assert aml_workspace.name == workspace_name