diff --git a/.pipelines/diabetes_regression-variables-template.yml b/.pipelines/diabetes_regression-variables-template.yml index 6d4d9797..fdccb3b7 100644 --- a/.pipelines/diabetes_regression-variables-template.yml +++ b/.pipelines/diabetes_regression-variables-template.yml @@ -7,7 +7,7 @@ variables: value: diabetes_regression # The path to the model training script under SOURCES_DIR_TRAIN - name: TRAIN_SCRIPT_PATH - value: training/train.py + value: training/train_aml.py # The path to the model evaluation script under SOURCES_DIR_TRAIN - name: EVALUATE_SCRIPT_PATH value: evaluate/evaluate_model.py diff --git a/diabetes_regression/training/test_train.py b/diabetes_regression/training/test_train.py index 155d367a..d121ecbc 100644 --- a/diabetes_regression/training/test_train.py +++ b/diabetes_regression/training/test_train.py @@ -1,27 +1,32 @@ import numpy as np -from azureml.core.run import Run -from unittest.mock import Mock -from diabetes_regression.training.train import train_model +from diabetes_regression.training.train import train_model, get_model_metrics def test_train_model(): X_train = np.array([1, 2, 3, 4, 5, 6]).reshape(-1, 1) y_train = np.array([10, 9, 8, 8, 6, 5]) + data = {"train": {"X": X_train, "y": y_train}} + + reg_model = train_model(data, {"alpha": 1.2}) + + preds = reg_model.predict([[1], [2]]) + np.testing.assert_equal(preds, [9.93939393939394, 9.03030303030303]) + + +def test_get_model_metrics(): + + class MockModel: + + @staticmethod + def predict(data): + return ([8.12121212, 7.21212121]) + X_test = np.array([3, 4]).reshape(-1, 1) y_test = np.array([8, 7]) - data = {"train": {"X": X_train, "y": y_train}, - "test": {"X": X_test, "y": y_test}} + data = {"test": {"X": X_test, "y": y_test}} - run = Mock(Run) - reg = train_model(run, data, alpha=1.2) + metrics = get_model_metrics(MockModel(), data) - _, call2 = run.log.call_args_list - nameValue, descriptionDict = call2 - name, value = nameValue - description = descriptionDict['description'] - assert (name == 'mse') - np.testing.assert_almost_equal(value, 0.029843893480257067) - assert (description == 'Mean squared error metric') - - preds = reg.predict([[1], [2]]) - np.testing.assert_equal(preds, [9.93939393939394, 9.03030303030303]) + assert 'mse' in metrics + mse = metrics['mse'] + np.testing.assert_almost_equal(mse, 0.029843893480257067) diff --git a/diabetes_regression/training/train.py b/diabetes_regression/training/train.py index 66dbc20f..22258042 100644 --- a/diabetes_regression/training/train.py +++ b/diabetes_regression/training/train.py @@ -23,137 +23,16 @@ ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ -from azureml.core.run import Run + import os -import argparse +import pandas as pd from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split -import joblib -import json -from azureml.core import Dataset, Datastore, Workspace - - -def register_dataset( - aml_workspace: Workspace, - dataset_name: str, - datastore_name: str, - file_path: str -) -> Dataset: - datastore = Datastore.get(aml_workspace, datastore_name) - dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path)) - dataset = dataset.register(workspace=aml_workspace, - name=dataset_name, - create_new_version=True) - - return dataset - - -def train_model(run, data, alpha): - run.log("alpha", alpha) - run.parent.log("alpha", alpha) - reg = Ridge(alpha=alpha) - reg.fit(data["train"]["X"], data["train"]["y"]) - preds = reg.predict(data["test"]["X"]) - run.log("mse", mean_squared_error( - preds, data["test"]["y"]), description="Mean squared error metric") - run.parent.log("mse", mean_squared_error( - preds, data["test"]["y"]), description="Mean squared error metric") - return reg - -def main(): - print("Running train.py") - parser = argparse.ArgumentParser("train") - - parser.add_argument( - "--model_name", - type=str, - help="Name of the Model", - default="sklearn_regression_model.pkl", - ) - - parser.add_argument( - "--step_output", - type=str, - help=("output for passing data to next step") - ) - - parser.add_argument( - "--dataset_version", - type=str, - help=("dataset version") - ) - - parser.add_argument( - "--data_file_path", - type=str, - help=("data file path, if specified,\ - a new version of the dataset will be registered") - ) - - parser.add_argument( - "--caller_run_id", - type=str, - help=("caller run id, for example ADF pipeline run id") - ) - - parser.add_argument( - "--dataset_name", - type=str, - help=("Dataset name. Dataset must be passed by name\ - to always get the desired dataset version\ - rather than the one used while the pipeline creation") - ) - - args = parser.parse_args() - - print("Argument [model_name]: %s" % args.model_name) - print("Argument [step_output]: %s" % args.step_output) - print("Argument [dataset_version]: %s" % args.dataset_version) - print("Argument [data_file_path]: %s" % args.data_file_path) - print("Argument [caller_run_id]: %s" % args.caller_run_id) - print("Argument [dataset_name]: %s" % args.dataset_name) - - model_name = args.model_name - step_output_path = args.step_output - dataset_version = args.dataset_version - data_file_path = args.data_file_path - dataset_name = args.dataset_name - - print("Getting training parameters") - - with open("config.json") as f: - pars = json.load(f) - try: - alpha = pars["training"]["alpha"] - except KeyError: - alpha = 0.5 - - print("Parameter alpha: %s" % alpha) - - run = Run.get_context() - - # Get the dataset - if (dataset_name): - if (data_file_path == 'none'): - dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 - else: - dataset = register_dataset(run.experiment.workspace, - dataset_name, - os.environ.get("DATASTORE_NAME"), - data_file_path) - else: - e = ("No dataset provided") - print(e) - raise Exception(e) - - # Link dataset to the step run so it is trackable in the UI - run.input_datasets['training_data'] = dataset - run.parent.tag("dataset_id", value=dataset.id) - - df = dataset.to_pandas_dataframe() +# Split the dataframe into test and train data +def split_data(df): X = df.drop('Y', axis=1).values y = df['Y'].values @@ -161,23 +40,44 @@ def main(): X, y, test_size=0.2, random_state=0) data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}} + return data + + +# Train the model, return the model +def train_model(data, ridge_args): + reg_model = Ridge(**ridge_args) + reg_model.fit(data["train"]["X"], data["train"]["y"]) + return reg_model + + +# Evaluate the metrics for the model +def get_model_metrics(model, data): + preds = model.predict(data["test"]["X"]) + mse = mean_squared_error(preds, data["test"]["y"]) + metrics = {"mse": mse} + return metrics + + +def main(): + print("Running train.py") - reg = train_model(run, data, alpha) + # Define training parameters + ridge_args = {"alpha": 0.5} - # Pass model file to next step - os.makedirs(step_output_path, exist_ok=True) - model_output_path = os.path.join(step_output_path, model_name) - joblib.dump(value=reg, filename=model_output_path) + # Load the training data as dataframe + data_dir = "data" + data_file = os.path.join(data_dir, 'diabetes.csv') + train_df = pd.read_csv(data_file) - # Also upload model file to run outputs for history - os.makedirs('outputs', exist_ok=True) - output_path = os.path.join('outputs', model_name) - joblib.dump(value=reg, filename=output_path) + data = split_data(train_df) - run.tag("run_type", value="train") - print(f"tags now present for run: {run.tags}") + # Train the model + model = train_model(data, ridge_args) - run.complete() + # Log the metrics for the model + metrics = get_model_metrics(model, data) + for (k, v) in metrics.items(): + print(f"{k}: {v}") if __name__ == '__main__': diff --git a/diabetes_regression/training/train_aml.py b/diabetes_regression/training/train_aml.py new file mode 100644 index 00000000..5bf76cb4 --- /dev/null +++ b/diabetes_regression/training/train_aml.py @@ -0,0 +1,175 @@ +""" +Copyright (C) Microsoft Corporation. All rights reserved.​ + ​ +Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, +royalty-free right to use, copy, and modify the software code provided by us +("Software Code"). You may not sublicense the Software Code or any use of it +(except to your affiliates and to vendors to perform work on your behalf) +through distribution, network access, service agreement, lease, rental, or +otherwise. This license does not purport to express any claim of ownership over +data you may have shared with Microsoft in the creation of the Software Code. +Unless applicable law gives you more rights, Microsoft reserves all other +rights not expressly granted herein, whether by implication, estoppel or +otherwise. ​ + ​ +THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +""" +from azureml.core.run import Dataset, Datastore, Run, Workspace +import os +import argparse +import joblib +import json +from train import split_data, train_model, get_model_metrics + + +def register_dataset( + aml_workspace: Workspace, + dataset_name: str, + datastore_name: str, + file_path: str +) -> Dataset: + datastore = Datastore.get(aml_workspace, datastore_name) + dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path)) + dataset = dataset.register(workspace=aml_workspace, + name=dataset_name, + create_new_version=True) + + return dataset + + +def main(): + print("Running train_aml.py") + + parser = argparse.ArgumentParser("train") + parser.add_argument( + "--model_name", + type=str, + help="Name of the Model", + default="sklearn_regression_model.pkl", + ) + + parser.add_argument( + "--step_output", + type=str, + help=("output for passing data to next step") + ) + + parser.add_argument( + "--dataset_version", + type=str, + help=("dataset version") + ) + + parser.add_argument( + "--data_file_path", + type=str, + help=("data file path, if specified,\ + a new version of the dataset will be registered") + ) + + parser.add_argument( + "--caller_run_id", + type=str, + help=("caller run id, for example ADF pipeline run id") + ) + + parser.add_argument( + "--dataset_name", + type=str, + help=("Dataset name. Dataset must be passed by name\ + to always get the desired dataset version\ + rather than the one used while the pipeline creation") + ) + + args = parser.parse_args() + + print("Argument [model_name]: %s" % args.model_name) + print("Argument [step_output]: %s" % args.step_output) + print("Argument [dataset_version]: %s" % args.dataset_version) + print("Argument [data_file_path]: %s" % args.data_file_path) + print("Argument [caller_run_id]: %s" % args.caller_run_id) + print("Argument [dataset_name]: %s" % args.dataset_name) + + model_name = args.model_name + step_output_path = args.step_output + dataset_version = args.dataset_version + data_file_path = args.data_file_path + dataset_name = args.dataset_name + + run = Run.get_context() + + print("Getting training parameters") + + # Load the training parameters from the config file + with open("config.json") as f: + pars = json.load(f) + try: + train_args = pars["training"] + except KeyError: + print("Could not load training values from file") + train_args = {} + + # Log the training parameters + print(f"Parameters: {train_args}") + for (k, v) in train_args.items(): + run.log(k, v) + run.parent.log(k, v) + + # Get the dataset + if (dataset_name): + if (data_file_path == 'none'): + dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 + else: + dataset = register_dataset(run.experiment.workspace, + dataset_name, + os.environ.get("DATASTORE_NAME"), + data_file_path) + else: + e = ("No dataset provided") + print(e) + raise Exception(e) + + # Link dataset to the step run so it is trackable in the UI + run.input_datasets['training_data'] = dataset + run.parent.tag("dataset_id", value=dataset.id) + + # Split the data into test/train + df = dataset.to_pandas_dataframe() + data = split_data(df) + + # Train the model + model = train_model(data, train_args) + + # Evaluate and log the metrics returned from the train function + metrics = get_model_metrics(model, data) + for (k, v) in metrics.items(): + run.log(k, v) + run.parent.log(k, v) + + # Pass model file to next step + os.makedirs(step_output_path, exist_ok=True) + model_output_path = os.path.join(step_output_path, model_name) + joblib.dump(value=model, filename=model_output_path) + + # Also upload model file to run outputs for history + os.makedirs('outputs', exist_ok=True) + output_path = os.path.join('outputs', model_name) + joblib.dump(value=model, filename=output_path) + + run.tag("run_type", value="train") + print(f"tags now present for run: {run.tags}") + + run.complete() + + +if __name__ == '__main__': + main() diff --git a/experimentation/Diabetes Ridge Regression Experimentation Pipeline.ipynb b/experimentation/Diabetes Ridge Regression Experimentation Pipeline.ipynb new file mode 100644 index 00000000..8b04a5c5 --- /dev/null +++ b/experimentation/Diabetes Ridge Regression Experimentation Pipeline.ipynb @@ -0,0 +1,353 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Experiment with parameters for a Ridge Regression Model on the Diabetes Dataset in an Azure ML Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook is for experimenting with different parameters to train a ridge regression model on the Diabetes dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Change out of the experimentation directory\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "from azureml.core import Workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the workspace from the saved config file\n", + "ws = Workspace.from_config()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, shutil\n", + "\n", + "# Create a folder for the experiment files\n", + "training_folder = 'diabetes-training'\n", + "os.makedirs(training_folder, exist_ok=True)\n", + "\n", + "# Copy the data file into the experiment folder\n", + "shutil.copy('data/diabetes.csv', os.path.join(training_folder, \"diabetes.csv\"))\n", + "\n", + "# Copy the train functions into the experiment folder\n", + "shutil.copy('diabetes_regression/training/train.py', os.path.join(training_folder, \"train.py\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/parameters.json\n", + "{\n", + " \"training\":\n", + " {\n", + " \"alpha\": 0.3\n", + " },\n", + " \"evaluation\":\n", + " {\n", + "\n", + " },\n", + " \"scoring\":\n", + " {\n", + " \n", + " }\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/diabetes_training.py\n", + "# Import libraries\n", + "from azureml.core import Run\n", + "import pandas as pd\n", + "import shutil\n", + "import joblib\n", + "\n", + "from train import split_data, train_model\n", + "\n", + "# Get parameters\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument('--output_folder', type=str, dest='output_folder', default=\"diabetes_model\", help='output folder')\n", + "args = parser.parse_args()\n", + "output_folder = args.output_folder\n", + "\n", + "# Get the experiment run context\n", + "run = Run.get_context()\n", + "\n", + "# load the diabetes dataset\n", + "print(\"Loading Data...\")\n", + "train_df = pd.read_csv('diabetes.csv')\n", + "\n", + "data = split_data(train_df)\n", + "\n", + "# Specify the parameters to test\n", + "with open(\"parameters.json\") as f:\n", + " pars = json.load(f)\n", + " train_args = pars[\"training\"]\n", + "\n", + "# Log parameters\n", + "for k, v in train_args.items():\n", + " run.log(k, v)\n", + "\n", + "model, metrics = train_model(data, train_args)\n", + "\n", + "# Log metrics\n", + "for k, v in metrics.items():\n", + " run.log(k, v)\n", + "\n", + "# Save the parameters file to the outputs folder\n", + "os.makedirs(output_folder, exist_ok=True)\n", + "shutil.copy('parameters.json', os.path.join(output_folder, 'parameters.json'))\n", + "joblib.dump(value=model, filename= output_folder + \"/model.pkl\")\n", + " \n", + "run.complete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/register_diabetes.py\n", + "# Import libraries\n", + "import argparse\n", + "import joblib\n", + "from azureml.core import Workspace, Model, Run\n", + "\n", + "# Get parameters\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument('--model_folder', type=str, dest='model_folder', default=\"diabetes_model\", help='model location')\n", + "args = parser.parse_args()\n", + "model_folder = args.model_folder\n", + "\n", + "# Get the experiment run context\n", + "run = Run.get_context()\n", + "\n", + "# load the model\n", + "print(\"Loading model from \" + model_folder)\n", + "model_file = model_folder + \"/model.pkl\"\n", + "model = joblib.load(model_file)\n", + "\n", + "Model.register(workspace=run.experiment.workspace,\n", + " model_path = model_file,\n", + " model_name = 'diabetes_model',\n", + " tags={'Training context':'Pipeline'})\n", + "\n", + "run.complete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, AmlCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "cluster_name = \"aml-cluster\"\n", + "\n", + "# Verify that cluster exists\n", + "try:\n", + " pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)\n", + " print('Found existing cluster, use it.')\n", + "except ComputeTargetException:\n", + " # If not, create it\n", + " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',\n", + " max_nodes=4,\n", + " idle_seconds_before_scaledown=1800)\n", + " pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)\n", + "\n", + "pipeline_cluster.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Environment\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "from azureml.core.runconfig import RunConfiguration\n", + "\n", + "# Create a Python environment for the experiment\n", + "diabetes_env = Environment(\"diabetes-pipeline-env\")\n", + "diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies\n", + "diabetes_env.docker.enabled = True # Use a docker container\n", + "\n", + "# Create a set of package dependencies\n", + "diabetes_packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas'],\n", + " pip_packages=['azureml-sdk'])\n", + "\n", + "# Add the dependencies to the environment\n", + "diabetes_env.python.conda_dependencies = diabetes_packages\n", + "\n", + "# Register the environment (just in case you want to use it again)\n", + "diabetes_env.register(workspace=ws)\n", + "registered_env = Environment.get(ws, 'diabetes-pipeline-env')\n", + "\n", + "# Create a new runconfig object for the pipeline\n", + "pipeline_run_config = RunConfiguration()\n", + "\n", + "# Use the compute you created above. \n", + "pipeline_run_config.target = pipeline_cluster\n", + "\n", + "# Assign the environment to the run configuration\n", + "pipeline_run_config.environment = registered_env\n", + "\n", + "print (\"Run configuration created.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.pipeline.core import PipelineData\n", + "from azureml.pipeline.steps import PythonScriptStep, EstimatorStep\n", + "from azureml.train.estimator import Estimator\n", + "\n", + "# Get the training dataset\n", + "#diabetes_ds = ws.datasets.get(\"diabetes dataset\")\n", + "\n", + "# Create a PipelineData (temporary Data Reference) for the model folder\n", + "model_folder = PipelineData(\"model_folder\", datastore=ws.get_default_datastore())\n", + "\n", + "estimator = Estimator(source_directory=training_folder,\n", + " compute_target = pipeline_cluster,\n", + " environment_definition=pipeline_run_config.environment,\n", + " entry_script='diabetes_training.py')\n", + "\n", + "# Step 1, run the estimator to train the model\n", + "train_step = EstimatorStep(name = \"Train Model\",\n", + " estimator=estimator, \n", + " estimator_entry_script_arguments=['--output_folder', model_folder],\n", + " outputs=[model_folder],\n", + " compute_target = pipeline_cluster,\n", + " allow_reuse = True)\n", + "\n", + "# Step 2, run the model registration script\n", + "register_step = PythonScriptStep(name = \"Register Model\",\n", + " source_directory = training_folder,\n", + " script_name = \"register_diabetes.py\",\n", + " arguments = ['--model_folder', model_folder],\n", + " inputs=[model_folder],\n", + " compute_target = pipeline_cluster,\n", + " runconfig = pipeline_run_config,\n", + " allow_reuse = True)\n", + "\n", + "print(\"Pipeline steps defined\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "from azureml.pipeline.core import Pipeline\n", + "from azureml.widgets import RunDetails\n", + "\n", + "# Construct the pipeline\n", + "pipeline_steps = [train_step, register_step]\n", + "pipeline = Pipeline(workspace = ws, steps=pipeline_steps)\n", + "print(\"Pipeline is built.\")\n", + "\n", + "# Create an experiment and run the pipeline\n", + "experiment = Experiment(workspace = ws, name = 'diabetes-training-pipeline')\n", + "pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)\n", + "print(\"Pipeline submitted for execution.\")\n", + "\n", + "RunDetails(pipeline_run).show()\n", + "pipeline_run.wait_for_completion()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Model\n", + "\n", + "for model in Model.list(ws):\n", + " print(model.name, 'version:', model.version)\n", + " for tag_name in model.tags:\n", + " tag = model.tags[tag_name]\n", + " print ('\\t',tag_name, ':', tag)\n", + " for prop_name in model.properties:\n", + " prop = model.properties[prop_name]\n", + " print ('\\t',prop_name, ':', prop)\n", + " print('\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/experimentation/Diabetes Ridge Regression Parameter Experimentation.ipynb b/experimentation/Diabetes Ridge Regression Parameter Experimentation.ipynb new file mode 100644 index 00000000..aab5e052 --- /dev/null +++ b/experimentation/Diabetes Ridge Regression Parameter Experimentation.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Experiment with parameters for a Ridge Regression Model on the Diabetes Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook is for experimenting with different parameters to train a ridge regression model on the Diabetes dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Change out of the experimentation directory\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "from azureml.core import Workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the workspace from the saved config file\n", + "ws = Workspace.from_config()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, shutil\n", + "\n", + "# Create a folder for the experiment files\n", + "training_folder = 'diabetes-training'\n", + "os.makedirs(training_folder, exist_ok=True)\n", + "\n", + "# Copy the data file into the experiment folder\n", + "shutil.copy('data/diabetes.csv', os.path.join(training_folder, \"diabetes.csv\"))\n", + "\n", + "# Copy the train functions into the experiment folder\n", + "shutil.copy('diabetes_regression/training/train.py', os.path.join(training_folder, \"train.py\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/parameters.json\n", + "{\n", + " \"training\":\n", + " {\n", + " \"alpha\": 0.3\n", + " },\n", + " \"evaluation\":\n", + " {\n", + "\n", + " },\n", + " \"scoring\":\n", + " {\n", + " \n", + " }\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile $training_folder/diabetes_training.py\n", + "# Import libraries\n", + "from azureml.core import Run\n", + "import json\n", + "import os\n", + "import pandas as pd\n", + "import shutil\n", + "\n", + "from train import split_data, train_model\n", + "\n", + "# Get the experiment run context\n", + "run = Run.get_context()\n", + "\n", + "# load the diabetes dataset\n", + "print(\"Loading Data...\")\n", + "train_df = pd.read_csv('diabetes.csv')\n", + "\n", + "data = split_data(train_df)\n", + "\n", + "# Specify the parameters to test\n", + "with open(\"parameters.json\") as f:\n", + " pars = json.load(f)\n", + " train_args = pars[\"training\"]\n", + "\n", + "# Log parameters\n", + "for k, v in train_args.items():\n", + " run.log(k, v)\n", + "\n", + "model, metrics = train_model(data, train_args)\n", + "\n", + "# Log metrics\n", + "for k, v in metrics.items():\n", + " run.log(k, v)\n", + "\n", + "# Save the parameters file to the outputs folder\n", + "os.makedirs('outputs', exist_ok=True)\n", + "shutil.copy('parameters.json', os.path.join('outputs', 'parameters.json'))\n", + " \n", + "run.complete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.estimator import Estimator\n", + "from azureml.core import Experiment\n", + "\n", + "# Create an estimator\n", + "estimator = Estimator(source_directory=training_folder,\n", + " entry_script='diabetes_training.py',\n", + " compute_target='local',\n", + " conda_packages=['scikit-learn']\n", + " )\n", + "\n", + "# Create an experiment\n", + "experiment_name = 'diabetes-training'\n", + "experiment = Experiment(workspace = ws, name = experiment_name)\n", + "\n", + "# Run the experiment based on the estimator\n", + "run = experiment.submit(config=estimator)\n", + "run.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = run.get_metrics()\n", + "for k, v in metrics.items():\n", + " print(k, v)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for file in run.get_file_names():\n", + " print(file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.6.10 64-bit ('OH3': conda)", + "language": "python", + "name": "python361064bitoh3conda5f7beeba8c1d407187c86667ecfb684f" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}