From cc598422c21067cc2b664eb4a450e5b3e4f4ce0c Mon Sep 17 00:00:00 2001 From: Chansung Park Date: Thu, 16 Jun 2022 00:15:47 +0000 Subject: [PATCH 1/7] add wandb dependency --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7ca652f..971964a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ dvc[gdrive]==2.10.2 +wandb==0.12.18 tensorflow==2.8 typer==0.4.1 docopt==0.6.2 From 73bf38005a9ea9fb1c4ea4df058746469e13ec55 Mon Sep 17 00:00:00 2001 From: Chansung Park Date: Thu, 16 Jun 2022 00:29:06 +0000 Subject: [PATCH 2/7] update to adopt wandb api --- pipeline/train.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/pipeline/train.py b/pipeline/train.py index df521ff..faea520 100644 --- a/pipeline/train.py +++ b/pipeline/train.py @@ -10,9 +10,11 @@ import tensorflow as tf from tensorflow.keras.applications import resnet50 -from dvclive.keras import DvcLiveCallback import modeling +import wandb +from wandb.keras import WandbCallback + if len(sys.argv) != 2: sys.stderr.write("Arguments error. Usage:\n") sys.stderr.write("\tpython prepare.py data-file\n") @@ -56,14 +58,22 @@ def make_tarfile(output_filename, source_dir): with tarfile.open(output_filename, "w:gz") as tar: tar.add(source_dir, arcname=os.path.basename(source_dir)) -def run_train(): +def run_train(project_name, + wandb_key): + wandb.login( + anonymous="never", + key=wandb_key + ) + wandb_run = wandb.init(project=project_name, + config=params) + train_size = params['train_size'] train_step_size = train_size // params['batch_size'] train_ds = _read_dataset(params['epoch'], params['batch_size'], train) test_ds = _read_dataset(params['epoch'], params['batch_size'], test) - dvcCallback = DvcLiveCallback() + wandbCallback = WandbCallback() m = modeling._build_keras_model() m = modeling._compile(m, float(params['lr'])) @@ -73,7 +83,7 @@ def run_train(): epochs=params['epoch'], steps_per_epoch=train_step_size, validation_data=test_ds, - callbacks=[dvcCallback]) + callbacks=[wandbCallback]) m.save(output, save_format='tf', @@ -81,4 +91,7 @@ def run_train(): make_tarfile(f'{output}.tar.gz', output) -run_train() \ No newline at end of file +project_name = os.environ["WANDB_PROJECT"] +wandb_key = os.environment["WANDB_API_KEY"] + +run_train(project_name, wandb_key) \ No newline at end of file From f769a378f0abda2cf367472387108d6b69a8a0a2 Mon Sep 17 00:00:00 2001 From: Chansung Park Date: Sat, 25 Jun 2022 17:28:48 +0000 Subject: [PATCH 3/7] add wandb run name --- pipeline/train.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pipeline/train.py b/pipeline/train.py index faea520..cbb0830 100644 --- a/pipeline/train.py +++ b/pipeline/train.py @@ -58,14 +58,18 @@ def make_tarfile(output_filename, source_dir): with tarfile.open(output_filename, "w:gz") as tar: tar.add(source_dir, arcname=os.path.basename(source_dir)) -def run_train(project_name, - wandb_key): +def run_train(): + project_name = os.environ["WANDB_PROJECT"] + wandb_key = os.environ["WANDB_API_KEY"] + wandb_run_name = os.environ["WANDB_RUN_NAME"] + wandb.login( anonymous="never", key=wandb_key ) - wandb_run = wandb.init(project=project_name, - config=params) + _ = wandb.init(project=project_name, + config=params, + name=wandb_run_name) train_size = params['train_size'] train_step_size = train_size // params['batch_size'] @@ -73,7 +77,8 @@ def run_train(project_name, train_ds = _read_dataset(params['epoch'], params['batch_size'], train) test_ds = _read_dataset(params['epoch'], params['batch_size'], test) - wandbCallback = WandbCallback() + wandbCallback = WandbCallback(training_data=train_ds, + log_weights=(True), log_gradients=(True)) m = modeling._build_keras_model() m = modeling._compile(m, float(params['lr'])) @@ -91,7 +96,4 @@ def run_train(project_name, make_tarfile(f'{output}.tar.gz', output) -project_name = os.environ["WANDB_PROJECT"] -wandb_key = os.environment["WANDB_API_KEY"] - -run_train(project_name, wandb_key) \ No newline at end of file +run_train() \ No newline at end of file From c0f5d79ae4f613c380c135796d5116bbae8d1c1a Mon Sep 17 00:00:00 2001 From: Chansung Park Date: Sat, 25 Jun 2022 17:30:05 +0000 Subject: [PATCH 4/7] update according to WANDB setup --- README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/README.md b/README.md index 22a2cff..ae29d96 100644 --- a/README.md +++ b/README.md @@ -29,10 +29,6 @@ This project shows how to realize MLOps in Git/GitHub. In order to achieve this $ dvc stage add -n train \ -p train.train_size,train.batch_size,train.epoch,train.lr \ -d pipeline/modeling.py -d pipeline/train.py -d data \ - --plots-no-cache dvclive/scalars/loss.tsv \ - --plots-no-cache dvclive/scalars/sparse_categorical_accuracy.tsv \ - --plots-no-cache dvclive/scalars/val_loss.tsv \ - --plots-no-cache dvclive/scalars/val_sparse_categorical_accuracy.tsv \ -o outputs/model \ python pipeline/train.py outputs/model ``` @@ -41,7 +37,6 @@ $ dvc stage add -n train \ $ dvc stage add -n evaluate \ -p evaluate.test,evaluate.batch_size \ -d pipeline/evaluate.py -d data/test -d outputs/model \ - -M outputs/metrics.json \ python pipeline/evaluate.py outputs/model ``` 11. Update `params.yaml` as you need. From 90f3097853dc616331ec43f37d2df7d345b60e09 Mon Sep 17 00:00:00 2001 From: Chansung Park Date: Sat, 25 Jun 2022 17:37:08 +0000 Subject: [PATCH 5/7] leave a note to set W&B specific GH secrets --- README.md | 53 ++++++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index ae29d96..7d65432 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,11 @@ This project shows how to realize MLOps in Git/GitHub. In order to achieve this 4. Run `dvc add [ADDED FILE OR DIRECTORY]` to track your data with DVC 5. Run `dvc remote add -d gdrive_storage gdrive://[ID of specific folder in gdrive]` to add Google Drive as the remote data storage 6. Run `dvc push`, then URL to auth is provided. Copy and paste it to the browser, and autheticate -7. Copy the content of `.dvc/tmp/gdrive-user-credentials.json` and put it as in [GitHub Secret](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-a-repository) with the name of `GDRIVE_CREDENTIALS` -8. Run `git add . && git commit -m "initial commit" && git push origin main` to keep the initial setup -9. Write your own pipeline under `pipeline` directory. Codes for basic image classification in TensorFlow are provided initially. -10. Run the following `dvc stage add` for training stage +7. Copy the content of `.dvc/tmp/gdrive-user-credentials.json` and put it as in [GitHub Secret](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-a-repository) with the name of `GDRIVE_CREDENTIAL` +8. Add W&B PROJECT NAME and API KEY to GitHub Secret as `WANDB_PROJECT` and `WANDB_API_KEY` respectively. +9. Run `git add . && git commit -m "initial commit" && git push origin main` to keep the initial setup +10. Write your own pipeline under `pipeline` directory. Codes for basic image classification in TensorFlow are provided initially. +11. Run the following `dvc stage add` for training stage ```bash $ dvc stage add -n train \ -p train.train_size,train.batch_size,train.epoch,train.lr \ @@ -32,23 +33,23 @@ $ dvc stage add -n train \ -o outputs/model \ python pipeline/train.py outputs/model ``` -10. Run the following `dvc stage add` for evaluate stage +12. Run the following `dvc stage add` for evaluate stage ```bash $ dvc stage add -n evaluate \ -p evaluate.test,evaluate.batch_size \ -d pipeline/evaluate.py -d data/test -d outputs/model \ python pipeline/evaluate.py outputs/model ``` -11. Update `params.yaml` as you need. -12. Run `git add . && git commit -m "add initial pipeline setup" && git push origin main` -13. Run `dvc repro` to run the pipeline initially -14. Run `dvc add outputs/model.tar.gz` to add compressed version of model -15. Run `dvc push outputs/model.tar.gz` -16. Run `echo "/pipeline/__pycache__" >> .gitignore` to ignore unnecessary directory -17. Run `git add . && git commit -m "add initial pipeline run" && git push origin main` -18. Add access token and user email of [JarvisLabs.ai](https://jarvislabs.ai/) to GitHub Secret as `JARVISLABS_ACCESS_TOKEN` and `JARVISLABS_USER_EMAIL` -19. Add GitHub access token to GitHub Secret as `GH_ACCESS_TOKEN` -20. Create a PR and write `#train` as in comment (you have to be the onwer of the repo) +13. Update `params.yaml` as you need. +14. Run `git add . && git commit -m "add initial pipeline setup" && git push origin main` +15. Run `dvc repro` to run the pipeline initially +16. Run `dvc add outputs/model.tar.gz` to add compressed version of model +17. Run `dvc push outputs/model.tar.gz` +18. Run `echo "/pipeline/__pycache__" >> .gitignore` to ignore unnecessary directory +19. Run `git add . && git commit -m "add initial pipeline run" && git push origin main` +20. Add access token and user email of [JarvisLabs.ai](https://jarvislabs.ai/) to GitHub Secret as `JARVISLABS_ACCESS_TOKEN` and `JARVISLABS_USER_EMAIL` +21. Add GitHub access token to GitHub Secret as `GH_ACCESS_TOKEN` +22. Create a PR and write `#train` as in comment (you have to be the onwer of the repo) ### HuggingFace Integration Setup @@ -58,28 +59,6 @@ $ dvc stage add -n evaluate \ - GitHub Action assumes your model is archieved as `model.tar.gz` under `outputs` directory - Algo GitHub Action assumes your HuggingFace Space app is written in [Gradio](https://gradio.app/) under `hf-space` directory. You need to change [`app_template.py`](https://github.com/codingpot/git-mlops/blob/main/hf-space/app_template.py) as you need(you shouldn't remove any environment variables in the file). -## TODO - -- [X] Write solid steps to reproduce this repo for other tasks -- [X] Deploy experimental model to [HF Space](https://huggingface.co/spaces) -- [ ] Deploy current model to [GKE](https://cloud.google.com/kubernetes-engine) with [auto TFServing deployment project](https://github.com/deep-diver/ml-deployment-k8s-tfserving) -- [ ] Add more cloud providers offering GPU VMs - - [X] [JarvisLabs.ai](https://jarvislabs.ai/) - - [ ] [DataCrunch.io](https://datacrunch.io/) - - [ ] [GCP Vertex AI Training](https://cloud.google.com/vertex-ai#section-9) -- [ ] Integrate more managed services for management - - [ ] [W&B Artifact](https://wandb.ai/site) for dataset/model versioning and experiment tracking - - [ ] [HugginfFace](https://huggingface.co) for dataset/model versioning -- [ ] Integrate more managed services for deployment - - [ ] [AKS](https://docs.microsoft.com/en-us/azure/aks/) - - [ ] [EKS](https://aws.amazon.com/ko/eks/) - - [ ] [App Engine](https://cloud.google.com/appengine/) - - [ ] [AWS Lambda](https://aws.amazon.com/ko/lambda/) -- [ ] Add more example codebase (pipeline) - - [ ] TensorFlow based Object Detection - - [ ] PyTorch based Image Classification - - [ ] HuggingFace Transformers - ## Brief description of each tools - **DVC(Data Version Control)**: Manages data in somewhere else(i.e. cloud storage) while keeping the version and remote information in metadata file in Git repository. From 1436c39b998c9f6beece9f4cc8a2c484bf176875 Mon Sep 17 00:00:00 2001 From: Chansung Park Date: Sat, 25 Jun 2022 18:10:29 +0000 Subject: [PATCH 6/7] fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7d65432..703d862 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ $ dvc stage add -n evaluate \ 2. Add username of HugginfFace to GitHub Secret as `HF_USER_ID` 3. Write `#deploy-hf` in comment of PR you want to deploy to HuggingFace Space - GitHub Action assumes your model is archieved as `model.tar.gz` under `outputs` directory - - Algo GitHub Action assumes your HuggingFace Space app is written in [Gradio](https://gradio.app/) under `hf-space` directory. You need to change [`app_template.py`](https://github.com/codingpot/git-mlops/blob/main/hf-space/app_template.py) as you need(you shouldn't remove any environment variables in the file). + - GitHub Action assumes your HuggingFace Space app is written in [Gradio](https://gradio.app/) under `hf-space` directory. You need to change [`app_template.py`](https://github.com/codingpot/git-mlops/blob/main/hf-space/app_template.py) as you need(you shouldn't remove any environment variables in the file). ## Brief description of each tools From 2e1519f12fc7caffba07b7cc634929e39c428f93 Mon Sep 17 00:00:00 2001 From: Chansung Park Date: Sat, 25 Jun 2022 18:12:13 +0000 Subject: [PATCH 7/7] update shell script to support W&B --- scripts/experiments.sh | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/scripts/experiments.sh b/scripts/experiments.sh index 56653e7..7590114 100644 --- a/scripts/experiments.sh +++ b/scripts/experiments.sh @@ -12,6 +12,10 @@ export GH_TOKEN='$GH_ACCESS_TOKEN' git config --global user.name "chansung" git config --global user.email "deep.diver.csp@gmail.com" +# set W&B specific keys +export WANDB_PROJECT='$WANDB_PROJECT' +export WANDB_API_KEY='$WANDB_API_KEY' + # move to the repo git clone https://github.com/codingpot/git-mlops.git @@ -29,30 +33,19 @@ echo '$GDRIVE_CREDENTIAL' > .dvc/tmp/gdrive-user-credentials.json # pull data dvc pull -exp_names=("base") -dvc exp run - -dvc exp show > exp_results.txt -exp_id_strings=`grep -oe "exp-[a-z0-9]\+" exp_results.txt` -exp_ids=($exp_id_strings) -cur_branch=$(git branch | sed -n -e 's/^\* \(.*\)/\1/p') +export WANDB_RUN_NAME=$CUR_BRANCH +dvc repro exp_result=$(dvc exp show --only-changed --md) -gh pr comment $CUR_PR_ID --body "$exp_result" +wandb_url="https://wandb.ai/codingpot/git-mlops" +gh pr comment $CUR_PR_ID --body "[Visit W&B Log Page for this Pull Request]($wandb_url)" git reset --hard -for idx in ${!exp_names[@]} -do - echo ${exp_ids[$idx]} - echo ${exp_names[$idx]} - dvc exp branch ${exp_ids[$idx]} ${exp_names[$idx]} - dvc add outputs/model.tar.gz - dvc push outputs/model.tar.gz - git branch -m ${exp_names[$idx]} exp-$cur_branch-${exp_names[$idx]} - git checkout exp-$cur_branch-${exp_names[$idx]} - git push origin exp-$cur_branch-${exp_names[$idx]} - git checkout $CUR_BRANCH -done + +echo ${exp_ids[$idx]} +echo ${exp_names[$idx]} +dvc add outputs/model.tar.gz +dvc push outputs/model.tar.gz VM_ID=$(tail -n 2 /home/.jarviscloud/jarvisconfig | head -n 1) python clouds/jarvislabs.py vm destroy $CLOUD_AT $CLOUD_ID $VM_ID