From d225ab92c4ae8746221f59b910ac0e2ecd1cecb9 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Tue, 7 Oct 2025 19:01:30 +0530 Subject: [PATCH 01/18] Update build.py to use sudo docker build --- scripts/fastpull/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fastpull/build.py b/scripts/fastpull/build.py index 4a2d675..1ce3802 100644 --- a/scripts/fastpull/build.py +++ b/scripts/fastpull/build.py @@ -322,7 +322,7 @@ def build_and_push_docker(args) -> bool: # Build cmd = [ - 'docker', 'build', + 'sudo', 'docker', 'build', '-t', args.repository_url, '-f', os.path.join(args.dockerfile_path, args.dockerfile) ] From ad8cda106f2a15a3afeccf327bccaf5a82641996 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Tue, 7 Oct 2025 20:05:43 +0530 Subject: [PATCH 02/18] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0aff12d..f108bc7 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,6 @@ Build and push from your Dockerfile: > [!NOTE] > - We support --registry gar, --registry ecr, --registry dockerhub > - For ``, you can use any name that's convenient, ex: `v1`, `latest` -> - FOR `[--FLAGS]` you can use any docker compatible flags, ex. `--gpus all`, `-p PORT:PORT`, `-v ` ```bash @@ -132,8 +131,10 @@ fastpull run --benchmark-mode readiness --readiness-endpoint localhost:/ [!NOTE] -> - When running for Readiness, you must publish the right port ex. -p 8000: 8000 and use --readiness-endpoint localhost:8000/health +> - When running for Readiness, you must publish the right port ex. `-p 8000:8000` and use `--readiness-endpoint localhost:8000/health` > - Use --mode normal to run normal docker, running without this flag runs with fastpull optimisations +> - For `[--FLAGS]` you can use any docker compatible flags, ex. `--gpus all`, `-p PORT:PORT`, `-v ` +> - If using GPUs, make sure you add `--gpus all` as a fastpull run flag #### Cleaning after a run From 0f6c392865ab92fd266f173a06eff0c1e985867e Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Tue, 7 Oct 2025 20:22:05 +0530 Subject: [PATCH 03/18] fix benchmark endpoint, now we can provide with and without http:// --- scripts/fastpull/benchmark.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/fastpull/benchmark.py b/scripts/fastpull/benchmark.py index 2979423..f79d228 100644 --- a/scripts/fastpull/benchmark.py +++ b/scripts/fastpull/benchmark.py @@ -89,12 +89,17 @@ def wait_for_readiness(self, timeout: int = 600, poll_interval: int = 2): if self.benchmark_mode != 'readiness' or not self.readiness_endpoint: return True - print(f"Polling {self.readiness_endpoint} for readiness...") + # Ensure endpoint has protocol prefix + endpoint = self.readiness_endpoint + if not endpoint.startswith(('http://', 'https://')): + endpoint = f'http://{endpoint}' + + print(f"Polling {endpoint} for readiness...") end_time = time.time() + timeout while time.time() < end_time: try: - response = urlopen(self.readiness_endpoint, timeout=5) + response = urlopen(endpoint, timeout=5) if response.getcode() == 200: elapsed = time.time() - self.start_time self.metrics['readiness_time'] = elapsed From 380c1845a7fffe11f0eefcc83d862493d70d87f1 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Tue, 7 Oct 2025 21:09:22 +0530 Subject: [PATCH 04/18] fix build docker --- scripts/fastpull/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fastpull/build.py b/scripts/fastpull/build.py index 1ce3802..4a68cc5 100644 --- a/scripts/fastpull/build.py +++ b/scripts/fastpull/build.py @@ -346,7 +346,7 @@ def build_and_push_docker(args) -> bool: # Push print(f"[Docker] Pushing {args.repository_url}...") try: - subprocess.run(['docker', 'push', args.repository_url], check=True) + subprocess.run(['sudo', 'docker', 'push', args.repository_url], check=True) print(f"[Docker] ✓ Pushed {args.repository_url}") return True except subprocess.CalledProcessError: From 17e334f2f36da7e8b9d10cef3fe9c2b7b815759a Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Tue, 7 Oct 2025 21:16:18 +0530 Subject: [PATCH 05/18] fix build issue --- scripts/fastpull/build.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/scripts/fastpull/build.py b/scripts/fastpull/build.py index 4a68cc5..0fa8821 100644 --- a/scripts/fastpull/build.py +++ b/scripts/fastpull/build.py @@ -125,12 +125,6 @@ def build_command(args): print(f"Error: Invalid format '{fmt}'. Valid: {', '.join(valid_formats)}") sys.exit(1) - # Authenticate with registry - print(f"\nAuthenticating with {args.registry}...") - if not authenticate_registry(args): - print("Error: Authentication failed") - sys.exit(1) - # Determine build mode if args.dockerfile_path: # Mode 1: Build from Dockerfile From 1ffaa59f54c59c560a57f2d5f41e68bed7a83e4a Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Wed, 8 Oct 2025 17:12:39 +0530 Subject: [PATCH 06/18] Improve readme buttons --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f108bc7..e987d22 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,8 @@
# Start massive AI/ML container images 10x faster with lazy-loading snapshotter - - - +[![Join Slack](https://img.shields.io/badge/Join_Slack-2EB67D?style=for-the-badge&logo=slack&logoColor=white)](https://join.slack.com/t/tensorfusecommunity/shared_invite/zt-30r6ik3dz-Rf7nS76vWKOu6DoKh5Cs5w) +[![Read our Blog](https://img.shields.io/badge/Read_our_Blog-ff9800?style=for-the-badge&logo=RSS&logoColor=white)](https://tensorfuse.io/docs/blogs/blog) [Installation](#install-fastpull-on-a-vm) • [Results](#understanding-test-results) • [Detailed Usage](docs/fastpull.md) From aee9cbbfd52dc5f604570523084f475863fc7c2c Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Thu, 23 Oct 2025 19:56:16 +0530 Subject: [PATCH 07/18] Add support for Kubernetes clusters --- README.md | 104 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index e987d22..b2152c8 100644 --- a/README.md +++ b/README.md @@ -28,21 +28,22 @@ AI/ML container images like CUDA, vLLM, and sglang are large (10 GB+). Tradition #### The Solution -Fastpull uses lazy-loading to pull only the files needed to start the container, then fetches remaining layers on demand. This accelerates start times by 10x. See the results below: +Fastpull uses lazy-loading to pull only the files needed to start the container, then fetches remaining layers on demand. This accelerates start times by 10x. See the results below:
benchmark
+You can now: +- [Install Fastpull on a VM](#install-fastpull-on-a-vm) +- [Install Fastpull on Kubernetes](#install-fastpull-on-a-kubernetes-cluster) + For more information, check out the [fastpull blog release](https://tensorfuse.io/docs/blogs/reducing_gpu_cold_start). --- ## Install fastpull on a VM -> [!NOTE] -> For Kubernetes installation, [contact us](mailto:agam@tensorfuse.io) for early access to our helm chart. - ### Prerequisites - VM Image: Works on Debian 12+, Ubuntu, AL2023 VMs with GPU, mileage on other AMIs may vary. @@ -62,27 +63,27 @@ You should see: **"✅ Fastpull installed successfully on your VM"** **2. Run containers** -Fastpull requires your images to be in a special format. You can either choose from our template of pre-built images like vLLM, TensorRT, and SGlang or build your own using a Dockerfile. +Fastpull requires your images to be in a special format. You can either choose from our template of pre-built images like vLLM, TensorRT, and SGlang or build your own using a Dockerfile. -Option A: Use pre-built images +#### Use pre-built images Test with vLLM, TensorRT, or Sglang: ```bash fastpull quickstart tensorrt -fastpull quickstart vllm -fastpull quickstart sglang +fastpull quickstart vllm +fastpull quickstart sglang ``` Each of these will run two times, once with fastpull optimisations, and one the way docker runs it -After the quickstart runs are complete, we also run `fastpull clean --all` which cleans up the downloaded images. +After the quickstart runs are complete, we also run `fastpull clean --all` which cleans up the downloaded images. -Option B: Build custom images +#### Build custom images -First, authenticate with your registry -For ECR: +First, authenticate with your registry +For ECR: ``` -aws configure; +aws configure; aws ecr get-login-password --region us-east-1 | sudo nerdctl login --username AWS --password-stdin ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com ``` @@ -92,33 +93,33 @@ For GAR: gcloud auth login; gcloud auth print-access-token | sudo nerdctl login -docker.pkg.dev --username oauth2accesstoken --password-stdin ``` -For Dockerhub: +For Dockerhub: ``` sudo docker login ``` Build and push from your Dockerfile: -> [!NOTE] -> - We support --registry gar, --registry ecr, --registry dockerhub +> [!NOTE] +> - We support --registry gar, --registry ecr, --registry dockerhub > - For ``, you can use any name that's convenient, ex: `v1`, `latest` ```bash # Build and push image -fastpull build --registry --dockerfile-path --repository-url : +fastpull build --registry --dockerfile-path --repository-url : ``` -## Benchmarking with Fastpull +### Benchmarking with Fastpull -To get the run time for your container, you can use either: +To get the run time for your container, you can use either: Completion Time Use if the workload has a defined end point ``` -fastpull run --benchmark-mode completion [--FLAGS] : -fastpull run --benchmark-mode completion --mode normal [--FLAGS] : +fastpull run --benchmark-mode completion [--FLAGS] : +fastpull run --benchmark-mode completion --mode normal [--FLAGS] : ``` Server Endpoint Readiness Time @@ -142,9 +143,7 @@ To get the right cold start numbers, run the clean command after each run: fastpull clean --all ``` ---- - -## Understanding Test Results +### Understanding Test Results Results show the startup and completion/readiness times: @@ -160,6 +159,63 @@ Total Elapsed Time: 329.367s ================================================== ``` +--- + +## Install fastpull on a Kubernetes Cluster + +### Prerequisites +- Tested on GKE +- Tested with COS Operating System for the nodes + +### Installation Steps +1. In your K8s cluster, create a GPU Nodepoool +2. Install Nvidia GPU drivers. For COS: +```bash +kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml +``` +3. Install containerd config updater daemonset: `kubectl apply -f https://raw.githubusercontent.com/tensorfuse/nydus-gke/main/containerd-daemonset.yaml` +4. Install the [Helm Chart](oci://registry-1.docker.io/tensorfuse/nydus-snapshotter). For COS: +```bash +helm upgrade --install nydus-snapshotter oci://registry-1.docker.io/tensorfuse/nydus-snapshotter \ +--version 0.0.10-gke-helm \ +--create-namespace \ +--namespace nydus-snapshotter \ +--set 'tolerations[0].key=nvidia.com/gpu' \ +--set 'tolerations[0].operator=Equal' \ +--set 'tolerations[0].value=present' \ +--set 'tolerations[0].effect=NoSchedule' \ +--set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key=cloud.google.com/gke-accelerator' \ +--set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator=Exists' +``` +5. Create a fastpull lazy loading image. On a Pod or a standalone VM, [install fastpull][#install-fastpull] and [build your image](#build-custom-images) +6. Create the pod spec for image we created. For COS, use a pod spec like this: +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: gpu-test-a100-nydus +spec: + tolerations: + - operator: Exists + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-tesla-a100 # Use your GPU Type + runtimeClassName: runc-nydus + containers: + - name: debug-container + image: IMAGE_PATH:TAG + resources: + limits: + nvidia.com/gpu: 1 + env: + - name: LD_LIBRARY_PATH + value: /usr/local/cuda/lib64:/usr/local/nvidia/lib64 +``` +7. Run a pod with this spec: +```bash +kubectl apply -f .yaml +``` + + ---
From 785dcf801db2415b70746452e48f1aef0c03e6b4 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Thu, 23 Oct 2025 19:57:42 +0530 Subject: [PATCH 08/18] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b2152c8..038b284 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ Total Elapsed Time: 329.367s - Tested on GKE - Tested with COS Operating System for the nodes -### Installation Steps +### Installation 1. In your K8s cluster, create a GPU Nodepoool 2. Install Nvidia GPU drivers. For COS: ```bash @@ -187,7 +187,7 @@ helm upgrade --install nydus-snapshotter oci://registry-1.docker.io/tensorfuse/n --set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key=cloud.google.com/gke-accelerator' \ --set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator=Exists' ``` -5. Create a fastpull lazy loading image. On a Pod or a standalone VM, [install fastpull][#install-fastpull] and [build your image](#build-custom-images) +5. Create a fastpull lazy loading image. On a Pod or a standalone VM, [install fastpull][#installation-steps] and [build your image](#build-custom-images) 6. Create the pod spec for image we created. For COS, use a pod spec like this: ```yaml apiVersion: v1 From 3bffa251c01d9eed3bd666767172d25148807f74 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Thu, 23 Oct 2025 20:03:53 +0530 Subject: [PATCH 09/18] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 038b284..3460134 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ Total Elapsed Time: 329.367s - Tested with COS Operating System for the nodes ### Installation -1. In your K8s cluster, create a GPU Nodepoool +1. In your K8s cluster, create a GPU Nodepoool. For GKE, ensure Workload Identity is enabled on your cluster 2. Install Nvidia GPU drivers. For COS: ```bash kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml @@ -187,7 +187,7 @@ helm upgrade --install nydus-snapshotter oci://registry-1.docker.io/tensorfuse/n --set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key=cloud.google.com/gke-accelerator' \ --set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator=Exists' ``` -5. Create a fastpull lazy loading image. On a Pod or a standalone VM, [install fastpull][#installation-steps] and [build your image](#build-custom-images) +5. On a Pod or a standalone VM, [install fastpull](#installation-steps) and [build your image](#build-custom-images) 6. Create the pod spec for image we created. For COS, use a pod spec like this: ```yaml apiVersion: v1 From 36b4b11273deb45ed5246efc4381866fee5078f7 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Fri, 24 Oct 2025 10:29:10 +0530 Subject: [PATCH 10/18] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3460134..3dc028a 100644 --- a/README.md +++ b/README.md @@ -193,7 +193,7 @@ helm upgrade --install nydus-snapshotter oci://registry-1.docker.io/tensorfuse/n apiVersion: v1 kind: Pod metadata: - name: gpu-test-a100-nydus + name: gpu-test-a100-fastpull spec: tolerations: - operator: Exists From 0a45a022f5aac0ade46e8c08d0fe682e71581668 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Fri, 24 Oct 2025 21:16:06 +0530 Subject: [PATCH 11/18] Update readme --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 3dc028a..6923578 100644 --- a/README.md +++ b/README.md @@ -173,13 +173,13 @@ Total Elapsed Time: 329.367s ```bash kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml ``` -3. Install containerd config updater daemonset: `kubectl apply -f https://raw.githubusercontent.com/tensorfuse/nydus-gke/main/containerd-daemonset.yaml` -4. Install the [Helm Chart](oci://registry-1.docker.io/tensorfuse/nydus-snapshotter). For COS: +3. Install containerd config updater daemonset: `kubectl apply -f https://raw.githubusercontent.com/tensorfuse/fastpull-gke/main/containerd-daemonset.yaml` +4. Install the [Helm Chart](https://hub.docker.com/repository/docker/tensorfuse/fastpull-snapshotter/general). For COS: ```bash -helm upgrade --install nydus-snapshotter oci://registry-1.docker.io/tensorfuse/nydus-snapshotter \ +helm upgrade --install fastpull-snapshotter oci://registry-1.docker.io/tensorfuse/fastpull-snapshotter \ --version 0.0.10-gke-helm \ --create-namespace \ ---namespace nydus-snapshotter \ +--namespace fastpull-snapshotter \ --set 'tolerations[0].key=nvidia.com/gpu' \ --set 'tolerations[0].operator=Equal' \ --set 'tolerations[0].value=present' \ @@ -199,7 +199,7 @@ spec: - operator: Exists nodeSelector: cloud.google.com/gke-accelerator: nvidia-tesla-a100 # Use your GPU Type - runtimeClassName: runc-nydus + runtimeClassName: runc-fastpull containers: - name: debug-container image: IMAGE_PATH:TAG From 5f50e137e033f9206724f69f726438aaef7bd3a1 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Fri, 24 Oct 2025 21:24:51 +0530 Subject: [PATCH 12/18] minor fix to build and run --- README.md | 5 +++-- scripts/fastpull/build.py | 4 ++-- scripts/fastpull/run.py | 10 +++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 6923578..3694662 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,7 @@ Build and push from your Dockerfile: > [!NOTE] > - We support --registry gar, --registry ecr, --registry dockerhub > - For ``, you can use any name that's convenient, ex: `v1`, `latest` +> - 2 images are created, one is the overlayfs with tag:`` and another is the fastpull image with tag: `-fastpull` ```bash @@ -187,7 +188,7 @@ helm upgrade --install fastpull-snapshotter oci://registry-1.docker.io/tensorfus --set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key=cloud.google.com/gke-accelerator' \ --set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator=Exists' ``` -5. On a Pod or a standalone VM, [install fastpull](#installation-steps) and [build your image](#build-custom-images) +5. On a standalone VM, preferably using Ubuntu os, [install fastpull](#installation-steps) and [build your image](#build-custom-images) 6. Create the pod spec for image we created. For COS, use a pod spec like this: ```yaml apiVersion: v1 @@ -202,7 +203,7 @@ spec: runtimeClassName: runc-fastpull containers: - name: debug-container - image: IMAGE_PATH:TAG + image: IMAGE_PATH:-fastpull # USE FASTPULL IMAGE resources: limits: nvidia.com/gpu: 1 diff --git a/scripts/fastpull/build.py b/scripts/fastpull/build.py index 0fa8821..7418d17 100644 --- a/scripts/fastpull/build.py +++ b/scripts/fastpull/build.py @@ -264,7 +264,7 @@ def build_from_dockerfile(args, formats: List[str]): # Convert to other formats if 'nydus' in formats: - nydus_image = f"{args.repository_url.rsplit(':', 1)[0]}:{args.repository_url.rsplit(':', 1)[1]}-nydus" + nydus_image = f"{args.repository_url.rsplit(':', 1)[0]}:{args.repository_url.rsplit(':', 1)[1]}-fastpull" if convert_to_nydus(args.repository_url, nydus_image): built_images.append(nydus_image) @@ -292,7 +292,7 @@ def convert_existing_image(args, formats: List[str]): # Convert to requested formats if 'nydus' in formats: - nydus_image = f"{args.repository_url.rsplit(':', 1)[0]}:{args.repository_url.rsplit(':', 1)[1]}-nydus" + nydus_image = f"{args.repository_url.rsplit(':', 1)[0]}:{args.repository_url.rsplit(':', 1)[1]}-fastpull" if convert_to_nydus(args.repository_url, nydus_image): built_images.append(nydus_image) diff --git a/scripts/fastpull/run.py b/scripts/fastpull/run.py index 71a8f67..3cfb0cb 100644 --- a/scripts/fastpull/run.py +++ b/scripts/fastpull/run.py @@ -26,7 +26,7 @@ def add_parser(subparsers): '--mode', choices=['nydus', 'normal'], default='nydus', - help='Run mode: nydus (default, adds -nydus suffix) or normal (overlayfs, no suffix)' + help='Run mode: nydus (default, adds -fastpull suffix) or normal (overlayfs, no suffix)' ) # Benchmarking arguments @@ -81,13 +81,13 @@ def run_command(args): # Determine snapshotter and modify image tag based on mode if args.mode == 'nydus': args.snapshotter = 'nydus' - # Add -nydus suffix to image tag if not already present + # Add -fastpull suffix to image tag if not already present if ':' in args.image: base, tag = args.image.rsplit(':', 1) - if not tag.endswith('-nydus'): - args.image = f"{base}:{tag}-nydus" + if not tag.endswith('-fastpull'): + args.image = f"{base}:{tag}-fastpull" else: - args.image = f"{args.image}:latest-nydus" + args.image = f"{args.image}:latest-fastpull" else: # normal mode args.snapshotter = 'overlayfs' # Use image as-is for normal mode From 0772ec151b31b7485335d369108b13b4b2453cac Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Fri, 24 Oct 2025 21:57:11 +0530 Subject: [PATCH 13/18] Update README.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3694662..a92ad5f 100644 --- a/README.md +++ b/README.md @@ -209,7 +209,7 @@ spec: nvidia.com/gpu: 1 env: - name: LD_LIBRARY_PATH - value: /usr/local/cuda/lib64:/usr/local/nvidia/lib64 + value: /usr/local/cuda/lib64:/usr/local/nvidia/lib64 # NOTE: This path may vary depending on the base image ``` 7. Run a pod with this spec: ```bash From e2b156960b40d9c1cf3698d2b75edd3481a1b65c Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Fri, 24 Oct 2025 21:57:22 +0530 Subject: [PATCH 14/18] Update README.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a92ad5f..c773f8a 100644 --- a/README.md +++ b/README.md @@ -169,7 +169,7 @@ Total Elapsed Time: 329.367s - Tested with COS Operating System for the nodes ### Installation -1. In your K8s cluster, create a GPU Nodepoool. For GKE, ensure Workload Identity is enabled on your cluster +1. In your K8s cluster, create a GPU Nodepool. For GKE, ensure Workload Identity is enabled on your cluster 2. Install Nvidia GPU drivers. For COS: ```bash kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml From 6fa1ae2af934c9ec22c26b51de260f7d332592dd Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Fri, 24 Oct 2025 21:59:11 +0530 Subject: [PATCH 15/18] update pyproject --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 544dd42..17a98c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,9 +10,9 @@ readme = "README.md" requires-python = ">=3.7" license = {text = "MIT"} authors = [ - {name = "TensorFuse", email = "team@tensorfuse.ai"} + {name = "TensorFuse", email = "saurabh@tensorfuse.io"} ] -keywords = ["containers", "docker", "nydus", "snapshotter", "ml", "ai"] +keywords = ["containers", "docker", "fastpull", "snapshotter", "ml", "ai"] classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", From f5219607397a657ba0dcfc508e8847e5252b8599 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Wed, 5 Nov 2025 19:17:11 +0530 Subject: [PATCH 16/18] Adds builder in containers --- README.md | 10 ++- scripts/builder/Dockerfile | 56 +++++++++++++ scripts/builder/README.md | 156 +++++++++++++++++++++++++++++++++++++ scripts/builder/build.sh | 72 +++++++++++++++++ 4 files changed, 293 insertions(+), 1 deletion(-) create mode 100644 scripts/builder/Dockerfile create mode 100644 scripts/builder/README.md create mode 100644 scripts/builder/build.sh diff --git a/README.md b/README.md index c773f8a..82f293b 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,15 @@ helm upgrade --install fastpull-snapshotter oci://registry-1.docker.io/tensorfus --set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key=cloud.google.com/gke-accelerator' \ --set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator=Exists' ``` -5. On a standalone VM, preferably using Ubuntu os, [install fastpull](#installation-steps) and [build your image](#build-custom-images) +5. Build your images. Authenticate to your registry, then build: +```bash +docker run --rm --privileged \ + -v /path/to/dockerfile-dir:/workspace:ro \ + -v ~/.docker/config.json:/root/.docker/config.json:ro \ + tensorfuse/fastpull-builder:latest \ + REGISTRY/REPO/IMAGE:TAG +``` +This creates `IMAGE:TAG` (normal) and `IMAGE:TAG-fastpull` (fastpull-optimized). Use the `-fastpull` tag in your pod spec. See [builder documentation](scripts/builder/README.md) for details. 6. Create the pod spec for image we created. For COS, use a pod spec like this: ```yaml apiVersion: v1 diff --git a/scripts/builder/Dockerfile b/scripts/builder/Dockerfile new file mode 100644 index 0000000..f90e9d5 --- /dev/null +++ b/scripts/builder/Dockerfile @@ -0,0 +1,56 @@ +# Build stage: Compile buildkit with Nydus support +FROM golang:1.21-alpine AS buildkit-builder + +# Install build dependencies +RUN apk add --no-cache git make + +# Clone nydusaccelerator/buildkit fork +ARG BUILDKIT_VERSION=nydus-compression-type-enhance +RUN git clone --depth 1 --branch ${BUILDKIT_VERSION} \ + https://github.com/nydusaccelerator/buildkit.git /buildkit + +WORKDIR /buildkit + +# Build buildkitd and buildctl with Nydus support +RUN go build -tags=nydus -o ./bin/buildkitd ./cmd/buildkitd && \ + go build -o ./bin/buildctl ./cmd/buildctl + +# Runtime stage +FROM alpine:latest + +# Copy buildkit binaries with Nydus support +COPY --from=buildkit-builder /buildkit/bin/buildctl /usr/bin/buildctl +COPY --from=buildkit-builder /buildkit/bin/buildkitd /usr/bin/buildkitd + +# Copy buildctl-daemonless.sh wrapper from moby/buildkit repo +ADD https://raw.githubusercontent.com/moby/buildkit/master/examples/buildctl-daemonless/buildctl-daemonless.sh /usr/bin/buildctl-daemonless.sh +RUN chmod +x /usr/bin/buildctl-daemonless.sh + +# Install runtime dependencies +RUN apk add --no-cache \ + ca-certificates \ + curl \ + wget \ + iptables \ + fuse-overlayfs \ + containerd + +# Install nydus-image binary (v2.3.6) +ARG NYDUS_VERSION=v2.3.6 +RUN wget -O /tmp/nydus.tgz \ + "https://github.com/dragonflyoss/nydus/releases/download/${NYDUS_VERSION}/nydus-static-${NYDUS_VERSION}-linux-amd64.tgz" \ + && tar -xzf /tmp/nydus.tgz -C /tmp \ + && mv /tmp/nydus-static/nydus-image /usr/bin/nydus-image \ + && chmod +x /usr/bin/nydus-image \ + && rm -rf /tmp/nydus.tgz /tmp/nydus-static + +# Set NYDUS_BUILDER environment variable (required for buildkit) +ENV NYDUS_BUILDER=/usr/bin/nydus-image + +# Copy build script +COPY build.sh /usr/local/bin/build.sh +RUN chmod +x /usr/local/bin/build.sh + +WORKDIR /workspace + +ENTRYPOINT ["/usr/local/bin/build.sh"] diff --git a/scripts/builder/README.md b/scripts/builder/README.md new file mode 100644 index 0000000..450d4f0 --- /dev/null +++ b/scripts/builder/README.md @@ -0,0 +1,156 @@ +# Container-Based Image Builder + +Builds container images using `buildctl` in a containerized environment. Produces both normal OCI and Nydus-optimized images. + +## Features + +- **Registry-agnostic**: Works with AWS ECR, Google Artifact Registry, Docker Hub, or any OCI registry +- **No local dependencies**: All build tools run inside a container +- **Two image formats**: Builds both normal OCI and Nydus images in one go +- **Direct push**: Images pushed directly to registry via buildctl + +## Architecture + +``` +Host (authenticated) → Builder Container (buildctl + nydus-image) → Registry +``` + +- **Host**: Authenticates to registry, mounts build context and docker config +- **Builder Container**: Runs buildctl to build and push images +- **No Docker daemon dependency**: buildctl pushes directly to registries + +## Prerequisites + +1. **Docker** installed on host (no other dependencies needed!) +2. **Authenticated to your registry** before running: + +```bash +# AWS ECR +aws ecr get-login-password --region us-east-1 | \ + docker login --username AWS --password-stdin 123456789.dkr.ecr.us-east-1.amazonaws.com + +# Google Artifact Registry +gcloud auth configure-docker us-central1-docker.pkg.dev + +# Docker Hub +docker login +``` + +## Usage + +```bash +docker run --rm --privileged \ + -v /path/to/build-context:/workspace:ro \ + -v ~/.docker/config.json:/root/.docker/config.json:ro \ + tensorfuse/fastpull-builder:latest \ + +``` + +### Examples + +**AWS ECR:** +```bash +docker run --rm --privileged \ + -v ./my-app:/workspace:ro \ + -v ~/.docker/config.json:/root/.docker/config.json:ro \ + tensorfuse/fastpull-builder:latest \ + 123456789.dkr.ecr.us-east-1.amazonaws.com/my-app:latest +``` + +**Google Artifact Registry:** +```bash +docker run --rm --privileged \ + -v ./my-app:/workspace:ro \ + -v ~/.docker/config.json:/root/.docker/config.json:ro \ + tensorfuse/fastpull-builder:latest \ + us-central1-docker.pkg.dev/my-project/my-repo/my-app:v1.0 +``` + +**Docker Hub:** +```bash +docker run --rm --privileged \ + -v ./my-app:/workspace:ro \ + -v ~/.docker/config.json:/root/.docker/config.json:ro \ + tensorfuse/fastpull-builder:latest \ + docker.io/username/my-app:latest +``` + +**No tag (defaults to :latest):** +```bash +docker run --rm --privileged \ + -v ./my-app:/workspace:ro \ + -v ~/.docker/config.json:/root/.docker/config.json:ro \ + tensorfuse/fastpull-builder:latest \ + my-registry.com/my-app +``` + +**Custom Dockerfile:** +```bash +docker run --rm --privileged \ + -v ./my-app:/workspace:ro \ + -v ~/.docker/config.json:/root/.docker/config.json:ro \ + -e DOCKERFILE=Dockerfile.custom \ + tensorfuse/fastpull-builder:latest \ + my-registry.com/my-app:latest +``` + +## Output + +The script builds and pushes two images: +- `:` - Normal OCI image +- `:-fastpull` - Fastpull-optimized image + +## Files + +- `Dockerfile` - Builder container definition (builds from nydusaccelerator/buildkit fork) +- `build.sh` - Build script that runs inside container (entrypoint) +- `README.md` - This file + +## Technical Details + +### Buildkit with Nydus Support +The Dockerfile builds `buildkitd` and `buildctl` from the [nydusaccelerator/buildkit](https://github.com/nydusaccelerator/buildkit) fork with the `-tags=nydus` flag, which enables Nydus compression support. The standard moby/buildkit does not include this functionality. + +### Components +- **buildkitd/buildctl**: Compiled from nydusaccelerator/buildkit fork +- **nydus-image**: v2.3.6 binary (set via `NYDUS_BUILDER` env var) +- **buildctl-daemonless.sh**: Wrapper that runs buildkitd in rootless mode + +## How It Works + +1. **Pull builder image**: Downloads `tensorfuse/fastpull-builder:latest` from Docker Hub +2. **Mount context**: Your build context is mounted read-only into `/workspace` +3. **Mount auth**: `~/.docker/config.json` is mounted for registry authentication +4. **Run buildctl**: Builds normal OCI image with `buildctl-daemonless.sh` +5. **Run buildctl again**: Builds Fastpull image with Nydus compression +6. **Direct push**: Both images pushed directly to registry + +## Troubleshooting + +**"Error: Docker config not found"** +- Run registry authentication command first (see Prerequisites) + +**"Error: Build context path does not exist"** +- Check that `--context` points to a valid directory + +**"Error: Dockerfile not found"** +- Ensure Dockerfile exists in context directory +- Or specify custom name with `--dockerfile` + +**Build fails with authentication error:** +- Re-authenticate to your registry +- Check that `~/.docker/config.json` contains valid credentials + +**"permission denied" errors:** +- Builder container runs with `--privileged` flag (required for buildkit) +- Ensure Docker is running with appropriate permissions + +## Comparison with Original build_push.py + +| Feature | Original | Container-Based | +|---------|----------|-----------------| +| Dependencies | Requires nerdctl, nydusify, soci, stargz locally | All tools in container | +| Registry | AWS ECR or GAR | Any OCI registry | +| Formats | normal, nydus, soci, estargz | normal, nydus | +| Push method | nerdctl/docker | buildctl (direct) | +| Portability | Requires snapshotter setup | Runs anywhere Docker runs | diff --git a/scripts/builder/build.sh b/scripts/builder/build.sh new file mode 100644 index 0000000..8858ccf --- /dev/null +++ b/scripts/builder/build.sh @@ -0,0 +1,72 @@ +#!/bin/sh +set -e + +# Usage: build.sh +# Example: build.sh my-registry.com/my-app:latest +# Example: build.sh my-registry.com/my-app (defaults to :latest) + +if [ $# -lt 1 ]; then + echo "Usage: $0 " + echo "Example: $0 123456789.dkr.ecr.us-east-1.amazonaws.com/my-app:v1.0" + echo "Example: $0 123456789.dkr.ecr.us-east-1.amazonaws.com/my-app (defaults to :latest)" + exit 1 +fi + +IMAGE_WITH_TAG="$1" +DOCKERFILE="${DOCKERFILE:-Dockerfile}" +CONTEXT_PATH="${CONTEXT_PATH:-/workspace}" + +# Parse image and tag (default to :latest if no tag provided) +if echo "$IMAGE_WITH_TAG" | grep -q ":"; then + IMAGE_NAME="${IMAGE_WITH_TAG%:*}" + TAG="${IMAGE_WITH_TAG##*:}" +else + IMAGE_NAME="$IMAGE_WITH_TAG" + TAG="latest" +fi + +FULL_IMAGE="${IMAGE_NAME}:${TAG}" +FULL_IMAGE_FASTPULL="${IMAGE_NAME}:${TAG}-fastpull" + +echo "==========================================" +echo "Building images for: ${IMAGE_NAME}" +echo "Tag: ${TAG}" +echo "Context: ${CONTEXT_PATH}" +echo "Dockerfile: ${DOCKERFILE}" +echo "==========================================" + +# Build normal OCI image +echo "" +echo ">>> Building normal OCI image: ${FULL_IMAGE}" +echo "" +time buildctl-daemonless.sh build \ + --frontend dockerfile.v0 \ + --local context="${CONTEXT_PATH}" \ + --local dockerfile="${CONTEXT_PATH}" \ + --opt filename="${DOCKERFILE}" \ + --output type=image,name="${FULL_IMAGE}",push=true + +echo "" +echo "✓ Normal OCI image built and pushed: ${FULL_IMAGE}" +echo "" + +# Build Fastpull image +echo "" +echo ">>> Building Fastpull image: ${FULL_IMAGE_FASTPULL}" +echo "" +time buildctl-daemonless.sh build \ + --frontend dockerfile.v0 \ + --local context="${CONTEXT_PATH}" \ + --local dockerfile="${CONTEXT_PATH}" \ + --opt filename="${DOCKERFILE}" \ + --output type=image,name="${FULL_IMAGE_FASTPULL}",push=true,compression=nydus,force-compression=true,oci-mediatypes=true + +echo "" +echo "✓ Fastpull image built and pushed: ${FULL_IMAGE_FASTPULL}" +echo "" + +echo "==========================================" +echo "✓ Build complete!" +echo " Normal: ${FULL_IMAGE}" +echo " Fastpull: ${FULL_IMAGE_FASTPULL}" +echo "==========================================" From c3a6848c564ebd6b1e7228803a7622c74efdfb38 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Wed, 5 Nov 2025 19:26:02 +0530 Subject: [PATCH 17/18] Update README.md --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 82f293b..60e25b5 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,13 @@ helm upgrade --install fastpull-snapshotter oci://registry-1.docker.io/tensorfus --set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key=cloud.google.com/gke-accelerator' \ --set 'affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator=Exists' ``` -5. Build your images. Authenticate to your registry, then build: +5. Build your images, which can be done by two ways: + + a. On a standalone VM, preferably using Ubuntu os, [install fastpull](#installation-steps) and [build your image](#build-custom-images) + + b. Build in a container: + + First authenticate to your registry and ensure the ~/docker/config.json is updated, then build using our image: ```bash docker run --rm --privileged \ -v /path/to/dockerfile-dir:/workspace:ro \ @@ -197,6 +203,7 @@ docker run --rm --privileged \ REGISTRY/REPO/IMAGE:TAG ``` This creates `IMAGE:TAG` (normal) and `IMAGE:TAG-fastpull` (fastpull-optimized). Use the `-fastpull` tag in your pod spec. See [builder documentation](scripts/builder/README.md) for details. + 6. Create the pod spec for image we created. For COS, use a pod spec like this: ```yaml apiVersion: v1 From 7d2e347af3d8e0b14736fe319718e72a62c3b6c3 Mon Sep 17 00:00:00 2001 From: Saurabh Singh Date: Wed, 5 Nov 2025 19:29:36 +0530 Subject: [PATCH 18/18] Update README.md --- README.md | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 60e25b5..380bb7a 100644 --- a/README.md +++ b/README.md @@ -194,15 +194,24 @@ helm upgrade --install fastpull-snapshotter oci://registry-1.docker.io/tensorfus b. Build in a container: - First authenticate to your registry and ensure the ~/docker/config.json is updated, then build using our image: -```bash -docker run --rm --privileged \ - -v /path/to/dockerfile-dir:/workspace:ro \ - -v ~/.docker/config.json:/root/.docker/config.json:ro \ - tensorfuse/fastpull-builder:latest \ - REGISTRY/REPO/IMAGE:TAG -``` -This creates `IMAGE:TAG` (normal) and `IMAGE:TAG-fastpull` (fastpull-optimized). Use the `-fastpull` tag in your pod spec. See [builder documentation](scripts/builder/README.md) for details. + First authenticate to your registry and ensure the ~/docker/config.json is updated + ```bash + #for aws + aws configure + aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com + #for gcp + gcloud auth login + gcloud auth print-access-token | sudo nerdctl login -docker.pkg.dev --username oauth2accesstoken --password-stdin + ``` + Then build using our image: + ```bash + docker run --rm --privileged \ + -v /path/to/dockerfile-dir:/workspace:ro \ + -v ~/.docker/config.json:/root/.docker/config.json:ro \ + tensorfuse/fastpull-builder:latest \ + REGISTRY/REPO/IMAGE:TAG + ``` + This creates `IMAGE:TAG` (normal) and `IMAGE:TAG-fastpull` (fastpull-optimized). Use the `-fastpull` tag in your pod spec. See [builder documentation](scripts/builder/README.md) for details. 6. Create the pod spec for image we created. For COS, use a pod spec like this: ```yaml