From 3c25a576a4add8d3ae63007bc2ece81127b93524 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sun, 14 Sep 2025 12:51:55 +0200 Subject: [PATCH 1/7] Temporary fix, send only split log --- src/kernelbot/cogs/admin_cog.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/kernelbot/cogs/admin_cog.py b/src/kernelbot/cogs/admin_cog.py index 5fe8aeba..238364bb 100644 --- a/src/kernelbot/cogs/admin_cog.py +++ b/src/kernelbot/cogs/admin_cog.py @@ -276,7 +276,11 @@ async def leaderboard_create_impl( # noqa: C901 forum_thread = await forum_channel.create_thread( name=leaderboard_name, content=self._leaderboard_opening_message( - leaderboard_name, date_value, definition.description + leaderboard_name, + date_value, + definition.description[:1500] + if len(definition.description) > 1500 + else definition.description, ), auto_archive_duration=10080, # 7 days ) From 36edd654cd307d907b84f740b856d2d691b1076d Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Tue, 28 Oct 2025 11:48:53 -0700 Subject: [PATCH 2/7] Smoke test --- .github/workflows/nvidia-smoke.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/nvidia-smoke.yml diff --git a/.github/workflows/nvidia-smoke.yml b/.github/workflows/nvidia-smoke.yml new file mode 100644 index 00000000..fdb0f883 --- /dev/null +++ b/.github/workflows/nvidia-smoke.yml @@ -0,0 +1,24 @@ +name: gpu-smoke +on: + workflow_dispatch: {} + push: + branches: [ main ] + +jobs: + smoke: + runs-on: [self-hosted, gpu] + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + - name: Show GPU + run: | + nvidia-smi || true + - name: Torch sanity + run: | + python - <<'PY' + import torch + print("CUDA available:", torch.cuda.is_available()) + print("device_count:", torch.cuda.device_count()) + if torch.cuda.is_available(): + print("device_0:", torch.cuda.get_device_name(0)) + PY From ecddedd1a26ce3f24db5ab686bd17ee61abb2551 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Tue, 28 Oct 2025 11:53:57 -0700 Subject: [PATCH 3/7] tmp --- .github/workflows/nvidia-smoke.yml | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/.github/workflows/nvidia-smoke.yml b/.github/workflows/nvidia-smoke.yml index fdb0f883..b3a3729d 100644 --- a/.github/workflows/nvidia-smoke.yml +++ b/.github/workflows/nvidia-smoke.yml @@ -1,24 +1,18 @@ name: gpu-smoke on: - workflow_dispatch: {} push: - branches: [ main ] - + branches: [ nvidia-gpu-runners ] + workflow_dispatch: {} jobs: smoke: runs-on: [self-hosted, gpu] - timeout-minutes: 30 steps: - uses: actions/checkout@v4 - - name: Show GPU - run: | - nvidia-smi || true - - name: Torch sanity - run: | + - run: nvidia-smi || true + - run: | python - <<'PY' import torch - print("CUDA available:", torch.cuda.is_available()) - print("device_count:", torch.cuda.device_count()) + print("cuda?", torch.cuda.is_available(), "count:", torch.cuda.device_count()) if torch.cuda.is_available(): - print("device_0:", torch.cuda.get_device_name(0)) + print("name:", torch.cuda.get_device_name(0)) PY From 2139ca1919841359775687d88f97c1bc01a1981b Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Tue, 28 Oct 2025 12:05:08 -0700 Subject: [PATCH 4/7] Smoke test From c61afb37518475305d59afa6166382cbef0aaf6a Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Tue, 28 Oct 2025 12:17:52 -0700 Subject: [PATCH 5/7] push --- .github/workflows/nvidia-smoke.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-smoke.yml b/.github/workflows/nvidia-smoke.yml index b3a3729d..92638ed8 100644 --- a/.github/workflows/nvidia-smoke.yml +++ b/.github/workflows/nvidia-smoke.yml @@ -5,7 +5,7 @@ on: workflow_dispatch: {} jobs: smoke: - runs-on: [self-hosted, gpu] + runs-on: gpu-runners steps: - uses: actions/checkout@v4 - run: nvidia-smi || true From d559349cb3fe4277e4b6978696d7cd10acc1b94d Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Thu, 30 Oct 2025 10:30:37 -0700 Subject: [PATCH 6/7] tmp --- .github/workflows/nvidia-smoke.yml | 54 ++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/.github/workflows/nvidia-smoke.yml b/.github/workflows/nvidia-smoke.yml index 92638ed8..7eca90a6 100644 --- a/.github/workflows/nvidia-smoke.yml +++ b/.github/workflows/nvidia-smoke.yml @@ -4,15 +4,47 @@ on: branches: [ nvidia-gpu-runners ] workflow_dispatch: {} jobs: - smoke: - runs-on: gpu-runners + gpu-test: + runs-on: [self-hosted, nvidia-docker-b200-8-x86-64] + steps: - - uses: actions/checkout@v4 - - run: nvidia-smi || true - - run: | - python - <<'PY' - import torch - print("cuda?", torch.cuda.is_available(), "count:", torch.cuda.device_count()) - if torch.cuda.is_available(): - print("name:", torch.cuda.get_device_name(0)) - PY + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Show GPU info + run: | + echo "===== nvidia-smi =====" + nvidia-smi || echo "nvidia-smi not available" + echo "======================" + + - name: Run CUDA sanity test with PyTorch + run: | + python - << 'EOF' + import torch, time + + print("PyTorch version:", torch.__version__) + print("CUDA available:", torch.cuda.is_available()) + print("CUDA device count:", torch.cuda.device_count()) + + if not torch.cuda.is_available(): + raise SystemExit("ERROR: CUDA not available on this runner ❌") + + # list all visible GPUs + for i in range(torch.cuda.device_count()): + print(f"Device {i}: {torch.cuda.get_device_name(i)}") + + # simple GPU compute test on cuda:0 + device = torch.device("cuda:0") + a = torch.randn(4096, 4096, device=device) + b = torch.randn(4096, 4096, device=device) + + torch.cuda.synchronize() + t0 = time.time() + c = a @ b + torch.cuda.synchronize() + t1 = time.time() + + print("Matmul result shape:", tuple(c.shape)) + print(f"Matmul took {t1 - t0:.3f} sec on GPU") + print("All good ✅") + EOF From fce35b752e2dd6b34d720bb897ee3816c73e70ac Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Thu, 30 Oct 2025 10:32:39 -0700 Subject: [PATCH 7/7] tmp --- .github/workflows/nvidia-smoke.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-smoke.yml b/.github/workflows/nvidia-smoke.yml index 7eca90a6..1cfdbec6 100644 --- a/.github/workflows/nvidia-smoke.yml +++ b/.github/workflows/nvidia-smoke.yml @@ -19,7 +19,7 @@ jobs: - name: Run CUDA sanity test with PyTorch run: | - python - << 'EOF' + python3 - << 'EOF' import torch, time print("PyTorch version:", torch.__version__)