S-Lab, Nanyang Technological University1 SenseTime Research 2
✉Corresponding Author.
✉Corresponding Author.
🔥 More coming soon!
conda create -n uae python=3.10 -y
conda activate uae
pip install uv
uv pip install torch==2.2.0 torchvision==0.17.0 torchaudio --index-url https://download.pytorch.org/whl/cu121
uv pip install timm==0.9.16 accelerate==0.23.0 torchdiffeq==0.2.5 wandb
uv pip install "numpy<2" transformers einops omegaconf
uv pip install torchmetricspython eval_uae.py \
--config unified_ae/configs/stage1_infer.yaml \
--checkpoint PATH_TO_WEIGHTS \
--imagenet-path PATH_TO_IMAGENET \
--coco-path PATH_TO_COCO \
--batch-size 16 \
--num-workers 8 \
--image-size 256 \
--freq-ratio 1.0 \
--log-file logs/uae_eval_metrics.txtExpected Results:
ImageNet: PSNR=29.588 dB | SSIM=0.8789 | rFID=0.193
MS-COCO: PSNR=29.484 dB | SSIM=0.8846 | rFID=0.157@misc{fan2025uae,
title={The Prism Hypothesis: Harmonizing Semantic and Pixel Representations via Unified Autoencoding},
author={Weichen Fan and Haiwen Diao and Quan Wang and Dahua Lin and Ziwei Liu},
year={2025},
eprint={2512.19693},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2512.19693},
}