Set up a local environment to run HUMO AI for human-centric video generation guided by text, image, and audio. This guide mirrors the configuration shown on the homepage.
conda create -n humo python=3.11
conda activate humo
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
pip install flash_attn==2.6.3
pip install -r requirements.txt
conda install -c conda-forge ffmpegPrepare model weights locally. Typical components include a HuMo checkpoint, Wan 2.1 VAE and text encoder, Whisper-large-v3 for audio, and an optional audio separator. Store them under a local weights directory.
huggingface-cli download Wan-AI/Wan2.1-T2V-1.3B --local-dir ./weights/Wan2.1-T2V-1.3B
huggingface-cli download bytedance-research/HuMo --local-dir ./weights/HuMo
huggingface-cli download openai/whisper-large-v3 --local-dir ./weights/whisper-large-v3
huggingface-cli download huangjackson/Kim_Vocal_2 --local-dir ./weights/audio_separatorAdjust generation settings in a YAML file. These defaults reflect common practice and match the homepage examples.
generation:
frames: 97
scale_a: 2.0
scale_t: 7.5
mode: "TIA" # TA or TIA
height: 720
width: 1280
diffusion:
timesteps:
sampling:
steps: 50bash infer_ta.sh)bash infer_tia.sh)