diff --git a/README.md b/README.md
index cae09e4..7b6b40a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
-
✨ ACE-Step ✨
-🎵 A Step Towards Music Generation Foundation Model 🎵
+ACE-Step
+A Step Towards Music Generation Foundation Model
Project |
Checkpoints |
@@ -8,7 +8,7 @@
---
-
+
## Table of Contents
@@ -113,37 +113,6 @@ Rather than building yet another end-to-end text-to-music pipeline, our vision i
- 🎵 Takes a vocal track and specified style as input to produce a complete vocal accompaniment
- 🎸 Creates full instrumental backing that complements the input vocals, making it easy to add professional-sounding accompaniment to any vocal recording
-## 💻 Installation
-
-### For MacOS/Linux
-
-```bash
-conda create -n ace_step python==3.10
-conda activate ace_step
-# Install other requirements
-pip install -r requirements.txt
-
-# Install ffmpeg
-conda install ffmpeg
-```
-
-### Windows
-
-```bash
-conda create -n ace_step python==3.10
-conda activate ace_step
-
-# Install PyTorch, TorchAudio, and TorchVision for Windows
-# replace cu121 with your CUDA version
-# replace torchvision and torchaudio with your version
-pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
-
-# Install other requirements
-pip install -r requirements_win.txt
-
-# Install ffmpeg
-conda install ffmpeg
-```
## 🖥️ Hardware Performance
@@ -152,11 +121,78 @@ We have evaluated ACE-Step across different hardware setups, yielding the follow
| Device | 27 Steps | 60 Steps |
|--------|----------|----------|
| NVIDIA A100 | 27.27x | 12.27x |
-| MacBook M2 Max | 2.27x | 1.03x |
| NVIDIA RTX 4090 | 34.48x | 15.63x |
+| NVIDIA RTX 3090 | 12.76X | 6.48X |
+| MacBook M2 Max | 2.27x | 1.03x |
We use RTF (Real-Time Factor) to measure the performance of ACE-Step. Higher values indicate faster generation speed. 27.27x means to generate 1 minute of music, it takes 2.2 seconds (60/27.27). The performance is measured on a single GPU with batch size 1 and 27 steps.
+
+## 💻 Installation
+
+### Prerequisites
+
+* Make sure you have Python installed. You can download it from [python.org](https://www.python.org/).
+* You will also need either Conda (recommended for complex dependencies) or ensure your Python installation includes `venv`.
+
+### Environment Setup
+
+It is highly recommended to use a virtual environment to manage project dependencies and avoid conflicts. Choose **one** of the following methods (Conda or venv):
+
+#### Option 1: Using Conda
+
+1. **Create the environment** named `ace_step` with Python 3.10:
+ ```bash
+ conda create -n ace_step python=3.10 -y
+ ```
+
+2. **Activate the environment:**
+ ```bash
+ conda activate ace_step
+ ```
+
+#### Option 2: Using venv
+
+1. **Ensure you are using the correct Python version.
+
+2. **Create the virtual environment** (commonly named `venv`):
+ ```bash
+ python -m venv venv
+ ```
+
+3. **Activate the environment:**
+ * **On Windows (cmd.exe):**
+ ```bash
+ venv\Scripts\activate.bat
+ ```
+ * **On Windows (PowerShell):**
+ ```powershell
+ .\venv\Scripts\Activate.ps1
+ ```
+ *(If you encounter execution policy errors, you might need to run `Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope Process` first)*
+ * **On Linux / macOS (bash/zsh):**
+ ```bash
+ source venv/bin/activate
+ ```
+
+3. **Install dependencies** from the `requirements.txt` file:
+
+ for macOS/Linux users:
+ ```bash
+ pip install -r requirements.txt
+ ```
+
+ for Windows users:
+ ```bash
+ # Install PyTorch, TorchAudio, and TorchVision for Windows
+ # replace cu121 with your CUDA version
+ # replace torchvision and torchaudio with your version
+ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+
+ # then install other dependencies
+ pip install -r requirements_win.txt
+ ```
+
## 🚀 Usage

@@ -173,6 +209,8 @@ python app.py
python app.py --checkpoint_path /path/to/checkpoint --port 7865 --device_id 0 --share --bf16
```
+If you are MacOS, please use `--bf16 False` to avoid errors.
+
#### 🛠️ Command Line Arguments
- `--checkpoint_path`: Path to the model checkpoint (default: downloads automatically)
diff --git a/app.py b/app.py
index 27c7eb7..96f9b2f 100644
--- a/app.py
+++ b/app.py
@@ -3,8 +3,8 @@ parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint_path", type=str, default="")
parser.add_argument("--port", type=int, default=7865)
parser.add_argument("--device_id", type=int, default=0)
-parser.add_argument("--share", action='store_true', default=False)
-parser.add_argument("--bf16", action='store_true', default=False)
+parser.add_argument("--share", type=bool, default=False)
+parser.add_argument("--bf16", type=bool, default=True)
args = parser.parse_args()
@@ -31,7 +31,6 @@ def main(args):
sample_data_func=data_sampler.sample,
)
demo.launch(
- server_name="0.0.0.0",
server_port=args.port,
share=args.share
)
diff --git a/fig/Logo_StepFun.png b/fig/Logo_StepFun.png
new file mode 100644
index 0000000..26a7f0c
Binary files /dev/null and b/fig/Logo_StepFun.png differ
diff --git a/fig/acestudio_logo.png b/fig/acestudio_logo.png
index abf4cb2..73501c7 100644
Binary files a/fig/acestudio_logo.png and b/fig/acestudio_logo.png differ
diff --git a/fig/orgnization_logos.png b/fig/orgnization_logos.png
index b5c9936..053f4e3 100644
Binary files a/fig/orgnization_logos.png and b/fig/orgnization_logos.png differ
diff --git a/fig/stepfun_logo.png b/fig/stepfun_logo.png
deleted file mode 100644
index 966e02a..0000000
Binary files a/fig/stepfun_logo.png and /dev/null differ
diff --git a/pipeline_ace_step.py b/pipeline_ace_step.py
index 6a1175b..4281721 100644
--- a/pipeline_ace_step.py
+++ b/pipeline_ace_step.py
@@ -158,7 +158,7 @@ class ACEStepPipeline:
])
self.lang_segment = lang_segment
self.lyric_tokenizer = VoiceBpeTokenizer()
- text_encoder_model = UMT5EncoderModel.from_pretrained(text_encoder_checkpoint_path).eval()
+ text_encoder_model = UMT5EncoderModel.from_pretrained(text_encoder_checkpoint_path, torch_dtype=self.dtype).eval()
text_encoder_model = text_encoder_model.to(device).to(self.dtype)
text_encoder_model.requires_grad_(False)
self.text_encoder_model = text_encoder_model
@@ -941,7 +941,7 @@ class ACEStepPipeline:
output_path_flac = f"{base_path}/output_{time.strftime('%Y%m%d%H%M%S')}_{idx}.{format}"
target_wav = target_wav.float()
- torchaudio.save(output_path_flac, target_wav, sample_rate=sample_rate, format=format, backend="ffmpeg", compression=torchaudio.io.CodecConfig(bit_rate=320000))
+ torchaudio.save(output_path_flac, target_wav, sample_rate=sample_rate, format=format)
return output_path_flac
def infer_latents(self, input_audio_path):