Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.egg-info
build
10 changes: 5 additions & 5 deletions Demo/Inference_LJSpeech.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@
"import librosa\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"from models import *\n",
"from utils import *\n",
"from text_utils import TextCleaner\n",
"from styletts2.models import *\n",
"from styletts2.utils import *\n",
"from styletts2.text_utils import TextCleaner\n",
"textclenaer = TextCleaner()\n",
"\n",
"%matplotlib inline"
Expand Down Expand Up @@ -160,7 +160,7 @@
"pitch_extractor = load_F0_models(F0_path)\n",
"\n",
"# load BERT model\n",
"from Utils.PLBERT.util import load_plbert\n",
"from styletts2.Utils.PLBERT.util import load_plbert\n",
"BERT_path = config.get('PLBERT_dir', False)\n",
"plbert = load_plbert(BERT_path)"
]
Expand Down Expand Up @@ -221,7 +221,7 @@
"metadata": {},
"outputs": [],
"source": [
"from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
"from styletts2.Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
]
},
{
Expand Down
16 changes: 8 additions & 8 deletions Demo/Inference_LibriTTS.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@
"import librosa\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"from models import *\n",
"from utils import *\n",
"from text_utils import TextCleaner\n",
"from styletts2.models import *\n",
"from styletts2.utils import *\n",
"from styletts2.text_utils import TextCleaner\n",
"textclenaer = TextCleaner()\n",
"\n",
"%matplotlib inline"
Expand Down Expand Up @@ -160,7 +160,7 @@
"pitch_extractor = load_F0_models(F0_path)\n",
"\n",
"# load BERT model\n",
"from Utils.PLBERT.util import load_plbert\n",
"from styletts2.Utils.PLBERT.util import load_plbert\n",
"BERT_path = config.get('PLBERT_dir', False)\n",
"plbert = load_plbert(BERT_path)"
]
Expand Down Expand Up @@ -222,7 +222,7 @@
"metadata": {},
"outputs": [],
"source": [
"from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
"from styletts2.Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
]
},
{
Expand Down Expand Up @@ -1133,9 +1133,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "NLP",
"display_name": "Python 3",
"language": "python",
"name": "nlp"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -1147,7 +1147,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive-include styletts2 *
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,40 @@ Please make sure you have the LibriTTS checkpoint downloaded and unzipped under
- **Out of memory after `joint_epoch`**: This is likely because your GPU RAM is not big enough for SLM adversarial training run. You may skip that but the quality could be worse. Setting `joint_epoch` a larger number than `epochs` could skip the SLM advesariral training.

## Inference

Quick start example:

```python
from styletts2 import TTS
import sounddevice as sd
import phonemizer

tts = TTS.load_model(
config_path="hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/config.yml",
checkpoint_path="hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth"
)

es_phonemizer = phonemizer.backend.EspeakBackend(
language='en-us',
preserve_punctuation=True,
with_stress=True
)

style = tts.compute_style('../tts-server/tts_server/voices/en-f-1.wav')

wav, _ = tts.inference(
"This is a text! Hello world! How are you? What's your name?",
style,
phonemizer=es_phonemizer,
alpha=0.3,
beta=0.7,
diffusion_steps=10,
embedding_scale=2)

sd.play(wav, 24000)
sd.wait()
```

Please refer to [Inference_LJSpeech.ipynb](https://github.com/yl4579/StyleTTS2/blob/main/Demo/Inference_LJSpeech.ipynb) (single-speaker) and [Inference_LibriTTS.ipynb](https://github.com/yl4579/StyleTTS2/blob/main/Demo/Inference_LibriTTS.ipynb) (multi-speaker) for details. For LibriTTS, you will also need to download [reference_audio.zip](https://huggingface.co/yl4579/StyleTTS2-LibriTTS/resolve/main/reference_audio.zip) and unzip it under the `demo` before running the demo.

- The pretrained StyleTTS 2 on LJSpeech corpus in 24 kHz can be downloaded at [https://huggingface.co/yl4579/StyleTTS2-LJSpeech/tree/main](https://huggingface.co/yl4579/StyleTTS2-LJSpeech/tree/main).
Expand Down
22 changes: 22 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from setuptools import setup, find_packages

setup(
name="styletts2",
version="0.0.1",
packages=find_packages(),
include_package_data=True,
install_requires=[
"cached_path",
"nltk",
"scipy",
"numpy",
"munch",
"librosa",
"sounddevice",
"einops",
"einops_exts",
"transformers",
"matplotlib",
"monotonic_align @ git+https://github.com/resemble-ai/monotonic_align.git",
]
)
Loading