yl4579 · lxe · Nov 26, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+*.egg-info
+build
diff --git a/Demo/Inference_LJSpeech.ipynb b/Demo/Inference_LJSpeech.ipynb
@@ -65,9 +65,9 @@
     "import librosa\n",
     "from nltk.tokenize import word_tokenize\n",
     "\n",
-    "from models import *\n",
-    "from utils import *\n",
-    "from text_utils import TextCleaner\n",
+    "from styletts2.models import *\n",
+    "from styletts2.utils import *\n",
+    "from styletts2.text_utils import TextCleaner\n",
     "textclenaer = TextCleaner()\n",
     "\n",
     "%matplotlib inline"
@@ -160,7 +160,7 @@
     "pitch_extractor = load_F0_models(F0_path)\n",
     "\n",
     "# load BERT model\n",
-    "from Utils.PLBERT.util import load_plbert\n",
+    "from styletts2.Utils.PLBERT.util import load_plbert\n",
     "BERT_path = config.get('PLBERT_dir', False)\n",
     "plbert = load_plbert(BERT_path)"
    ]
@@ -221,7 +221,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
+    "from styletts2.Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
    ]
   },
   {

diff --git a/Demo/Inference_LibriTTS.ipynb b/Demo/Inference_LibriTTS.ipynb
@@ -67,9 +67,9 @@
     "import librosa\n",
     "from nltk.tokenize import word_tokenize\n",
     "\n",
-    "from models import *\n",
-    "from utils import *\n",
-    "from text_utils import TextCleaner\n",
+    "from styletts2.models import *\n",
+    "from styletts2.utils import *\n",
+    "from styletts2.text_utils import TextCleaner\n",
     "textclenaer = TextCleaner()\n",
     "\n",
     "%matplotlib inline"
@@ -160,7 +160,7 @@
     "pitch_extractor = load_F0_models(F0_path)\n",
     "\n",
     "# load BERT model\n",
-    "from Utils.PLBERT.util import load_plbert\n",
+    "from styletts2.Utils.PLBERT.util import load_plbert\n",
     "BERT_path = config.get('PLBERT_dir', False)\n",
     "plbert = load_plbert(BERT_path)"
    ]
@@ -222,7 +222,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
+    "from styletts2.Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
    ]
   },
   {
@@ -1133,9 +1133,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "NLP",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "nlp"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1147,7 +1147,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include styletts2 *
diff --git a/README.md b/README.md
@@ -88,6 +88,40 @@ Please make sure you have the LibriTTS checkpoint downloaded and unzipped under
 - **Out of memory after `joint_epoch`**: This is likely because your GPU RAM is not big enough for SLM adversarial training run. You may skip that but the quality could be worse. Setting `joint_epoch` a larger number than `epochs` could skip the SLM advesariral training.
 
 ## Inference
+
+Quick start example:
+
+```python
+from styletts2 import TTS
+import sounddevice as sd
+import phonemizer
+
+tts = TTS.load_model(
+    config_path="hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/config.yml",
+    checkpoint_path="hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth"
+)
+
+es_phonemizer = phonemizer.backend.EspeakBackend(
+    language='en-us',
+    preserve_punctuation=True,
+    with_stress=True
+)
+
+style = tts.compute_style('../tts-server/tts_server/voices/en-f-1.wav')
+
+wav, _ = tts.inference(
+    "This is a text! Hello world! How are you? What's your name?", 
+    style,
+    phonemizer=es_phonemizer,
+    alpha=0.3,
+    beta=0.7,
+    diffusion_steps=10,
+    embedding_scale=2)
+
+sd.play(wav, 24000)
+sd.wait()
+```
+
 Please refer to [Inference_LJSpeech.ipynb](https://github.com/yl4579/StyleTTS2/blob/main/Demo/Inference_LJSpeech.ipynb) (single-speaker) and [Inference_LibriTTS.ipynb](https://github.com/yl4579/StyleTTS2/blob/main/Demo/Inference_LibriTTS.ipynb) (multi-speaker) for details. For LibriTTS, you will also need to download [reference_audio.zip](https://huggingface.co/yl4579/StyleTTS2-LibriTTS/resolve/main/reference_audio.zip) and unzip it under the `demo` before running the demo. 
 
 - The pretrained StyleTTS 2 on LJSpeech corpus in 24 kHz can be downloaded at [https://huggingface.co/yl4579/StyleTTS2-LJSpeech/tree/main](https://huggingface.co/yl4579/StyleTTS2-LJSpeech/tree/main).

diff --git a/setup.py b/setup.py
@@ -0,0 +1,22 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="styletts2",
+    version="0.0.1",
+    packages=find_packages(),
+    include_package_data=True,
+    install_requires=[
+        "cached_path",
+        "nltk",
+        "scipy",
+        "numpy",
+        "munch",
+        "librosa",
+        "sounddevice",
+        "einops",
+        "einops_exts",
+        "transformers",
+        "matplotlib",
+        "monotonic_align @ git+https://github.com/resemble-ai/monotonic_align.git",
+    ]
+)