fix initial structure and update deps

osl-incubator · Mar 11, 2024 · d8c14e5 · d8c14e5
1 parent ffdf895
commit d8c14e5
Show file tree

Hide file tree

Showing 7 changed files with 1,261 additions and 1,127 deletions.
diff --git a/README.md b/README.md
@@ -38,9 +38,9 @@ specify [`gtts`](https://github.com/pndurette/gTTS) with the flag
 
 ```bash
 $ echo "Are you ready to join Link and Zelda in fighting off this unprecedented threat to Hyrule?" > /tmp/artbox/text.md
-$ artbox speech text-to-speech \
+$ artbox speech from-text \
     --title artbox \
-    --text-path /tmp/artbox/text.md \
+    --input-path /tmp/artbox/text.md \
     --output-path /tmp/artbox/speech.mp3 \
     --engine edge-tts
 ```
@@ -50,9 +50,9 @@ If you need to generate the audio for different language, you can use the flag
 
 ```bash
 $ echo "Bom dia, mundo!" > /tmp/artbox/text.md
-$ artbox speech text-to-speech \
+$ artbox speech from-text \
     --title artbox \
-    --text-path /tmp/artbox/text.md \
+    --input-path /tmp/artbox/text.md \
     --output-path /tmp/artbox/speech.mp3 \
     --lang pt
 ```
@@ -62,9 +62,9 @@ locale for that language, for example:
 
 ```bash
 $ echo "Are you ready to join Link and Zelda in fighting off this unprecedented threat to Hyrule?" > /tmp/artbox/text.md
-$ artbox speech text-to-speech \
+$ artbox speech from-text \
     --title artbox \
-    --text-path /tmp/artbox/text.md \
+    --input-path /tmp/artbox/text.md \
     --output-path /tmp/artbox/speech.mp3 \
     --engine edge-tts \
     --lang en-IN
@@ -75,9 +75,9 @@ and `--pitch`, for example:
 
 ```bash
 $ echo "Do you want some coffee?" > /tmp/artbox/text.md
-$ artbox speech text-to-speech \
+$ artbox speech from-text \
     --title artbox \
-    --text-path /tmp/artbox/text.md \
+    --input-path /tmp/artbox/text.md \
     --output-path /tmp/artbox/speech.mp3 \
     --engine edge-tts \
     --lang en \
@@ -86,6 +86,31 @@ $ artbox speech text-to-speech \
     --pitch -5Hz
 ```
 
+### Convert audio to text
+
+ArtBox uses `speechrecognition` to convert from audio to text. Currently, ArtBox
+just support the `google` engine.
+
+For this example, let's first create our audio:
+
+```bash
+$ echo "Are you ready to join Link and Zelda in fighting off this unprecedented threat to Hyrule?" > /tmp/artbox/text.md
+$ artbox speech from-text \
+    --title artbox \
+    --input-path /tmp/artbox/text.md \
+    --output-path /tmp/artbox/speech.mp3 \
+    --engine edge-tts
+```
+
+Now we can convert it back to text:
+
+```bash
+$ artbox speech to-text \
+    --input-path /tmp/artbox/speech.mp3 \
+    --output-path /tmp/artbox/text-from-speech.md \
+    --lang en
+```
+
 ### Download a youtube video
 
 If you want to download videos from the youtube, you can use the following

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,7 @@ gtts = ">=2.3.2"
 edge-tts = ">=6.1.8"
 numpy = ">=1.20"
 typer = ">=0.9.0"
-pytubefix = ">=1.13.3"
+pytubefix = ">=2"
 speechrecognition = ">=3.10"
 vosk = ">=0.3.45"
 google-cloud-speech = ">=2.24.1"
@@ -117,5 +117,6 @@ module = [
   "pydub",
   "pydub.generators",
   "pytubefix",
+  "speech_recognition",
 ]
 ignore_missing_imports = true
diff --git a/src/artbox/cli.py b/src/artbox/cli.py
@@ -5,7 +5,7 @@
 
 from artbox import __version__
 from artbox.sounds import Sound
-from artbox.speech import Speech
+from artbox.speech import SpeechFromText, SpeechToText
 from artbox.videos import Video, Youtube
 
 app = typer.Typer(
@@ -27,7 +27,7 @@
     help="Video processing commands for Artbox.",
     short_help="Video processing commands.",
 )
-app_voice = typer.Typer(
+app_speech = typer.Typer(
     name="speech",
     help="Speech processing commands for Artbox.",
     short_help="Speech processing commands.",
@@ -40,7 +40,7 @@
 
 app.add_typer(app_sound, name="sound")
 app.add_typer(app_video, name="video")
-app.add_typer(app_voice, name="speech")
+app.add_typer(app_speech, name="speech")
 app.add_typer(app_youtube, name="youtube")
 
 
@@ -65,14 +65,16 @@ def main(
         raise typer.Exit(0)
 
 
-@app_voice.command("text-to-speech")
-def voice_text_to_speech(
+@app_speech.command("from-text")
+def speech_from_text(
     title: Annotated[
         str, typer.Option("--title", help="Specify the name of the audio file")
     ] = "artbox",
-    text_path: Annotated[
+    input_path: Annotated[
         str,
-        typer.Option("--text-path", help="Specify the path of the text file"),
+        typer.Option(
+            "--input-path", help="Specify the path of the text file (txt)"
+        ),
     ] = "",
     output_path: Annotated[
         str,
@@ -109,7 +111,7 @@ def voice_text_to_speech(
     """Convert text to speech."""
     args_dict = {
         "title": title,
-        "text-path": text_path,
+        "input-path": input_path,
         "output-path": output_path,
         "engine": engine,
         "lang": lang,
@@ -118,8 +120,49 @@ def voice_text_to_speech(
         "pitch": pitch,
     }
 
-    runner = Speech(args_dict)
-    runner.text_to_speech()
+    runner = SpeechFromText(args_dict)
+    runner.convert()
+
+
+@app_speech.command("to-text")
+def speech_to_text(
+    input_path: Annotated[
+        str,
+        typer.Option(
+            "--input-path",
+            help="Specify the path of the audio file (mp3 or wav)",
+        ),
+    ] = "",
+    output_path: Annotated[
+        str,
+        typer.Option(
+            "--output-path", help="Specify the path to store the text file"
+        ),
+    ] = "",
+    engine: Annotated[
+        str,
+        typer.Option(
+            "--engine",
+            help="Choose the text-to-speech engine (Options: google)",
+        ),
+    ] = "google",
+    lang: Annotated[
+        str,
+        typer.Option(
+            "--lang", help="Choose the language for audio generation"
+        ),
+    ] = "en",
+) -> None:
+    """Convert text to speech."""
+    args_dict = {
+        "input-path": input_path,
+        "output-path": output_path,
+        "engine": engine,
+        "lang": lang,
+    }
+
+    runner = SpeechToText(args_dict)
+    runner.convert()
 
 
 @app_sound.command("notes-to-audio")

diff --git a/src/artbox/speech.py b/src/artbox/speech.py
@@ -7,7 +7,7 @@
 import os
 import random
 
-from abc import ABC, abstractmethod
+from abc import ABC
 from pathlib import Path
 
 import edge_tts
@@ -20,43 +20,54 @@
 from artbox.base import ArtBox
 
 
-class SpeechEngineBase(ArtBox, ABC):
+def convert_mp3_to_wav(input_path: str, output_path: str) -> None:
+    """Convert from mp3 to wav."""
+    sound = AudioSegment.from_mp3(input_path)
+    sound.export(output_path, format="wav")
+
+
+class Speech(ArtBox, ABC):
     """Set of methods for handing audio voices."""
 
-    @abstractmethod
-    def text_to_speech(self) -> None:
+
+class SpeechFromTextEngineBase(Speech):
+    """Set of methods for handing audio voices."""
+
+    def convert(self) -> None:
         """Convert text to audio speech."""
         ...
 
 
-class Speech(SpeechEngineBase):
+class SpeechFromText(Speech):
     """Speech class will run commands according to the selected engine."""
 
-    engine: SpeechEngineBase
+    engine: SpeechFromTextEngineBase
 
     def __init__(self, *args, **kwargs) -> None:
         """Initialize Speech class."""
         super().__init__(*args, **kwargs)
         engine = self.args.get("engine", "edge-tts")
 
         if engine == "edge-tts":
-            self.engine: SpeechEngineBase = SpeechEngineMSEdgeTTS(
+            self.engine: SpeechFromTextEngineBase = SpeechEngineMSEdgeTTS(
                 *args, **kwargs
             )
         elif engine == "gtts":
-            self.engine: SpeechEngineBase = SpeechEngineGTTS(*args, **kwargs)
+            self.engine: SpeechFromTextEngineBase = SpeechEngineGTTS(
+                *args, **kwargs
+            )
         else:
             raise Exception(f"Engine {engine} not found.")
 
-    def text_to_speech(self) -> None:
+    def convert(self) -> None:
         """Convert text to audio speech."""
-        return self.engine.text_to_speech()
+        return self.engine.convert()
 
 
-class SpeechEngineGTTS(SpeechEngineBase):
+class SpeechEngineGTTS(SpeechFromTextEngineBase):
     """Google-Text-To-Speech engine."""
 
-    def text_to_speech(self) -> None:
+    def convert(self) -> None:
         """Convert text to audio speech."""
         title: str = self.args.get("title", "")
         text_path: str = self.args.get("text-path", "")
@@ -75,13 +86,13 @@ def text_to_speech(self) -> None:
         tts.save(str(self.output_path))
 
 
-class SpeechEngineMSEdgeTTS(SpeechEngineBase):
+class SpeechEngineMSEdgeTTS(SpeechFromTextEngineBase):
     """Microsoft Edge Text-To-Speech engine."""
 
-    async def async_text_to_speech(self) -> None:
+    async def async_convert(self) -> None:
         """Convert text to audio speech in async mode."""
         title: str = self.args.get("title", "")
-        text_path: str = self.args.get("text-path", "")
+        text_path: str = self.args.get("input-path", "")
         lang: str = self.args.get("lang", "en")
         rate = self.args.get("rate", "+0%")
         volume = self.args.get("volume", "+0%")
@@ -102,7 +113,7 @@ async def async_text_to_speech(self) -> None:
 
         communicate = edge_tts.Communicate(
             text=text,
-            speech=random.choice(voice_options)["Name"],
+            voice=random.choice(voice_options)["Name"],
             rate=rate,
             volume=volume,
             pitch=pitch,
@@ -114,39 +125,52 @@ async def async_text_to_speech(self) -> None:
                 elif chunk["type"] == "WordBoundary":
                     print(f"WordBoundary: {chunk}")
 
-    def text_to_speech(self) -> None:
+    def convert(self) -> None:
         """Convert text to audio speech."""
         loop = asyncio.get_event_loop_policy().get_event_loop()
         try:
-            loop.run_until_complete(self.async_text_to_speech())
+            loop.run_until_complete(self.async_convert())
         finally:
             loop.close()
 
 
-def convert_mp3_to_wav(input_path: str, output_path: str) -> None:
-    sound = AudioSegment.from_mp3(input_path)
-    sound.export(output_path, format="wav")
+class SpeechToText(Speech):
+    """Speech to Text class."""
 
+    def convert(self) -> None:
+        """Recognize speech from MP# using various engines options."""
+        file_path: str = str(self.input_path)
+
+        if file_path.endswith("mp3"):
+            self.convert_from_mp3()
+            return
+
+        if file_path.endswith("wav"):
+            self.convert_from_wav()
+            return
+
+        raise Exception(
+            "The file format is not valid. Valid types are mp3 and wav."
+        )
 
-class SpeechToText(ArtBox):
     def convert_from_mp3(self) -> None:
         """Recognize speech from MP# using various engines options."""
         file_path: Path = self.input_path
 
         # Convert MP3 to WAV
         wav_path = str(file_path).replace(".mp3", ".wav")
-        convert_mp3_to_wav(file_path, wav_path)
+        convert_mp3_to_wav(str(file_path), wav_path)
 
-        self.input_path = wav_path
+        self.input_path = Path(wav_path)
         self.convert_from_wav()
 
         # Cleanup: Remove the WAV file
         os.remove(wav_path)
 
     def convert_from_wav(self) -> None:
         """Recognize speech from WAVE using various engines options."""
-        wav_path: str = self.input_path
-        output_path: str = self.output_path
+        wav_path: str = str(self.input_path)
+        output_path: str = str(self.output_path)
         language: str = self.args.get("lang", "en-US")
         engine: str = self.args.get("engine", "google")
 

diff --git a/tests/data/audios/speech.wav b/tests/data/audios/speech.wav