Skip to content

Commit

Permalink
fix initial structure and update deps
Browse files Browse the repository at this point in the history
  • Loading branch information
xmnlab committed Mar 11, 2024
1 parent ffdf895 commit d8c14e5
Show file tree
Hide file tree
Showing 7 changed files with 1,261 additions and 1,127 deletions.
41 changes: 33 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ specify [`gtts`](https://github.com/pndurette/gTTS) with the flag

```bash
$ echo "Are you ready to join Link and Zelda in fighting off this unprecedented threat to Hyrule?" > /tmp/artbox/text.md
$ artbox speech text-to-speech \
$ artbox speech from-text \
--title artbox \
--text-path /tmp/artbox/text.md \
--input-path /tmp/artbox/text.md \
--output-path /tmp/artbox/speech.mp3 \
--engine edge-tts
```
Expand All @@ -50,9 +50,9 @@ If you need to generate the audio for different language, you can use the flag

```bash
$ echo "Bom dia, mundo!" > /tmp/artbox/text.md
$ artbox speech text-to-speech \
$ artbox speech from-text \
--title artbox \
--text-path /tmp/artbox/text.md \
--input-path /tmp/artbox/text.md \
--output-path /tmp/artbox/speech.mp3 \
--lang pt
```
Expand All @@ -62,9 +62,9 @@ locale for that language, for example:

```bash
$ echo "Are you ready to join Link and Zelda in fighting off this unprecedented threat to Hyrule?" > /tmp/artbox/text.md
$ artbox speech text-to-speech \
$ artbox speech from-text \
--title artbox \
--text-path /tmp/artbox/text.md \
--input-path /tmp/artbox/text.md \
--output-path /tmp/artbox/speech.mp3 \
--engine edge-tts \
--lang en-IN
Expand All @@ -75,9 +75,9 @@ and `--pitch`, for example:

```bash
$ echo "Do you want some coffee?" > /tmp/artbox/text.md
$ artbox speech text-to-speech \
$ artbox speech from-text \
--title artbox \
--text-path /tmp/artbox/text.md \
--input-path /tmp/artbox/text.md \
--output-path /tmp/artbox/speech.mp3 \
--engine edge-tts \
--lang en \
Expand All @@ -86,6 +86,31 @@ $ artbox speech text-to-speech \
--pitch -5Hz
```

### Convert audio to text

ArtBox uses `speechrecognition` to convert from audio to text. Currently, ArtBox
just support the `google` engine.

For this example, let's first create our audio:

```bash
$ echo "Are you ready to join Link and Zelda in fighting off this unprecedented threat to Hyrule?" > /tmp/artbox/text.md
$ artbox speech from-text \
--title artbox \
--input-path /tmp/artbox/text.md \
--output-path /tmp/artbox/speech.mp3 \
--engine edge-tts
```

Now we can convert it back to text:

```bash
$ artbox speech to-text \
--input-path /tmp/artbox/speech.mp3 \
--output-path /tmp/artbox/text-from-speech.md \
--lang en
```

### Download a youtube video

If you want to download videos from the youtube, you can use the following
Expand Down
2,189 changes: 1,115 additions & 1,074 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ gtts = ">=2.3.2"
edge-tts = ">=6.1.8"
numpy = ">=1.20"
typer = ">=0.9.0"
pytubefix = ">=1.13.3"
pytubefix = ">=2"
speechrecognition = ">=3.10"
vosk = ">=0.3.45"
google-cloud-speech = ">=2.24.1"
Expand Down Expand Up @@ -117,5 +117,6 @@ module = [
"pydub",
"pydub.generators",
"pytubefix",
"speech_recognition",
]
ignore_missing_imports = true
63 changes: 53 additions & 10 deletions src/artbox/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from artbox import __version__
from artbox.sounds import Sound
from artbox.speech import Speech
from artbox.speech import SpeechFromText, SpeechToText
from artbox.videos import Video, Youtube

app = typer.Typer(
Expand All @@ -27,7 +27,7 @@
help="Video processing commands for Artbox.",
short_help="Video processing commands.",
)
app_voice = typer.Typer(
app_speech = typer.Typer(
name="speech",
help="Speech processing commands for Artbox.",
short_help="Speech processing commands.",
Expand All @@ -40,7 +40,7 @@

app.add_typer(app_sound, name="sound")
app.add_typer(app_video, name="video")
app.add_typer(app_voice, name="speech")
app.add_typer(app_speech, name="speech")
app.add_typer(app_youtube, name="youtube")


Expand All @@ -65,14 +65,16 @@ def main(
raise typer.Exit(0)


@app_voice.command("text-to-speech")
def voice_text_to_speech(
@app_speech.command("from-text")
def speech_from_text(
title: Annotated[
str, typer.Option("--title", help="Specify the name of the audio file")
] = "artbox",
text_path: Annotated[
input_path: Annotated[
str,
typer.Option("--text-path", help="Specify the path of the text file"),
typer.Option(
"--input-path", help="Specify the path of the text file (txt)"
),
] = "",
output_path: Annotated[
str,
Expand Down Expand Up @@ -109,7 +111,7 @@ def voice_text_to_speech(
"""Convert text to speech."""
args_dict = {
"title": title,
"text-path": text_path,
"input-path": input_path,
"output-path": output_path,
"engine": engine,
"lang": lang,
Expand All @@ -118,8 +120,49 @@ def voice_text_to_speech(
"pitch": pitch,
}

runner = Speech(args_dict)
runner.text_to_speech()
runner = SpeechFromText(args_dict)
runner.convert()


@app_speech.command("to-text")
def speech_to_text(
input_path: Annotated[
str,
typer.Option(
"--input-path",
help="Specify the path of the audio file (mp3 or wav)",
),
] = "",
output_path: Annotated[
str,
typer.Option(
"--output-path", help="Specify the path to store the text file"
),
] = "",
engine: Annotated[
str,
typer.Option(
"--engine",
help="Choose the text-to-speech engine (Options: google)",
),
] = "google",
lang: Annotated[
str,
typer.Option(
"--lang", help="Choose the language for audio generation"
),
] = "en",
) -> None:
"""Convert text to speech."""
args_dict = {
"input-path": input_path,
"output-path": output_path,
"engine": engine,
"lang": lang,
}

runner = SpeechToText(args_dict)
runner.convert()


@app_sound.command("notes-to-audio")
Expand Down
76 changes: 50 additions & 26 deletions src/artbox/speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import os
import random

from abc import ABC, abstractmethod
from abc import ABC
from pathlib import Path

import edge_tts
Expand All @@ -20,43 +20,54 @@
from artbox.base import ArtBox


class SpeechEngineBase(ArtBox, ABC):
def convert_mp3_to_wav(input_path: str, output_path: str) -> None:
"""Convert from mp3 to wav."""
sound = AudioSegment.from_mp3(input_path)
sound.export(output_path, format="wav")


class Speech(ArtBox, ABC):
"""Set of methods for handing audio voices."""

@abstractmethod
def text_to_speech(self) -> None:

class SpeechFromTextEngineBase(Speech):
"""Set of methods for handing audio voices."""

def convert(self) -> None:
"""Convert text to audio speech."""
...


class Speech(SpeechEngineBase):
class SpeechFromText(Speech):
"""Speech class will run commands according to the selected engine."""

engine: SpeechEngineBase
engine: SpeechFromTextEngineBase

def __init__(self, *args, **kwargs) -> None:
"""Initialize Speech class."""
super().__init__(*args, **kwargs)
engine = self.args.get("engine", "edge-tts")

if engine == "edge-tts":
self.engine: SpeechEngineBase = SpeechEngineMSEdgeTTS(
self.engine: SpeechFromTextEngineBase = SpeechEngineMSEdgeTTS(
*args, **kwargs
)
elif engine == "gtts":
self.engine: SpeechEngineBase = SpeechEngineGTTS(*args, **kwargs)
self.engine: SpeechFromTextEngineBase = SpeechEngineGTTS(
*args, **kwargs
)
else:
raise Exception(f"Engine {engine} not found.")

def text_to_speech(self) -> None:
def convert(self) -> None:
"""Convert text to audio speech."""
return self.engine.text_to_speech()
return self.engine.convert()


class SpeechEngineGTTS(SpeechEngineBase):
class SpeechEngineGTTS(SpeechFromTextEngineBase):
"""Google-Text-To-Speech engine."""

def text_to_speech(self) -> None:
def convert(self) -> None:
"""Convert text to audio speech."""
title: str = self.args.get("title", "")
text_path: str = self.args.get("text-path", "")
Expand All @@ -75,13 +86,13 @@ def text_to_speech(self) -> None:
tts.save(str(self.output_path))


class SpeechEngineMSEdgeTTS(SpeechEngineBase):
class SpeechEngineMSEdgeTTS(SpeechFromTextEngineBase):
"""Microsoft Edge Text-To-Speech engine."""

async def async_text_to_speech(self) -> None:
async def async_convert(self) -> None:
"""Convert text to audio speech in async mode."""
title: str = self.args.get("title", "")
text_path: str = self.args.get("text-path", "")
text_path: str = self.args.get("input-path", "")
lang: str = self.args.get("lang", "en")
rate = self.args.get("rate", "+0%")
volume = self.args.get("volume", "+0%")
Expand All @@ -102,7 +113,7 @@ async def async_text_to_speech(self) -> None:

communicate = edge_tts.Communicate(
text=text,
speech=random.choice(voice_options)["Name"],
voice=random.choice(voice_options)["Name"],
rate=rate,
volume=volume,
pitch=pitch,
Expand All @@ -114,39 +125,52 @@ async def async_text_to_speech(self) -> None:
elif chunk["type"] == "WordBoundary":
print(f"WordBoundary: {chunk}")

def text_to_speech(self) -> None:
def convert(self) -> None:
"""Convert text to audio speech."""
loop = asyncio.get_event_loop_policy().get_event_loop()
try:
loop.run_until_complete(self.async_text_to_speech())
loop.run_until_complete(self.async_convert())
finally:
loop.close()


def convert_mp3_to_wav(input_path: str, output_path: str) -> None:
sound = AudioSegment.from_mp3(input_path)
sound.export(output_path, format="wav")
class SpeechToText(Speech):
"""Speech to Text class."""

def convert(self) -> None:
"""Recognize speech from MP# using various engines options."""
file_path: str = str(self.input_path)

if file_path.endswith("mp3"):
self.convert_from_mp3()
return

if file_path.endswith("wav"):
self.convert_from_wav()
return

raise Exception(
"The file format is not valid. Valid types are mp3 and wav."
)

class SpeechToText(ArtBox):
def convert_from_mp3(self) -> None:
"""Recognize speech from MP# using various engines options."""
file_path: Path = self.input_path

# Convert MP3 to WAV
wav_path = str(file_path).replace(".mp3", ".wav")
convert_mp3_to_wav(file_path, wav_path)
convert_mp3_to_wav(str(file_path), wav_path)

self.input_path = wav_path
self.input_path = Path(wav_path)
self.convert_from_wav()

# Cleanup: Remove the WAV file
os.remove(wav_path)

def convert_from_wav(self) -> None:
"""Recognize speech from WAVE using various engines options."""
wav_path: str = self.input_path
output_path: str = self.output_path
wav_path: str = str(self.input_path)
output_path: str = str(self.output_path)
language: str = self.args.get("lang", "en-US")
engine: str = self.args.get("engine", "google")

Expand Down
Binary file removed tests/data/audios/speech.wav
Binary file not shown.
Loading

0 comments on commit d8c14e5

Please sign in to comment.