From 37711f24be3158c6df7f0f353def8fb8f41aa261 Mon Sep 17 00:00:00 2001
From: Ivan Ogasawara <ivan.ogasawara@gmail.com>
Date: Fri, 2 Feb 2024 14:40:59 -0400
Subject: [PATCH] feat: Extract CC from youtube video (#24)

---
 README.md            | 23 ++++++++++-----
 docs/index.md        | 16 +++++++++++
 poetry.lock          | 13 ++++-----
 pyproject.toml       |  4 +--
 src/artbox/cli.py    | 66 ++++++++++++++++++++++++++++++++++++++++----
 src/artbox/videos.py | 44 ++++++++++++++++++++++++++++-
 src/artbox/voices.py |  8 ++++--
 7 files changed, 149 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 41629bc..33f3300 100644
--- a/README.md
+++ b/README.md
@@ -70,6 +70,22 @@ $ artbox voice text-to-speech \
     --lang en-IN
 ```
 
+Additionally, if you are using edge-tts, you can specify `--rate`, `--volume`,
+and `--pitch`, for example:
+
+```bash
+$ echo "Do you want some coffee?" > /tmp/artbox/text.md
+$ artbox voice text-to-speech \
+    --title artbox \
+    --text-path /tmp/artbox/text.md \
+    --output-path /tmp/artbox/voice.mp3 \
+    --engine edge-tts \
+    --lang en \
+    --rate +10% \
+    --volume -10% \
+    --pitch -5Hz
+```
+
 ### Download a youtube video
 
 If you want to download videos from the youtube, you can use the following
@@ -152,10 +168,3 @@ If you want to use Python to play your audio files, you can install `playsound`:
 ```bash
 $ pip wheel --use-pep517 "playsound (==1.3.0)"
 ```
-
-## Troubleshoot
-
-After installing with `poetry install`:
-
-- Patch `pytube` (ref: https://github.com/pytube/pytube/issues/1773):
-  `sed -i 's/(r"^$\\w+\\W")/(r"^\\w+\\W")/' $CONDA_PREFIX/lib/python3.*/site-packages/pytube/cipher.py`
diff --git a/docs/index.md b/docs/index.md
index 41629bc..2b78971 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -70,6 +70,22 @@ $ artbox voice text-to-speech \
     --lang en-IN
 ```
 
+Additionally, if you are using edge-tts, you can specify `--rate`, `--volume`,
+and `--pitch`, for example:
+
+```bash
+$ echo "Do you want some coffee?" > /tmp/artbox/text.md
+$ artbox voice text-to-speech \
+    --title artbox \
+    --text-path /tmp/artbox/text.md \
+    --output-path /tmp/artbox/voice.mp3 \
+    --engine edge-tts \
+    --lang en \
+    --rate +10% \
+    --volume -10% \
+    --pitch -5Hz
+```
+
 ### Download a youtube video
 
 If you want to download videos from the youtube, you can use the following
diff --git a/poetry.lock b/poetry.lock
index 7f8b49e..5a97ec4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3718,14 +3718,14 @@ files = [
 Levenshtein = "0.23.0"
 
 [[package]]
-name = "pytube"
-version = "15.0.0"
-description = "Python 3 library for downloading YouTube Videos."
+name = "pytubefix"
+version = "1.13.3"
+description = "Python3 library for downloading YouTube Videos."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytube-15.0.0-py3-none-any.whl", hash = "sha256:07b9904749e213485780d7eb606e5e5b8e4341aa4dccf699160876da00e12d78"},
-    {file = "pytube-15.0.0.tar.gz", hash = "sha256:076052efe76f390dfa24b1194ff821d4e86c17d41cb5562f3a276a8bcbfc9d1d"},
+    {file = "pytubefix-1.13.3-py3-none-any.whl", hash = "sha256:66a3a7cc4035961aeab0174a03d196d5a90797f40956b00c88f0204a6df2c62c"},
+    {file = "pytubefix-1.13.3.tar.gz", hash = "sha256:e58528907e6afefc502e31175c5d7c72c85d2952e87d20ae05c84197750046f5"},
 ]
 
 [[package]]
@@ -3802,7 +3802,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -5125,4 +5124,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">3.8.1,<3.12"
-content-hash = "178f6645fbd1d29010d5ec1d834fc7e6a9af3105f0524773cfb595f6f734b1e3"
+content-hash = "507bab25cdfcdb01af5c0d2869dfc46cfd3276f760ce0f702600b20140bf7d24"
diff --git a/pyproject.toml b/pyproject.toml
index 2e88e21..7ebd99b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,6 @@ exclude = [
 
 [tool.poetry.dependencies]
 python = ">3.8.1,<3.12"
-pytube = ">=15.0.0"
 pycairo = ">=1.24.0"
 pygobject = ">=3.44.1"
 openai = ">=1"
@@ -32,6 +31,7 @@ gtts = ">=2.3.2"
 edge-tts = ">=6.1.8"
 numpy = ">=1.20"
 typer = ">=0.9.0"
+pytubefix = ">=1.13.3"
 
 [tool.poetry.group.dev.dependencies]
 pytest = ">=7.3.2"
@@ -113,6 +113,6 @@ module = [
   "noisereduce",
   "pydub",
   "pydub.generators",
-  "pytube",
+  "pytubefix",
 ]
 ignore_missing_imports = true
diff --git a/src/artbox/cli.py b/src/artbox/cli.py
index 247624d..26e40a3 100644
--- a/src/artbox/cli.py
+++ b/src/artbox/cli.py
@@ -66,7 +66,7 @@ def main(
 
 
 @app_voice.command("text-to-speech")
-def text_to_speech(
+def voice_text_to_speech(
     title: Annotated[
         str, typer.Option("--title", help="Specify the name of the audio file")
     ] = "artbox",
@@ -93,6 +93,18 @@ def text_to_speech(
             "--lang", help="Choose the language for audio generation"
         ),
     ] = "en",
+    rate: Annotated[
+        str,
+        typer.Option("--rate", help="Decrease/Increase the rate level"),
+    ] = "+0%",
+    volume: Annotated[
+        str,
+        typer.Option("--volume", help="Decrease/Increase the volume level"),
+    ] = "+0%",
+    pitch: Annotated[
+        str,
+        typer.Option("--pitch", help="Decrease/Increase the pitch level"),
+    ] = "+0Hz",
 ) -> None:
     """Convert text to speech."""
     args_dict = {
@@ -101,6 +113,9 @@ def text_to_speech(
         "output-path": output_path,
         "engine": engine,
         "lang": lang,
+        "rate": rate,
+        "volume": volume,
+        "pitch": pitch,
     }
 
     runner = Voice(args_dict)
@@ -108,7 +123,7 @@ def text_to_speech(
 
 
 @app_sound.command("notes-to-audio")
-def notes_to_audio(
+def sound_notes_to_audio(
     input_path: Annotated[
         str,
         typer.Option(
@@ -138,7 +153,7 @@ def notes_to_audio(
 
 
 @app_video.command("remove-audio")
-def remove_audio(
+def video_remove_audio(
     input_path: Annotated[
         str,
         typer.Option(
@@ -163,7 +178,7 @@ def remove_audio(
 
 
 @app_video.command("extract-audio")
-def extract_audio(
+def video_extract_audio(
     input_path: Annotated[
         str,
         typer.Option(
@@ -189,7 +204,7 @@ def extract_audio(
 
 
 @app_video.command("combine-video-and-audio")
-def combine_audio_and_video(
+def video_combine_audio_and_video(
     video_path: Annotated[
         str,
         typer.Option(
@@ -222,7 +237,7 @@ def combine_audio_and_video(
 
 
 @app_youtube.command("download")
-def download_youtube_video(
+def youtube_download(
     url: Annotated[
         str,
         typer.Option(
@@ -252,3 +267,42 @@ def download_youtube_video(
 
     runner = Youtube(args_dict)
     runner.download()
+
+
+@app_youtube.command("cc")
+def youtube_cc(
+    url: Annotated[
+        str,
+        typer.Option(
+            "--url", help="Specify the URL of the YouTube video to download"
+        ),
+    ] = "",
+    output_path: Annotated[
+        str,
+        typer.Option(
+            "--output-path",
+            help=(
+                "Specify the path to store the downloaded video file "
+                "(.srt, .txt)"
+            ),
+        ),
+    ] = "/tmp/cc.txt",
+    lang: Annotated[
+        str,
+        typer.Option("--lang", help="Set the CC language to be downloaded"),
+    ] = "en",
+    format: Annotated[
+        str,
+        typer.Option("--format", help="Set the CC format (srt, text)"),
+    ] = "text",
+) -> None:
+    """Download youtube video CC."""
+    args_dict = {
+        "url": url,
+        "output-path": output_path,
+        "lang": lang,
+        "format": format,
+    }
+
+    runner = Youtube(args_dict)
+    runner.download_captions()
diff --git a/src/artbox/videos.py b/src/artbox/videos.py
index f1f1b09..a89ffae 100644
--- a/src/artbox/videos.py
+++ b/src/artbox/videos.py
@@ -6,7 +6,7 @@
 from abc import abstractmethod
 
 from moviepy.editor import AudioFileClip, VideoFileClip
-from pytube import YouTube as PyYouTube
+from pytubefix import YouTube as PyYouTube
 
 from artbox.base import ArtBox
 
@@ -20,6 +20,27 @@ def download(self):
         ...
 
 
+def _convert_srt_to_plain_text(srt_text: str) -> str:
+    """
+    Convert an SRT file to plain text by removing timestamps and formatting.
+
+    Parameters
+    ----------
+    srt_file_path (str): Path to the SRT file.
+
+    Returns
+    -------
+    str: The extracted plain text from the SRT file.
+    """
+    plain_text = []
+    # Skip lines that are part of SRT formatting (timestamps, etc.)
+    for line in srt_text.split("\n"):
+        if line.strip() and not line.strip().isdigit() and "-->" not in line:
+            plain_text.append(line.strip())
+
+    return "\n".join(plain_text)
+
+
 class Youtube(DownloadBase):
     """Set of tools for handing videos."""
 
@@ -52,6 +73,27 @@ def download(self):
         except Exception as e:
             print(f"Failed to download video: {e}")
 
+    def download_captions(self):
+        """Download the English closed captions of a YouTube video."""
+        video_url = self.args.get("url", "")
+        lang = self.args.get("lang", "en")
+        format = self.args.get("format", "text")
+
+        yt = PyYouTube(video_url)
+        caption = yt.captions.get_by_language_code(f"a.{lang}")
+
+        if not caption:
+            print(f"No captions found for language {lang}.")
+            return
+
+        # Save the captions to a file
+        cc = caption.generate_srt_captions()
+        with open(str(self.output_path), "w") as f:
+            if format == "text":
+                cc = _convert_srt_to_plain_text(cc)
+            f.write(cc)
+        print("Captions downloaded successfully.")
+
 
 class Video(ArtBox):
     """Set of tools for handing videos."""
diff --git a/src/artbox/voices.py b/src/artbox/voices.py
index c9ecd7e..420386f 100644
--- a/src/artbox/voices.py
+++ b/src/artbox/voices.py
@@ -79,6 +79,9 @@ async def async_text_to_speech(self) -> None:
         title: str = self.args.get("title", "")
         text_path: str = self.args.get("text-path", "")
         lang: str = self.args.get("lang", "en")
+        rate = self.args.get("rate", "+0%")
+        volume = self.args.get("volume", "+0%")
+        pitch = self.args.get("pitch", "+0Hz")
 
         if not title:
             raise Exception("Argument `title` not given")
@@ -96,8 +99,9 @@ async def async_text_to_speech(self) -> None:
         communicate = edge_tts.Communicate(
             text=text,
             voice=random.choice(voice_options)["Name"],
-            rate="+5%",
-            volume="+0%",
+            rate=rate,
+            volume=volume,
+            pitch=pitch,
         )
         with open(self.output_path, "wb") as file:
             async for chunk in communicate.stream():