Merge branch 'next'

linto-ai · Mar 22, 2024 · c1f0ada · c1f0ada
2 parents 399c98d + 122964c
commit c1f0ada
Show file tree

Hide file tree

Showing 10 changed files with 51 additions and 234 deletions.
diff --git a/http_server/ingress.py b/http_server/ingress.py
@@ -21,6 +21,7 @@
     datefmt="%d/%m/%Y %H:%M:%S",
 )
 logger = logging.getLogger("__stt-standalone-worker__")
+logger.setLevel(logging.INFO)
 
 # If websocket streaming route is enabled
 if os.environ.get("ENABLE_STREAMING", False) in [True, "true", 1]:

diff --git a/kaldi/Dockerfile b/kaldi/Dockerfile
@@ -28,7 +28,7 @@ RUN git clone -b vosk --single-branch https://github.com/alphacep/kaldi /opt/kal
     && cd /opt/kaldi/tools \
     && sed -i 's:status=0:exit 0:g' extras/check_dependencies.sh \
     && sed -i 's:--enable-ngram-fsts:--enable-ngram-fsts --disable-bin:g' Makefile \
-    && make -j $(nproc) openfst cub \
+    && make -j $(( $(nproc) < 8 ? $(nproc) : 8 )) openfst cub \
     && if [ "x$KALDI_MKL" != "x1" ] ; then \
           extras/install_openblas_clapack.sh; \
        else \
@@ -42,7 +42,7 @@ RUN git clone -b vosk --single-branch https://github.com/alphacep/kaldi /opt/kal
        fi \
     && sed -i 's:-msse -msse2:-msse -msse2:g' kaldi.mk \
     && sed -i 's: -O1 : -O3 :g' kaldi.mk \
-    && make -j $(nproc) online2 lm rnnlm
+    && make -j $(( $(nproc) < 8 ? $(nproc) : 8 )) online2 lm rnnlm
 
 # Install python dependencies
 COPY kaldi/requirements.txt ./
@@ -51,7 +51,7 @@ RUN pip install --no-cache-dir -r requirements.txt
 # Install Custom Vosk API
 RUN git clone --depth 1 https://github.com/alphacep/vosk-api /opt/vosk-api && cd /opt/vosk-api/python && \
     cd /opt/vosk-api/src \
-    && KALDI_MKL=$KALDI_MKL KALDI_ROOT=/opt/kaldi make -j $(nproc) \
+    && KALDI_MKL=$KALDI_MKL KALDI_ROOT=/opt/kaldi make -j $(( $(nproc) < 8 ? $(nproc) : 8 )) \
     && cd /opt/vosk-api/python \
     && python3 ./setup.py install
 

diff --git a/kaldi/RELEASE.md b/kaldi/RELEASE.md
@@ -1,3 +1,6 @@
+#  1.0.1
+- Fix streaming mode (websocket) in linto-stt-kaldi
+
 #  1.0.0
 - First build of linto-stt-kaldi
 - Based on 3.3.2 of linto-stt (https://github.com/linto-ai/linto-stt/blob/4361300a4463c90cec0bf3fa2975d7cc2ddf8d36/RELEASE.md)
diff --git a/wait-for-it.sh b/wait-for-it.sh
@@ -67,8 +67,6 @@ wait_for_wrapper()
     return $WAITFORIT_RESULT
 }
 
-echo "NOCOMMIT wait-for-it $*"
-
 # process arguments
 while [[ $# -gt 0 ]]
 do

diff --git a/websocket/websocketserver.py b/websocket/websocketserver.py
@@ -3,13 +3,13 @@
 
 import websockets
 
-from stt.processing import model
+from stt.processing import MODEL
 from stt.processing.streaming import wssDecode
 
 
 async def _fun_wrapper(ws):
     """Wrap wssDecode function to add STT Model reference"""
-    return await wssDecode(ws, model)
+    return await wssDecode(ws, MODEL)
 
 
 async def WSServer(port: int):

diff --git a/whisper/Dockerfile.ctranslate2 b/whisper/Dockerfile.ctranslate2
@@ -1,4 +1,4 @@
-FROM ghcr.io/opennmt/ctranslate2:latest-ubuntu20.04-cuda11.2
+FROM ghcr.io/opennmt/ctranslate2:latest-ubuntu20.04-cuda12.2
 LABEL maintainer="[email protected], [email protected], [email protected]"
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ffmpeg git

diff --git a/whisper/RELEASE.md b/whisper/RELEASE.md
@@ -1,6 +1,11 @@
+# 1.0.2
+- ct2/faster_whisper: Upgrade faster_whisper and support recent distilled models
+- ct2/faster_whisper: Fix possible gluing of different words together
+- torch/whisper-timesptamped: Upgrade whisper-timestamped and delegate model loading
+
 # 1.0.1
-- support of model.safetensors
 - ct2/faster_whisper: Information about used precision added in the logs
+- torch/whisper-timesptamped: support of model.safetensors
 
 # 1.0.0
 - First build of linto-stt-whisper

diff --git a/whisper/requirements.ctranslate2.txt b/whisper/requirements.ctranslate2.txt
@@ -7,9 +7,10 @@ gevent
 gunicorn
 lockfile
 pyyaml>=5.4.1
+regex
 requests>=2.26.0
 wavio>=0.0.4
 websockets
-#faster_whisper==0.10.0
-# This is version faster_whisper==0.9.0 + prompt propagation + fix for large-v3
-git+https://github.com/linto-ai/faster-whisper.git@aad9e7508b528e79be2a9975ac79ef8317f02a6d
+#faster_whisper==1.0.1
+# This is version faster_whisper==1.0.1 + option for (persistent) prompt + fix for large-v3
+git+https://github.com/linto-ai/faster-whisper.git
diff --git a/whisper/stt/processing/decoding.py b/whisper/stt/processing/decoding.py
@@ -1,6 +1,7 @@
 import copy
 import os
 import time
+import regex as re
 from typing import Tuple, Union
 
 import numpy as np
@@ -27,7 +28,7 @@
     default_best_of = None
     default_temperature = 0.0
 
-default_initial_prompt = os.environ.get("PROMPT", None)
+default_prompt = os.environ.get("PROMPT", None)
 
 
 def decode(
@@ -42,7 +43,7 @@ def decode(
     condition_on_previous_text: bool = False,
     no_speech_threshold: float = 0.6,
     compression_ratio_threshold: float = 2.4,
-    initial_prompt: str = default_initial_prompt,
+    prompt: str = default_prompt,
 ) -> dict:
     if language is None:
         language = get_language()
@@ -111,7 +112,7 @@ def decode_torch(
     no_speech_threshold,
     compression_ratio_threshold,
     normalize_text_as_words=False,
-    initial_prompt=None,
+    prompt=None,
 ):
     """Transcribe the audio data using Whisper with the defined model."""
 
@@ -127,7 +128,7 @@ def decode_torch(
         no_speech_threshold=no_speech_threshold,
         compression_ratio_threshold=compression_ratio_threshold,
         vad=USE_VAD,
-        initial_prompt=initial_prompt,
+        initial_prompt=prompt,
     )
 
     if alignment_model is None:
@@ -319,20 +320,22 @@ def checked_timestamps(start, end=None):
         if segment.words:
             for word in segment.words:
                 start, end = checked_timestamps(word.start, word.end)
-                word_strip = word.word.strip()
+                word_string = word.word
+                word_strip = word_string.lstrip()
                 if (
                     glue_punctuations
                     and len(words)
                     and len(word_strip) > 1
                     and word_strip[0] in glue_punctuations
+                    and (word_strip == word_string or not contains_alphanum(words[-1]["text"]) or not contains_alphanum(word_strip))
                 ):
-                    words[-1]["text"] += word.word.lstrip()
+                    words[-1]["text"] += word_strip
                     words[-1]["confidence"].append(word.probability)
                     words[-1]["end"] = max(words[-1]["end"], end)
                     continue
                 words.append(
                     {
-                        "text": word.word,
+                        "text": word_string,
                         "confidence": [word.probability],
                         "start": start,
                         "end": end,
@@ -364,3 +367,6 @@ def checked_timestamps(start, end=None):
     return format_whisper_timestamped_response(
         transcription, remove_punctuation_from_words=remove_punctuation_from_words
     )
+
+def contains_alphanum(text: str) -> bool:
+    return re.search(r"[^\W\'\-_]", text)