Skip to content

Commit

Permalink
Merge branch 'next'
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeronymous committed Mar 22, 2024
2 parents 399c98d + 122964c commit c1f0ada
Show file tree
Hide file tree
Showing 10 changed files with 51 additions and 234 deletions.
1 change: 1 addition & 0 deletions http_server/ingress.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
datefmt="%d/%m/%Y %H:%M:%S",
)
logger = logging.getLogger("__stt-standalone-worker__")
logger.setLevel(logging.INFO)

# If websocket streaming route is enabled
if os.environ.get("ENABLE_STREAMING", False) in [True, "true", 1]:
Expand Down
6 changes: 3 additions & 3 deletions kaldi/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ RUN git clone -b vosk --single-branch https://github.com/alphacep/kaldi /opt/kal
&& cd /opt/kaldi/tools \
&& sed -i 's:status=0:exit 0:g' extras/check_dependencies.sh \
&& sed -i 's:--enable-ngram-fsts:--enable-ngram-fsts --disable-bin:g' Makefile \
&& make -j $(nproc) openfst cub \
&& make -j $(( $(nproc) < 8 ? $(nproc) : 8 )) openfst cub \
&& if [ "x$KALDI_MKL" != "x1" ] ; then \
extras/install_openblas_clapack.sh; \
else \
Expand All @@ -42,7 +42,7 @@ RUN git clone -b vosk --single-branch https://github.com/alphacep/kaldi /opt/kal
fi \
&& sed -i 's:-msse -msse2:-msse -msse2:g' kaldi.mk \
&& sed -i 's: -O1 : -O3 :g' kaldi.mk \
&& make -j $(nproc) online2 lm rnnlm
&& make -j $(( $(nproc) < 8 ? $(nproc) : 8 )) online2 lm rnnlm

# Install python dependencies
COPY kaldi/requirements.txt ./
Expand All @@ -51,7 +51,7 @@ RUN pip install --no-cache-dir -r requirements.txt
# Install Custom Vosk API
RUN git clone --depth 1 https://github.com/alphacep/vosk-api /opt/vosk-api && cd /opt/vosk-api/python && \
cd /opt/vosk-api/src \
&& KALDI_MKL=$KALDI_MKL KALDI_ROOT=/opt/kaldi make -j $(nproc) \
&& KALDI_MKL=$KALDI_MKL KALDI_ROOT=/opt/kaldi make -j $(( $(nproc) < 8 ? $(nproc) : 8 )) \
&& cd /opt/vosk-api/python \
&& python3 ./setup.py install

Expand Down
3 changes: 3 additions & 0 deletions kaldi/RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# 1.0.1
- Fix streaming mode (websocket) in linto-stt-kaldi

# 1.0.0
- First build of linto-stt-kaldi
- Based on 3.3.2 of linto-stt (https://github.com/linto-ai/linto-stt/blob/4361300a4463c90cec0bf3fa2975d7cc2ddf8d36/RELEASE.md)
2 changes: 0 additions & 2 deletions wait-for-it.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ wait_for_wrapper()
return $WAITFORIT_RESULT
}

echo "NOCOMMIT wait-for-it $*"

# process arguments
while [[ $# -gt 0 ]]
do
Expand Down
4 changes: 2 additions & 2 deletions websocket/websocketserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@

import websockets

from stt.processing import model
from stt.processing import MODEL
from stt.processing.streaming import wssDecode


async def _fun_wrapper(ws):
"""Wrap wssDecode function to add STT Model reference"""
return await wssDecode(ws, model)
return await wssDecode(ws, MODEL)


async def WSServer(port: int):
Expand Down
2 changes: 1 addition & 1 deletion whisper/Dockerfile.ctranslate2
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ghcr.io/opennmt/ctranslate2:latest-ubuntu20.04-cuda11.2
FROM ghcr.io/opennmt/ctranslate2:latest-ubuntu20.04-cuda12.2
LABEL maintainer="[email protected], [email protected], [email protected]"

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ffmpeg git
Expand Down
7 changes: 6 additions & 1 deletion whisper/RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# 1.0.2
- ct2/faster_whisper: Upgrade faster_whisper and support recent distilled models
- ct2/faster_whisper: Fix possible gluing of different words together
- torch/whisper-timesptamped: Upgrade whisper-timestamped and delegate model loading

# 1.0.1
- support of model.safetensors
- ct2/faster_whisper: Information about used precision added in the logs
- torch/whisper-timesptamped: support of model.safetensors

# 1.0.0
- First build of linto-stt-whisper
Expand Down
7 changes: 4 additions & 3 deletions whisper/requirements.ctranslate2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ gevent
gunicorn
lockfile
pyyaml>=5.4.1
regex
requests>=2.26.0
wavio>=0.0.4
websockets
#faster_whisper==0.10.0
# This is version faster_whisper==0.9.0 + prompt propagation + fix for large-v3
git+https://github.com/linto-ai/faster-whisper.git@aad9e7508b528e79be2a9975ac79ef8317f02a6d
#faster_whisper==1.0.1
# This is version faster_whisper==1.0.1 + option for (persistent) prompt + fix for large-v3
git+https://github.com/linto-ai/faster-whisper.git
20 changes: 13 additions & 7 deletions whisper/stt/processing/decoding.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import copy
import os
import time
import regex as re
from typing import Tuple, Union

import numpy as np
Expand All @@ -27,7 +28,7 @@
default_best_of = None
default_temperature = 0.0

default_initial_prompt = os.environ.get("PROMPT", None)
default_prompt = os.environ.get("PROMPT", None)


def decode(
Expand All @@ -42,7 +43,7 @@ def decode(
condition_on_previous_text: bool = False,
no_speech_threshold: float = 0.6,
compression_ratio_threshold: float = 2.4,
initial_prompt: str = default_initial_prompt,
prompt: str = default_prompt,
) -> dict:
if language is None:
language = get_language()
Expand Down Expand Up @@ -111,7 +112,7 @@ def decode_torch(
no_speech_threshold,
compression_ratio_threshold,
normalize_text_as_words=False,
initial_prompt=None,
prompt=None,
):
"""Transcribe the audio data using Whisper with the defined model."""

Expand All @@ -127,7 +128,7 @@ def decode_torch(
no_speech_threshold=no_speech_threshold,
compression_ratio_threshold=compression_ratio_threshold,
vad=USE_VAD,
initial_prompt=initial_prompt,
initial_prompt=prompt,
)

if alignment_model is None:
Expand Down Expand Up @@ -319,20 +320,22 @@ def checked_timestamps(start, end=None):
if segment.words:
for word in segment.words:
start, end = checked_timestamps(word.start, word.end)
word_strip = word.word.strip()
word_string = word.word
word_strip = word_string.lstrip()
if (
glue_punctuations
and len(words)
and len(word_strip) > 1
and word_strip[0] in glue_punctuations
and (word_strip == word_string or not contains_alphanum(words[-1]["text"]) or not contains_alphanum(word_strip))
):
words[-1]["text"] += word.word.lstrip()
words[-1]["text"] += word_strip
words[-1]["confidence"].append(word.probability)
words[-1]["end"] = max(words[-1]["end"], end)
continue
words.append(
{
"text": word.word,
"text": word_string,
"confidence": [word.probability],
"start": start,
"end": end,
Expand Down Expand Up @@ -364,3 +367,6 @@ def checked_timestamps(start, end=None):
return format_whisper_timestamped_response(
transcription, remove_punctuation_from_words=remove_punctuation_from_words
)

def contains_alphanum(text: str) -> bool:
return re.search(r"[^\W\'\-_]", text)
Loading

0 comments on commit c1f0ada

Please sign in to comment.