Skip to content

Commit

Permalink
feat: detect language using lingua, instead of trusting LLM (#518)
Browse files Browse the repository at this point in the history
* feat: detect language using lingua, instead of trusting LLM

Signed-off-by: Frost Ming <[email protected]>

* fix: cache the detector

Signed-off-by: Frost Ming <[email protected]>

* fix comment

Signed-off-by: Frost Ming <[email protected]>

---------

Signed-off-by: Frost Ming <[email protected]>
  • Loading branch information
frostming authored May 22, 2024
1 parent 82c57cc commit c82be6a
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 22 deletions.
76 changes: 75 additions & 1 deletion pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ dependencies = [
"groq>=0.5.0",
"pyyaml>=6.0.1",
"langchain-community>=0.0.38",
# lingua doesn't ship wheels for python 3.13 nor sdist
"lingua-language-detector>=2.0.2; python_version < \"3.13\"",
]
license = {text = "MIT"}
dynamic = ["version", "optional-dependencies"]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ langchain-community==0.2.0
langchain-core==0.2.0
langchain-text-splitters==0.2.0
langsmith==0.1.45
lingua-language-detector==2.0.2; python_version < "3.13"
markdown-it-py==3.0.0
marshmallow==3.20.1
mdurl==0.1.2
Expand Down
23 changes: 22 additions & 1 deletion xiaogpt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
import re
import socket
from http.cookies import SimpleCookie
from typing import AsyncIterator
from typing import TYPE_CHECKING, AsyncIterator
from urllib.parse import urlparse

from requests.utils import cookiejar_from_dict

if TYPE_CHECKING:
from lingua import LanguageDetector


### HELP FUNCTION ###
def parse_cookie_string(cookie_string):
Expand Down Expand Up @@ -69,3 +72,21 @@ def get_hostname() -> str:
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
s.connect(("8.8.8.8", 80))
return s.getsockname()[0]


def _get_detector() -> LanguageDetector | None:
try:
from lingua import LanguageDetectorBuilder
except ImportError:
return None
return LanguageDetectorBuilder.from_all_spoken_languages().build()


_detector = _get_detector()


def detect_language(text: str) -> str:
if _detector is None:
return "zh" # default to Chinese if langdetect module is not available
lang = _detector.detect_language_of(text)
return lang.iso_code_639_1.name.lower() if lang is not None else "zh"
26 changes: 6 additions & 20 deletions xiaogpt/xiaogpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@
Config,
)
from xiaogpt.tts import TTS, MiTTS, TetosTTS
from xiaogpt.utils import (
parse_cookie_string,
)
from xiaogpt.utils import detect_language, parse_cookie_string

EOF = object()

Expand Down Expand Up @@ -390,11 +388,6 @@ async def run_forever(self):
query = f"{query}{self.config.prompt}"
# some model can not detect the language code, so we need to add it

if self.config.tts != "mi": # mi only say Chinese
query += (
",并用本段话的language code作为开头,用|分隔,如:en-US|你好……"
)

if self.config.mute_xiaoai:
await self.stop_if_xiaoai_is_playing()
else:
Expand All @@ -420,18 +413,11 @@ async def run_forever(self):
await self.wakeup_xiaoai()

async def speak(self, text_stream: AsyncIterator[str]) -> None:
text = await text_stream.__anext__()
# See if the first part contains language code(e.g. en-US|Hello world)
lang, _, first_chunk = text.rpartition("|")
if len(lang) > 7:
# It is not a legal language code, discard it
lang, first_chunk = "", text

lang = (
matches[0]
if (matches := re.findall(r"([a-z]{2}-[A-Z]{2})", lang))
else "zh-CN"
)
first_chunk = await text_stream.__anext__()
# Detect the language from the first chunk
# Add suffix '-' because tetos expects it to exist when selecting voices
# however, the nation code is never used.
lang = detect_language(first_chunk) + "-"

async def gen(): # reconstruct the generator
yield first_chunk
Expand Down

0 comments on commit c82be6a

Please sign in to comment.