From a87a4bd03ae6ab0f8c3c19f6fe8d6ede476b259e Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Wed, 13 Sep 2023 14:21:42 +0000 Subject: [PATCH] Migrated all speech synthesis and transcription over to using the new icespeak package --- README.md | 6 +- queries/atm.py | 2 +- queries/bus.py | 9 +- queries/flights.py | 11 +- queries/gpt.py | 4 +- queries/ja.py | 2 +- queries/news.py | 2 +- queries/petrol.py | 2 +- queries/places.py | 7 +- queries/rand.py | 2 +- queries/schedules.py | 2 +- queries/special.py | 2 +- queries/stats.py | 2 +- queries/sunpos.py | 3 +- queries/time.py | 2 +- queries/unit.py | 2 +- queries/userinfo.py | 10 +- queries/userloc.py | 2 +- queries/util/__init__.py | 2 +- queries/weather.py | 10 +- queries/wiki.py | 2 +- queries/words.py | 2 +- queries/yulelads.py | 2 +- requirements.txt | 8 +- routes/api.py | 75 +- search.py | 18 +- speak.py | 269 ------- speech/README.md | 8 - speech/__init__.py | 259 ------- speech/trans/__init__.py | 1091 --------------------------- speech/trans/num.py | 707 ----------------- speech/voices/aws_polly.py | 171 ----- speech/voices/azure.py | 300 -------- speech/voices/google.py | 108 --- speech/voices/tiro.py | 111 --- tests/test_queries.py | 2 +- tests/test_speech.py | 1016 ------------------------- speech/voices/__init__.py => tts.py | 50 +- utility.py | 1 + 39 files changed, 110 insertions(+), 4174 deletions(-) delete mode 100755 speak.py delete mode 100644 speech/README.md delete mode 100755 speech/__init__.py delete mode 100644 speech/trans/__init__.py delete mode 100755 speech/trans/num.py delete mode 100755 speech/voices/aws_polly.py delete mode 100755 speech/voices/azure.py delete mode 100755 speech/voices/google.py delete mode 100755 speech/voices/tiro.py delete mode 100755 tests/test_speech.py rename speech/voices/__init__.py => tts.py (53%) mode change 100755 => 100644 diff --git a/README.md b/README.md index 39e5138a..f358edc6 100644 --- a/README.md +++ b/README.md @@ -182,8 +182,8 @@ in [`queries/examples`](queries/examples). ## Running Greynir -Once you have followed the installation and setup instructions above, change to the -Greynir repository and activate the virtual environment: +Once you have followed the installation and setup instructions above, change +to the Greynir repository and activate the virtual environment: ```bash cd Greynir @@ -265,7 +265,7 @@ GNU GPLv3 or other compatible licenses. ## Acknowledgements -Greynir uses the official BÍN ([Beygingarlýsing íslensks nútímamáls](https://bin.arnastofnun.is)) +Greynir uses the BÍN ([Beygingarlýsing íslensks nútímamáls](https://bin.arnastofnun.is)) lexicon and database of Icelandic word forms to identify words and find their potential meanings and lemmas. The database is included in [BinPackage](https://github.com/mideind/BinPackage) in compressed form. diff --git a/queries/atm.py b/queries/atm.py index 84ffb60a..9abc41d1 100644 --- a/queries/atm.py +++ b/queries/atm.py @@ -43,7 +43,7 @@ AnswerTuple, LatLonTuple, ) -from speech.trans.num import number_to_text +from icespeak.transcribe.num import number_to_text from utility import QUERIES_RESOURCES_DIR # TODO: fetch ATM data from a web service instead of a local file every once in a while diff --git a/queries/bus.py b/queries/bus.py index 141783c4..f1058cc6 100755 --- a/queries/bus.py +++ b/queries/bus.py @@ -64,7 +64,8 @@ gen_answer, read_grammar_file, ) -from speech.trans import gssml, strip_markup +from icespeak import gssml +from icespeak.transcribe import strip_markup from settings import Settings from geo import in_iceland @@ -706,7 +707,11 @@ def assemble(x: Iterable[str]) -> str: voice_answer = assemble(va) return dict(answer=answer), answer, voice_answer -_ROUTE_SORT: Callable[[str], int] = lambda num: int("".join(i for i in num if i.isdecimal())) + +_ROUTE_SORT: Callable[[str], int] = lambda num: int( + "".join(i for i in num if i.isdecimal()) +) + def query_which_route(query: Query, result: Result): """Which routes stop at a given bus stop""" diff --git a/queries/flights.py b/queries/flights.py index bb6628b9..01522669 100755 --- a/queries/flights.py +++ b/queries/flights.py @@ -38,7 +38,7 @@ from queries.util import query_json_api, is_plural, read_grammar_file from tree import Result, Node from settings import changedlocale -from speech.trans import gssml +from icespeak import gssml from reynir import NounPhrase from geo import capitalize_placename, iceprep_for_placename, icelandic_city_name @@ -140,6 +140,7 @@ def QFlightsDepLoc(node: Node, params: QueryStateDict, result: Result) -> None: maxsize=2, ttl=_FLIGHTS_CACHE_TTL ) + # For type checking class FlightType(TypedDict, total=False): No: str @@ -219,7 +220,9 @@ def _filter_flight_data( """ flight_time: datetime flight: FlightType - now: datetime = datetime.now(timezone.utc) # Timezone aware datetime (don't change to datetime.utcnow()!) + now: datetime = datetime.now( + timezone.utc + ) # Timezone aware datetime (don't change to datetime.utcnow()!) matching_flights: FlightList = [] for flight in flights: @@ -388,7 +391,9 @@ def _process_result(result: Result) -> Dict[str, str]: from_date: datetime to_date: datetime - now = datetime.now(timezone.utc) # Timezone aware datetime, don't change to .utcnow()! + now = datetime.now( + timezone.utc + ) # Timezone aware datetime, don't change to .utcnow()! days: int = result.get("day_count", 5) # Check 5 days into future by default from_date = result.get("from_date", now) to_date = result.get("to_date", now + timedelta(days=days)) diff --git a/queries/gpt.py b/queries/gpt.py index 0278ed40..cbcc0471 100755 --- a/queries/gpt.py +++ b/queries/gpt.py @@ -54,8 +54,8 @@ from queries.currency import fetch_exchange_rates from queries.userloc import locality_and_country from settings import Settings -from speech.trans import gssml -from speech.trans.num import numbers_to_ordinal, years_to_text +from icespeak import gssml +from icespeak.transcribe.num import numbers_to_ordinal, years_to_text from queries.util.openai_gpt import ( OPENAI_KEY_PRESENT, jdump, diff --git a/queries/ja.py b/queries/ja.py index 5b0bfff1..79cebda6 100755 --- a/queries/ja.py +++ b/queries/ja.py @@ -36,7 +36,7 @@ from reynir.bindb import GreynirBin from queries.util import query_json_api, gen_answer, read_grammar_file -from speech.trans import gssml +from icespeak import gssml from queries import AnswerTuple, Query, QueryStateDict from tree import Result, Node diff --git a/queries/news.py b/queries/news.py index bd615091..19db96f6 100755 --- a/queries/news.py +++ b/queries/news.py @@ -34,7 +34,7 @@ import cachetools # type: ignore import random -from speech.trans import gssml +from icespeak import gssml from queries import Query, QueryStateDict, AnswerTuple from queries.util import gen_answer, query_json_api, read_grammar_file from tree import Result, Node diff --git a/queries/petrol.py b/queries/petrol.py index 163e81d1..d4d223ab 100755 --- a/queries/petrol.py +++ b/queries/petrol.py @@ -44,7 +44,7 @@ LatLonTuple, read_grammar_file, ) -from speech.trans import gssml +from icespeak import gssml _PETROL_QTYPE = "Petrol" diff --git a/queries/places.py b/queries/places.py index c5981476..7b122cd9 100755 --- a/queries/places.py +++ b/queries/places.py @@ -50,7 +50,7 @@ AnswerTuple, read_grammar_file, ) -from speech.trans import gssml +from icespeak import gssml from tree import Result, Node @@ -76,7 +76,10 @@ def help_text(lemma: str) -> str: one of the above lemmas is found in it""" return "Ég get svarað ef þú spyrð til dæmis: {0}?".format( random.choice( - ("Hvað er opið lengi á Forréttabarnum", "Hvenær lokar Bónus á Fiskislóð",) + ( + "Hvað er opið lengi á Forréttabarnum", + "Hvenær lokar Bónus á Fiskislóð", + ) ) ) diff --git a/queries/rand.py b/queries/rand.py index 3a8727a2..426b2d3c 100755 --- a/queries/rand.py +++ b/queries/rand.py @@ -31,7 +31,7 @@ from queries import Query, QueryStateDict, AnswerTuple from queries.util import gen_answer, read_grammar_file from queries.arithmetic import add_num, terminal_num -from speech.trans import gssml +from icespeak import gssml from tree import Result, Node diff --git a/queries/schedules.py b/queries/schedules.py index 25714f67..c54dd932 100755 --- a/queries/schedules.py +++ b/queries/schedules.py @@ -39,7 +39,7 @@ from tokenizer import split_into_sentences -from speech.trans import gssml +from icespeak import gssml from settings import changedlocale from queries.util import query_json_api, read_grammar_file diff --git a/queries/special.py b/queries/special.py index 6107bfbc..548e4796 100755 --- a/queries/special.py +++ b/queries/special.py @@ -37,7 +37,7 @@ from utility import icequote from queries import Query -from speech.trans import gssml +from icespeak import gssml # Type definitions AnswerEntry = Union[str, bool] diff --git a/queries/stats.py b/queries/stats.py index cc79e09f..39ccada4 100755 --- a/queries/stats.py +++ b/queries/stats.py @@ -35,7 +35,7 @@ from queries import Query from queries.util import gen_answer, natlang_seq, is_plural, iceformat_float from routes.people import top_persons -from speech.trans import gssml +from icespeak import gssml _STATS_QTYPE = "Stats" diff --git a/queries/sunpos.py b/queries/sunpos.py index d84c3247..c43f135c 100755 --- a/queries/sunpos.py +++ b/queries/sunpos.py @@ -58,7 +58,7 @@ ICE_PLACENAME_BLACKLIST, ) from iceaddr import placename_lookup -from speech.trans.num import numbers_to_ordinal, floats_to_text +from icespeak.transcribe.num import numbers_to_ordinal, floats_to_text # Indicate that this module wants to handle parse trees for queries, # as opposed to simple literal text strings @@ -495,7 +495,6 @@ def _answer_city_solar_data( def _get_answer(q: Query, result: Result) -> AnswerTuple: - qdate: datetime.date = result.get("date", datetime.date.today()) sun_pos: int = result.get("solar_position") diff --git a/queries/time.py b/queries/time.py index 4dde9d6c..fa1f8cd0 100755 --- a/queries/time.py +++ b/queries/time.py @@ -41,7 +41,7 @@ ) from queries import Query from queries.util import timezone4loc, gen_answer -from speech.trans import gssml +from icespeak import gssml from utility import icequote _TIME_QTYPE = "Time" diff --git a/queries/unit.py b/queries/unit.py index 4174bdee..7fa7ba98 100755 --- a/queries/unit.py +++ b/queries/unit.py @@ -37,7 +37,7 @@ from queries import Query, QueryStateDict, to_dative, to_accusative from queries.util import iceformat_float, parse_num, read_grammar_file, is_plural from tree import Result, Node -from speech.trans import gssml +from icespeak import gssml # Lemmas of keywords that could indicate that the user is trying to use this module TOPIC_LEMMAS = [ diff --git a/queries/userinfo.py b/queries/userinfo.py index ef1ad3b1..e7aa693d 100755 --- a/queries/userinfo.py +++ b/queries/userinfo.py @@ -35,7 +35,7 @@ from geo import icelandic_addr_info, iceprep_for_placename, iceprep_for_street from queries import ClientDataDict, Query from queries.util import gen_answer -from speech.trans.num import numbers_to_text +from icespeak.transcribe.num import numbers_to_text _USERINFO_QTYPE = "UserInfo" @@ -328,18 +328,18 @@ def _myaddris_handler(q: Query, ql: str) -> bool: return True -#def _whatsmynum_handler(q: Query, ql: str) -> bool: +# def _whatsmynum_handler(q: Query, ql: str) -> bool: # """Handle queries of the form "Hvað er símanúmerið mitt?""" # return False # -#_MY_PHONE_IS_REGEXES = ( +# _MY_PHONE_IS_REGEXES = ( # r"símanúmer mitt er (.+)$", # r"símanúmerið mitt er (.+)$", # r"ég er með símanúmer (.+)$", # r"ég er með símanúmerið (.+)$", -#) +# ) # -#_DUNNO_PHONE_NUM = "Ég veit ekki hvert símanúmer þitt er, en þú getur sagt mér það." +# _DUNNO_PHONE_NUM = "Ég veit ekki hvert símanúmer þitt er, en þú getur sagt mér það." _DEVICE_TYPE_QUERIES = frozenset( diff --git a/queries/userloc.py b/queries/userloc.py index 35340d49..54c19e8e 100755 --- a/queries/userloc.py +++ b/queries/userloc.py @@ -37,7 +37,7 @@ nom2dat, read_grammar_file, ) -from speech.trans.num import numbers_to_text +from icespeak.transcribe.num import numbers_to_text from tree import Result, Node from iceaddr import iceaddr_lookup, postcodes from geo import ( diff --git a/queries/util/__init__.py b/queries/util/__init__.py index 9c412206..12f1f5f7 100755 --- a/queries/util/__init__.py +++ b/queries/util/__init__.py @@ -48,7 +48,7 @@ from pytz import country_timezones from geo import country_name_for_isocode, iceprep_for_cc, LatLonTuple -from speech.trans.num import number_to_text, float_to_text +from icespeak.transcribe.num import number_to_text, float_to_text from reynir import NounPhrase from tree import Node from settings import changedlocale diff --git a/queries/weather.py b/queries/weather.py index 6a79020a..cc52c70a 100755 --- a/queries/weather.py +++ b/queries/weather.py @@ -68,7 +68,7 @@ from iceaddr import placename_lookup # type: ignore from iceweather import observation_for_closest, observation_for_station, forecast_text # type: ignore -from speech.trans import gssml +from icespeak import gssml _WEATHER_QTYPE = "Weather" @@ -351,9 +351,13 @@ def get_currweather_answer(query: Query, result: Result) -> AnswerTuple: return response, answer, voice -def gpt_query(q: Query, query: str, time: str, location: str) -> Dict[str, Union[str, int, float]]: +def gpt_query( + q: Query, query: str, time: str, location: str +) -> Dict[str, Union[str, int, float]]: """Return a string response for a GPT query""" - weather: Dict[str, Union[str, int, float]] = dict(temperature=random.randint(-10, 30), wind=random.randint(0, 20)) + weather: Dict[str, Union[str, int, float]] = dict( + temperature=random.randint(-10, 30), wind=random.randint(0, 20) + ) return weather diff --git a/queries/wiki.py b/queries/wiki.py index 31ff84a5..709fec50 100755 --- a/queries/wiki.py +++ b/queries/wiki.py @@ -37,7 +37,7 @@ from tree import Result, Node from utility import cap_first -from speech.trans import gssml +from icespeak import gssml from queries import Query, QueryStateDict, ContextDict from queries.util import query_json_api, gen_answer, read_grammar_file diff --git a/queries/words.py b/queries/words.py index 955d927a..cdcef49d 100755 --- a/queries/words.py +++ b/queries/words.py @@ -46,7 +46,7 @@ from queries import Query, AnswerTuple from queries.util import gen_answer from utility import icequote -from speech.trans import gssml +from icespeak import gssml _WORDTYPE_RX_NOM = "(?:orðið|nafnið|nafnorðið)" diff --git a/queries/yulelads.py b/queries/yulelads.py index 51f9b34f..9ad66576 100755 --- a/queries/yulelads.py +++ b/queries/yulelads.py @@ -31,7 +31,7 @@ from queries import Query, QueryStateDict from tree import Result, Node, TerminalNode from queries.util import read_grammar_file -from speech.trans.num import numbers_to_ordinal +from icespeak.transcribe.num import numbers_to_ordinal def help_text(lemma: str) -> str: diff --git a/requirements.txt b/requirements.txt index cf380b64..60e275e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,18 +21,12 @@ timezonefinder>=6.2.0 rjsmin>=1.2.1 python-youtube>=0.9.0 country-list>=1.0.0 -# AWS Polly text-to-speech -botocore==1.21.40 -boto3==1.18.40 -# Azure text-to-speech. -azure-cognitiveservices-speech>=1.28.0 -# Google text-to-speech -# google-cloud-texttospeech>=2.14.1 # For OpenAI GPT support openai>=0.27.6 # Ours reynir>=3.5.4 islenska>=0.4.8 +icespeak>=0.3.2 iceaddr>=0.5.5 iceweather>=0.2.3 cityloc>=0.1.1 diff --git a/routes/api.py b/routes/api.py index 91bff6c4..dc94aeb4 100755 --- a/routes/api.py +++ b/routes/api.py @@ -25,34 +25,29 @@ from typing import Dict, Any, Iterable, List, Optional, cast from datetime import datetime -import logging from flask import request, abort from flask.wrappers import Response, Request -from settings import Settings +from icespeak import GreynirSSMLParser, tts_to_file, TTSOptions, VOICES +from icespeak.settings import SETTINGS as TTS_SETTINGS +from icespeak.settings import TextFormats +from icespeak.tts import TTSOutput +from reynir.bintokenizer import TokenDict +from reynir.binparser import canonicalize_token +from settings import Settings from tnttagger import ifd_tag from db import SessionContext from db.models import ArticleTopic, Query, QueryClientData, Summary from geo import LatLonTuple from tree.util import TreeUtility -from reynir.bintokenizer import TokenDict -from reynir.binparser import canonicalize_token from article import Article as ArticleProxy from queries import process_query from queries import Query as QueryObject -from speech import ( - GreynirSSMLParser, - text_to_audio_url, - DEFAULT_VOICE, - SUPPORTED_VOICES, - RECOMMENDED_VOICES, - DEFAULT_VOICE_SPEED, -) -from speech.voices import voice_for_locale from queries.util.openai_gpt import summarize -from utility import read_txt_api_key, icelandic_asciify +from tts import voice_for_locale +from utility import read_txt_api_key, icelandic_asciify, TTS_AUDIO_DIR, RESOURCES_DIR from . import routes, better_jsonify, text_from_request, bool_from_request from . import MAX_URL_LENGTH, MAX_UUID_LENGTH @@ -349,8 +344,8 @@ def reparse_api(version: int = 1) -> Response: return better_jsonify(valid=True, result=tokens, register=register, stats=stats) -def file_url_to_host_url(url: str, r: Request) -> str: - """Convert a local file:// URL to a http(s):// URL.""" +def _audio_file_url_to_host_url(url: str, r: Request) -> str: + """Convert a local audio file:// URL to a http(s):// URL.""" if url.startswith("file://"): try: idx = url.index("static/audio/") # A bit hacky @@ -388,12 +383,12 @@ def query_api(version: int = 1) -> Response: # If voice is set, return a voice-friendly string voice = bool_from_request(request, "voice") # Request a particular voice - voice_id: str = icelandic_asciify(rv.get("voice_id", DEFAULT_VOICE)) + voice_id: str = icelandic_asciify(rv.get("voice_id", TTS_SETTINGS.DEFAULT_VOICE)) # Request a particular voice speed try: - voice_speed = float(rv.get("voice_speed", DEFAULT_VOICE_SPEED)) + voice_speed = float(rv.get("voice_speed", TTS_SETTINGS.DEFAULT_VOICE_SPEED)) except ValueError: - voice_speed = DEFAULT_VOICE_SPEED + voice_speed = TTS_SETTINGS.DEFAULT_VOICE_SPEED # If test is set to True, we # (1) add a synthetic location, if not given; and @@ -459,7 +454,7 @@ def query_api(version: int = 1) -> Response: authenticated=_has_valid_api_key(request), ) - # Get URL for response synthesized speech audio + # Get URL for synthesized speech audio if voice and "voice" in result: # If the result contains a "voice" key, return it v = result["voice"] @@ -477,9 +472,13 @@ def query_api(version: int = 1) -> Response: vid = voice_for_locale(result["voice_locale"]) result["voice_id"] = vid # Create audio data - url = text_to_audio_url(v, voice_id=vid, speed=voice_speed) + TTS_SETTINGS.AUDIO_DIR = TTS_AUDIO_DIR + TTS_SETTINGS.KEYS_DIR = RESOURCES_DIR + tts_options = TTSOptions(voice=vid, speed=voice_speed) + tts_output: TTSOutput = tts_to_file(v, tts_options=tts_options) + url = tts_output.file.as_uri() if url: - result["audio"] = file_url_to_host_url(url, request) + result["audio"] = _audio_file_url_to_host_url(url, request) response = cast(Optional[Dict[str, str]], result.get("response")) if response: if "sources" in response: @@ -572,24 +571,27 @@ def speech_api(version: int = 1) -> Response: if not text: return better_jsonify(**reply) - fmt = rv.get("format", "ssml") - if fmt not in ["text", "ssml"]: - fmt = "ssml" - voice_id = icelandic_asciify(rv.get("voice_id", DEFAULT_VOICE)) + fmt: str = rv.get("format", "ssml") + text_format: TextFormats = TextFormats.SSML + if fmt in TextFormats._member_names_: + text_format = TextFormats.SSML + + voice_id = icelandic_asciify(rv.get("voice_id", TTS_SETTINGS.DEFAULT_VOICE)) try: - voice_speed = float(rv.get("voice_speed", DEFAULT_VOICE_SPEED)) + voice_speed = float(rv.get("voice_speed", TTS_SETTINGS.DEFAULT_VOICE_SPEED)) except ValueError: - voice_speed = DEFAULT_VOICE_SPEED + voice_speed = TTS_SETTINGS.DEFAULT_VOICE_SPEED try: - url = text_to_audio_url( - text, - text_format=fmt, - voice_id=voice_id, - speed=voice_speed, + TTS_SETTINGS.AUDIO_DIR = TTS_AUDIO_DIR + TTS_SETTINGS.KEYS_DIR = RESOURCES_DIR + tts_options = TTSOptions( + voice=voice_id, speed=voice_speed, text_format=text_format ) + tts_output: TTSOutput = tts_to_file(text, tts_options=tts_options) + url = tts_output.file.as_uri() if url: - url = file_url_to_host_url(url, request) + url = _audio_file_url_to_host_url(url, request) except Exception: return better_jsonify(**reply) @@ -609,9 +611,8 @@ def voices_api(version: int = 1) -> Response: return better_jsonify( valid=True, - default=DEFAULT_VOICE, - supported=sorted(list(SUPPORTED_VOICES)), - recommended=sorted(list(RECOMMENDED_VOICES)), + default=TTS_SETTINGS.DEFAULT_VOICE, + supported=sorted(list(VOICES)), ) diff --git a/search.py b/search.py index f2b33d77..59231f71 100755 --- a/search.py +++ b/search.py @@ -75,7 +75,9 @@ def _connect(cls): cls.similarity_client = SimilarityClient() @classmethod - def list_similar_to_article(cls, session: Session, uuid: str, n: int) -> List[SimilarDict]: + def list_similar_to_article( + cls, session: Session, uuid: str, n: int + ) -> List[SimilarDict]: """List n articles that are similar to the article with the given id""" cls._connect() # Returns a list of tuples: (article_id, similarity) @@ -86,7 +88,9 @@ def list_similar_to_article(cls, session: Session, uuid: str, n: int) -> List[Si return cls.list_articles(session, articles, n) @classmethod - def list_similar_to_topic(cls, session: Session, topic_vector: List[float], n: int) -> List[SimilarDict]: + def list_similar_to_topic( + cls, session: Session, topic_vector: List[float], n: int + ) -> List[SimilarDict]: """List n articles that are similar to the given topic vector""" cls._connect() # Returns a list of tuples: (article_id, similarity) @@ -97,7 +101,9 @@ def list_similar_to_topic(cls, session: Session, topic_vector: List[float], n: i return cls.list_articles(session, articles, n) @classmethod - def list_similar_to_terms(cls, session: Session, terms: List[Tuple[str, str]], n: int) -> WeightsDict: + def list_similar_to_terms( + cls, session: Session, terms: List[Tuple[str, str]], n: int + ) -> WeightsDict: """List n articles that are similar to the given terms. The terms are expected to be a list of (stem, category) tuples.""" cls._connect() @@ -108,7 +114,9 @@ def list_similar_to_terms(cls, session: Session, terms: List[Tuple[str, str]], n articles: List[Tuple[str, float]] = result.get("articles", []) # Obtain the search term weights weights: List[float] = result.get("weights", []) - return WeightsDict(weights=weights, articles=cls.list_articles(session, articles, n)) + return WeightsDict( + weights=weights, articles=cls.list_articles(session, articles, n) + ) @classmethod def list_articles( @@ -131,6 +139,8 @@ def list_articles( # Similarity in percent spercent = 100.0 * similarity + assert sa.timestamp is not None # Silence type checker + def is_probably_same_as(last: SimilarDict) -> bool: """Return True if the current article is probably different from the one already described in the last object""" diff --git a/speak.py b/speak.py deleted file mode 100755 index e0eb887f..00000000 --- a/speak.py +++ /dev/null @@ -1,269 +0,0 @@ -#!/usr/bin/env python -""" - - Greynir: Natural language processing for Icelandic - - Copyright (C) 2023 Miðeind ehf. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. - - - Friendly command line interface for Icelandic speech synthesis. - Returns 0 on success, 1 on error. - - Run the following command for a list of options: - - ./speak.py --help - -""" - -from typing import Optional, cast, List - -import sys -import subprocess -import logging -from pathlib import Path -from shutil import which -from urllib.request import urlopen -import wave - -import requests - -from speech import ( - text_to_audio_url, - DEFAULT_VOICE, - SUPPORTED_VOICES, - DEFAULT_TEXT_FORMAT, - DEFAULT_AUDIO_FORMAT, - SUPPORTED_AUDIO_FORMATS, - SUPPORTED_TEXT_FORMATS, -) -from speech.voices import suffix_for_audiofmt -from utility import sanitize_filename - - -def _die(msg: str, exit_code: int = 1) -> None: - print(msg, file=sys.stderr) - sys.exit(exit_code) - - -_DATA_URI_PREFIX = "data:" - - -def _is_data_uri(s: str) -> bool: - """Returns whether a URL is a data URI (RFC2397). Tolerates upper/mixed case prefix.""" - return s[: len(_DATA_URI_PREFIX)].lower() == _DATA_URI_PREFIX - - -def _is_file_uri(s: str) -> bool: - """Returns whether a URL is a file URI (RFC8089).""" - return s.startswith("file://") - - -def _bytes4file_or_data_uri(uri: str) -> bytes: - """Returns bytes of file at file URI (RFC8089) or in data URI (RFC2397).""" - with urlopen(uri) as response: - return response.read() - - -def _fetch_audio_bytes(url: str) -> Optional[bytes]: - """Returns bytes of audio file at URL.""" - if _is_data_uri(url) or _is_file_uri(url): - return _bytes4file_or_data_uri(url) - - try: - r = requests.get(url, timeout=10) - if r.status_code != 200: - raise Exception( - f"Received HTTP status code {r.status_code} when fetching {url}" - ) - return r.content - except Exception as e: - logging.error(f"Error fetching audio file: {e}") - - -def _write_wav( - fn: str, data: bytes, num_channels=1, sample_width=2, sample_rate=16000 -) -> None: - """Write audio data to WAV file. Defaults to 16-bit mono 16 kHz PCM.""" - with wave.open(fn, "wb") as wav: - wav.setnchannels(num_channels) - wav.setsampwidth(sample_width) - wav.setframerate(sample_rate) - wav.writeframes(data) - - -def _play_audio_file(path: str) -> None: - """Play audio file at path via command line player. This only works - on systems with afplay (macOS), mpv, mpg123 or cmdmp3 installed.""" - - AFPLAY = "/usr/bin/afplay" # afplay is only present on macOS systems - MPV = which("mpv") # mpv is a cross-platform player - MPG123 = which("mpg123") # mpg123 is a cross-platform player - CMDMP3 = which("cmdmp3") # cmdmp3 is a Windows command line mp3 player - - cmd: Optional[List[str]] = None - if Path(AFPLAY).is_file(): - cmd = [AFPLAY, path] - elif MPV: - cmd = [MPV, path] - elif MPG123: - cmd = [MPG123, "--quiet", path] - elif CMDMP3: - cmd = [CMDMP3, path] - - if not cmd: - _die("Couldn't find suitable command line audio player.") - - print(f"Playing file '{path}'") - subprocess.run(cast(List[str], cmd)) - - -DEFAULT_TEXT = ["Góðan daginn og til hamingju með lífið."] - - -def main() -> None: - """Main program function.""" - from argparse import ArgumentParser - - parser = ArgumentParser() - - parser.add_argument( - "-v", - "--voice", - help="specify which voice to use", - default=DEFAULT_VOICE, - choices=list(SUPPORTED_VOICES), - ) - parser.add_argument( - "-l", - "--list-voices", - help="print list of supported voices", - action="store_true", - ) - parser.add_argument( - "-f", - "--audioformat", - help="select audio format", - default=DEFAULT_AUDIO_FORMAT, - choices=list(SUPPORTED_AUDIO_FORMATS), - ) - parser.add_argument( - "-s", - "--speed", - help="set speech speed", - default=1.0, - type=float, - ) - parser.add_argument( - "-t", - "--textformat", - help="set text format", - default=DEFAULT_TEXT_FORMAT, - choices=list(SUPPORTED_TEXT_FORMATS), - ) - parser.add_argument( - "-o", - "--override", - help="override default audio output filename", - default="", # Empty string means use default filename - ) - parser.add_argument( - "-w", "--wav", help="generate WAV file from PCM", action="store_true" - ) - parser.add_argument( - "-u", "--url", help="dump audio URL to stdout", action="store_true" - ) - parser.add_argument( - "-n", "--noplay", help="do not play resulting audio file", action="store_true" - ) - parser.add_argument( - "-r", "--remove", help="remove audio file after playing", action="store_true" - ) - parser.add_argument( - "text", - help="text to synthesize", - nargs="*", - default=DEFAULT_TEXT, - ) - - args = parser.parse_args() - - if args.list_voices: - for voice in SUPPORTED_VOICES: - print(voice) - sys.exit(0) - - if len(args.text) == 0: - _die("No text provided.") - text = " ".join(args.text).strip() - if not text: - _die("No text provided.") - - if args.wav and args.audioformat != "pcm": - _die("WAV output flag only supported for PCM format.") - - # Synthesize the text according to CLI options - url = text_to_audio_url( - text, - text_format=args.textformat, - audio_format=args.audioformat, - voice_id=args.voice, - speed=args.speed, - ) - if not url: - _die("Error generating speech synthesis URL.") - - # Command line flag specifies that we should just dump the URL to stdout - if args.url: - print(url) - sys.exit(0) - - # Download - urldesc = f"data URI ({len(url)} bytes)" if _is_data_uri(url) else url - print(f"Fetching {urldesc}") - data: Optional[bytes] = _fetch_audio_bytes(url) - if not data: - _die("Unable to fetch audio data.") - - assert data is not None # Silence typing complaints - - if args.override: - # Override default filename - fn = args.override - else: - # Generate file name - fn = sanitize_filename(text) - fn = f"{fn}.{suffix_for_audiofmt(args.audioformat)}" - - # Write audio data to file - print(f'Writing to file "{fn}".') - if args.wav: - _write_wav(fn, data) - else: - with open(fn, "wb") as f: - f.write(data) - - # Play audio file using command line tool (if available) - if not args.noplay: - _play_audio_file(fn) - - # Remove file after playing - if args.remove: - print(f'Deleting file "{fn}".') - Path(fn).unlink() - - -if __name__ == "__main__": - """Perform speech synthesis of Icelandic text via the command line.""" - main() diff --git a/speech/README.md b/speech/README.md deleted file mode 100644 index d0ba3f88..00000000 --- a/speech/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# Speech-To-Text voices - -Herein you will find Icelandic-language speech-synthesis code used by Greynir. Voice modules are -found in the `voices` directory. Each module declares the names of the voices it support and -implements the `text_to_audio_data` and `text_to_audio_url` functions. -Functions/methods for performing phonetic transcription are found in the `trans` directory, -along with `speech/__init__.py`. The function `gssml` marks portions of text -which get phonetically transcribed when parsed by `GreynirSSMLParser` from `__init__.py` diff --git a/speech/__init__.py b/speech/__init__.py deleted file mode 100755 index 0aa04dd7..00000000 --- a/speech/__init__.py +++ /dev/null @@ -1,259 +0,0 @@ -#!/usr/bin/env python -""" - - Greynir: Natural language processing for Icelandic - - Copyright (C) 2023 Miðeind ehf. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. - - - Icelandic speech synthesis via various Text-To-Speech services. - -""" - -from typing import ( - Any, - Deque, - Dict, - Iterable, - List, - Optional, - Tuple, - Type, -) -from types import ModuleType - -import logging -import importlib -from pathlib import Path -from inspect import isfunction, ismethod -from html.parser import HTMLParser -from collections import deque -from speech.trans import TRANSCRIBER_CLASS, DefaultTranscriber, TranscriptionMethod - -from utility import GREYNIR_ROOT_DIR, cap_first, modules_in_dir - - -# Text formats -# For details about SSML markup, see: -# https://developer.amazon.com/en-US/docs/alexa/custom-skills/speech-synthesis-markup-language-ssml-reference.html -# or: -# https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup -DEFAULT_TEXT_FORMAT = "ssml" -SUPPORTED_TEXT_FORMATS = frozenset(("text", "ssml")) -assert DEFAULT_TEXT_FORMAT in SUPPORTED_TEXT_FORMATS - -# Audio formats -DEFAULT_AUDIO_FORMAT = "mp3" -SUPPORTED_AUDIO_FORMATS = frozenset(("mp3", "ogg_vorbis", "pcm", "opus")) -assert DEFAULT_AUDIO_FORMAT in SUPPORTED_AUDIO_FORMATS - -VOICES_DIR = GREYNIR_ROOT_DIR / "speech" / "voices" -assert VOICES_DIR.is_dir() - - -def _load_voice_modules() -> Dict[str, ModuleType]: - """Dynamically load all voice modules, map - voice ID strings to the relevant modules.""" - - v2m: Dict[str, ModuleType] = {} - for modname in modules_in_dir(VOICES_DIR): - try: - # Try to import - m = importlib.import_module(modname) - voices: Iterable[str] = getattr(m, "VOICES") - if not voices: - continue # No voices declared, skip - for v in voices: - assert v not in v2m, f"Voice '{v}' already declared in module {v2m[v]}" - v2m[v] = m - except Exception as e: - logging.error(f"Error importing voice module {modname}: {e}") - - return v2m - - -VOICE_TO_MODULE = _load_voice_modules() -SUPPORTED_VOICES = frozenset(VOICE_TO_MODULE.keys()) -RECOMMENDED_VOICES = frozenset(("Gudrun", "Gunnar")) -DEFAULT_VOICE = "Gudrun" -DEFAULT_VOICE_SPEED = 1.0 - -assert DEFAULT_VOICE in SUPPORTED_VOICES -assert DEFAULT_VOICE in RECOMMENDED_VOICES - - -def _sanitize_args(args: Dict[str, Any]) -> Dict[str, Any]: - """Make sure arguments to speech synthesis functions are sane.""" - # Make sure we have a valid voice ID - voice_id = args["voice_id"].lower().capitalize() - if voice_id not in SUPPORTED_VOICES: - logging.warning( - f"Voice '{voice_id}' not in supported voices, reverting to default ({DEFAULT_VOICE})" - ) - args["voice_id"] = DEFAULT_VOICE - else: - args["voice_id"] = voice_id - - # Clamp speed to 50-200% range - args["speed"] = max(min(2.0, args["speed"]), 0.5) - - return args - - -def text_to_audio_data( - text: str, - text_format: str = DEFAULT_TEXT_FORMAT, - audio_format: str = DEFAULT_AUDIO_FORMAT, - voice_id: str = DEFAULT_VOICE, - speed: float = DEFAULT_VOICE_SPEED, -) -> bytes: - """Returns audio data for speech-synthesized text.""" - # Fall back to default voice if voice_id param invalid - if voice_id not in SUPPORTED_VOICES: - voice_id = DEFAULT_VOICE - # Create a copy of all function arguments - args = locals().copy() - # Find the module that provides this voice - module = VOICE_TO_MODULE.get(voice_id) - assert module is not None - # Get the function from the module - fn = getattr(module, "text_to_audio_data") - assert isfunction(fn) - # Call function in module, passing on the arguments - return fn(**_sanitize_args(args)) - - -def text_to_audio_url( - text: str, - text_format: str = DEFAULT_TEXT_FORMAT, - audio_format: str = DEFAULT_AUDIO_FORMAT, - voice_id: str = DEFAULT_VOICE, - speed: float = DEFAULT_VOICE_SPEED, -) -> str: - """Returns URL to audio of speech-synthesized text.""" - # Fall back to default voice if voice_id param invalid - if voice_id not in SUPPORTED_VOICES: - voice_id = DEFAULT_VOICE - # Create a copy of all function arguments - args = locals().copy() - # Find the module that provides this voice - module = VOICE_TO_MODULE.get(voice_id) - assert module is not None - # Get the function from the module - fn = getattr(module, "text_to_audio_url") - assert isfunction(fn) - # Call function in module, passing on the arguments - return fn(**_sanitize_args(args)) - - -class GreynirSSMLParser(HTMLParser): - """ - Parses voice strings containing tags and - calls transcription handlers corresponding to each tag's type attribute. - - Note: Removes any other markup tags from the text as that - can interfere with the voice engines. - - Example: - # Provide voice engine ID - gp = GreynirSSMLParser(voice_id) - # Transcribe voice string - voice_string = gp.transcribe(voice_string) - """ - - def __init__(self, voice_id: str = DEFAULT_VOICE) -> None: - """ - Initialize parser and setup transcription handlers - for the provided speech synthesis engine. - """ - super().__init__() - if voice_id not in SUPPORTED_VOICES: - logging.warning( - f"Voice '{voice_id}' not in supported voices, reverting to default ({DEFAULT_VOICE})" - ) - voice_id = DEFAULT_VOICE - # Find the module that provides this voice - module = VOICE_TO_MODULE[voice_id] - - # Fetch transcriber for this voice module, - # using DefaultTranscriber as fallback - self._handler: Type[DefaultTranscriber] = getattr( - module, TRANSCRIBER_CLASS, DefaultTranscriber - ) - - def transcribe(self, voice_string: str) -> str: - """Parse and return transcribed voice string.""" - # Prepare HTMLParser variables for parsing - # (in case this method is called more - # than once on a particular instance) - self.reset() - - # Set (or reset) variables used during parsing - self._str_stack: Deque[str] = deque() - self._str_stack.append("") - self._attr_stack: Deque[Dict[str, Optional[str]]] = deque() - - self.feed(voice_string) - self.close() - assert ( - len(self._str_stack) == 1 - ), "Error during parsing, are all markup tags correctly closed?" - return cap_first(self._str_stack[0]) - - # ---------------------------------------- - - def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]): - """Called when a tag is opened.""" - if tag == "greynir": - self._str_stack.append("") - self._attr_stack.append(dict(attrs)) - - def handle_data(self, data: str) -> None: - """Called when data is encountered.""" - # Append string data to current string in stack - self._str_stack[-1] += self._handler.danger_symbols(data) - - def handle_endtag(self, tag: str): - """Called when a tag is closed.""" - if tag == "greynir": - # Parse data inside the greynir tag we're closing - s: str = self._str_stack.pop() # String content - if self._attr_stack: - dattrs = self._attr_stack.pop() # Current tag attributes - t: Optional[str] = dattrs.pop("type") - assert t, f"Missing type attribute in tag around string: {s}" - # Fetch corresponding transcription method from handler - transf: TranscriptionMethod = getattr(self._handler, t) - assert ismethod(transf), f"{t} is not a transcription method." - # Transcriber classmethod found, transcribe text - s = transf(s, **dattrs) - # Add to our string stack - if self._str_stack: - self._str_stack[-1] += s - else: - self._str_stack.append(s) - - def handle_startendtag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]): - """Called when a empty tag is opened (and closed), e.g. ''.""" - if tag == "greynir": - dattrs = dict(attrs) - t: Optional[str] = dattrs.pop("type") - assert t, "Missing type attribute in tag" - transf: TranscriptionMethod = getattr(self._handler, t) - # If handler found, replace empty greynir tag with output, - # otherwise simply remove empty greynir tag - assert ismethod(transf), f"{t} is not a transcription method." - s: str = transf(**dattrs) - self._str_stack[-1] += s diff --git a/speech/trans/__init__.py b/speech/trans/__init__.py deleted file mode 100644 index d51a4e63..00000000 --- a/speech/trans/__init__.py +++ /dev/null @@ -1,1091 +0,0 @@ -#!/usr/bin/env python -""" - - Greynir: Natural language processing for Icelandic - - Copyright (C) 2023 Miðeind ehf. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. - - - This file contains phonetic transcription functionality - specifically intended for Icelandic speech synthesis engines. - -""" - -from typing import ( - Any, - Callable, - FrozenSet, - Iterable, - List, - Match, - Mapping, - Optional, - Tuple, - Union, - cast, -) - -import re -import itertools -from functools import lru_cache - -from tokenizer import Abbreviations - -# Ensure abbreviations have been loaded -Abbreviations.initialize() - -from tokenizer.definitions import HYPHENS -from islenska.basics import ALL_CASES, ALL_GENDERS, ALL_NUMBERS -from reynir.bindb import GreynirBin -from reynir.simpletree import SimpleTree -from reynir import Greynir, TOK, Tok - -from speech.trans.num import ( - CaseType, - GenderType, - NumberType, - digits_to_text, - float_to_text, - floats_to_text, - number_to_text, - numbers_to_text, - number_to_ordinal, - numbers_to_ordinal, - year_to_text, - years_to_text, - _ROMAN_NUMERALS, - roman_numeral_to_ordinal, -) - -# Each voice module in the directory `speech/voices` can define a -# 'Transcriber' class, as a subclass of 'DefaultTranscriber', in -# order to override transcription methods for a particular voice -TRANSCRIBER_CLASS = "Transcriber" - - -def strip_markup(text: str) -> str: - """Remove HTML/SSML tags from a string.""" - return re.sub(r"<.*?>", "", text) - - -def gssml(data: Any = None, *, type: str, **kwargs: Union[str, int, float]) -> str: - """ - Utility function, surrounds data with Greynir-specific - voice transcription tags. - E.g. '{data}' - or '' if data is None. - - Type specifies the type of handling needed when the tags are parsed. - The kwargs are then passed to the handler functions as appropriate. - - The greynir tags can be transcribed - in different ways depending on the voice engine used. - - Example: - gssml(43, type="number", gender="kk") -> '43' - """ - assert type and isinstance( - type, str - ), f"type keyword arg must be non-empty string in function gssml; data: {data}" - return ( - f'{data}" if data is not None else f" />") - ) - - -# Spell out how character names are pronounced in Icelandic -_CHAR_PRONUNCIATION: Mapping[str, str] = { - "a": "a", - "á": "á", - "b": "bé", - "c": "sé", - "d": "dé", - "ð": "eð", - "e": "e", - "é": "é", - "f": "eff", - "g": "gé", - "h": "há", - "i": "i", - "í": "í", - "j": "joð", - "k": "ká", - "l": "ell", - "m": "emm", - "n": "enn", - "o": "o", - "ó": "ó", - "p": "pé", - "q": "kú", - "r": "err", - "s": "ess", - "t": "té", - "u": "u", - "ú": "ú", - "v": "vaff", - "w": "tvöfalt vaff", - "x": "ex", - "y": "ufsilon", - "ý": "ufsilon í", - "þ": "þoddn", - "æ": "æ", - "ö": "ö", - "z": "seta", -} - -# Icelandic/English alphabet, uppercased -_ICE_ENG_ALPHA = "".join(c.upper() for c in _CHAR_PRONUNCIATION.keys()) - -# Matches e.g. "klukkan 14:30", "kl. 2:23:31", "02:15" -_TIME_REGEX = re.compile( - r"((?P(kl\.|klukkan)) )?(?P\d{1,2}):" - r"(?P\d\d)(:(?P\d\d))?", - flags=re.IGNORECASE, -) -_MONTH_ABBREVS = ( - "jan", - "feb", - "mar", - "apr", - "maí", - "jún", - "júl", - "ágú", - "sep", - "okt", - "nóv", - "des", -) -_MONTH_NAMES = ( - "janúar", - "febrúar", - "mars", - "apríl", - "maí", - "júní", - "júlí", - "ágúst", - "september", - "október", - "nóvember", - "desember", -) -_DATE_REGEXES = ( - # Matches e.g. "1986-03-07" - re.compile(r"(?P\d{1,4})-(?P\d{1,2})-(?P\d{1,2})"), - # Matches e.g. "1/4/2001" - re.compile(r"(?P\d{1,2})/(?P\d{1,2})/(?P\d{1,4})"), - # Matches e.g. "25. janúar 1999" or "25 des." - re.compile( - r"(?P\d{1,2})\.? ?" - r"(?P(jan(úar|\.)?|feb(rúar|\.)?|mar(s|\.)?|" - r"apr(íl|\.)?|maí\.?|jún(í|\.)?|" - r"júl(í|\.)?|ágú(st|\.)?|sep(tember|\.)?|" - r"okt(óber|\.)?|nóv(ember|\.)?|des(ember|\.)?))" # 'month' capture group ends - r"( (?P\d{1,4}))?", # Optional - flags=re.IGNORECASE, - ), -) - - -def _split_substring_types(t: str) -> Iterable[str]: - """ - Split text into alphabetic, decimal or - other character type substrings. - - Example: - list(_split_substring_types("hello world,123")) - -> ["hello", " ", "world", ",", "123"] - """ - f: Callable[[str], int] = lambda c: c.isalpha() + 2 * c.isdecimal() - return ("".join(g) for _, g in itertools.groupby(t, key=f)) - - -# Matches letter followed by period or -# 2-5 uppercase letters side-by-side not -# followed by another uppercase letter -# (e.g. matches "EUIPO" or "MSc", but not "TESTING") -_ABBREV_RE = re.compile( - rf"([{_ICE_ENG_ALPHA + _ICE_ENG_ALPHA.lower()}]\." - rf"|\b[{_ICE_ENG_ALPHA}]{{2,5}}(?![{_ICE_ENG_ALPHA}]))" -) - -# Terms common in sentences which refer to results from sports -_SPORTS_LEMMAS: FrozenSet[str] = frozenset(("leikur", "vinna", "tapa", "sigra")) - -_HYPHEN_SYMBOLS = frozenset(HYPHENS) - -_StrBool = Union[str, bool] -TranscriptionMethod = Callable[..., str] - - -def _empty_str(f: TranscriptionMethod) -> TranscriptionMethod: - """ - Decorator which returns an empty string - if the transcription method is called - with an empty string. - """ - - def _inner(cls: "DefaultTranscriber", txt: str, **kwargs: _StrBool): - if not txt: - return "" - return f(cls, txt, **kwargs) - - return _inner - - -def _bool_args(*bool_args: str) -> Callable[[TranscriptionMethod], TranscriptionMethod]: - """ - Returns a decorator which converts keyword arguments in bool_args - from strings into booleans before calling the decorated function. - - As GSSML is text-based, all function arguments come from strings. - Booleans also work when calling the methods directly, e.g. in testing. - """ - - def _decorator(f: TranscriptionMethod) -> TranscriptionMethod: - def _bool_translate(cls: "DefaultTranscriber", *args: str, **kwargs: str): - # Convert keyword arguments in bool_args from - # str to boolean before calling decorated function - newkwargs = { - key: (str(val) == "True" if key in bool_args else val) - for key, val in kwargs.items() - } - return f(cls, *args, **newkwargs) - - return _bool_translate - - return _decorator - - -class DefaultTranscriber: - """ - Class containing default phonetic transcription functions - for Icelandic speech synthesis. - """ - - # Singleton Greynir instance - _greynir: Optional[Greynir] = None - - # &,<,> cause speech synthesis errors, - # change these to text - _DANGER_SYMBOLS: Tuple[Tuple[str, str], ...] = ( - ("&", " og "), - ("<=", " minna eða jafnt og "), - ("<", " minna en "), - (">=", " stærra eða jafnt og "), - (">", " stærra en "), - ) - - @classmethod - @_empty_str - def danger_symbols(cls, txt: str) -> str: - """ - Takes in any text and replaces the symbols that - cause issues for the speech synthesis engine. - These symbols are &,<,>. - - Note: HTML charrefs (e.g. &) should be translated to their - unicode character before this function is called. - (GreynirSSMLParser does this automatically.) - """ - for symb, new in cls._DANGER_SYMBOLS: - txt = txt.replace(symb, new) - return txt - - @classmethod - @_empty_str - @_bool_args("one_hundred") - def number( - cls, - txt: str, - *, - case: CaseType = "nf", - gender: GenderType = "hk", - one_hundred: bool = False, - ) -> str: - """Voicify a number.""" - return number_to_text(txt, case=case, gender=gender, one_hundred=one_hundred) - - @classmethod - @_empty_str - @_bool_args("one_hundred") - def numbers( - cls, - txt: str, - *, - case: CaseType = "nf", - gender: GenderType = "hk", - one_hundred: bool = False, - ) -> str: - """Voicify text containing multiple numbers.""" - return numbers_to_text(txt, case=case, gender=gender, one_hundred=one_hundred) - - @classmethod - @_empty_str - @_bool_args("comma_null", "one_hundred") - def float( - cls, - txt: str, - *, - case: CaseType = "nf", - gender: GenderType = "hk", - one_hundred: bool = False, - comma_null: bool = False, - ) -> str: - """Voicify a float.""" - return float_to_text( - txt, - case=case, - gender=gender, - one_hundred=one_hundred, - comma_null=comma_null, - ) - - @classmethod - @_empty_str - @_bool_args("comma_null", "one_hundred") - def floats( - cls, - txt: str, - *, - case: CaseType = "nf", - gender: GenderType = "hk", - one_hundred: bool = False, - comma_null: bool = False, - ) -> str: - """Voicify text containing multiple floats.""" - return floats_to_text( - txt, - case=case, - gender=gender, - one_hundred=one_hundred, - comma_null=comma_null, - ) - - @classmethod - @_empty_str - def ordinal( - cls, - txt: str, - *, - case: CaseType = "nf", - gender: GenderType = "hk", - number: NumberType = "et", - ) -> str: - """Voicify an ordinal.""" - return number_to_ordinal(txt, case=case, gender=gender, number=number) - - @classmethod - @_empty_str - def ordinals( - cls, - txt: str, - *, - case: CaseType = "nf", - gender: GenderType = "hk", - number: NumberType = "et", - ) -> str: - """Voicify text containing multiple ordinals.""" - return numbers_to_ordinal(txt, case=case, gender=gender, number=number) - - @classmethod - @_empty_str - def digits(cls, txt: str) -> str: - """Spell out digits.""" - return digits_to_text(txt) - - @classmethod - @_empty_str - def phone(cls, txt: str) -> str: - """Spell out a phone number.""" - return cls.digits(txt) - - @classmethod - def timespan(cls, seconds: str) -> str: - """Voicify a span of time, specified in seconds.""" - # TODO: Replace time_period_desc in queries/util/__init__.py - raise NotImplementedError() - - @classmethod - def distance(cls, meters: str) -> str: - # TODO: Replace distance_desc in queries/util/__init__.py - raise NotImplementedError() - - @classmethod - @_empty_str - def time(cls, txt: str) -> str: - """Voicifies time of day.""" - - def _time_fmt(match: Match[str]) -> str: - gd = match.groupdict() - prefix: Optional[str] = gd["klukkan"] - h: int = int(gd["hour"]) - m: int = int(gd["minute"]) - s: Optional[int] = int(gd["second"]) if gd["second"] is not None else None - suffix: Optional[str] = None - - t: List[str] = [] - # If "klukkan" or "kl." at beginning of string, - # prepend "klukkan" - if prefix: - t.append("klukkan") - - # Hours - if h == 0 and m == 0: - # Call 00:00 "tólf á miðnætti" - h = 12 - suffix = "á miðnætti" - elif 0 <= h <= 5: - # Call 00:xx-0:5:xx "... um nótt" - suffix = "um nótt" - elif h == 12 and m == 0: - # Call 12:00 "tólf á hádegi" - suffix = "á hádegi" - t.append(number_to_text(h, case="nf", gender="hk")) - - # Minutes - if m > 0: - if m < 10: - # e.g. "þrettán núll fjögur" - t.append("núll") - t.append(number_to_text(m, case="nf", gender="hk")) - - # Seconds - if s is not None and s > 0: - if s < 10: - # e.g. "þrettán núll fjögur núll sex" - t.append("núll") - t.append(number_to_text(s, case="nf", gender="hk")) - - # Suffix for certain times of day to reduce ambiguity - if suffix: - t.append(suffix) - - return " ".join(t) - - return _TIME_REGEX.sub(_time_fmt, txt) - - @classmethod - @_empty_str - def date(cls, txt: str, case: CaseType = "nf") -> str: - """Voicifies a date""" - for r in _DATE_REGEXES: - match = r.search(txt) - if match: - # Found match - start, end = match.span() - gd = match.groupdict() - day = number_to_ordinal(gd["day"], gender="kk", case=case, number="et") - mon: str = gd["month"] - # Month names don't change in different declensions - month = ( - _MONTH_NAMES[int(mon) - 1] # DD/MM/YYYY specification - if mon.isdecimal() - else _MONTH_NAMES[_MONTH_ABBREVS.index(mon[:3])] # Non-decimal - ) - fmt_date = ( - f"{day} {month} {year_to_text(gd['year'])}" - if gd["year"] - else f"{day} {month}" - ) - # Only replace date part, leave rest of string intact - txt = txt[:start] + fmt_date + txt[end:] - break - return txt - - @classmethod - @_empty_str - def year(cls, txt: str) -> str: - """Voicify a year.""" - return year_to_text(txt) - - @classmethod - @_empty_str - def years(cls, txt: str) -> str: - """Voicify text containing multiple years.""" - return years_to_text(txt) - - # Pronunciation of character names in Icelandic - _CHAR_PRONUNCIATION: Mapping[str, str] = { - "a": "a", - "á": "á", - "b": "bé", - "c": "sé", - "d": "dé", - "ð": "eð", - "e": "e", - "é": "é", - "f": "eff", - "g": "gé", - "h": "há", - "i": "i", - "í": "í", - "j": "joð", - "k": "ká", - "l": "ell", - "m": "emm", - "n": "enn", - "o": "o", - "ó": "ó", - "p": "pé", - "q": "kú", - "r": "err", - "s": "ess", - "t": "té", - "u": "u", - "ú": "ú", - "v": "vaff", - "w": "tvöfaltvaff", - "x": "ex", - "y": "ufsilon", - "ý": "ufsilon í", - "þ": "þoddn", - "æ": "æ", - "ö": "ö", - "z": "seta", - } - # Pronunciation of some symbols - _PUNCT_PRONUNCIATION: Mapping[str, str] = { - " ": "bil", - "~": "tilda", - "`": "broddur", - "!": "upphrópunarmerki", - "@": "att merki", - "#": "myllumerki", - "$": "dollaramerki", - "%": "prósentumerki", - "^": "tvíbroddur", - "&": "og merki", - "*": "stjarna", - "(": "vinstri svigi", - ")": "hægri svigi", - "-": "bandstrik", - "_": "niðurstrik", - "=": "jafnt og merki", - "+": "plús", - "[": "vinstri hornklofi", - "{": "vinstri slaufusvigi", - "]": "hægri hornklofi", - "}": "hægri slaufusvigi", - "\\": "bakstrik", - "|": "pípumerki", - ";": "semíkomma", - ":": "tvípunktur", - "'": "úrfellingarkomma", - '"': "tvöföld gæsalöpp", - ",": "komma", - "<": "vinstri oddklofi", - ".": "punktur", - ">": "hægri oddklofi", - "/": "skástrik", - "?": "spurningarmerki", - # Less common symbols - "°": "gráðumerki", - "±": "plús-mínus merki", - "–": "stutt þankastrik", - "—": "þankastrik", - "…": "úrfellingarpunktar", - "™": "vörumerki", - "®": "skrásett vörumerki", - "©": "höfundarréttarmerki", - } - - @classmethod - @_empty_str - @_bool_args("literal") - def spell( - cls, - txt: str, - *, - pause_length: Optional[str] = None, - literal: bool = False, - ) -> str: - """ - Spell out a sequence of characters. - If literal is set, also pronounce spaces and punctuation symbols. - """ - pronounce: Callable[[str], str] = ( - lambda c: cls._CHAR_PRONUNCIATION.get(c.lower(), c) - if not c.isspace() - else "" - ) - if literal: - pronounce = lambda c: cls._CHAR_PRONUNCIATION.get( - c.lower(), cls._PUNCT_PRONUNCIATION.get(c, c) - ) - t = tuple(map(pronounce, txt)) - return ( - cls.vbreak(time="0.01s") - + cls.vbreak(time=pause_length or "0.02s").join(t) - + cls.vbreak(time="0.02s" if len(t) > 1 else "0.01s") - ) - - @classmethod - @_empty_str - def abbrev(cls, txt: str) -> str: - """Expand an abbreviation.""" - meanings = tuple( - filter( - lambda m: m.fl != "erl", # Only Icelandic abbrevs - Abbreviations.get_meaning(txt) or [], - ) - ) - if meanings: - # Abbreviation has at least one known meaning, expand it - return ( - cls.vbreak(time="0.01s") + meanings[0].stofn + cls.vbreak(time="0.05s") - ) - - # Fallbacks: - # - Spell out, if any letter is uppercase (e.g. "MSc") - if not txt.islower(): - return cls.spell(txt.replace(".", "")) - # - Give up and keep as-is for all-lowercase txt - # (e.g. "cand.med."), - return txt - - @classmethod - def amount(cls, txt: str) -> str: - # TODO - raise NotImplementedError() - - @classmethod - def currency(cls, txt: str) -> str: - # TODO - raise NotImplementedError() - - @classmethod - def measurement(cls, txt: str) -> str: - # TODO - raise NotImplementedError() - - @classmethod - @_empty_str - def molecule(cls, txt: str) -> str: - """Voicify the name of a molecule""" - return " ".join( - cls.number(x, gender="kk") if x.isdecimal() else cls.spell(x, literal=True) - for x in _split_substring_types(txt) - ) - - @classmethod - @_empty_str - def numalpha(cls, txt: str) -> str: - """Voicify a alphanumeric string, spelling each character.""" - return " ".join( - cls.digits(x) if x.isdecimal() else cls.spell(x) - for x in _split_substring_types(txt) - ) - - @classmethod - @_empty_str - def username(cls, txt: str) -> str: - """Voicify a username.""" - newtext: List[str] = [] - if txt.startswith("@"): - txt = txt[1:] - newtext.append("att") - for x in _split_substring_types(txt): - if x.isdecimal(): - if len(x) > 2: - # Spell out numbers of more than 2 digits - newtext.append(cls.digits(x)) - else: - newtext.append(cls.number(x)) - else: - if x.isalpha() and len(x) > 2: - # Alphabetic string, longer than 2 chars, pronounce as is - newtext.append(x) - else: - # Not recognized as number or Icelandic word, - # spell this literally (might include punctuation symbols) - newtext.append(cls.spell(x, literal=True)) - return " ".join(newtext) - - _DOMAIN_PRONUNCIATIONS: Mapping[str, str] = { - "is": "is", - "org": "org", - "net": "net", - "com": "komm", - "gmail": "gjé meil", - "hotmail": "hott meil", - "yahoo": "ja húú", - "outlook": "átlúkk", - } - - @classmethod - @_empty_str - def domain(cls, txt: str) -> str: - """Voicify a domain name.""" - newtext: List[str] = [] - for x in _split_substring_types(txt): - if x in cls._DOMAIN_PRONUNCIATIONS: - newtext.append(cls._DOMAIN_PRONUNCIATIONS[x]) - elif x.isdecimal(): - if len(x) > 2: - # Spell out numbers of more than 2 digits - newtext.append(cls.digits(x)) - else: - newtext.append(cls.number(x)) - else: - if x.isalpha() and len(x) > 2: - # Alphabetic string, longer than 2 chars, pronounce as is - newtext.append(x) - elif x == ".": - # Periods are common in domains/URLs, - # skip calling the spell method - newtext.append("punktur") - else: - # Short and/or non-alphabetic string - # (might consist of punctuation symbols) - # Spell this literally - newtext.append(cls.spell(x, literal=True)) - return " ".join(newtext) - - @classmethod - @_empty_str - def email(cls, txt: str) -> str: - """Voicify an email address.""" - user, at, domain = txt.partition("@") - return f"{cls.username(user)}{' hjá ' if at else ''}{cls.domain(domain)}" - - # Hardcoded pronounciations, - # should be overriden based on voice engine - _ENTITY_PRONUNCIATIONS: Mapping[str, str] = { - "ABBA": "ABBA", - "BOYS": "BOYS", - "BUGL": "BUGL", - "BYKO": "BYKO", - "CAVA": "CAVA", - "CERN": "CERN", - "CERT": "CERT", - "EFTA": "EFTA", - "ELKO": "ELKO", - "NATO": "NATO", - "NEW": "NEW", - "NOVA": "NOVA", - "PLAY": "PLAY", - "PLUS": "PLUS", - "RARIK": "RARIK", - "RIFF": "RIFF", - "RÚV": "RÚV", - "SAAB": "SAAB", - "SAAS": "SAAS", - "SHAH": "SHAH", - "SIRI": "SIRI", - "UENO": "UENO", - "YVES": "YVES", - } - - # These parts of a entity name aren't necessarily - # all uppercase or contain a period, - # but should be spelled out - _ENTITY_SPELL = frozenset( - ( - "GmbH", - "USS", - "Ltd", - "bs", - "ehf", - "h/f", - "hf", - "hses", - "hsf", - "ohf", - "s/f", - "ses", - "sf", - "slf", - "slhf", - "svf", - "vlf", - "vmf", - ) - ) - - @classmethod - @_empty_str - def entity(cls, txt: str) -> str: - """Voicify an entity name.""" - parts = txt.split() - with GreynirBin.get_db() as gbin: - for i, p in enumerate(parts): - if p in cls._ENTITY_PRONUNCIATIONS: - # Hardcoded pronunciation - parts[i] = cls._ENTITY_PRONUNCIATIONS[p] - continue - if p.isdecimal(): - # Number - parts[i] = cls.number(p) - continue - - spell_part = False - p_nodots = p.replace(".", "") - if p_nodots in cls._ENTITY_SPELL: - # We know this should be spelled out - spell_part = True - elif p_nodots.isupper(): - if gbin.lookup(p_nodots, auto_uppercase=True)[1]: - # Uppercase word has similar Icelandic word, - # pronounce it that way - parts[i] = p_nodots.capitalize() - continue - # No known Icelandic pronounciation, spell - spell_part = True - if spell_part: - # Spell out this part of the entity name - parts[i] = cls.spell(p_nodots) - return " ".join(parts) - - @classmethod - @_empty_str - @_bool_args("full_text") - @lru_cache(maxsize=50) # Caching, as this method could be slow - def generic(cls, txt: str, *, full_text: bool = False) -> str: - """ - Attempt to voicify some generic text. - Parses text and calls other transcription handlers - based on inferred meaning of words. - if full_text is set to True, - add paragraph and sentence markers. - """ - if cls._greynir is None: - cls._greynir = Greynir(no_sentence_start=True) - p_result = cls._greynir.parse(txt) - - def _ordinal(tok: Tok, term: Optional[SimpleTree]) -> str: - """Handles ordinals, e.g. '14.' or '2.'.""" - case, gender, number = "nf", "hk", "et" - if term is not None: - case = next(filter(lambda v: v in ALL_CASES, term.variants), "nf") - gender = next(filter(lambda v: v in ALL_GENDERS, term.variants), "hk") - if term is not None and term.index is not None: - leaves = tuple(term.root.leaves) - if len(leaves) > term.index + 1: - # Fetch the grammatical number of the following word - number = next( - filter( - lambda v: v in ALL_NUMBERS, - leaves[term.index + 1].variants, - ), - "et", - ) - return cls.ordinal(txt, case=case, gender=gender, number=number) - - def _number(tok: Tok, term: Optional[SimpleTree]) -> str: - """Handles numbers, e.g. '135', '17,86' or 'fjörutíu og þrír'.""" - if not tok.txt.replace(".", "").replace(",", "").isdecimal(): - # Don't modify non-decimal numbers - return tok.txt - case, gender = "nf", "hk" - if term is not None: - case = next(filter(lambda v: v in ALL_CASES, term.variants), "nf") - gender = next(filter(lambda v: v in ALL_GENDERS, term.variants), "hk") - if "," in txt: - return cls.float(txt, case=case, gender=gender) - else: - return cls.number(txt, case=case, gender=gender) - - def _percent(tok: Tok, term: Optional[SimpleTree]) -> str: - """Handles a percentage, e.g. '15,6%' or '40 prósent'.""" - gender = "hk" - n, cases, _ = cast(Tuple[float, Any, Any], tok.val) - if cases is None: - case = "nf" - else: - case = cases[0] - if n.is_integer(): - val = cls.number(n, case=case, gender=gender) - else: - val = cls.float(n, case=case, gender=gender) - if cases is None: - # Uses "%" or "‰" instead of "prósent" - # (permille value is converted to percentage by tokenizer) - percent = "prósent" - else: - # Uses "prósent" in some form, keep as is - percent = tok.txt.split(" ")[-1] - return f"{val} {percent}" - - def _numwletter(tok: Tok, term: Optional[SimpleTree]) -> str: - num = "".join(filter(lambda c: c.isdecimal(), tok.txt)) - return ( - cls.number(num, case="nf", gender="hk") - + " " - + cls.spell(tok.txt[len(num) + 1 :]) - ) - - # Map certain terminals directly to transcription functions - handler_map: Mapping[int, Callable[[Tok, Optional[SimpleTree]], str]] = { - TOK.ENTITY: lambda tok, term: cls.entity(tok.txt), - TOK.COMPANY: lambda tok, term: cls.entity(tok.txt), - TOK.PERSON: lambda tok, term: cls.person(tok.txt), - TOK.EMAIL: lambda tok, term: cls.email(tok.txt), - TOK.HASHTAG: lambda tok, term: f"myllumerki {tok.txt[1:]}", - TOK.TIME: lambda tok, term: cls.time(tok.txt), - TOK.YEAR: lambda tok, term: cls.years(tok.txt), - # TODO: Better handling of case for dates, - # accusative is common though - TOK.DATE: lambda tok, term: cls.date(tok.txt, case="þf"), - TOK.DATEABS: lambda tok, term: cls.date(tok.txt, case="þf"), - TOK.DATEREL: lambda tok, term: cls.date(tok.txt, case="þf"), - TOK.TIMESTAMP: lambda tok, term: cls.time(cls.date(tok.txt, case="þf")), - TOK.TIMESTAMPABS: lambda tok, term: cls.time(cls.date(tok.txt, case="þf")), - TOK.TIMESTAMPREL: lambda tok, term: cls.time(cls.date(tok.txt, case="þf")), - TOK.SSN: lambda tok, term: cls.digits(tok.txt), - TOK.TELNO: lambda tok, term: cls.digits(tok.txt), - TOK.SERIALNUMBER: lambda tok, term: cls.digits(tok.txt), - TOK.MOLECULE: lambda tok, term: cls.molecule(tok.txt), - TOK.USERNAME: lambda tok, term: cls.username(tok.txt), - TOK.DOMAIN: lambda tok, term: cls.domain(tok.txt), - TOK.URL: lambda tok, term: cls.domain(tok.txt), - # TOK.AMOUNT: lambda tok, term: tok.txt, - # TOK.CURRENCY: lambda tok, term: tok.txt, CURRENCY_SYMBOLS in tokenizer - # TOK.MEASUREMENT: lambda tok, term: tok.txt, SI_UNITS in tokenizer - TOK.NUMBER: _number, - TOK.NUMWLETTER: _numwletter, - TOK.ORDINAL: _ordinal, - TOK.PERCENT: _percent, - } - - parts: List[str] = [] - for s in p_result["sentences"]: - s_parts: List[str] = [] - # List of (token, terminal node) pairs. - # Terminal nodes can be None if the sentence wasn't parseable - tk_term_list = tuple( - zip(s.tokens, s.terminal_nodes or (None for _ in s.tokens)) - ) - for tok, term in tk_term_list: - txt = tok.txt - - if tok.kind in handler_map: - # Found a handler for this token type - s_parts.append(handler_map[tok.kind](tok, term)) - continue - - # Fallbacks if no handler found - if txt.isupper(): - # Fully uppercase string, - # might be part of an entity name - s_parts.append(cls.entity(txt)) - - elif _ABBREV_RE.match(txt) and ( - (term is not None and not _ABBREV_RE.match(term.lemma)) - or any(not _ABBREV_RE.match(m.stofn) for m in tok.meanings) - ): - # Probably an abbreviation such as "t.d." or "MSc" - s_parts.append(cls.abbrev(txt)) - - # Check whether this is a hyphen denoting a range - elif ( - txt in _HYPHEN_SYMBOLS - and term is not None - and term.parent is not None - # Check whether parent nonterminal has at least 3 children (might be a range) - and len(term.parent) >= 3 - ): - # Hyphen found, probably denoting a range - if s.lemmas is not None and _SPORTS_LEMMAS.isdisjoint(s.lemmas): - # Probably not the result from a sports match - # (as the sentence doesn't contain sports-related lemmas), - # so replace the range-denoting hyphen with 'til' - s_parts.append("til") - else: - # No transcribing happened - s_parts.append(txt) - - # Finished parsing a sentence - sent = " ".join(s_parts).strip() - parts.append(cls.sentence(sent) if full_text else sent) - - # Join sentences - para = " ".join(parts) - return cls.paragraph(para) if full_text else para - - _PERSON_PRONUNCIATION: Mapping[str, str] = { - "Jr": "djúníor", - "Jr.": "djúníor", - } - - @classmethod - @_empty_str - def person(cls, txt: str) -> str: - """Voicify the name of a person.""" - with GreynirBin.get_db() as gbin: - gender = cast(GenderType, gbin.lookup_name_gender(txt)) - parts = txt.split() - for i, p in enumerate(parts): - if p in cls._PERSON_PRONUNCIATION: - parts[i] = cls._PERSON_PRONUNCIATION[p] - continue - if "." in p: - # Contains period (e.g. 'Jak.' or 'Ólafsd.') - abbrs = next( - filter( - lambda m: m.ordfl == gender # Correct gender - # Icelandic abbrev - and m.fl != "erl" - # Uppercase first letter - and m.stofn[0].isupper() - # Expanded meaning must be longer - # (otherwise we just spell it, e.g. 'Th.' = 'Th.') - and len(m.stofn) > len(p), - Abbreviations.get_meaning(p) or [], - ), - None, - ) - if abbrs is not None: - # Replace with expanded version of part - parts[i] = abbrs.stofn - else: - # Spell this part - parts[i] = cls.spell(p.replace(".", "")) - if i + 2 >= len(parts) and all(l in _ROMAN_NUMERALS for l in parts[i]): - # Last or second to last part of name looks - # like an uppercase roman numeral, - # replace with ordinal - parts[i] = roman_numeral_to_ordinal(parts[i], gender=gender) - return " ".join(parts) - - _VBREAK_STRENGTHS = frozenset( - ("none", "x-weak", "weak", "medium", "strong", "x-strong") - ) - - @classmethod - def vbreak(cls, time: Optional[str] = None, strength: Optional[str] = None) -> str: - """Create a break in the voice/speech synthesis.""" - if time: - return f'' - if strength: - assert ( - strength in cls._VBREAK_STRENGTHS - ), f"Break strength {strength} is invalid." - return f'' - return f"" - - @classmethod - @_empty_str - def paragraph(cls, txt: str) -> str: - """Paragraph delimiter for speech synthesis.""" - return f"

{txt}

" - - @classmethod - @_empty_str - def sentence(cls, txt: str) -> str: - """Sentence delimiter for speech synthesis.""" - return f"{txt}" diff --git a/speech/trans/num.py b/speech/trans/num.py deleted file mode 100755 index 7af1d32a..00000000 --- a/speech/trans/num.py +++ /dev/null @@ -1,707 +0,0 @@ -""" - - Greynir: Natural language processing for Icelandic - - Copyright (C) 2023 Miðeind ehf. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. - - - This file contains various utility functions that convert - numbers to Icelandic text. - -""" - -from typing import Mapping, List, Optional, Tuple, Match, Callable, Union -from typing_extensions import Literal -import re - - -_SUB_20_NEUTRAL: Mapping[int, str] = { - 1: "eitt", - 2: "tvö", - 3: "þrjú", - 4: "fjögur", - 5: "fimm", - 6: "sex", - 7: "sjö", - 8: "átta", - 9: "níu", - 10: "tíu", - 11: "ellefu", - 12: "tólf", - 13: "þrettán", - 14: "fjórtán", - 15: "fimmtán", - 16: "sextán", - 17: "sautján", - 18: "átján", - 19: "nítján", -} - -_TENS_NEUTRAL: Mapping[int, str] = { - 20: "tuttugu", - 30: "þrjátíu", - 40: "fjörutíu", - 50: "fimmtíu", - 60: "sextíu", - 70: "sjötíu", - 80: "áttatíu", - 90: "níutíu", -} - -_DeclensionMapping = Mapping[str, Mapping[str, Mapping[str, str]]] -_NUM_NEUT_TO_DECL: _DeclensionMapping = { - "eitt": { - "kk": {"nf": "einn", "þf": "einn", "þgf": "einum", "ef": "eins"}, - "kvk": {"nf": "ein", "þf": "eina", "þgf": "einni", "ef": "einnar"}, - "hk": {"nf": "eitt", "þf": "eitt", "þgf": "einu", "ef": "eins"}, - }, - "tvö": { - "kk": {"nf": "tveir", "þf": "tvo", "þgf": "tveimur", "ef": "tveggja"}, - "kvk": {"nf": "tvær", "þf": "tvær", "þgf": "tveimur", "ef": "tveggja"}, - "hk": {"nf": "tvö", "þf": "tvö", "þgf": "tveimur", "ef": "tveggja"}, - }, - "þrjú": { - "kk": {"nf": "þrír", "þf": "þrjá", "þgf": "þremur", "ef": "þriggja"}, - "kvk": {"nf": "þrjár", "þf": "þrjár", "þgf": "þremur", "ef": "þriggja"}, - "hk": {"nf": "þrjú", "þf": "þrjú", "þgf": "þremur", "ef": "þriggja"}, - }, - "fjögur": { - "kk": {"nf": "fjórir", "þf": "fjóra", "þgf": "fjórum", "ef": "fjögurra"}, - "kvk": {"nf": "fjórar", "þf": "fjórar", "þgf": "fjórum", "ef": "fjögurra"}, - "hk": {"nf": "fjögur", "þf": "fjögur", "þgf": "fjórum", "ef": "fjögurra"}, - }, -} - -_LARGE_NUMBERS: Tuple[Tuple[int, str, str], ...] = ( - (10 ** 48, "oktilljón", "kvk"), - (10 ** 42, "septilljón", "kvk"), - (10 ** 36, "sextilljón", "kvk"), - (10 ** 30, "kvintilljón", "kvk"), - (10 ** 27, "kvaðrilljarð", "kk"), - (10 ** 24, "kvaðrilljón", "kvk"), - (10 ** 21, "trilljarð", "kk"), - (10 ** 18, "trilljón", "kvk"), - (10 ** 15, "billjarð", "kk"), - (10 ** 12, "billjón", "kvk"), - (10 ** 9, "milljarð", "kk"), - (10 ** 6, "milljón", "kvk"), -) - - -def number_to_neutral(n: int = 0, *, one_hundred: bool = False) -> str: - """ - Write integer out as neutral gender text in Icelandic. - Argument one_hundred specifies whether to add "eitt" before "hundrað". - Example: - number_to_neutral(1337) -> "eitt þúsund þrjú hundruð þrjátíu og sjö" - """ - n = int(n) - - if n == 0: - return "núll" - - text: List[str] = [] - # Make n positive while creating written number string - minus: str = "" - if n < 0: - minus = "mínus " - n = -n - - MILLION = 1000000 - THOUSAND = 1000 - - # Helper function to check whether a number should be prefixed with "og" - should_prepend_og: Callable[[int], bool] = ( - lambda x: x > 0 and int(str(x).rstrip("0")) < 20 - ) - - # Very large numbers - while n >= MILLION: - - large_num, isl_num, gender = 1, "", "" - for large_num, isl_num, gender in _LARGE_NUMBERS: - if large_num <= n: - break - - large_count, n = divmod(n, large_num) - - text.extend(number_to_neutral(large_count, one_hundred=True).split()) - - last = text[-1] - if gender == "kk": - # e.g. "milljarður" if last number ends with "eitt/einn" else "milljarðar" - isl_num += "ur" if last == "eitt" else "ar" - elif gender == "kvk": - # e.g. "milljón" if last number ends with "eitt/ein" else "milljónir" - if last != "eitt": - isl_num += "ir" - - if last in _NUM_NEUT_TO_DECL: - # Change "eitt" to "einn/ein" - text[-1] = _NUM_NEUT_TO_DECL[last][gender]["nf"] - - text.append(isl_num) - if should_prepend_og(n): - text.append("og") - - if THOUSAND <= n < MILLION: - thousands, n = divmod(n, THOUSAND) - - if thousands > 1: - text.extend(number_to_neutral(thousands, one_hundred=True).split()) - elif thousands == 1: - text.append("eitt") - - # Singular/Plural form of "þúsund" is the same - text.append("þúsund") - # Don't prepend 'og' in front of 110, 120, ..., 190 - if should_prepend_og(n) and n not in range(110, 200, 10): - text.append("og") - - if 100 <= n < THOUSAND: - hundreds, n = divmod(n, 100) - - if hundreds > 1: - text.extend(number_to_neutral(hundreds).split()) - # Note: don't need to fix singular here as e.g. - # 2100 gets interpreted as "tvö þúsund og eitt hundrað" - # instead of "tuttugu og eitt hundrað" - text.append("hundruð") - elif hundreds == 1: - if text or one_hundred: - # Add "eitt" before "hundrað" - # if not first number in text - # or if one_hundred is True - text.append("eitt") - text.append("hundrað") - - if should_prepend_og(n): - text.append("og") - - if 20 <= n < 100: - tens, digit = divmod(n, 10) - tens *= 10 - - text.append(_TENS_NEUTRAL[tens]) - if digit != 0: - text.append("og") - text.append(_SUB_20_NEUTRAL[digit]) - n = 0 - - if 0 < n < 20: - text.append(_SUB_20_NEUTRAL[n]) - n = 0 - - # Fix e.g. "milljónir milljarðar" -> "milljónir milljarða" - number_string: str = minus + re.sub( - r"(\S*(jónir|jarð[au]r?)) (\S*(jarð|jón))[ia]r", r"\1 \3a", " ".join(text) - ) - - return number_string - - -CaseType = Literal["nf", "þf", "þgf", "ef"] -GenderType = Literal["kk", "kvk", "hk"] -NumberType = Literal["et", "ft"] - - -def number_to_text( - n: Union[int, str], - *, - case: str = "nf", - gender: str = "hk", - one_hundred: bool = False -) -> str: - """ - Convert an integer into written Icelandic text in given case/gender. - Argument one_hundred specifies whether to add "eitt" before "hundrað". - Example: - 302 -> "þrjú hundruð og tvær" (gender="kvk") - 501 -> "fimm hundruð og einn" (gender="kk") - """ - if isinstance(n, str): - n = n.replace(".", "") - n = int(n) - nums = number_to_neutral(n, one_hundred=one_hundred).split() - - last = nums[-1] - if last in _NUM_NEUT_TO_DECL: - nums[-1] = _NUM_NEUT_TO_DECL[last][gender][case] - - return " ".join(nums) - - -def numbers_to_text( - s: str, - *, - regex: str = r"((? str: - """ - Converts numbers in string to Icelandic text. - (Can also be supplied with custom regex to match certain numbers) - Extra arguments specifies case/gender of number - and whether to add "eitt" before "hundrað". - """ - - def convert(m: Match[str]) -> str: - match = m.group(0) - n = int(match) - return number_to_text(n, case=case, gender=gender, one_hundred=one_hundred) - - return re.sub(regex, convert, s) - - -def float_to_text( - f: Union[float, str], - *, - case: str = "nf", - gender: str = "hk", - comma_null: bool = False, - one_hundred: bool = False -) -> str: - """ - Convert a float into written Icelandic text in given case/gender. - Argument one_hundred specifies whether to add "eitt" before "hundrað". - Example: - -0.02 -> "mínus núll komma núll tveir" (gender="kk") - """ - if isinstance(f, str): - if "," in f and "." in f: - # Remove Icelandic thousand markers - f = f.replace(".", "") - # Change Icelandic comma to period - f = f.replace(",", ".") - - f = float(f) - out_str: str = "" - # To prevent edge cases like -0.2 being translated to - # "núll komma tvö" instead of "mínus núll komma tvö" - if f < 0: - out_str = "mínus " - f = -f - - first, second = str(f).split(".") - - # Number before decimal point - out_str += number_to_text( - int(first), case=case, gender=gender, one_hundred=one_hundred - ) - - if not comma_null and second == "0": - # Skip "komma núll" if comma_null is False - return out_str - - out_str += " komma " - - if len(second.lstrip("0")) <= 2: - # e.g. 2,41 -> "tveimur komma fjörutíu og einum" - while second and second[0] == "0": - # e.g. 1,03 -> "einni komma núll tveimur" - # or 2,003 -> "tveimur komma núll núll þremur" - out_str += "núll " - second = second[1:] - if second: - out_str += number_to_text(int(second), case=case, gender=gender) - else: - if len(second) > 2: - # Only allow declension for two digits at most after decimal point - # Otherwise fall back to "nf" - case = "nf" - # Numbers after decimal point - for digit in second: - if digit == "0": - out_str += "núll " - else: - digit_str = _SUB_20_NEUTRAL.get(int(digit), "") - if digit_str in _NUM_NEUT_TO_DECL: - out_str += _NUM_NEUT_TO_DECL[digit_str][gender][case] - else: - out_str += digit_str - out_str += " " - - return out_str.rstrip() - - -def floats_to_text( - s: str, - *, - regex: str = r"((? str: - """ - Converts floats of the form '14.022,14', '0,42' (with Icelandic comma) - (or matching custom regex if provided) - in string to Icelandic text. - Extra arguments specifies case/gender of float, - whether to read after decimal point if fractional part is zero - and whether to add "eitt" before "hundrað". - """ - - def convert(m: Match[str]) -> str: - match = m.group(0) - n = float(match.replace(".", "").replace(",", ".")) - return float_to_text( - n, case=case, gender=gender, comma_null=comma_null, one_hundred=one_hundred - ) - - return re.sub(regex, convert, s) - - -def year_to_text(year: Union[int, str]) -> str: - """ - Write year as text in Icelandic. - Negative years automatically append "fyrir Krist" to the text. - """ - year = int(year) - suffix: str = "" - text: List[str] = [] - - if year < 0: - suffix = " fyrir Krist" - year = -year - - # People say e.g. "nítján hundruð þrjátíu og tvö" - # instead of "eitt þúsund níu hundruð þrjátíu og tvö" - # for years between 1100-2000 - if 1100 <= year < 2000: - hundreds, digits = divmod(year, 100) - - text.append(_SUB_20_NEUTRAL[hundreds]) - text.append("hundruð") - if digits > 0: - if digits in _SUB_20_NEUTRAL or digits in _TENS_NEUTRAL: - text.append("og") - text.append(number_to_neutral(digits)) - - # Other years are spoken like regular numbers - else: - text.append(number_to_neutral(year)) - - return " ".join(text) + suffix - - -def years_to_text( - s: str, *, regex: Optional[str] = None, allow_three_digits: bool = False -) -> str: - """ - Converts numbers in string matching the regex - to text as spoken Icelandic year. - """ - - if regex is None: - if allow_three_digits: - # Use a regex that matches 3-4 digit numbers but does a lookahead - # to not match numbers that are followed by a decimal point and a digit - regex = r"\b\d{3,4}(?![\.,]\d)\b" - else: - regex = r"\b\d{4}(?![\.,]\d)\b" - - def convert(m: Match[str]) -> str: - match = m.group(0) - n = int(match) - # Don't interpret numbers lower than 850 or higher than 2200 as years - return year_to_text(n) if 850 < n < 2200 else match - - return re.sub(regex, convert, s) - - -_SUB_20_NEUT_TO_ORDINAL: Mapping[str, str] = { - "eitt": "fyrst", - # 2 is a special case - "þrjú": "þriðj", - "fjögur": "fjórð", - "fimm": "fimmt", - "sex": "sjött", - "sjö": "sjöund", - "átta": "áttund", - "níu": "níund", - "tíu": "tíund", - "ellefu": "elleft", - "tólf": "tólft", - "þrettán": "þrettánd", - "fjórtán": "fjórtánd", - "fimmtán": "fimmtánd", - "sextán": "sextánd", - "sautján": "sautjánd", - "átján": "átjánd", - "nítján": "nítjánd", -} - -_ANNAR_TABLE: _DeclensionMapping = { - "et": { - "kk": {"nf": "annar", "þf": "annan", "þgf": "öðrum", "ef": "annars",}, - "kvk": {"nf": "önnur", "þf": "aðra", "þgf": "annarri", "ef": "annarrar",}, - "hk": {"nf": "annað", "þf": "annað", "þgf": "öðru", "ef": "annars",}, - }, - "ft": { - "kk": {"nf": "aðrir", "þf": "aðra", "þgf": "öðrum", "ef": "annarra",}, - "kvk": {"nf": "aðrar", "þf": "aðrar", "þgf": "öðrum", "ef": "annarra",}, - "hk": {"nf": "önnur", "þf": "önnur", "þgf": "öðrum", "ef": "annarra",}, - }, -} - -_SuffixMapping = Mapping[str, Mapping[str, str]] -_SUB_20_ORDINAL_SUFFIX: _SuffixMapping = { - "kk": {"nf": "i", "þf": "a", "þgf": "a", "ef": "a",}, - "kvk": {"nf": "a", "þf": "u", "þgf": "u", "ef": "u",}, - "hk": {"nf": "a", "þf": "a", "þgf": "a", "ef": "a",}, -} - -_TENS_NEUT_TO_ORDINAL: Mapping[str, str] = { - "tuttugu": "tuttug", - "þrjátíu": "þrítug", - "fjörutíu": "fertug", - "fimmtíu": "fimmtug", - "sextíu": "sextug", - "sjötíu": "sjötug", - "áttatíu": "átttug", - "níutíu": "nítug", -} - -_LARGE_ORDINAL_SUFFIX: _SuffixMapping = { - "kk": {"nf": "asti", "þf": "asta", "þgf": "asta", "ef": "asta",}, - "kvk": {"nf": "asta", "þf": "ustu", "þgf": "ustu", "ef": "ustu",}, - "hk": {"nf": "asta", "þf": "asta", "þgf": "asta", "ef": "asta",}, -} - - -def _num_to_ordinal( - word: str, - case: str = "nf", - gender: str = "kk", - number: str = "et", -) -> str: - """ - Helper function. Changes one part of a number (in written form) to ordinal form - in correct case, gender and number. - Example: - "hundruð" -> "hundraðasti" (default args) - "tvö" -> "aðrar" (þf, kvk, ft) - """ - if word == "núll": - word = "núllt" + _SUB_20_ORDINAL_SUFFIX[gender][case] - - elif word == "tvö": - word = _ANNAR_TABLE[number][gender][case] - - elif word in _SUB_20_NEUT_TO_ORDINAL: - word = _SUB_20_NEUT_TO_ORDINAL.get(word, word) - if number == "ft": - word += "u" - else: - word += _SUB_20_ORDINAL_SUFFIX[gender][case] - - elif word in _TENS_NEUT_TO_ORDINAL: - word = _TENS_NEUT_TO_ORDINAL.get(word, word) - if number == "ft": - word += "ustu" - else: - word += _LARGE_ORDINAL_SUFFIX[gender][case] - - elif word.startswith("hundr"): - if number == "ft" or (gender == "kvk" and case != "nf"): - word = "hundruðustu" - else: - word = "hundrað" + _LARGE_ORDINAL_SUFFIX[gender][case] - - elif word == "þúsund": - if number == "ft" or (gender == "kvk" and case != "nf"): - word = "þúsundustu" - else: - word = "þúsund" + _LARGE_ORDINAL_SUFFIX[gender][case] - - elif "jón" in word: - if number == "ft": - word = re.sub(r"(\S*jón)\S*", r"\1ustu", word) - else: - word = re.sub( - r"(\S*jón)\S*", r"\1" + _LARGE_ORDINAL_SUFFIX[gender][case], word - ) - - elif "jarð" in word: - if number == "ft" or (gender == "kvk" and case != "nf"): - word = re.sub(r"(\S*)jarð\S*", r"\1jörðustu", word) - else: - word = re.sub( - r"(\S*jarð)\S*", r"\1" + _LARGE_ORDINAL_SUFFIX[gender][case], word - ) - - return word - - -def neutral_text_to_ordinal( - s: str, - *, - case: str = "nf", - gender: str = "kk", - number: str = "et" -) -> str: - """ - Takes Icelandic text representation of number - and returns it as an ordinal in specified case (nf, þf, þgf, ef), - gender (kk, kvk, hk) and number (et, ft). - """ - if len(s) == 0: - return s - - ordinal: List[str] = s.split() - - # Change last word to ordinal - ordinal[-1] = _num_to_ordinal(ordinal[-1], case, gender, number) - - if len(ordinal) > 1: - # Change e.g. "tvö þúsund og fyrsti" -> "tvö þúsundasti og fyrsti" - if ordinal[-2] == "og" and len(ordinal) >= 3: - # Check that last number in text isn't a large ordinal - # e.g. "sextugustu", "hundraðasti" or "þúsundasta" - if not re.search(r"[au]st[iau]$", ordinal[-1]): - ordinal[-3] = _num_to_ordinal(ordinal[-3], case, gender, number) - - ordinal_str: str = " ".join(ordinal) - - # Change e.g. - # "eitt hundraðasti" -> "hundraðasti" - # "ein milljónasta og fyrsta" -> "milljónasta og fyrsta" - ordinal_str = re.sub(r"^(einn?|eitt) ((\S*)([au]st[iau]))", r"\2", ordinal_str) - - return ordinal_str - - -def number_to_ordinal( - n: Union[int, str], - *, - case: str = "nf", - gender: str = "kk", - number: str = "et" -) -> str: - """ - Takes number and returns it as an ordinal - in specified case (nf, þf, þgf, ef), - gender (kk, kvk, hk) and number (et, ft). - """ - if isinstance(n, str): - n = int(n.rstrip(".")) - return neutral_text_to_ordinal( - number_to_neutral(n), case=case, gender=gender, number=number - ) - - -def numbers_to_ordinal( - s: str, - *, - regex: Optional[str] = None, - case: str = "nf", - gender: str = "kk", - number: str = "et" -) -> str: - """ - Converts ordinals of the form '2.', '101.' - (or matching regex if provided) - in string to Icelandic text. - Extra arguments specify case, gender and number. - """ - - if regex is None: - # Match ordinals of the form '2.', '101.' - regex = r"((? str: - match = m.group(0) - n = int(match.strip(".")) - return number_to_ordinal(n, case=case, gender=gender, number=number) - - return re.sub(regex, convert, s) - - -_DIGITS_TO_KK: Mapping[str, str] = { - "0": "núll", - "1": "einn", - "2": "tveir", - "3": "þrír", - "4": "fjórir", - "5": "fimm", - "6": "sex", - "7": "sjö", - "8": "átta", - "9": "níu", -} - - -def digits_to_text(s: str, *, regex: str = r"\b\d+") -> str: - """ - Converts digits in string to Icelandic text. - Useful for phone numbers, social security numbers and such. - Can also supply custom regex to match only certain numbers. - Examples: - "5885522" -> "fimm átta átta fimm fimm tveir tveir" - "Síminn minn er 581-2345" -> "Síminn minn er fimm átta einn-tveir þrír fjórir fimm" - """ - - def convert(m: Match[str]) -> str: - match = m.group(0).replace("-", "") - return "".join( - _DIGITS_TO_KK[letter] + " " if letter.isdecimal() else letter - for letter in match - ).rstrip() - - return re.sub(regex, convert, s) - - -_ROMAN_NUMERALS: Mapping[str, int] = { - "I": 1, - "V": 5, - "X": 10, - "L": 50, - "C": 100, - "D": 500, - "M": 1000, -} - - -def _roman_numeral_to_int(n: str) -> int: - """ - Helper function, changes a correct roman numeral to an integer. - Source: https://stackoverflow.com/a/52426119 - """ - nums = [_ROMAN_NUMERALS[i] for i in n.upper() if i in _ROMAN_NUMERALS] - return sum( - val if val >= nums[min(i + 1, len(n) - 1)] else -val - for i, val in enumerate(nums) - ) - - -def roman_numeral_to_ordinal( - n: str, - *, - case: str = "nf", - gender: str = "kk", - number: str = "et" -): - """ - Change a roman numeral into a written Icelandic ordinal. - Example: - "III" -> "þriðji" - "MMXXII" -> "tvö þúsund tuttugasti og annar" - """ - return number_to_ordinal( - _roman_numeral_to_int(n), case=case, gender=gender, number=number, - ) diff --git a/speech/voices/aws_polly.py b/speech/voices/aws_polly.py deleted file mode 100755 index e943e28a..00000000 --- a/speech/voices/aws_polly.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python -""" - - Greynir: Natural language processing for Icelandic - - Copyright (C) 2023 Miðeind ehf. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. - - - Icelandic-language text to speech via Amazon Polly. - -""" - -from typing import Optional, Any, cast - -import json -import logging -from threading import Lock - -import requests -import cachetools -import boto3 # type: ignore -from botocore.exceptions import ClientError # type: ignore - -from utility import RESOURCES_DIR - - -NAME = "Amazon Polly" -VOICES = frozenset(("Karl", "Dora")) -AUDIO_FORMATS = frozenset(("mp3", "pcm", "ogg_vorbis")) - -# The AWS Polly API access keys -# You must obtain your own keys if you want to use this code -# JSON format is the following: -# { -# "aws_access_key_id": "my_key", -# "aws_secret_access_key": "my_secret", -# "region_name": "my_region" -# } -# -_AWS_KEYFILE_NAME = "AWSPollyServerKey.json" -_AWS_API_KEYS_PATH = str(RESOURCES_DIR / _AWS_KEYFILE_NAME) - - -_aws_api_client: Optional[boto3.Session] = None -_aws_api_client_lock = Lock() - - -def _initialize_aws_client() -> Optional[boto3.Session]: - """Set up AWS Polly client.""" - global _api_client - - # Make sure that only one thread is messing with the global variable - with _aws_api_client_lock: - if _aws_api_client is None: - # Read AWS Polly API keys from file - aws_config = {} - try: - with open(_AWS_API_KEYS_PATH) as json_file: - aws_config = json.load(json_file) - except Exception as e: - logging.warning(f"Unable to read AWS Polly credentials: {e}") - return None - _api_client = boto3.Session(**aws_config).client("polly") - # Return client instance - return _api_client # type: ignore - - -# Time to live (in seconds) for synthesized text URL caching -# Add a safe 30 second margin to ensure that clients are never provided with an -# audio URL that is just about to expire and might do so before playback starts. -_AWS_URL_TTL = 600 # 10 mins in seconds -_AWS_CACHE_TTL = _AWS_URL_TTL - 30 # seconds -_AWS_CACHE_MAXITEMS = 30 - - -@cachetools.cached(cachetools.TTLCache(_AWS_CACHE_MAXITEMS, _AWS_CACHE_TTL)) -def text_to_audio_url( - text: str, - text_format: str, - audio_format: str, - voice_id: Optional[str], - speed: float = 1.0, -) -> Optional[str]: - """Returns Amazon Polly URL to audio file with speech-synthesized text.""" - - assert voice_id in VOICES - assert audio_format in AUDIO_FORMATS - - # Set up client lazily - client = _initialize_aws_client() - if client is None: - logging.warning("Unable to instantiate AWS client") - return None - - if audio_format not in AUDIO_FORMATS: - logging.warn( - f"Unsupported audio format for Amazon Polly speech synthesis: {audio_format}." - " Falling back to mp3" - ) - audio_format = "mp3" - - # Special preprocessing for SSML markup - if text_format == "ssml": - # Adjust voice speed as appropriate - if speed != 1.0: - perc = int(speed * 100) - text = f'{text}' - # Wrap text in the required tag - if not text.startswith(""): - text = f"{text}" - - # Configure query string parameters for AWS request - params = { - # The text to synthesize - "Text": text, - # mp3 | ogg_vorbis | pcm - "OutputFormat": audio_format, - # Dora or Karl - "VoiceId": voice_id, - # Valid values for mp3 and ogg_vorbis are "8000", "16000", and "22050". - # The default value is "22050". - "SampleRate": "16000", - # Either "text" or "ssml" - "TextType": text_format, - # Only required for bilingual voices - # "LanguageCode": "is-IS" - } - - try: - url = cast(Any, client).generate_presigned_url( - ClientMethod="synthesize_speech", - Params=params, - ExpiresIn=_AWS_URL_TTL, - HttpMethod="GET", - ) - except ClientError as e: - logging.error(e) - return None - - return url - - -def text_to_audio_data( - text: str, - text_format: str, - audio_format: str, - voice_id: str, - speed: float, -) -> Optional[bytes]: - """Returns audio data for speech-synthesized text.""" - url = text_to_audio_url(**locals()) - if not url: - return None - try: - r = requests.get(url, timeout=10) - return r.content - except Exception as e: - logging.error(f"Error fetching URL {url}: {e}") - return None diff --git a/speech/voices/azure.py b/speech/voices/azure.py deleted file mode 100755 index fa23c4be..00000000 --- a/speech/voices/azure.py +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/env python -""" - - Greynir: Natural language processing for Icelandic - - Copyright (C) 2023 Miðeind ehf. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. - - - Icelandic-language text to speech via the MS Azure Speech API. - -""" - -from typing import Optional, Tuple - -import logging -import json -import uuid -import pathlib - -import azure.cognitiveservices.speech as speechsdk - -from . import AUDIO_SCRATCH_DIR -from utility import RESOURCES_DIR -from speech.trans import DefaultTranscriber, strip_markup -from speech.voices import suffix_for_audiofmt - - -NAME = "Azure Cognitive Services" -AUDIO_FORMATS = frozenset(("mp3", "pcm", "opus")) -_VOICE_TO_ID = { - # Icelandic - "Gudrun": "is-IS-GudrunNeural", - "Gunnar": "is-IS-GunnarNeural", - # English (UK) - "Abbi": "en-GB-AbbiNeural", - "Alfie": "en-GB-AlfieNeural", - # English (US) - "Jenny": "en-US-JennyNeural", - "Brandon": "en-US-BrandonNeural", - # French - "Brigitte": "fr-FR-BrigitteNeural", - "Alain": "fr-FR-AlainNeural", - # German - "Amala": "de-DE-AmalaNeural", - # Danish - "Christel": "da-DK-ChristelNeural", - "Jeppe": "da-DK-JeppeNeural", - # Swedish - "Sofie": "sv-SE-SofieNeural", - "Mattias": "sv-SE-MattiasNeural", - # Norwegian - "Finn": "nb-NO-FinnNeural", - "Iselin": "nb-NO-IselinNeural", - # Spanish - "Abril": "es-ES-AbrilNeural", - "Alvaro": "es-ES-AlvaroNeural", - # Polish - "Agnieszka": "pl-PL-AgnieszkaNeural", - "Marek": "pl-PL-MarekNeural", - # Many more voices available, see: - # https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support -} -VOICES = frozenset(_VOICE_TO_ID.keys()) -_DEFAULT_VOICE_ID = "is-IS-GudrunNeural" - -# The Azure Speech API access key -# You must obtain your own key if you want to use this code -# JSON format is the following: -# { -# "key": ""my_key", -# "region": "my_region", -# } -# -_AZURE_KEYFILE_NAME = "AzureSpeechServerKey.json" - -_AZURE_API_KEY_PATH = str(RESOURCES_DIR / _AZURE_KEYFILE_NAME) - -_AZURE_API_KEY = "" -_AZURE_API_REGION = "" - - -def _azure_api_key() -> Tuple[str, str]: - """Lazy-load API key and region from JSON and return as tuple.""" - global _AZURE_API_KEY - global _AZURE_API_REGION - - if _AZURE_API_KEY and _AZURE_API_REGION: - return (_AZURE_API_KEY, _AZURE_API_REGION) - - try: - with open(_AZURE_API_KEY_PATH) as json_file: - js = json.load(json_file) - _AZURE_API_KEY = js["key"] - _AZURE_API_REGION = js["region"] - except Exception as e: - logging.warning(f"Unable to read Azure Speech API credentials: {e}") - - return (_AZURE_API_KEY, _AZURE_API_REGION) - - -def _synthesize_text( - text: str, - text_format: str, - audio_format: str, - voice_id: str, - speed: float = 1.0, - **kwargs, -) -> Optional[str]: - """Synthesizes text via Azure and returns path to generated audio file.""" - - if audio_format not in AUDIO_FORMATS: - logging.warn( - f"Unsupported audio format for Azure speech synthesis: {audio_format}." - " Falling back to mp3" - ) - audio_format = "mp3" - - # Audio format enums for Azure Speech API - # https://learn.microsoft.com/en-us/javascript/api/microsoft-cognitiveservices-speech-sdk/speechsynthesisoutputformat - aof = speechsdk.SpeechSynthesisOutputFormat - fmt2enum = { - "mp3": aof.Audio16Khz32KBitRateMonoMp3, - "pcm": aof.Raw16Khz16BitMonoPcm, - "opus": aof.Ogg16Khz16BitMonoOpus, - } - - try: - # Configure speech synthesis - (key, region) = _azure_api_key() - speech_config = speechsdk.SpeechConfig(subscription=key, region=region) - azure_voice_id = _VOICE_TO_ID.get(voice_id) or _DEFAULT_VOICE_ID - speech_config.speech_synthesis_voice_name = azure_voice_id - fmt = fmt2enum.get(audio_format, aof.Audio16Khz32KBitRateMonoMp3) - speech_config.set_speech_synthesis_output_format(fmt) - - # Generate a unique filename for the audio output file - suffix = suffix_for_audiofmt(audio_format) - out_fn: str = str(AUDIO_SCRATCH_DIR / f"{uuid.uuid4()}.{suffix}") - audio_config = speechsdk.audio.AudioOutputConfig(filename=out_fn) # type: ignore - - # Init synthesizer - synthesizer = speechsdk.SpeechSynthesizer( - speech_config=speech_config, audio_config=audio_config - ) - - # Azure Speech API supports SSML but the notation is a bit different from Amazon Polly's - # See https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup - if text_format == "ssml": - # Adjust speed - if speed != 1.0: - text = f'{text}' - # Wrap text in the required and tags - text = f""" - - - {text} - - """.strip() - speak_fn = synthesizer.speak_ssml - else: - # We're not sending SSML so strip any markup from text - text = strip_markup(text) - speak_fn = synthesizer.speak_text - - # Feed text into speech synthesizer - result = speak_fn(text) - - # Check result - if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: - # Return path to generated audio file - assert pathlib.Path(out_fn).exists() - return out_fn - elif result.reason == speechsdk.ResultReason.Canceled: - cancellation_details = result.cancellation_details - logging.error(f"Speech synthesis canceled: {cancellation_details.reason}") - if cancellation_details.reason == speechsdk.CancellationReason.Error: - logging.error(f"Azure TTS error: {cancellation_details.error_details}") - except Exception as e: - logging.error(f"Error communicating with Azure Speech API: {e}") - - -def text_to_audio_data( - text: str, - text_format: str, - audio_format: str, - voice_id: str, - speed: float = 1.0, -) -> Optional[bytes]: - """Feeds text to Azure Speech API and returns audio data received from server.""" - audio_file_path = _synthesize_text(**locals()) - if audio_file_path: - try: - # Read audio data from file and return it - with open(audio_file_path, "rb") as f: - audio_data = f.read() - return audio_data - except Exception as e: - logging.error( - f"Azure: Error reading synthesized audio file {audio_file_path}: {e}" - ) - return None - - -def text_to_audio_url( - text: str, - text_format: str, - audio_format: str, - voice_id: str, - speed: float = 1.0, -) -> Optional[str]: - """Returns URL for speech-synthesized text.""" - - audio_file_path = _synthesize_text(**locals()) - if audio_file_path: - # Generate and return file:// URL to audio file - url = pathlib.Path(audio_file_path).as_uri() - return url - return None - - # Old method returned data URI - # data = text_to_audio_data(**locals()) - # if not data: - # return None - # # Generate Data URI from the bytes received - # mime_type = mimetype_for_audiofmt(audio_format) - # data_uri = generate_data_uri(data, mime_type=mime_type) - # return data_uri - - -class Transcriber(DefaultTranscriber): - """ - Transcription handler class, - specific to the Azure voice engine. - """ - - # Override some character pronunciations during - # transcription (custom for this voice) - _CHAR_PRONUNCIATION = { - **DefaultTranscriber._CHAR_PRONUNCIATION, - "b": "bjé", - "c": "sjé", - "d": "djé", - "ð": "eeð", - "e": "eeh", - "é": "jé", - "g": "gjéé", - "i": "ii", - "j": "íoð", - "o": "úa", - "ó": "oú", - "u": "uu", - "r": "errr", - "t": "tjéé", - "ú": "úúu", - "ý": "ufsilon íí", - "þ": "þodn", - "æ": "æí", - "ö": "öö", - } - - # Weird entity pronunciations can be added here - # when they're encountered - _ENTITY_PRONUNCIATIONS = { - **DefaultTranscriber._ENTITY_PRONUNCIATIONS, - "BYKO": "Býkó", - "ELKO": "Elkó", - "FIDE": "fídeh", - "FIFA": "fííffah", - "GIRL": "görl", - "LEGO": "llegó", - "MIT": "emm æí tíí", - "NEW": "njúú", - "NOVA": "Nóva", - "PLUS": "plöss", - "SHAH": "Sjah", - "TIME": "tæm", - "UEFA": "júei fa", - "UENO": "júeenó", - "UKIP": "júkipp", - "VISA": "vísa", - "XBOX": "ex box", - } - - # Override some weird name pronunciations - _PERSON_PRONUNCIATION = { - "Joe": "Djó", - "Biden": "Bæden", - } diff --git a/speech/voices/google.py b/speech/voices/google.py deleted file mode 100755 index cd50994d..00000000 --- a/speech/voices/google.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python -""" - - Greynir: Natural language processing for Icelandic - - Copyright (C) 2023 Miðeind ehf. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. - - - Icelandic-language text to speech via the Google Cloud API. - -""" - -NAME = "Google" -VOICES = frozenset(()) -AUDIO_FORMATS = frozenset(("mp3")) - - -# from typing import Optional - -# import logging -# import uuid -# from pathlib import Path - -# from google.cloud import texttospeech - -# from . import AUDIO_SCRATCH_DIR, suffix_for_audiofmt - - -# NAME = "Google" -# VOICES = frozenset(("Anna",)) -# AUDIO_FORMATS = frozenset(("mp3")) - - -# def text_to_audio_data( -# text: str, -# text_format: str, -# audio_format: str, -# voice_id: str, -# speed: float = 1.0, -# ) -> Optional[bytes]: -# """Feeds text to Google's TTS API and returns audio data received from server.""" - -# # Instantiates a client -# client = texttospeech.TextToSpeechClient() - -# # Set the text input to be synthesized -# synthesis_input = texttospeech.SynthesisInput(text=text) - -# # Build the voice request, select the language code -# # and the SSML voice gender. -# voice = texttospeech.VoiceSelectionParams( -# language_code="is-IS", ssml_gender=texttospeech.SsmlVoiceGender.FEMALE -# ) - -# # Select the type of audio file you want returned. -# # We only support mp3 for now. -# audio_config = texttospeech.AudioConfig( -# audio_encoding=texttospeech.AudioEncoding.MP3 -# ) - -# try: -# # Perform the text-to-speech request on the text input -# # with the selected voice parameters and audio file type. -# response = client.synthesize_speech( -# input=synthesis_input, voice=voice, audio_config=audio_config -# ) -# return response.audio_content -# except Exception as e: -# logging.error(f"Error communicating with Google Cloud STT API: {e}") - - -# def text_to_audio_url( -# text: str, -# text_format: str, -# audio_format: str, -# voice_id: str, -# speed: float = 1.0, -# ) -> Optional[str]: -# """Returns URL for speech-synthesized text.""" - -# data = text_to_audio_data(**locals()) -# if not data: -# return None - -# suffix = suffix_for_audiofmt(audio_format) -# out_fn: str = str(AUDIO_SCRATCH_DIR / f"{uuid.uuid4()}.{suffix}") -# try: -# with open(out_fn, "wb") as f: -# f.write(data) -# except Exception as e: -# logging.error(f"Error writing audio file {out_fn}: {e}") -# return None - -# # Generate and return file:// URL to audio file -# url = Path(out_fn).as_uri() -# return url diff --git a/speech/voices/tiro.py b/speech/voices/tiro.py deleted file mode 100755 index 42404a38..00000000 --- a/speech/voices/tiro.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python -""" - - Greynir: Natural language processing for Icelandic - - Copyright (C) 2023 Miðeind ehf. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. - - - Icelandic-language text to speech via Tiro's text to speech API. - -""" - -from typing import Optional - -import logging -import uuid -from pathlib import Path - -import requests - -from . import AUDIO_SCRATCH_DIR, suffix_for_audiofmt -from speech.trans import strip_markup - -NAME = "Tiro" -VOICES = frozenset(("Alfur", "Dilja", "Bjartur", "Rosa", "Alfur_v2", "Dilja_v2")) -AUDIO_FORMATS = frozenset(("mp3", "pcm", "ogg_vorbis")) - - -_TIRO_TTS_URL = "https://tts.tiro.is/v0/speech" - - -def text_to_audio_data( - text: str, - text_format: str, - audio_format: str, - voice_id: str, - speed: float = 1.0, -) -> Optional[bytes]: - """Feeds text to Tiro's TTS API and returns audio data received from server.""" - - # Tiro's API supports a subset of SSML tags - # See https://tts.tiro.is/#tag/speech/paths/~1v0~1speech/post - # However, for now, we just strip all markup - text = strip_markup(text) - text_format = "text" - - if audio_format not in AUDIO_FORMATS: - logging.warn( - f"Unsupported audio format for Tiro speech synthesis: {audio_format}." - " Falling back to mp3" - ) - audio_format = "mp3" - - jdict = { - "Engine": "standard", - "LanguageCode": "is-IS", - "OutputFormat": audio_format, - "SampleRate": "16000", - "Text": text, - "TextType": text_format, - "VoiceId": voice_id, - } - - try: - r = requests.post(_TIRO_TTS_URL, json=jdict, timeout=10) - if r.status_code != 200: - raise Exception( - f"Received HTTP status code {r.status_code} from {NAME} server" - ) - return r.content - except Exception as e: - logging.error(f"Error communicating with Tiro API at {_TIRO_TTS_URL}: {e}") - - -def text_to_audio_url( - text: str, - text_format: str, - audio_format: str, - voice_id: str, - speed: float = 1.0, -) -> Optional[str]: - """Returns URL for speech-synthesized text.""" - - data = text_to_audio_data(**locals()) - if not data: - return None - - suffix = suffix_for_audiofmt(audio_format) - out_fn: str = str(AUDIO_SCRATCH_DIR / f"{uuid.uuid4()}.{suffix}") - try: - with open(out_fn, "wb") as f: - f.write(data) - except Exception as e: - logging.error(f"Error writing audio file {out_fn}: {e}") - return None - - # Generate and return file:// URL to audio file - url = Path(out_fn).as_uri() - return url diff --git a/tests/test_queries.py b/tests/test_queries.py index cab7263d..93b8376d 100755 --- a/tests/test_queries.py +++ b/tests/test_queries.py @@ -47,7 +47,7 @@ from db.models import Query, QueryClientData # , QueryLog from queries import ResponseDict from utility import read_txt_api_key -from speech.trans import strip_markup +from icespeak.transcribe import strip_markup from utility import QUERIES_RESOURCES_DIR diff --git a/tests/test_speech.py b/tests/test_speech.py deleted file mode 100755 index 8ba3dc37..00000000 --- a/tests/test_speech.py +++ /dev/null @@ -1,1016 +0,0 @@ -""" - - Greynir: Natural language processing for Icelandic - - Copyright (C) 2023 Miðeind ehf. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see http://www.gnu.org/licenses/. - - - Tests for speech-synthesis-related code in the Greynir repo. - -""" -from typing import Callable - -import os -import re -import sys -import datetime -import pytest -from pathlib import Path -from itertools import product - -import requests - -# Shenanigans to enable Pytest to discover modules in the -# main workspace directory (the parent of /tests) -basepath, _ = os.path.split(os.path.realpath(__file__)) -mainpath = os.path.join(basepath, "..") -if mainpath not in sys.path: - sys.path.insert(0, mainpath) - -from speech import text_to_audio_url -from speech.trans import DefaultTranscriber as DT -from utility import read_json_api_key - -# TODO: remove these tests once icespeak is released - - -def has_azure_api_key() -> bool: - return read_json_api_key("AzureSpeechServerKey") != {} - - -def has_aws_api_key() -> bool: - return read_json_api_key("AWSPollyServerKey") != {} - - -def test_voices_utils(): - """Test utility functions in speech.voices.""" - from speech.trans import strip_markup - from speech.voices import ( - mimetype_for_audiofmt, - suffix_for_audiofmt, - generate_data_uri, - ) - - assert mimetype_for_audiofmt("mp3") == "audio/mpeg" - assert mimetype_for_audiofmt("blergh") == "application/octet-stream" - - assert suffix_for_audiofmt("mp3") == "mp3" - assert suffix_for_audiofmt("blergh") == "data" - - assert strip_markup("hello") == "hello" - assert strip_markup("hello") == "hello" - assert strip_markup("hello") == "hello" - assert strip_markup("hello") == "hello" - - assert ( - generate_data_uri(b"hello") == "data:application/octet-stream;base64,aGVsbG8=" - ) - assert ( - generate_data_uri(b"hello", mime_type="text/plain") - == "data:text/plain;base64,aGVsbG8=" - ) - - -@pytest.mark.skipif(not has_aws_api_key(), reason="No AWS Polly API key found") -def test_speech_synthesis_aws(): - """Test basic speech synthesis functionality with AWS Polly.""" - - _TEXT = "Prufa" - _MIN_AUDIO_SIZE = 1000 - - # Test AWS Polly - url = text_to_audio_url( - text=_TEXT, - text_format="text", - audio_format="mp3", - voice_id="Dora", - ) - assert url and url.startswith("http") - r = requests.get(url, timeout=10) - assert r.headers.get("Content-Type") == "audio/mpeg", "Expected MP3 audio data" - assert len(r.content) > _MIN_AUDIO_SIZE, "Expected longer audio data" - - -@pytest.mark.skipif( - not has_azure_api_key(), - reason="No Azure Speech API key found", -) -def test_speech_synthesis_azure(): - """ - Test basic speech synthesis functionality with Azure Cognitive Services. - """ - - _TEXT = "Prufa" - _MIN_AUDIO_SIZE = 1000 - - # Test Azure Cognitive Services - url = text_to_audio_url( - text=_TEXT, - text_format="text", - audio_format="mp3", - voice_id="Gudrun", - ) - assert url and url.startswith("file://") and url.endswith(".mp3") - path_str = url[7:] - path = Path(path_str) - assert path.is_file(), "Expected audio file to exist" - assert path.stat().st_size > _MIN_AUDIO_SIZE, "Expected longer audio data" - path.unlink() - - -def test_gssml(): - from speech.trans import gssml - - gv = gssml("5", type="number") - assert gv == '5' - gv = gssml(type="vbreak") - assert gv == '' - gv = gssml(type="vbreak", strength="medium") - assert gv == '' - gv = gssml("whatever", type="misc", a="1", b=3, c=4.5) - assert gv == 'whatever' - try: - gssml("something", no_type_arg="hello") # type: ignore - assert False, "gssml should raise error if no type arg specified" - except: - pass - - -def test_greynirssmlparser(): - from speech import GreynirSSMLParser, DEFAULT_VOICE, SUPPORTED_VOICES - from speech.trans import gssml - - gp = GreynirSSMLParser(DEFAULT_VOICE) - n = gp.transcribe(f"Ég vel töluna {gssml(244, type='number', gender='kk')}") - assert "tvö hundruð fjörutíu og fjórir" in n - n = gp.transcribe( - f"{gssml(type='vbreak')} {gssml(3, type='number', gender='kk', case='þf')}" - ) - assert "" in n and "þrjá" in n - - example_data = { - "number": "1", - "numbers": "1 2 3", - "float": "1.0", - "floats": "1.0 2.3", - "ordinal": "1", - "ordinals": "1., 3., 4.", - "phone": "5885522", - "time": "12:31", - "date": "2000-01-01", - "year": "1999", - "years": "1999, 2000 og 2021", - "abbrev": "t.d.", - "spell": "SÍBS", - "vbreak": None, - "email": "t@olvupostur.rugl", - "paragraph": "lítil efnisgrein", - "sentence": "lítil setning eða málsgrein?", - } - - for t, v in DT.__dict__.items(): - if t not in example_data: - continue - assert isinstance( - v, (staticmethod, classmethod) - ), "not valid transcription method name" - d = example_data[t] - if d is None: - # No data argument to gssml - r = f"hér er {gssml(type=t)} texti" - # Make sure gssml added tag - assert "" in r - else: - r = f"hér er {gssml(d, type=t)} texti" - # Make sure gssml added tags - assert " tags - assert ", - # nothing easy we can do to fix that - x = """ bla bla fad fda""" - n = gp.transcribe(x) - assert "&" not in n and "<" not in n and ">" not in n - assert len(n) > 0 - # We strip spaces from the names of endtags, - # but otherwise try to keep unrecognized tags unmodified - x = """""" - n = gp.transcribe(x) - assert "&" not in n and "<" not in n and ">" not in n - assert n == "" - - x = """ 4""" - n = gp.transcribe(x) - assert "&" not in n and n.count("<") == 1 and n.count(">") == 1 - assert n == """ fjórar""" - - x = """ 4""" - n = gp.transcribe(x) - assert "&" not in n and n.count("<") == 1 and n.count(">") == 1 - assert n == """ fjórar""" - - x = """ </4>""" - n = gp.transcribe(x) - assert "&" not in n and n.count("<") == 1 and n.count(">") == 1 - - # ------------------------- - # Test voice engine specific transcription - - assert "Dora" in SUPPORTED_VOICES - # Gudrun, the default voice, and Dora don't spell things the same - gp2 = GreynirSSMLParser("Dora") - alphabet = "aábcdðeéfghiíjklmnoópqrstuúvwxyýþæöz" - n1 = gp.transcribe(gssml(alphabet, type="spell")) - n2 = gp2.transcribe(gssml(alphabet, type="spell")) - assert n1 != n2 - - -def test_number_transcription() -> None: - """Test number handling functionality in queries""" - - from speech.trans.num import ( - number_to_neutral, - number_to_text, - numbers_to_text, - ) - - assert number_to_neutral(2) == "tvö" - assert number_to_neutral(1100) == "eitt þúsund og eitt hundrað" - assert ( - number_to_neutral(-42178249) - == "mínus fjörutíu og tvær milljónir eitt hundrað sjötíu og átta þúsund tvö hundruð fjörutíu og níu" - ) - assert number_to_neutral(241000000000) == "tvö hundruð fjörutíu og einn milljarður" - assert number_to_neutral(100000000) == "eitt hundrað milljónir" - assert number_to_neutral(1000001000) == "einn milljarður og eitt þúsund" - assert number_to_neutral(1000000011) == "einn milljarður og ellefu" - assert number_to_neutral(1001000000) == "einn milljarður og ein milljón" - assert number_to_neutral(1002000000) == "einn milljarður og tvær milljónir" - assert number_to_neutral(200000000000) == "tvö hundruð milljarðar" - assert ( - number_to_text(1000200200) - == "einn milljarður tvö hundruð þúsund og tvö hundruð" - ) - assert ( - number_to_neutral(10000000000000000000000000000000000000000000000000000000) - == "tíu milljónir oktilljóna" - ) - assert ( - number_to_neutral(1000000000000000000000000000000000000001000000000) - == "ein oktilljón og einn milljarður" - ) - assert ( - number_to_neutral(1000000000000000000000000000000000000003000000000) - == "ein oktilljón og þrír milljarðar" - ) - assert number_to_neutral(3000400000) == "þrír milljarðar og fjögur hundruð þúsund" - assert ( - number_to_neutral(2000000000000000000000000000000000100000000000000) - == "tvær oktilljónir og eitt hundrað billjónir" - ) - assert number_to_text(320) == "þrjú hundruð og tuttugu" - assert number_to_text(320000) == "þrjú hundruð og tuttugu þúsund" - assert ( - number_to_text(3202020202020) - == "þrjár billjónir tvö hundruð og tveir milljarðar tuttugu milljónir tvö hundruð og tvö þúsund og tuttugu" - ) - assert ( - number_to_text(320202020) - == "þrjú hundruð og tuttugu milljónir tvö hundruð og tvö þúsund og tuttugu" - ) - - assert number_to_text(101, gender="kk") == "hundrað og einn" - assert number_to_text(-102, gender="kvk") == "mínus hundrað og tvær" - assert ( - number_to_text(-102, gender="kvk", one_hundred=True) - == "mínus eitt hundrað og tvær" - ) - assert number_to_text(5, gender="kk") == "fimm" - assert number_to_text(10001, gender="kvk") == "tíu þúsund og ein" - assert ( - number_to_text(113305, gender="kk") - == "eitt hundrað og þrettán þúsund þrjú hundruð og fimm" - ) - assert number_to_text(400567, gender="hk") == number_to_neutral(400567) - assert ( - number_to_text(-11220024, gender="kvk") - == "mínus ellefu milljónir tvö hundruð og tuttugu þúsund tuttugu og fjórar" - ) - assert ( - number_to_text(19501180) - == "nítján milljónir fimm hundruð og eitt þúsund eitt hundrað og áttatíu" - ) - - assert numbers_to_text("135 og -16") == "hundrað þrjátíu og fimm og mínus sextán" - assert numbers_to_text("-55 manns") == "mínus fimmtíu og fimm manns" - assert numbers_to_text("Baugatangi 1, Reykjavík") == "Baugatangi eitt, Reykjavík" - assert numbers_to_text("Baugatangi 2, Reykjavík") == "Baugatangi tvö, Reykjavík" - assert numbers_to_text("Baugatangi 3, Reykjavík") == "Baugatangi þrjú, Reykjavík" - assert numbers_to_text("Baugatangi 4, Reykjavík") == "Baugatangi fjögur, Reykjavík" - assert numbers_to_text("Baugatangi 5, Reykjavík") == "Baugatangi fimm, Reykjavík" - assert numbers_to_text("Baugatangi 10, Reykjavík") == "Baugatangi tíu, Reykjavík" - assert numbers_to_text("Baugatangi 11, Reykjavík") == "Baugatangi ellefu, Reykjavík" - assert numbers_to_text("Baugatangi 12, Reykjavík") == "Baugatangi tólf, Reykjavík" - assert ( - numbers_to_text("Baugatangi 13, Reykjavík") == "Baugatangi þrettán, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 14, Reykjavík") == "Baugatangi fjórtán, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 15, Reykjavík") == "Baugatangi fimmtán, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 20, Reykjavík") == "Baugatangi tuttugu, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 21, Reykjavík") - == "Baugatangi tuttugu og eitt, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 22, Reykjavík") - == "Baugatangi tuttugu og tvö, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 23, Reykjavík") - == "Baugatangi tuttugu og þrjú, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 24, Reykjavík") - == "Baugatangi tuttugu og fjögur, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 25, Reykjavík") - == "Baugatangi tuttugu og fimm, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 100, Reykjavík") == "Baugatangi hundrað, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 101, Reykjavík") - == "Baugatangi hundrað og eitt, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 102, Reykjavík") - == "Baugatangi hundrað og tvö, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 103, Reykjavík") - == "Baugatangi hundrað og þrjú, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 104, Reykjavík") - == "Baugatangi hundrað og fjögur, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 105, Reykjavík") - == "Baugatangi hundrað og fimm, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 111, Reykjavík") - == "Baugatangi hundrað og ellefu, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 112, Reykjavík") - == "Baugatangi hundrað og tólf, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 113, Reykjavík") - == "Baugatangi hundrað og þrettán, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 114, Reykjavík") - == "Baugatangi hundrað og fjórtán, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 115, Reykjavík") - == "Baugatangi hundrað og fimmtán, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 121, Reykjavík") - == "Baugatangi hundrað tuttugu og eitt, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 174, Reykjavík") - == "Baugatangi hundrað sjötíu og fjögur, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 200, Reykjavík") - == "Baugatangi tvö hundruð, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 201, Reykjavík") - == "Baugatangi tvö hundruð og eitt, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 202, Reykjavík") - == "Baugatangi tvö hundruð og tvö, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 203, Reykjavík") - == "Baugatangi tvö hundruð og þrjú, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 204, Reykjavík") - == "Baugatangi tvö hundruð og fjögur, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 205, Reykjavík") - == "Baugatangi tvö hundruð og fimm, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 211, Reykjavík") - == "Baugatangi tvö hundruð og ellefu, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 212, Reykjavík") - == "Baugatangi tvö hundruð og tólf, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 213, Reykjavík") - == "Baugatangi tvö hundruð og þrettán, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 214, Reykjavík") - == "Baugatangi tvö hundruð og fjórtán, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 215, Reykjavík") - == "Baugatangi tvö hundruð og fimmtán, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 700, Reykjavík") - == "Baugatangi sjö hundruð, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 1-4, Reykjavík") - == "Baugatangi eitt-fjögur, Reykjavík" - ) - assert ( - numbers_to_text("Baugatangi 1-17, Reykjavík") - == "Baugatangi eitt-sautján, Reykjavík" - ) - - -def test_year_transcription() -> None: - """Test number to written year conversion.""" - - from speech.trans.num import year_to_text, years_to_text - - assert year_to_text(1999) == "nítján hundruð níutíu og níu" - assert year_to_text(2004) == "tvö þúsund og fjögur" - assert year_to_text(-501) == "fimm hundruð og eitt fyrir Krist" - assert year_to_text(1001) == "eitt þúsund og eitt" - assert year_to_text(57) == "fimmtíu og sjö" - assert year_to_text(2401) == "tvö þúsund fjögur hundruð og eitt" - - assert ( - years_to_text("Ég fæddist 1994") == "Ég fæddist nítján hundruð níutíu og fjögur" - ) - assert ( - years_to_text("Árið 1461 var borgin Sarajevo stofnuð") - == "Árið fjórtán hundruð sextíu og eitt var borgin Sarajevo stofnuð" - ) - assert ( - years_to_text("17. júlí 1210 lést Sverker II") - == "17. júlí tólf hundruð og tíu lést Sverker II" - ) - assert ( - years_to_text("2021, 2007 og 1999") - == "tvö þúsund tuttugu og eitt, tvö þúsund og sjö og nítján hundruð níutíu og níu" - ) - - -def test_ordinal_transcription() -> None: - """Test number to written ordinal conversion.""" - - from speech.trans.num import number_to_ordinal, numbers_to_ordinal - - assert number_to_ordinal(0) == "núllti" - assert number_to_ordinal(22, case="þgf", gender="kvk") == "tuttugustu og annarri" - assert number_to_ordinal(302, gender="kvk") == "þrjú hundraðasta og önnur" - assert number_to_ordinal(302, case="þgf", gender="hk") == "þrjú hundraðasta og öðru" - assert ( - number_to_ordinal(-302, case="þgf", gender="hk") - == "mínus þrjú hundraðasta og öðru" - ) - assert ( - number_to_ordinal(10202, case="þgf", gender="hk", number="ft") - == "tíu þúsund tvö hundruðustu og öðrum" - ) - assert ( - number_to_ordinal(1000000, case="þf", gender="kvk", number="et") - == "milljónustu" - ) - assert ( - number_to_ordinal(1000000002, case="þf", gender="kvk", number="et") - == "milljörðustu og aðra" - ) - - assert ( - numbers_to_ordinal("Ég lenti í 41. sæti.", case="þgf") - == "Ég lenti í fertugasta og fyrsta sæti." - ) - assert ( - numbers_to_ordinal("Ég lenti í -41. sæti.", case="þgf") - == "Ég lenti í mínus fertugasta og fyrsta sæti." - ) - assert numbers_to_ordinal("-4. sæti.", case="þgf") == "mínus fjórða sæti." - assert ( - numbers_to_ordinal("2. í röðinni var hæstur.") == "annar í röðinni var hæstur." - ) - assert ( - numbers_to_ordinal("1. konan lenti í 2. sæti.", regex=r"1\.", gender="kvk") - == "fyrsta konan lenti í 2. sæti." - ) - assert ( - numbers_to_ordinal("fyrsta konan lenti í 2. sæti.", gender="hk", case="þgf") - == "fyrsta konan lenti í öðru sæti." - ) - assert ( - numbers_to_ordinal("Ég var 10201. í röðinni.") - == "Ég var tíu þúsund tvö hundraðasti og fyrsti í röðinni." - ) - assert ( - numbers_to_ordinal( - "Björn sækist eftir 1. - 4. sæti í Norðvesturkjördæmi", case="þgf" - ).replace("-", "til") - == "Björn sækist eftir fyrsta til fjórða sæti í Norðvesturkjördæmi" - ) - assert ( - numbers_to_ordinal( - "Björn sækist eftir 1.-4. sæti í Norðvesturkjördæmi", case="þgf" - ).replace("-", " til ") - == "Björn sækist eftir fyrsta til fjórða sæti í Norðvesturkjördæmi" - ) - assert ( - numbers_to_ordinal("1.-4. sæti í Norðvesturkjördæmi", case="þgf").replace( - "-", " til " - ) - == "fyrsta til fjórða sæti í Norðvesturkjördæmi" - ) - - -def test_float_transcription() -> None: - """Test float to written text conversion.""" - - from speech.trans.num import float_to_text, floats_to_text - - assert float_to_text(-0.12) == "mínus núll komma tólf" - assert float_to_text(-0.1012) == "mínus núll komma eitt núll eitt tvö" - assert ( - float_to_text(-0.1012, gender="kk") == "mínus núll komma einn núll einn tveir" - ) - assert float_to_text(-21.12, gender="kk") == "mínus tuttugu og einn komma tólf" - assert ( - float_to_text(-21.123, gender="kk") - == "mínus tuttugu og einn komma einn tveir þrír" - ) - assert float_to_text(1.03, gender="kvk") == "ein komma núll þrjár" - assert float_to_text(2.0, gender="kvk", case="þgf") == "tveimur" - assert ( - float_to_text(2.0, gender="kvk", case="þgf", comma_null=True) - == "tveimur komma núll" - ) - assert ( - float_to_text("-10.100,21") - == float_to_text("-10100,21") - == float_to_text("-10100.21") - == "mínus tíu þúsund og eitt hundrað komma tuttugu og eitt" - ) - - assert ( - floats_to_text("2,13 millilítrar af vökva.", gender="kk") - == "tveir komma þrettán millilítrar af vökva." - ) - assert floats_to_text("0,04 prósent.") == "núll komma núll fjögur prósent." - assert floats_to_text("-0,04 prósent.") == "mínus núll komma núll fjögur prósent." - assert ( - floats_to_text("101,0021 prósent.") - == "hundrað og eitt komma núll núll tuttugu og eitt prósent." - ) - assert ( - floats_to_text("10.100,21 prósent.") - == "tíu þúsund og eitt hundrað komma tuttugu og eitt prósent." - ) - assert ( - floats_to_text("Um -10.100,21 prósent.") - == "Um mínus tíu þúsund og eitt hundrað komma tuttugu og eitt prósent." - ) - assert ( - floats_to_text("-10.100,21 prósent.") - == "mínus tíu þúsund og eitt hundrað komma tuttugu og eitt prósent." - ) - assert floats_to_text("2.000.000,00.", comma_null=False) == "tvær milljónir." - - -def test_digit_transcription() -> None: - """Test digit string to written text conversion.""" - - from speech.trans.num import digits_to_text - - assert digits_to_text("5885522") == "fimm átta átta fimm fimm tveir tveir" - assert digits_to_text("112") == "einn einn tveir" - assert digits_to_text("123-0679") == "einn tveir þrír-núll sex sjö níu" - assert ( - digits_to_text("Síminn minn er 12342") - == "Síminn minn er einn tveir þrír fjórir tveir" - ) - assert digits_to_text("581 2345") == "fimm átta einn tveir þrír fjórir fimm" - assert ( - digits_to_text("5812345, það er síminn hjá þeim.") - == "fimm átta einn tveir þrír fjórir fimm, það er síminn hjá þeim." - ) - assert ( - digits_to_text("010270-2039") - == "núll einn núll tveir sjö núll-tveir núll þrír níu" - ) - assert ( - digits_to_text("192 0-1-127", regex=r"\d\d\d") - == "einn níu tveir 0-1-einn tveir sjö" - ) - assert ( - digits_to_text("Hringdu í 1-800-BULL", regex=r"\d+-\d+") - == "Hringdu í einn átta núll núll-BULL" - ) - - -def test_time_transcription() -> None: - assert DT.time(f"00:00") == "tólf á miðnætti" - assert DT.time(f"12:00") == "tólf á hádegi" - midnight = datetime.time(0, 0) - six_am = datetime.time(6, 0) - for h, m in product(range(24), range(60)): - t = datetime.time(hour=h, minute=m) - n = DT.time(t.strftime("%H:%M")) - assert n.replace(" ", "").isalpha() - if midnight < t < six_am: - assert "um nótt" in n - t = datetime.time(6, 6, 6) - assert "klukkan sex núll sex núll sex" == DT.time(t.strftime("klukkan %H:%M:%S")) - assert "klukkan sex núll sex núll sex" == DT.time(t.strftime("kl. %H:%M:%S")) - t = datetime.time(3, 3, 3) - assert "þrjú núll þrjú núll þrjú um nótt" == DT.time(t.strftime("%H:%M:%S")) - - -def test_date_transcription() -> None: - from settings import changedlocale - - with changedlocale(category="LC_TIME"): - for d, m, y, case in product( - range(1, 32), - range(1, 13), - (1, 100, 1800, 1850, 1900, 1939, 2022), - ("nf", "þf", "þgf", "ef"), - ): - try: - date = datetime.date(y, m, d) - except: - continue - n1 = DT.date(date.isoformat(), case=case) - assert n1 == DT.date(f"{y}-{m}-{d}", case=case) - n2 = DT.date(f"{d}/{m}/{y}", case=case) - assert n2 == DT.date(date.strftime("%d/%m/%Y"), case=case) - n3 = DT.date(date.strftime("%d. %B %Y"), case=case) - n4 = DT.date(date.strftime("%d. %b %Y"), case=case) - assert n1 == n2 == n3 == n4 - - -def test_spelling_transcription() -> None: - from speech.trans import _ICE_ENG_ALPHA - - _ALPHABET = _ICE_ENG_ALPHA + _ICE_ENG_ALPHA.lower() - - for a in (_ALPHABET, "ÁÍS", "BSÍ", "LSH", "SÍBS"): - n1 = DT.spell(a.upper()) - n2 = DT.spell(a.lower()) - assert n1 == n2 - assert "." not in re.sub(r"", "", n1) - assert len(n1) > len(a) - assert n1.islower() - - -def test_abbreviation_transcription() -> None: - abbrevs = ( - "t.d.", - "MSc", - "m.a.s.", - "o.s.frv.", - "m.a.", - "PhD", - "Ph.D.", - ) - for a in abbrevs: - n = DT.abbrev(a) - assert "." not in re.sub(r"", "", n) - assert n.islower() - - -def test_email_transcription() -> None: - for e in ( - "jon.jonsson@mideind.is", - "gunnar.brjann@youtube.gov.uk", - "tolvupostur@gmail.com", - ): - n = DT.email(e) - assert "@" not in n and " hjá " in n - assert "." not in re.sub(r"", "", n) and " punktur " in n - - -def test_entity_transcription() -> None: - n = DT.entity("Miðeind ehf.") - assert "ehf." not in n - n = DT.entity("BSÍ") - assert "BSÍ" not in n - n = DT.entity("SÍBS") - assert "SÍBS" not in n - n = DT.entity("L&L slf.") - assert "L" not in n - assert "slf" not in n - n = DT.entity("Kjarninn") - assert n == "Kjarninn" - n = DT.entity("RANNÍS") - assert n.upper() == "RANNÍS" - n = DT.entity("Rannís") - assert n == "Rannís" - n = DT.entity("Verkís") - assert n == "Verkís" - n = DT.entity("RARIK") - assert n == "RARIK" - n = DT.entity("NATO") - assert n == "NATO" - n = DT.entity("NASA") - assert n.upper() == "NASA" - n = DT.entity("Víkurskel ehf.") - assert n.startswith("Víkurskel") and "ehf." not in n - n = DT.entity("VF 45 ehf.") - assert "VF" not in n and "ehf." not in n and "45" not in n - n = DT.entity("Alþjóðalyfjaeftirlitsstofnunin") - assert n == "Alþjóðalyfjaeftirlitsstofnunin" - n = DT.entity("ÖSE") - assert n != "ÖSE" - n = DT.entity("Ungmennaráð UMFÍ") - assert n.startswith("Ungmennaráð") and "UMFÍ" not in n - n = DT.entity("NEC Nijmegen") - assert "NEC" not in n and n.endswith("Nijmegen") - n = DT.entity("Fabienne Buccio") - assert n == "Fabienne Buccio" - n = DT.entity("Salgado") - assert n == "Salgado" - n = DT.entity("Sleep Inn") - assert n == "Sleep Inn" - n = DT.entity("GSMbensín") - assert n == "GSMbensín" - n = DT.entity("Kvennalistinn.is") - assert n == "Kvennalistinn.is" - n = DT.entity("USS Comfort") - assert "USS" not in n and n.endswith("Comfort") - n = DT.entity("Bayern München - FC Rostov") - assert "FC" not in n - - -def test_generic_transcription() -> None: - n = DT.generic("þjálfari ÍR") - assert "ÍR" not in n and "þjálfari " in n - n = DT.generic("fulltrúi í samninganefnd félagsins") - assert n == "fulltrúi í samninganefnd félagsins" - n = DT.generic("formaður nefndarinnar") - assert n == "formaður nefndarinnar" - n = DT.generic("fyrrverandi Bandaríkjaforseti") - assert n == "fyrrverandi Bandaríkjaforseti" - n = DT.generic("þjálfari Fram í Olís deild karla") - assert n == "þjálfari Fram í Olís deild karla" - n = DT.generic("NASF") - assert n and "NASF" not in n - n = DT.generic("íþróttakennari") - assert n == "íþróttakennari" - n = DT.generic("formaður Bandalags háskólamanna") - assert n == "formaður Bandalags háskólamanna" - n = DT.generic("formaður Leigjendasamtakanna") - assert n == "formaður Leigjendasamtakanna" - n = DT.generic("framkvæmdastjóri Samtaka atvinnulífsins (SA)") - assert "framkvæmdastjóri Samtaka atvinnulífsins" in n and "SA" not in n - n = DT.generic("innanríkisráðherra í stjórn Sigmundar Davíðs Gunnlaugssonar") - assert n == "innanríkisráðherra í stjórn Sigmundar Davíðs Gunnlaugssonar" - n = DT.generic("fyrsti ráðherra Íslands") - assert n == "fyrsti ráðherra Íslands" - n = DT.generic("málpípur þær") - assert n == "málpípur þær" - n = DT.generic("sundsérfræðingur RÚV") - assert n == "sundsérfræðingur RÚV" - n = DT.generic("framkvæmdastjóri Strætó ehf.") - assert "framkvæmdastjóri Strætó" in n and "ehf." not in n - n = DT.generic("þáverandi sjávarútvegsráðherra") - assert n == "þáverandi sjávarútvegsráðherra" - n = DT.generic("knattspyrnudómari") - assert n == "knattspyrnudómari" - n = DT.generic("framkvæmdastjóri Félags atvinnurekenda") - assert n == "framkvæmdastjóri Félags atvinnurekenda" - n = DT.generic("þjálfari Stjörnunnar") - assert n == "þjálfari Stjörnunnar" - n = DT.generic("lektor við HÍ") - assert "lektor við" in n and "HÍ" not in n - n = DT.generic("formaður VR og LÍV") - assert "formaður" in n and "VR" not in n and "LÍV" not in n - # Test complete_text arg - n = DT.generic("trillukarl í Skerjafirði") - assert n == "trillukarl í Skerjafirði" - n = DT.generic("trillukarl í Skerjafirði", full_text=True) - assert n == "

trillukarl í Skerjafirði

" - - # Replace whitespace with single space in text - # stretching over multiple lines - ws_re = re.compile(r"\n\s+") - ws_to_space: Callable[[str], str] = lambda t: ws_re.sub(" ", t.strip()) - t = ws_to_space( - """ - Breski seðlabankinn hækkaði stýrivexti sína í dag - um hálft prósentustig og eru vextir nú yfir 3,2%. - Það eru hæstu stýrivextir í Bretlandi í 14 ár. - Seðlabankinn vonar að vaxtahækkunin stemmi stigu - við mikilli verðbólgu í landinu. - """ - ) - n = DT.generic(t, full_text=True) - assert "fjórtán" in n and "yfir þrjú komma tvö prósent" in n - t = ws_to_space( - """ - Breski seðlabankinn hækkaði stýrivexti sína í dag - um hálft prósentustig og eru vextir nú yfir 3,2 prósentum. - Það eru hæstu stýrivextir í Bretlandi í 14 ár. - Seðlabankinn vonar að vaxtahækkunin stemmi stigu - við mikilli verðbólgu í landinu. - """ - ) - n = DT.generic(t, full_text=True) - assert "fjórtán" in n and "yfir þremur komma tveimur prósentum" in n - t = ws_to_space( - """ - t.d. var 249% munur á ódýrstu og dýrustu rauðrófunum, - 118% munur milli bökunarkartafla, 291% munur á grænum eplum, - 97% munur á vínberjum og 2-3% af jarðarberjum. - """ - ) - n = DT.generic(t, full_text=True) - assert ( - "%" not in n - and "til dæmis" in n - and "tvö hundruð níutíu og eitt prósent" in n - and "tvö til þrjú prósent" - ) - n = DT.generic( - "sagðist hún vona að á næstu 10-20 árum " - "yrði farið að nýta tæknina 9,2-5,3 prósent meira." - ) - assert ( - "tíu til tuttugu árum" in n and "níu komma tvö til fimm komma þrjú prósent" in n - ) - t = ws_to_space( - """ - Frakkland - Marókkó á HM. - Leikurinn var bráðfjörugur en það voru Frakkar - sem voru sterkari og unnu þeir leikinn 2-0. - """ - ) - n = DT.generic(t, full_text=True) - assert "Frakkland til Marókkó" not in n and "HM" not in n and "tvö núll" in n - t = ws_to_space( - """ - 2 eru slasaðir og um 1.500 fiskar dauðir eftir að um - 16 metra hátt fiskabúr í miðju Radisson hóteli - í Berlín sprakk snemma í morgun. - """ - ) - n = DT.generic(t, full_text=True) - assert n.startswith("

") and n.endswith("

") - assert "tveir" in n and "eitt þúsund og fimm hundruð" in n and "sextán metra" in n - - t = ws_to_space("Fréttin var síðast uppfærð 3/12/2022 kl. 10:42.") - n = DT.generic(t, full_text=True) - assert n.startswith("

") and n.endswith("

") - assert ( - "þriðja desember tvö þúsund tuttugu og tvö" in n - and "klukkan tíu fjörutíu og tvö" in n - ) - t = ws_to_space("Fréttin var síðast uppfærð 16. desember 2022 kl. 10:42.") - n = DT.generic(t, full_text=True) - assert n.startswith("

") and n.endswith("

") - assert ( - "sextánda desember tvö þúsund tuttugu og tvö" in n - and "klukkan tíu fjörutíu og tvö" in n - ) - t = ws_to_space("Fréttin var síðast uppfærð 2. janúar 2022.") - n = DT.generic(t, full_text=True) - assert n.startswith("

") and n.endswith("

") - assert "annan janúar tvö þúsund tuttugu og tvö" in n - t = ws_to_space("Fréttin var síðast uppfærð 01/01/2022.") - n = DT.generic(t, full_text=True) - assert n.startswith("

") and n.endswith("

") - assert "fyrsta janúar tvö þúsund tuttugu og tvö" in n - t = ws_to_space("Fréttin var síðast uppfærð 14. nóvember og 16. desember 1999.") - n = DT.generic(t, full_text=True) - assert n.startswith("

") and n.endswith("

") - assert "fjórtánda nóvember og sextánda desember nítján hundruð níutíu og níu" in n - t = ws_to_space("Fréttin var síðast uppfærð 2. febrúar klukkan 13:30.") - n = DT.generic(t, full_text=True) - assert n.startswith("

") and n.endswith("

") - assert "annan febrúar klukkan þrettán þrjátíu" in n - - t = ws_to_space( - """ - „ICELAND-málið er mikilvægt og fordæmisgefandi bæði á sviði - hugverkaréttar og þjóðaréttar enda getur niðurstaða þess leitt - til breytinga á evrópskum hugverkarétti. - Athygli hefur vakið að áfrýjunarnefnd EUIPO er að þessu sinni - fjölskipuð, þ.e. skipuð níu aðilum í stað þriggja eins og í - hefðbundnum áfrýjunarnefndum. Það er talið til marks um hve - mikilvægt málið er talið vera, en af um það bil 2.500 árlegum - áfrýjunum er einungis 3-5 vísað til fjölskipaðrar áfrýjunarnefndar. - Þegar við bætist að málið er það fyrsta sem flutt er munnlega fyrir - nefndinni verður þýðing þess enn betur ljós,“ segir á vef Stjórnarráðsins. - """ - ) - n = DT.generic(t, full_text=True) - assert n.startswith("

") and n.endswith("

") - assert "hugverkarétti" in n - assert "ICELAND" in n and "EUIPO" not in n - assert "það er" in n and "þ.e." not in n - assert "2.500" not in n and "tvö þúsund og fimm hundruð árlegum áfrýjunum" - # assert "þremur til fimm" in n # TODO - - -def test_person_transcription() -> None: - # Roman numerals - n = DT.person("Elísabet II") - assert n == "Elísabet önnur" - n = DT.person("Elísabet II Bretlandsdrottning") - assert n == "Elísabet önnur Bretlandsdrottning" - n = DT.person("Leópold II Belgakonungur") - assert n == "Leópold annar Belgakonungur" - n = DT.person("Óskar II Svíakonungur") - assert n == "Óskar annar Svíakonungur" - n = DT.person("Loðvík XVI") - assert n == "Loðvík sextándi" - - # Normal - n = DT.person("Einar Björn") - assert n == "Einar Björn" - n = DT.person("Martin Rivers") - assert n == "Martin Rivers" - n = DT.person("Tor Magne Drønen") - assert n == "Tor Magne Drønen" - n = DT.person("Richard Guthrie") - assert n == "Richard Guthrie" - n = DT.person("Jón Ingvi Bragason") - assert n == "Jón Ingvi Bragason" - n = DT.person("Regína Valdimarsdóttir") - assert n == "Regína Valdimarsdóttir" - n = DT.person("Sigurður Ingvi Snorrason") - assert n == "Sigurður Ingvi Snorrason" - n = DT.person("Aðalsteinn Sigurgeirsson") - assert n == "Aðalsteinn Sigurgeirsson" - - # Abbreviations which should be spelled out - # Note that the spelling can be different based on the voice engine - n = DT.person("James H. Grendell") - assert "H." not in n and n.startswith("James") and n.endswith("Grendell") - n = DT.person("Guðni Th. Jóhannesson") - assert "Th" not in n and n.startswith("Guðni") and n.endswith("Jóhannesson") - n = DT.person("guðni th. jóhannesson") - assert "th" not in n and n.startswith("guðni") and n.endswith("jóhannesson") - n = DT.person("Mary J. Blige") - assert "J." not in n and n.startswith("Mary") and n.endswith("Blige") - n = DT.person("Alfred P. Sloan Jr.") - assert "P." not in n and "Jr." not in n and "Alfred" in n and "Sloan" in n - - # Lowercase middle names - assert DT.person("Louis van Gaal") == "Louis van Gaal" - assert DT.person("Frans van Houten") == "Frans van Houten" - assert DT.person("Alex van der Zwaan") == "Alex van der Zwaan" - assert DT.person("Rafael van der Vaart") == "Rafael van der Vaart" - - -def test_voice_breaks() -> None: - assert DT.vbreak() == "" - for t in ("0ms", "50ms", "1s", "1.7s"): - n = DT.vbreak(time=t) - assert n == f'' - for s in DT._VBREAK_STRENGTHS: - n = DT.vbreak(strength=s) - assert n == f'' diff --git a/speech/voices/__init__.py b/tts.py old mode 100755 new mode 100644 similarity index 53% rename from speech/voices/__init__.py rename to tts.py index 83522032..1c5872bc --- a/speech/voices/__init__.py +++ b/tts.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python """ Greynir: Natural language processing for Icelandic @@ -18,57 +17,12 @@ along with this program. If not, see http://www.gnu.org/licenses/. -""" - -from base64 import b64encode -from utility import STATIC_DIR - - -# Directory for temporary audio files -AUDIO_SCRATCH_DIR = STATIC_DIR / "audio" / "tmp" - - -# Mime types and suffixes -BINARY_MIMETYPE = "application/octet-stream" -AUDIOFMT_TO_MIMETYPE = { - "mp3": "audio/mpeg", - "wav": "audio/wav", - "ogg_vorbis": "audio/ogg", - "pcm": BINARY_MIMETYPE, - # Uses an Ogg container. See https://www.rfc-editor.org/rfc/rfc7845 - "opus": "audio/ogg", -} - -FALLBACK_SUFFIX = "data" -AUDIOFMT_TO_SUFFIX = { - "mp3": "mp3", - "wav": "wav", - "ogg_vorbis": "ogg", - "pcm": "pcm", - # Recommended filename extension for Ogg Opus files is '.opus'. - "opus": "opus", -} - - -def mimetype_for_audiofmt(fmt: str) -> str: - """Returns mime type for the given audio format.""" - return AUDIOFMT_TO_MIMETYPE.get(fmt, BINARY_MIMETYPE) - - -def suffix_for_audiofmt(fmt: str) -> str: - """Returns file suffix for the given audio format.""" - return AUDIOFMT_TO_SUFFIX.get(fmt, FALLBACK_SUFFIX) - - -def generate_data_uri(data: bytes, mime_type: str = BINARY_MIMETYPE) -> str: - """Generate Data URI (RFC2397) from bytes.""" - b64str = b64encode(data).decode("ascii") - return f"data:{mime_type};base64,{b64str}" + Utility functions used in various places in the codebase. +""" DEFAULT_LOCALE = "is_IS" - # Map locales to a default voice ID LOCALE_TO_VOICE_ID = { "is_IS": "Gudrun", diff --git a/utility.py b/utility.py index f7b2ce7f..9ac7e4f2 100755 --- a/utility.py +++ b/utility.py @@ -37,6 +37,7 @@ RESOURCES_DIR = GREYNIR_ROOT_DIR / "resources" STATIC_DIR = GREYNIR_ROOT_DIR / "static" +TTS_AUDIO_DIR = STATIC_DIR / "audio" / "tmp" QUERIES_DIR = GREYNIR_ROOT_DIR / "queries" QUERIES_GRAMMAR_DIR = QUERIES_DIR / "grammars"