LASR-at-Home · maayan25 · Oct 2, 2024 · Oct 7, 2024 · Oct 20, 2024 · Nov 4, 2024
diff --git a/common/__init__.py b/common/__init__.py
diff --git a/common/speech/__init__.py b/common/speech/__init__.py
diff --git a/common/speech/lasr_speech_recognition_interfaces/CMakeLists.txt b/common/speech/lasr_speech_recognition_interfaces/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.8)
+project(lasr_speech_recognition_interfaces)
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  add_compile_options(-Wall -Wextra -Wpedantic)
+endif()
+
+# find dependencies
+find_package(ament_cmake REQUIRED)
+find_package(rclpy REQUIRED)
+find_package(action_msgs REQUIRED)
+
+# uncomment the following section in order to fill in
+# further dependencies manually.
+# find_package(<dependency> REQUIRED)
+
+# For actions, messages, and services
+find_package(rosidl_default_generators REQUIRED)
+
+rosidl_generate_interfaces(${PROJECT_NAME}
+  "action/TranscribeSpeech.action"
+  "msg/Transcription.msg"
+  "srv/TranscribeAudio.srv"
+  DEPENDENCIES builtin_interfaces # Add packages that above messages depend on
+)
+
+ament_export_dependencies(rosidl_default_runtime)
+
+if(BUILD_TESTING)
+  find_package(ament_lint_auto REQUIRED)
+  # the following line skips the linter which checks for copyrights
+  # comment the line when a copyright and license is added to all source files
+  set(ament_cmake_copyright_FOUND TRUE)
+  # the following line skips cpplint (only works in a git repo)
+  # comment the line when this package is in a git repo and when
+  # a copyright and license is added to all source files
+  set(ament_cmake_cpplint_FOUND TRUE)
+  ament_lint_auto_find_test_dependencies()
+endif()
+
+ament_package()
diff --git a/common/speech/lasr_speech_recognition_interfaces/LICENSE b/common/speech/lasr_speech_recognition_interfaces/LICENSE
@@ -0,0 +1,17 @@
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/common/speech/lasr_speech_recognition_interfaces/README.md b/common/speech/lasr_speech_recognition_interfaces/README.md
@@ -0,0 +1,51 @@
+# lasr_speech_recognition_interfaces
+
+Common messages used for speech recognition
+
+This package is maintained by:
+
+- [Maayan Armony](mailto:[email protected])
+- [Paul Makles](mailto:[email protected]) (ROS1)
- [Paul Makles](mailto:[email protected]) (ROS1)
- [Paul Makles](mailto:[email protected]) (ROS1)
+
+## Prerequisites
+
+This package depends on the following ROS packages:
+
+- colcon (buildtool)
+- message_generation (build)
+- message_runtime (exec)
+
+## Usage
+
+Ask the package maintainer to write a `doc/USAGE.md` for their package!
+
+## Example
+
+Ask the package maintainer to write a `doc/EXAMPLE.md` for their package!
+
+## Technical Overview
+
+Ask the package maintainer to write a `doc/TECHNICAL.md` for their package!
+
+## ROS Definitions
+
+### Launch Files
+
+This package has no launch files.
+
+### Messages
+
+#### `Transcription`
+
+|  Field   |  Type  | Description |
+|:--------:|:------:|-------------|
+|  phrase  | string |             |
+| finished |  bool  |             |
+
+### Services
+
+This package has no services.
+
+### Actions
+
+This package has no actions.
diff --git a/common/speech/lasr_speech_recognition_interfaces/action/TranscribeSpeech.action b/common/speech/lasr_speech_recognition_interfaces/action/TranscribeSpeech.action
@@ -0,0 +1,11 @@
+# Energy threshold
+float32 energy_threshold
+
+# Max phrase duration
+float32 max_phrase_limit
+---
+#result definition
+string sequence
+---
+#feedback
+string sequence
diff --git a/common/speech/lasr_speech_recognition_interfaces/msg/Transcription.msg b/common/speech/lasr_speech_recognition_interfaces/msg/Transcription.msg
@@ -0,0 +1,2 @@
+string phrase
+bool finished
diff --git a/common/speech/lasr_speech_recognition_interfaces/package.xml b/common/speech/lasr_speech_recognition_interfaces/package.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+    <name>lasr_speech_recognition_interfaces</name>
+    <version>0.0.0</version>
+    <description>Common messages used for speech recognition</description>
+    <maintainer email="[email protected]">maayan</maintainer>
+    <license>MIT</license>
+
+    <buildtool_depend>ament_cmake</buildtool_depend>
+    <!-- Required for actions, messages, and services -->
+    <buildtool_depend>rosidl_default_generators</buildtool_depend>
+    <depend>action_msgs</depend>
+    <exec_depend>rosidl_default_runtime</exec_depend>
+    <member_of_group>rosidl_interface_packages</member_of_group>
+
+    <test_depend>ament_lint_auto</test_depend>
+    <test_depend>ament_lint_common</test_depend>
+
+    <export>
+        <build_type>ament_cmake</build_type>
+    </export>
+</package>
diff --git a/common/speech/lasr_speech_recognition_interfaces/srv/TranscribeAudio.srv b/common/speech/lasr_speech_recognition_interfaces/srv/TranscribeAudio.srv
@@ -0,0 +1,2 @@
+---
+string phrase
diff --git a/common/speech/lasr_speech_recognition_whisper/LICENSE b/common/speech/lasr_speech_recognition_whisper/LICENSE
@@ -0,0 +1,17 @@
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/common/speech/lasr_speech_recognition_whisper/README.md b/common/speech/lasr_speech_recognition_whisper/README.md
@@ -0,0 +1,109 @@
+# lasr_speech_recognition_whisper
+
+Speech recognition implemented using OpenAI Whisper
+
+This package is maintained by:
+
+- [Maayan Armony](mailto:[email protected])
+- [Paul Makles](mailto:[email protected]) (ROS1)
- [Paul Makles](mailto:[email protected]) (ROS1)
- [Paul Makles](mailto:[email protected]) (ROS1)
+
+## Prerequisites
+
+This package depends on the following ROS packages:
+
+- colcon (buildtool)
+- lasr_speech_recognition_interfaces
+
+This packages requires Python 3.10 to be present.
+
+This package has 48 Python dependencies:
+
+- [SpeechRecognition](https://pypi.org/project/SpeechRecognition)==3.10.0
+- [openai-whisper](https://pypi.org/project/openai-whisper)==20230314
+- [PyAudio](https://pypi.org/project/PyAudio)==0.2.13
+- [PyYaml](https://pypi.org/project/PyYaml)==6.0.1
+- .. and sub dependencies (see [requirements file](requirements.txt))
+
+This package requires that [ffmpeg](https://ffmpeg.org/) is available during runtime.
+
+## Usage
+
+> **Warning**: this package is not complete, this is subject to change.
+
+List available microphones:
+
+```bash
+ros2 run lasr_speech_recognition_whisper list_microphones.py
+```
+
+Start the example script:
+
+```bash
+ros2 run lasr_speech_recognition_whisper transcribe_microphone by-index <microphone_index>
+ros2 run lasr_speech_recognition_whisper transcribe_microphone by-name <substring_of_name>
+```
+
+Then start listening to people:
+
+```bash
+ros2 service call /whisper/start_listening "{}"
+```
+
+You can now listen on `/transcription` for a live transcription.
+
+Stop listening whenever:
+
+```bash
+ros2 service call /whisper/stop_listening "{}"
+```
+
+## Example
+
+Ask the package maintainer to write a `doc/EXAMPLE.md` for their package!
+
+## Technical Overview
+
+This package does speech recognition in three parts:
+
+- Adjusting for background noise
+
+  We wait for a set period of time monitoring the audio stream to determine what we should ignore when collecting voice
+  data.
+
+- Collecting appropriate voice data for phrases
+
+  We use the `SpeechRecognition` package to monitor the input audio stream and determine when a person is actually
+  speaking with enough energy that we would consider them to be speaking to the robot.
+
+- Running inference on phrases
+
+  We continuously combine segments of the spoken phrase to form a sample until a certain timeout or threshold after
+  which the phrase ends. This sample is sent to a local OpenAI Whisper model to transcribe.
+
+The package can input from the following sources:
+
+- On-board or external microphone on device
+- Audio data from ROS topic (WORK IN PROGRESS)
+
+The package can output transcriptions to:
+
+- Standard output
+- A ROS topic
+
+## ROS Definitions
+
+### Launch Files
+
+This package has no launch files.
+
+### Messages
+
+This package has no messages.
+
+### Services
+
+This package has no services.
+
+### Actions
+
+This package has no actions.
diff --git a/common/speech/lasr_speech_recognition_whisper/__init__.py b/common/speech/lasr_speech_recognition_whisper/__init__.py
diff --git a/common/speech/lasr_speech_recognition_whisper/lasr_speech_recognition_whisper/__init__.py b/common/speech/lasr_speech_recognition_whisper/lasr_speech_recognition_whisper/__init__.py
diff --git a/...peech_recognition_whisper/lasr_speech_recognition_whisper/simple_transcribe_microphone.py b/...peech_recognition_whisper/lasr_speech_recognition_whisper/simple_transcribe_microphone.py
@@ -0,0 +1,101 @@
+#!/usr/bin python3
+import os
+import torch
+import rclpy
+from ament_index_python import packages
+
+import sys
+from pathlib import Path
+import speech_recognition as sr
+import numpy as np
+
+import sounddevice  # needed to remove ALSA error messages
+from lasr_speech_recognition_interfaces.srv import TranscribeAudio
+from src import ModelCache  # type: ignore
+
+MODEL = "medium.en"  # Whisper model
+TIMEOUT = 5.0  # Timeout for listening for the start of a phrase
+PHRASE_TIME_LIMIT = None  # Timeout for listening for the end of a phrase
+
+WHISPER_CACHE = os.path.join(str(Path.home()), ".cache", "whisper")
+os.makedirs(WHISPER_CACHE, exist_ok=True)
+os.environ["TIKTOKEN_CACHE_DIR"] = WHISPER_CACHE
+
+if len(sys.argv) < 3:
+    print("Usage:")
+    print(
+        "ros2 run lasr_speech_recognition transcribe_microphone by-index <device_index>"
+    )
+    print("ros2 run lasr_speech_recognition transcribe_microphone by-name <substring>")
+    exit(1)
+else:
+    matcher = sys.argv[1]
+    device_index = None
+    if matcher == "by-index":
+        device_index = int(sys.argv[2])
+    elif matcher == "by-name":
+        import speech_recognition as sr
+
+        microphones = enumerate(sr.Microphone.list_microphone_names())
+
+        target_name = sys.argv[2]
+        for index, name in microphones:
+            if target_name in name:
+                device_index = index
+                break
+
+        if device_index is None:
+            print("Could not find device!")
+            exit(1)
+    else:
+        print("Invalid matcher")
+        exit(1)
+
+rclpy.init(args=sys.argv)
+node = rclpy.create_node("transcribe_mic")
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_cache = ModelCache()
+model = model_cache.load_model("medium.en", device=device)
+
+# try to run inference on the example file
+package_install = packages.get_package_prefix("lasr_speech_recognition_whisper")
+package_root = os.path.abspath(
+    os.path.join(
+        package_install, os.pardir, os.pardir, "lasr_speech_recognition_whisper"
+    )
+)
+example_fp = os.path.join(package_root, "test.m4a")
+node.get_logger().info(
+    "Running transcription on example file to ensure model is loaded..."
+)
+transcription = model.transcribe(example_fp, fp16=torch.cuda.is_available())
+node.get_logger().info(str(transcription))
+
+microphone = sr.Microphone(device_index=device_index, sample_rate=16000)
+r = sr.Recognizer()
+with microphone as source:
+    r.adjust_for_ambient_noise(source)
+
+
+def handle_transcribe_audio(_):
+    with microphone as source:
+
+        wav_data = r.listen(
+            source, timeout=TIMEOUT, phrase_time_limit=PHRASE_TIME_LIMIT
+        ).get_wav_data()
+        float_data = (
+            np.frombuffer(wav_data, dtype=np.int16).astype(np.float32, order="C")
+            / 32768.0
+        )
+
+        phrase = model.transcribe(float_data, fp16=device == "cuda")["text"]
+        return TranscribeAudio.Response(phrase=phrase)
+
+
+node.create_service(
+    TranscribeAudio, "/whisper/transcribe_audio", handle_transcribe_audio
+)
+
+node.get_logger().info("Whisper service ready")
+rclpy.spin(node)