diff --git a/angel-docker-build.sh b/angel-docker-build.sh index 4180fb1f4..43018d243 100755 --- a/angel-docker-build.sh +++ b/angel-docker-build.sh @@ -19,7 +19,7 @@ Build the PTG ANGEL system docker container images. Options: -h | --help Display this message. - --force Force image building regardless of workspace hygiene.f + -f | --force Force image building regardless of workspace hygiene. " } @@ -32,7 +32,7 @@ do usage exit 0 ;; - --force) + -f|--force) log "Forcing build regardless of workspace hygiene." shift FORCE_BUILD=1 @@ -113,4 +113,4 @@ get_docker_compose_cmd DC_CMD --env-file "$SCRIPT_DIR"/docker/.env \ -f "$SCRIPT_DIR"/docker/docker-compose.yml \ --profile build-only \ - build "$@" + build "${dc_forward_params[@]}" "$@" diff --git a/docker/.env b/docker/.env index 7fb316548..7c260dd65 100644 --- a/docker/.env +++ b/docker/.env @@ -32,3 +32,10 @@ RMW_IMPLEMENTATION=rmw_cyclonedds_cpp # This must specify the network interface for CycloneDDS to use. CYCLONE_DDS_INTERFACE=lo + +# Starting with the docker compose plugin (v2), the whole compose file will be +# validated, even for services not being run. This provides a valid "default" +# path to cause validation to succeed. This variable should be overridden when +# attempting to actually run a service that makes use of this variable. +# Path considered relative to where the docker-compose file is located. +XAUTH_FILEPATH=../.container_xauth/.placeholder diff --git a/ros/angel_msgs/CMakeLists.txt b/ros/angel_msgs/CMakeLists.txt index e5abe06e3..bd9890ce4 100644 --- a/ros/angel_msgs/CMakeLists.txt +++ b/ros/angel_msgs/CMakeLists.txt @@ -28,6 +28,7 @@ set( message_files msg/AruiObject3d.msg msg/AruiUpdate.msg msg/AruiUserNotification.msg + msg/DialogueUtterance.msg msg/EyeGazeData.msg msg/HandJointPose.msg msg/HandJointPosesUpdate.msg diff --git a/ros/angel_msgs/msg/DialogueUtterance.msg b/ros/angel_msgs/msg/DialogueUtterance.msg new file mode 100644 index 000000000..49d3122ee --- /dev/null +++ b/ros/angel_msgs/msg/DialogueUtterance.msg @@ -0,0 +1,24 @@ +# +# Dialogue Utterance with additional information about the environmental state +# and user model. +# + +# The header primarily encapsulates when this message was emitted. +# The time component of this may be utilized as an identifier for this user +# intent and utterance. +std_msgs/Header header + +# Speech-to-text of the user utterance we have interpreted +string utterance_text + +# Below are optional fields + +# Canonical user intent that has been interpreted. "Canonical" in this context +# is to mean that this string may be used as an identifier of this type of +# user intent. Should be in the range [0,1] where 1.0 means absolute confidence. +string intent +float64 intent_confidence_score + +# Emotion classification. Should be in the range [0,1] where 1.0 means absolute confidence. +string emotion +float64 emotion_confidence_score diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/asr.py b/ros/angel_system_nodes/angel_system_nodes/audio/asr.py index 3cf7c4b0c..a24208dea 100644 --- a/ros/angel_system_nodes/angel_system_nodes/audio/asr.py +++ b/ros/angel_system_nodes/angel_system_nodes/audio/asr.py @@ -11,7 +11,7 @@ from rclpy.node import Node import simpleaudio as sa -from angel_msgs.msg import HeadsetAudioData, Utterance +from angel_msgs.msg import HeadsetAudioData, DialogueUtterance from angel_utils import make_default_main @@ -106,7 +106,9 @@ def __init__(self): self.subscription = self.create_subscription( HeadsetAudioData, self._audio_topic, self.listener_callback, 1 ) - self._publisher = self.create_publisher(Utterance, self._utterances_topic, 1) + self._publisher = self.create_publisher( + DialogueUtterance, self._utterances_topic, 1 + ) self.audio_stream = [] self.t = threading.Thread() @@ -203,17 +205,22 @@ def asr_server_request_thread(self, audio_data, num_channels, sample_rate): if response: response_text = json.loads(response.text)["text"] self.log.info("Complete ASR text is:\n" + f'"{response_text}"') - if self._is_sentence_tokenize_mode: - for sentence in sent_tokenize(response_text): - utterance_msg = Utterance() - utterance_msg.value = sentence - self.log.info("Publishing message: " + f'"{sentence}"') - self._publisher.publish(utterance_msg) - else: - utterance_msg = Utterance() - utterance_msg.value = response_text - self.log.info("Publishing message: " + f'"{response_text}"') - self._publisher.publish(utterance_msg) + self._publish_response(response_text, self._is_sentence_tokenize_mode) + + def _publish_response(self, response_text: str, tokenize_sentences: bool): + if tokenize_sentences: + for sentence in sent_tokenize(response_text): + self._publisher.publish(self._construct_dialogue_utterance(sentence)) + else: + self._publisher.publish(self._construct_dialogue_utterance(response_text)) + + def _construct_dialogue_utterance(self, msg_text: str) -> DialogueUtterance: + msg = DialogueUtterance() + msg.header.frame_id = "ASR" + msg.header.stamp = self.get_clock().now().to_msg() + msg.utterance_text = msg_text + self.log.info("Publishing message: " + f'"{msg_text}"') + return msg main = make_default_main(ASR) diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/dialogue_utterance_processing.py b/ros/angel_system_nodes/angel_system_nodes/audio/dialogue_utterance_processing.py new file mode 100644 index 000000000..2674d7000 --- /dev/null +++ b/ros/angel_system_nodes/angel_system_nodes/audio/dialogue_utterance_processing.py @@ -0,0 +1,24 @@ +from angel_msgs.msg import DialogueUtterance + + +def copy_dialogue_utterance( + msg: DialogueUtterance, node_name, copy_time +) -> DialogueUtterance: + msg = DialogueUtterance() + msg.header.frame_id = node_name + msg.utterance_text = msg.utterance_text + + # Assign new time for publication. + msg.header.stamp = copy_time + + # Copy over intent classification information if present. + if msg.intent: + msg.intent = msg.intent + msg.intent_confidence_score = msg.intent_confidence_score + + # Copy over intent classification information if present. + if msg.emotion: + msg.emotion = msg.emotion + msg.emotion_confidence_score = msg.emotion_confidence_score + + return msg diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/emotion/__init__.py b/ros/angel_system_nodes/angel_system_nodes/audio/emotion/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/emotion/base_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/audio/emotion/base_emotion_detector.py index df99a7490..e05621f93 100644 --- a/ros/angel_system_nodes/angel_system_nodes/audio/emotion/base_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/audio/emotion/base_emotion_detector.py @@ -4,14 +4,14 @@ import threading from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer -from angel_msgs.msg import InterpretedAudioUserEmotion, InterpretedAudioUserIntent +from angel_msgs.msg import DialogueUtterance from angel_utils import declare_and_get_parameters from angel_utils import make_default_main +from angel_system_nodes.audio import dialogue_utterance_processing -IN_EXPECT_USER_INTENT_TOPIC = "expect_user_intent_topic" -IN_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" -OUT_INTERP_USER_EMOTION_TOPIC = "user_emotion_topic" +IN_TOPIC = "input_topic" +OUT_TOPIC = "user_emotion_topic" # Currently supported emotions. This is tied with the emotions # output to VaderSentiment (https://github.com/cjhutto/vaderSentiment) and @@ -26,8 +26,8 @@ class BaseEmotionDetector(Node): """ - As of Q22023, emotion detection is derived via VaderSentiment - (https://github.com/cjhutto/vaderSentiment). + This is the base emotion detection node that other emotion detection nodes + should inherit from. """ def __init__(self): @@ -38,32 +38,22 @@ def __init__(self): param_values = declare_and_get_parameters( self, [ - (IN_EXPECT_USER_INTENT_TOPIC,), - (IN_INTERP_USER_INTENT_TOPIC,), - (OUT_INTERP_USER_EMOTION_TOPIC,), + (IN_TOPIC,), + (OUT_TOPIC,), ], ) - self._in_expect_uintent_topic = param_values[IN_EXPECT_USER_INTENT_TOPIC] - self._in_interp_uintent_topic = param_values[IN_INTERP_USER_INTENT_TOPIC] - self._out_interp_uemotion_topic = param_values[OUT_INTERP_USER_EMOTION_TOPIC] + self._in_topic = param_values[IN_TOPIC] + self._out_topic = param_values[OUT_TOPIC] # Handle subscription/publication topics. - self.expect_uintent_subscription = self.create_subscription( - InterpretedAudioUserIntent, - self._in_expect_uintent_topic, - self.intent_detection_callback, + self._subscription = self.create_subscription( + DialogueUtterance, + self._in_topic, + self.emotion_detection_callback, 1, ) - self.interp_uintent_subscription = self.create_subscription( - InterpretedAudioUserIntent, - self._in_interp_uintent_topic, - self.intent_detection_callback, - 1, - ) - self._interp_emo_publisher = self.create_publisher( - InterpretedAudioUserEmotion, self._out_interp_uemotion_topic, 1 - ) + self._publication = self.create_publisher(DialogueUtterance, self._out_topic, 1) self.message_queue = queue.Queue() self.handler_thread = threading.Thread(target=self.process_message_queue) @@ -95,14 +85,14 @@ def _get_vader_sentiment_analysis(self, utterance: str): ) return (classification, confidence) - def get_inference(self, msg): + def get_inference(self, msg: DialogueUtterance): """ Abstract away the different model inference calls depending on the node's configure model mode. """ return self._get_vader_sentiment_analysis(msg.utterance_text) - def intent_detection_callback(self, msg): + def emotion_detection_callback(self, msg: DialogueUtterance): """ This is the main ROS node listener callback loop that will process all messages received via subscribed topics. @@ -119,29 +109,29 @@ def process_message_queue(self): while True: msg = self.message_queue.get() self.log.debug(f'Processing message:\n\n"{msg.utterance_text}"') - classification, confidence_score = self.get_inference(msg) - self.publish_detected_emotion( - msg.utterance_text, classification, confidence_score - ) - - def publish_detected_emotion( - self, utterance: str, classification: str, confidence_score: float - ): + self.process_message(msg) + + def process_message(self, msg: DialogueUtterance): """ Handles message publishing for an utterance with a detected emotion classification. """ - emotion_msg = InterpretedAudioUserEmotion() - emotion_msg.header.frame_id = "Emotion Detection" - emotion_msg.header.stamp = self.get_clock().now().to_msg() - emotion_msg.utterance_text = utterance - emotion_msg.user_emotion = classification - emotion_msg.confidence = confidence_score - self._interp_emo_publisher.publish(emotion_msg) - colored_utterance = colored(utterance, "light_blue") - colored_emotion = colored(classification, "light_green") + classification, confidence_score = self.get_inference(msg) + pub_msg = dialogue_utterance_processing.copy_dialogue_utterance( + msg, + node_name="Emotion Detection", + copy_time=self.get_clock().now().to_msg(), + ) + # Overwrite the user emotion with the latest classification information. + pub_msg.emotion = classification + pub_msg.emotion_confidence_score = confidence_score + self.emotion_publication.publish(pub_msg) + + # Log emotion detection information. + colored_utterance = colored(pub_msg.utterance_text, "light_blue") + colored_emotion = colored(pub_msg.emotion, "light_green") self.log.info( f'Publishing {{"{colored_emotion}": {confidence_score}}} ' - + f'to {self._out_interp_uemotion_topic} for:\n>>> "{colored_utterance}"' + + f'to {self._out_topic} for:\n>>> "{colored_utterance}"' ) def _apply_filter(self, msg): @@ -150,10 +140,6 @@ def _apply_filter(self, msg): none if the message should be filtered out. Else, return the incoming msg if it can be included. """ - # if msg.user_intent.lower() == "user inquiry": - # return msg - # else: - # return None return msg diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/emotion/gpt_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/audio/emotion/gpt_emotion_detector.py index 06668bf91..f725478df 100644 --- a/ros/angel_system_nodes/angel_system_nodes/audio/emotion/gpt_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/audio/emotion/gpt_emotion_detector.py @@ -8,7 +8,7 @@ BaseEmotionDetector, LABEL_MAPPINGS, ) -from angel_utils import make_default_main +from angel_utils import declare_and_get_parameters, make_default_main openai.organization = os.getenv("OPENAI_ORG_ID") openai.api_key = os.getenv("OPENAI_API_KEY") @@ -23,12 +23,22 @@ {"utterance": "We're doing great and I'm learning a lot!", "label": "positive"}, ] +PARAM_TIMEOUT = "timeout" + class GptEmotionDetector(BaseEmotionDetector): def __init__(self): super().__init__() self.log = self.get_logger() + param_values = declare_and_get_parameters( + self, + [ + (PARAM_TIMEOUT, 600), + ], + ) + self.timeout = param_values[PARAM_TIMEOUT] + # This node additionally includes fields for interacting with OpenAI # via LangChain. if not os.getenv("OPENAI_API_KEY"): diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/intent/__init__.py b/ros/angel_system_nodes/angel_system_nodes/audio/intent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/intent/base_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/audio/intent/base_intent_detector.py index 0651aa512..cb1c7f6fc 100644 --- a/ros/angel_system_nodes/angel_system_nodes/audio/intent/base_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/audio/intent/base_intent_detector.py @@ -3,7 +3,7 @@ from termcolor import colored import threading -from angel_msgs.msg import InterpretedAudioUserIntent, Utterance +from angel_msgs.msg import DialogueUtterance from angel_utils import declare_and_get_parameters from angel_utils import make_default_main @@ -18,7 +18,7 @@ # https://docs.google.com/document/d/1uuvSL5de3LVM9c0tKpRKYazDxckffRHf7IAcabSw9UA . INTENT_LABELS = ["next_step", "prev_step", "inquiry", "other"] -UTTERANCES_TOPIC = "utterances_topic" +IN_TOPIC = "utterances_topic" PARAM_EXPECT_USER_INTENT_TOPIC = "expect_user_intent_topic" PARAM_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" @@ -32,24 +32,24 @@ def __init__(self): param_values = declare_and_get_parameters( self, [ - (UTTERANCES_TOPIC,), + (IN_TOPIC,), (PARAM_EXPECT_USER_INTENT_TOPIC,), (PARAM_INTERP_USER_INTENT_TOPIC,), ], ) - self._utterances_topic = param_values[UTTERANCES_TOPIC] + self._input_topic = param_values[IN_TOPIC] self._expect_uintent_topic = param_values[PARAM_EXPECT_USER_INTENT_TOPIC] self._interp_uintent_topic = param_values[PARAM_INTERP_USER_INTENT_TOPIC] # Handle subscription/publication topics. self.subscription = self.create_subscription( - Utterance, self._utterances_topic, self.utterance_callback, 1 + DialogueUtterance, self._input_topic, self.utterance_callback, 1 ) self._expected_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._expect_uintent_topic, 1 + DialogueUtterance, self._expect_uintent_topic, 1 ) self._interp_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._interp_uintent_topic, 1 + DialogueUtterance, self._interp_uintent_topic, 1 ) self.utterance_message_queue = queue.Queue() @@ -63,7 +63,7 @@ def utterance_callback(self, msg): This is the main ROS node listener callback loop that will process all messages received via subscribed topics. """ - self.log.debug(f'Received message:\n\n"{msg.value}"') + self.log.debug(f'Received message:\n\n"{msg.utterance_text}"') self.utterance_message_queue.put(msg) def process_utterance_message_queue(self): @@ -72,13 +72,10 @@ def process_utterance_message_queue(self): """ while True: msg = self.utterance_message_queue.get() - self.log.debug(f'Processing message:\n\n"{msg.value}"') - intent, score = self.detect_intents(msg) - if not intent: - continue - self.publish_msg(msg.value, intent, score) + self.log.debug(f'Processing message:\n\n"{msg.utterance_text}"') + self.process_message(msg) - def detect_intents(self, msg): + def process_message(self, msg: DialogueUtterance): """ Keyphrase search for intent detection. This implementation does simple string matching to assign a detected label. When multiple intents are @@ -98,7 +95,7 @@ def _tiebreak_intents(intents, confidences): ) return classification, score - lower_utterance = msg.value.lower() + lower_utterance = msg.utterance_text.lower() intents = [] confidences = [] if self._contains_phrase(lower_utterance, NEXT_STEP_KEYPHRASES): @@ -110,36 +107,39 @@ def _tiebreak_intents(intents, confidences): if self._contains_phrase(lower_utterance, QUESTION_KEYPHRASES): intents.append(INTENT_LABELS[2]) confidences.append(0.5) + if not intents: - colored_utterance = colored(msg.value, "light_blue") + colored_utterance = colored(msg.utterance_text, "light_blue") self.log.info(f'No intents detected for:\n>>> "{colored_utterance}":') return None, -1.0 + else: + classification, confidence = _tiebreak_intents(intents, confidences) + classification = colored(classification, "light_green") + self.publish_message(msg.utterance_text, classification, confidence) - classification, confidence = _tiebreak_intents(intents, confidences) - classification = colored(classification, "light_green") - return classification, confidence - - def publish_msg(self, utterance, intent, score): + def publish_message(self, msg: DialogueUtterance, intent: str, score: float): """ Handles message publishing for an utterance with a detected intent. """ - intent_msg = InterpretedAudioUserIntent() - intent_msg.header.frame_id = "Intent Detection" - intent_msg.header.stamp = self.get_clock().now().to_msg() - intent_msg.utterance_text = utterance - intent_msg.user_intent = intent - intent_msg.confidence = score + pub_msg = self.copy_dialogue_utterance( + msg, node_name="Intent Detection", copy_time=self.get_clock().now().to_msg() + ) + # Overwrite the user intent with the latest classification information. + pub_msg.intent = intent + pub_msg.intent_confidence_score = score + + # Decide which intent topic to publish the message to. published_topic = None - if self._contains_phrase(utterance.lower(), OVERRIDE_KEYPHRASES): - intent_msg.confidence = 1.0 - self._expected_publisher.publish(intent_msg) + if self._contains_phrase(pub_msg.utterance_text.lower(), OVERRIDE_KEYPHRASES): published_topic = PARAM_EXPECT_USER_INTENT_TOPIC + pub_msg.intent_confidence_score = 1.0 + self._expected_publisher.publish(pub_msg) else: - self._interp_publisher.publish(intent_msg) published_topic = PARAM_INTERP_USER_INTENT_TOPIC + self._interp_publisher.publish(pub_msg) - colored_utterance = colored(utterance, "light_blue") - colored_intent = colored(intent_msg.user_intent, "light_green") + colored_utterance = colored(pub_msg.utterance_text, "light_blue") + colored_intent = colored(pub_msg.intent, "light_green") self.log.info( f'Publishing {{"{colored_intent}": {score}}} to {published_topic} ' + f'for:\n>>> "{colored_utterance}"' diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/intent/gpt_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/audio/intent/gpt_intent_detector.py index 8efeedbae..47099c2ce 100644 --- a/ros/angel_system_nodes/angel_system_nodes/audio/intent/gpt_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/audio/intent/gpt_intent_detector.py @@ -5,11 +5,12 @@ import os import rclpy +from angel_msgs.msg import DialogueUtterance from angel_system_nodes.audio.intent.base_intent_detector import ( BaseIntentDetector, INTENT_LABELS, ) -from angel_utils import make_default_main +from angel_utils import declare_and_get_parameters, make_default_main openai.organization = os.getenv("OPENAI_ORG_ID") @@ -17,18 +18,29 @@ # The following are few shot examples when prompting GPT. FEW_SHOT_EXAMPLES = [ - {"utterance": "Go back to the previous step!", "label": "prev_step."}, - {"utterance": "Next step, please.", "label": "next_step"}, - {"utterance": "How should I wrap this tourniquet?", "label": "inquiry"}, - {"utterance": "The sky is blue", "label": "other"}, + {"utterance": "Go back to the previous step!", "label": "prev_step[eos]"}, + {"utterance": "Next step, please.", "label": "next_step[eos]"}, + {"utterance": "How should I wrap this tourniquet?", "label": "inquiry[eos]"}, + {"utterance": "The sky is blue", "label": "other[eos]"}, + {"utterance": "What is this thing?", "label": "object_clarification[eos]"}, ] +PARAM_TIMEOUT = "timeout" + class GptIntentDetector(BaseIntentDetector): def __init__(self): super().__init__() self.log = self.get_logger() + param_values = declare_and_get_parameters( + self, + [ + (PARAM_TIMEOUT, 600), + ], + ) + self.timeout = param_values[PARAM_TIMEOUT] + # This node additionally includes fields for interacting with OpenAI # via LangChain. if not os.getenv("OPENAI_API_KEY"): @@ -79,17 +91,16 @@ def _labels_list_str(labels): model_name="gpt-3.5-turbo", openai_api_key=self.openai_api_key, temperature=0.0, - # Only 2 tokens needed for classification (tokens are delimited by use of '_', i.e. - # 'next_step' counts as 2 tokens). - max_tokens=2, + request_timeout=self.timeout, ) return LLMChain(llm=openai_llm, prompt=few_shot_prompt) - def detect_intents(self, msg): + def detect_intents(self, msg: DialogueUtterance): """ Detects the user intent via langchain execution of GPT. """ - return self.chain.run(utterance=msg), 0.5 + intent = self.chain.run(utterance=msg.utterance_text) + return intent.split("[eos]")[0], 0.5 main = make_default_main(GptIntentDetector) diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/intent/intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/audio/intent/intent_detector.py deleted file mode 100644 index 14d35a330..000000000 --- a/ros/angel_system_nodes/angel_system_nodes/audio/intent/intent_detector.py +++ /dev/null @@ -1,127 +0,0 @@ -import rclpy -from rclpy.node import Node - -from angel_msgs.msg import InterpretedAudioUserIntent, Utterance -from angel_utils import make_default_main - - -# Please refer to labels defined in -# https://docs.google.com/document/d/1uuvSL5de3LVM9c0tKpRKYazDxckffRHf7IAcabSw9UA . -NEXT_STEP_KEYPHRASES = ["skip", "next", "next step"] -PREV_STEP_KEYPHRASES = ["previous", "previous step", "last step", "go back"] -OVERRIDE_KEYPHRASES = ["angel", "angel system"] - -# TODO(derekahmed): Please figure out how to keep this sync-ed with -# config/angel_system_cmds/user_intent_to_sys_cmd_v1.yaml. -LABELS = ["Go to next step", "Go to previous step"] - - -UTTERANCES_TOPIC = "utterances_topic" -PARAM_EXPECT_USER_INTENT_TOPIC = "expect_user_intent_topic" -PARAM_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" - - -class IntentDetector(Node): - """ - As of Q12023, intent detection is derived heuristically. This will be shifted - to a model-based approach in the near-future. - """ - - def __init__(self): - super().__init__(self.__class__.__name__) - self.log = self.get_logger() - - parameter_names = [ - UTTERANCES_TOPIC, - PARAM_EXPECT_USER_INTENT_TOPIC, - PARAM_INTERP_USER_INTENT_TOPIC, - ] - set_parameters = self.declare_parameters( - namespace="", - parameters=[(p,) for p in parameter_names], - ) - # Check for not-set parameters - some_not_set = False - for p in set_parameters: - if p.type_ is rclpy.parameter.Parameter.Type.NOT_SET: - some_not_set = True - self.log.error(f"Parameter not set: {p.name}") - if some_not_set: - raise ValueError("Some parameters are not set.") - - self._utterances_topic = self.get_parameter(UTTERANCES_TOPIC).value - self._expect_uintent_topic = self.get_parameter( - PARAM_EXPECT_USER_INTENT_TOPIC - ).value - self._interp_uintent_topic = self.get_parameter( - PARAM_INTERP_USER_INTENT_TOPIC - ).value - self.log.info( - f"Utterances topic: " - f"({type(self._utterances_topic).__name__}) " - f"{self._utterances_topic}" - ) - self.log.info( - f"Expected User Intent topic: " - f"({type(self._expect_uintent_topic).__name__}) " - f"{self._expect_uintent_topic}" - ) - self.log.info( - f"Interpreted User Intent topic: " - f"({type(self._interp_uintent_topic).__name__}) " - f"{self._interp_uintent_topic}" - ) - - # TODO(derekahmed): Add internal queueing to reduce subscriber queue - # size to 1. - self.subscription = self.create_subscription( - Utterance, self._utterances_topic, self.listener_callback, 10 - ) - - self._expected_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._expect_uintent_topic, 1 - ) - - self._interp_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._interp_uintent_topic, 1 - ) - - def listener_callback(self, msg): - log = self.get_logger() - intent_msg = InterpretedAudioUserIntent() - intent_msg.utterance_text = msg.value - - lower_utterance = msg.value.lower() - if self.contains_phrase(lower_utterance, NEXT_STEP_KEYPHRASES): - intent_msg.user_intent = LABELS[0] - intent_msg.confidence = 0.5 - elif self.contains_phrase(lower_utterance, PREV_STEP_KEYPHRASES): - intent_msg.user_intent = LABELS[1] - intent_msg.confidence = 0.5 - else: - log.info(f'Detected no intents for "{msg.value}":') - return - - if self.contains_phrase(lower_utterance, OVERRIDE_KEYPHRASES): - intent_msg.confidence = 1.0 - self._expected_publisher.publish(intent_msg) - else: - self._interp_publisher.publish(intent_msg) - - log.info( - f'Detected intents for "{msg.value}":\n' - + f'"{intent_msg.user_intent}": {intent_msg.confidence}' - ) - - def contains_phrase(self, utterance, phrases): - for phrase in phrases: - if phrase in utterance: - return True - return False - - -main = make_default_main(IntentDetector) - - -if __name__ == "__main__": - main() diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/audio/question_answerer.py index 548d286c6..956044652 100644 --- a/ros/angel_system_nodes/angel_system_nodes/audio/question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/audio/question_answerer.py @@ -7,7 +7,7 @@ from termcolor import colored import threading -from angel_msgs.msg import InterpretedAudioUserEmotion, SystemTextResponse +from angel_msgs.msg import DialogueUtterance, SystemTextResponse from angel_utils import declare_and_get_parameters from angel_utils import make_default_main @@ -15,9 +15,10 @@ openai.organization = os.getenv("OPENAI_ORG_ID") openai.api_key = os.getenv("OPENAI_API_KEY") -IN_EMOTION_TOPIC = "user_emotion_topic" +INPUT_TOPIC = "input_topic" OUT_QA_TOPIC = "system_text_response_topic" FEW_SHOT_PROMPT = "few_shot_prompt_file" +PARAM_TIMEOUT = "timeout" class QuestionAnswerer(Node): @@ -28,14 +29,16 @@ def __init__(self): param_values = declare_and_get_parameters( self, [ - (IN_EMOTION_TOPIC,), + (INPUT_TOPIC,), (OUT_QA_TOPIC,), (FEW_SHOT_PROMPT,), + (PARAM_TIMEOUT, 600), ], ) - self._in_emotion_topic = param_values[IN_EMOTION_TOPIC] + self._input_topic = param_values[INPUT_TOPIC] self._out_qa_topic = param_values[OUT_QA_TOPIC] self.prompt_file = param_values[FEW_SHOT_PROMPT] + self.timeout = param_values[PARAM_TIMEOUT] self.question_queue = queue.Queue() self.handler_thread = threading.Thread(target=self.process_question_queue) @@ -59,8 +62,8 @@ def __init__(self): # Handle subscription/publication topics. self.subscription = self.create_subscription( - InterpretedAudioUserEmotion, - self._in_emotion_topic, + DialogueUtterance, + self._input_topic, self.question_answer_callback, 1, ) @@ -68,28 +71,25 @@ def __init__(self): SystemTextResponse, self._out_qa_topic, 1 ) - def get_response(self, user_utterance: str, user_emotion: str): + def get_response(self, sub_msg: DialogueUtterance): """ - Generate a response to the utterance, enriched with the addition of - the user's detected emotion. Inference calls can be added and revised - here. + Generate a response to the received message. + Inference calls can be added and revised here. """ - return_msg = "" try: if self.is_openai_ready: return_msg = colored( - self.prompt_gpt(user_utterance) + "\n", "light_green" + self.prompt_gpt(sub_msg.utterance_text) + "\n", "light_green" ) except RuntimeError as err: self.log.info(err) colored_apology = colored( "I'm sorry. I don't know how to answer your statement.", "light_red" ) - colored_emotion = colored(user_emotion, "light_red") + colored_emotion = colored(sub_msg.emotion, "light_red") return_msg = ( f"{colored_apology} I understand that you feel {colored_emotion}." ) - return return_msg def question_answer_callback(self, msg): """ @@ -108,22 +108,22 @@ def process_question_queue(self): while True: msg = self.question_queue.get() emotion = msg.user_emotion - response = self.get_response(msg.utterance_text, emotion) - self.publish_generated_response(msg.utterance_text, response) - - def publish_generated_response(self, utterance: str, response: str): - msg = SystemTextResponse() - msg.header.frame_id = "GPT Question Answering" - msg.header.stamp = self.get_clock().now().to_msg() - msg.utterance_text = utterance - msg.response = response - colored_utterance = colored(utterance, "light_blue") + response = self.get_response(msg) + self.publish_generated_response(msg, response) + + def publish_generated_response(self, sub_msg: DialogueUtterance, response: str): + pub_msg = SystemTextResponse() + pub_msg.header.frame_id = "GPT Question Answering" + pub_msg.header.stamp = self.get_clock().now().to_msg() + pub_msg.utterance_text = sub_msg.utterance_text + pub_msg.response = response + colored_utterance = colored(sub_msg.utterance_text, "light_blue") colored_response = colored(response, "light_green") self.log.info( f'Responding to utterance:\n>>> "{colored_utterance}"\n>>> with:\n' + f'>>> "{colored_response}"' ) - self._qa_publisher.publish(msg) + self._qa_publisher.publish(pub_msg) def prompt_gpt(self, question, model: str = "gpt-3.5-turbo"): prompt = self.prompt.format(question) @@ -138,6 +138,7 @@ def prompt_gpt(self, question, model: str = "gpt-3.5-turbo"): "https://api.openai.com/v1/chat/completions", json=payload, headers={"Authorization": "Bearer {}".format(self.openai_api_key)}, + timeout=self.timeout, ) return ( json.loads(req.text)["choices"][0]["message"]["content"] diff --git a/ros/angel_system_nodes/setup.py b/ros/angel_system_nodes/setup.py index 81c576c9a..f0f8baed3 100644 --- a/ros/angel_system_nodes/setup.py +++ b/ros/angel_system_nodes/setup.py @@ -20,12 +20,11 @@ entry_points={ "console_scripts": [ "video_listener = angel_system_nodes.video_subscriber:main", - "base_intent_detector = angel_system_nodes.base_intent_detector:main", - "gpt_intent_detector = angel_system_nodes.gpt_intent_detector:main", - "base_emotion_detector = angel_system_nodes.base_emotion_detector:main", - "gpt_emotion_detector = angel_system_nodes.gpt_emotion_detector:main", - "question_answerer = angel_system_nodes.question_answerer:main", - "intent_detector = angel_system_nodes.intent_detector:main", + "base_intent_detector = angel_system_nodes.audio.intent.base_intent_detector:main", + "gpt_intent_detector = angel_system_nodes.audio.intent.gpt_intent_detector:main", + "base_emotion_detector = angel_system_nodes.audio.emotion.base_emotion_detector:main", + "gpt_emotion_detector = angel_system_nodes.audio.emotion.gpt_emotion_detector:main", + "question_answerer = angel_system_nodes.audio.question_answerer:main", "spatial_mapper = angel_system_nodes.spatial_mapper:main", "feedback_generator = angel_system_nodes.feedback_generator:main", "annotation_event_monitor = angel_system_nodes.annotation_event_monitor:main", diff --git a/tmux/demos/cooking/eval_vocalized_question_answering.yml b/tmux/demos/cooking/eval_vocalized_question_answering.yml index 3fd3c8dbb..de0e0a5c8 100644 --- a/tmux/demos/cooking/eval_vocalized_question_answering.yml +++ b/tmux/demos/cooking/eval_vocalized_question_answering.yml @@ -25,7 +25,7 @@ root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # on_project_start: command on_project_start: | export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} - export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/config export NODE_RESOURCES_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/resource # Run on project start, the first time # on_project_first_start: command @@ -60,7 +60,7 @@ tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf windows: # - ros_bag_play: ros2 bag play <> - - ros_bag_play: sleep 5; ros2 bag play /angel_workspace/ros_bags/rosbag2_2023_03_01-17_28_00/rosbag2_2023_03_01-17_28_00_0.db3 + - ros_bag_play: sleep 5; ros2 bag play /angel_workspace/ros_bags/rosbag2_2023_07_12-17_51_14_0.db3 - vocal: layout: even-vertical panes: @@ -94,14 +94,13 @@ windows: panes: - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic + -p input_topic:=interp_user_intent_topic -p user_emotion_topic:=gpt_emotion_topic - question_answering: layout: even-vertical panes: - gpt_question_answering: ros2 run angel_system_nodes question_answerer --ros-args -r __ns:=${ROS_NAMESPACE} - -p user_emotion_topic:=gpt_emotion_topic + -p input_topic:=gpt_emotion_topic -p system_text_response_topic:=system_text_response_topic -p few_shot_prompt_file:=${CONFIG_DIR}/llm_prompts/tourniquet_steps_prompt diff --git a/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs new file mode 100644 index 000000000..2e1479edd --- /dev/null +++ b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs @@ -0,0 +1,99 @@ +//Do not edit! This file was generated by Unity-ROS MessageGeneration. +using System; +using System.Linq; +using System.Collections.Generic; +using System.Text; +using Unity.Robotics.ROSTCPConnector.MessageGeneration; + +namespace RosMessageTypes.Angel +{ + [Serializable] + public class DialogueUtteranceMsg : Message + { + public const string k_RosMessageName = "angel_msgs/DialogueUtterance"; + public override string RosMessageName => k_RosMessageName; + + // + // Dialogue Utterance with additional information about the environmental state + // and user model. + // + // The header primarily encapsulates when this message was emitted. + // The time component of this may be utilized as an identifier for this user + // intent and utterance. + public Std.HeaderMsg header; + // Speech-to-text of the user utterance we have interpreted + public string utterance_text; + // Below are optional fields + // Canonical user intent that has been interpreted. "Canonical" in this context + // is to mean that this string may be used as an identifier of this type of + // user intent. Should be in the range [0,1] where 1.0 means absolute confidence. + public string intent; + public double intent_confidence_score; + // Emotion classification. Should be in the range [0,1] where 1.0 means absolute confidence. + public string emotion; + public double emotion_confidence_score; + + public DialogueUtteranceMsg() + { + this.header = new Std.HeaderMsg(); + this.utterance_text = ""; + this.intent = ""; + this.intent_confidence_score = 0.0; + this.emotion = ""; + this.emotion_confidence_score = 0.0; + } + + public DialogueUtteranceMsg(Std.HeaderMsg header, string utterance_text, string intent, double intent_confidence_score, string emotion, double emotion_confidence_score) + { + this.header = header; + this.utterance_text = utterance_text; + this.intent = intent; + this.intent_confidence_score = intent_confidence_score; + this.emotion = emotion; + this.emotion_confidence_score = emotion_confidence_score; + } + + public static DialogueUtteranceMsg Deserialize(MessageDeserializer deserializer) => new DialogueUtteranceMsg(deserializer); + + private DialogueUtteranceMsg(MessageDeserializer deserializer) + { + this.header = Std.HeaderMsg.Deserialize(deserializer); + deserializer.Read(out this.utterance_text); + deserializer.Read(out this.intent); + deserializer.Read(out this.intent_confidence_score); + deserializer.Read(out this.emotion); + deserializer.Read(out this.emotion_confidence_score); + } + + public override void SerializeTo(MessageSerializer serializer) + { + serializer.Write(this.header); + serializer.Write(this.utterance_text); + serializer.Write(this.intent); + serializer.Write(this.intent_confidence_score); + serializer.Write(this.emotion); + serializer.Write(this.emotion_confidence_score); + } + + public override string ToString() + { + return "DialogueUtteranceMsg: " + + "\nheader: " + header.ToString() + + "\nutterance_text: " + utterance_text.ToString() + + "\nintent: " + intent.ToString() + + "\nintent_confidence_score: " + intent_confidence_score.ToString() + + "\nemotion: " + emotion.ToString() + + "\nemotion_confidence_score: " + emotion_confidence_score.ToString(); + } + +#if UNITY_EDITOR + [UnityEditor.InitializeOnLoadMethod] +#else + [UnityEngine.RuntimeInitializeOnLoadMethod] +#endif + public static void Register() + { + MessageRegistry.Register(k_RosMessageName, Deserialize); + } + } +} \ No newline at end of file diff --git a/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs.meta b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs.meta new file mode 100644 index 000000000..5d154c5ad --- /dev/null +++ b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 244f6af8d6d7e4c18a6e2d52b444d387 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: \ No newline at end of file