Add speech recognition phrase list to the Web Speech API (#145)

yrw-google · web-flow · commit bc21038e415f · 2025-04-14T13:51:35.000-07:00
* Add speech recognition context to the Web Speech API

Introduce a new speech recognition context feature for contextual biasing

* Add phrases instead of context

Remove SpeechRecognitionContext and add SpeechRecognitionPhraseList to SpeechRecognition directly

Remove updateContext and always update phrases instead

Rename context-not-supported error code to phrases-not-supported

Add removeItem to SpeechRecognitionPhraseList

* Minor updates for comments

* Add descriptions for corner cases
diff --git a/.gitignore b/.gitignore
@@ -1 +1,3 @@
 index.html
+.DS_Store
+.idea/
diff --git a/index.bs b/index.bs
@@ -151,6 +151,18 @@ The term "interim result" indicates a SpeechRecognitionResult in which the final
         A boolean flag representing whether the speech recognition started. The initial value is <code>false</code>.
 </dl>
 
+<dl dfn-type=attribute dfn-for="SpeechRecognition">
+    : <dfn>[[mode]]</dfn>
+    ::
+        A {{SpeechRecognitionMode}} enum to determine where speech recognition takes place. The initial value is <code>ondevice-preferred</code>.
+</dl>
+
+<dl dfn-type=attribute dfn-for="SpeechRecognition">
+    : <dfn>[[phrases]]</dfn>
+    ::
+        A {{SpeechRecognitionPhraseList}} representing a list of phrases for contextual biasing. The initial value is null.
+</dl>
+
 <xmp class="idl">
 [Exposed=Window]
 interface SpeechRecognition : EventTarget {
@@ -162,6 +174,7 @@ interface SpeechRecognition : EventTarget {
     attribute boolean interimResults;
     attribute unsigned long maxAlternatives;
     attribute SpeechRecognitionMode mode;
+    attribute SpeechRecognitionPhraseList phrases;
 
     // methods to drive the speech interaction
     undefined start();
@@ -192,7 +205,8 @@ enum SpeechRecognitionErrorCode {
     "network",
     "not-allowed",
     "service-not-allowed",
-    "language-not-supported"
+    "language-not-supported",
+    "phrases-not-supported"
 };
 
 enum SpeechRecognitionMode {
@@ -254,12 +268,29 @@ dictionary SpeechRecognitionEventInit : EventInit {
     unsigned long resultIndex = 0;
     required SpeechRecognitionResultList results;
 };
+
+// The object representing a phrase for contextual biasing.
+[Exposed=Window]
+interface SpeechRecognitionPhrase {
+    constructor(DOMString phrase, optional float boost = 1.0);
+    readonly attribute DOMString phrase;
+    readonly attribute float boost;
+};
+
+// The object representing a list of phrases for contextual biasing.
+[Exposed=Window]
+interface SpeechRecognitionPhraseList {
+    constructor(sequence<SpeechRecognitionPhrase> phrases);
+    readonly attribute unsigned long length;
+    SpeechRecognitionPhrase item(unsigned long index);
+    undefined addItem(SpeechRecognitionPhrase item);
+    undefined removeItem(unsigned long index);
+};
 </xmp>
 
 <h4 id="speechreco-attributes">SpeechRecognition Attributes</h4>
 
 <dl>
-
   <dt><dfn attribute for=SpeechRecognition>lang</dfn> attribute</dt>
   <dd>This attribute will set the language of the recognition for the request, using a valid BCP 47 language tag. [[!BCP47]]
   If unset it remains unset for getting in script, but will default to use the language of the html document root element and associated hierarchy.
@@ -283,7 +314,35 @@ dictionary SpeechRecognitionEventInit : EventInit {
   The default value is 1.</dd>
 
   <dt><dfn attribute for=SpeechRecognition>mode</dfn> attribute</dt>
-  <dd>An enum to determine where speech recognition takes place. The default value is "ondevice-preferred".</dd>
+  <dd>
+   This attribute represents where speech recognition takes place.
+  </dd>
+  <dd>
+    The getter steps are to return the value of {{SpeechRecognition/[[mode]]}}.
+  </dd>
+  <dd>
+    The setter steps are:
+    1. If the {{SpeechRecognitionPhraseList/length}} of {{SpeechRecognition/phrases}} is greater than 0
+        and the system using the given value for {{SpeechRecognition/[[mode]]}} does not support contextual biasing,
+        throw a {{SpeechRecognitionErrorEvent}} with the {{SpeechRecognitionErrorCode/phrases-not-supported}}
+        error code and abort these steps.
+    1. Set {{SpeechRecognition/[[mode]]}} to the given value.
+  </dd>
+
+  <dt><dfn attribute for=SpeechRecognition>phrases</dfn> attribute</dt>
+  <dd>
+    This attribute represents a list of phrases for contextual biasing.
+  </dd>
+  <dd>
+    The getter steps are to return the value of {{SpeechRecognition/[[phrases]]}}.
+  </dd>
+  <dd>
+    The setter steps are:
+    1. If the {{SpeechRecognitionPhraseList/length}} of the given value is greater than 0 and the system does not support contextual biasing,
+        throw a {{SpeechRecognitionErrorEvent}} with the {{phrases-not-supported}} error code and abort these steps.
+    1. Set {{SpeechRecognition/[[phrases]]}} to the given value.
+    1. Send a copy of {{SpeechRecognition/[[phrases]]}} to the system for initializing or updating the phrases for contextual biasing implementation.
+  </dd>
 </dl>
 
 <p class=issue>The group has discussed whether WebRTC might be used to specify selection of audio sources and remote recognizers.
@@ -479,6 +538,9 @@ For example, some implementations may fire <a event for=SpeechRecognition>audioe
 
     <dt><dfn enum-value for=SpeechRecognitionErrorCode>"language-not-supported"</dfn></dt>
     <dd>The language was not supported.</dd>
+
+    <dt><dfn enum-value for=SpeechRecognitionErrorCode>"phrases-not-supported"</dfn></dt>
+    <dd>The speech recognition model does not support phrases for contextual biasing.</dd>
   </dl>
   </dd>
 
@@ -557,6 +619,98 @@ For a non-continuous recognition it will hold only a single value.</p>
   Note that when resultIndex equals results.length, no new results are returned, this may occur when the array length decreases to remove one or more interim results.</dd>
 </dl>
 
+<h4 id="speechreco-phrase">SpeechRecognitionPhrase</h4>
+
+<p>The SpeechRecognitionPhrase object represents a phrase for contextual biasing and has the following internal slots:</p>
+
+<dl dfn-type=attribute dfn-for="SpeechRecognitionPhrase">
+    : <dfn>[[phrase]]</dfn>
+    ::
+        A {{DOMString}} representing the text string to be boosted. The initial value is null.
+        An empty value is allowed but should be ignored by the speech recognition model.
+</dl>
+
+<dl dfn-type=attribute dfn-for="SpeechRecognitionPhrase">
+    : <dfn>[[boost]]</dfn>
+    ::
+        A float representing approximately the natural log of the number of times more likely the website thinks this phrase is
+        than what the speech recognition model knows.
+        A valid boost must be a float value inside the range [0.0, 10.0], with a default value of 1.0 if not specified.
+        A boost of 0.0 means the phrase is not boosted at all, and a higher boost means the phrase is more likely to appear.
+        A boost of 10.0 means the phrase is extremely likely to appear and should be rarely set.
+</dl>
+
+<dl>
+  <dt><dfn constructor for=SpeechRecognitionPhrase>SpeechRecognitionPhrase(|phrase|, |boost|)</dfn> constructor</dt>
+  <dd>
+    When this constructor is invoked, run the following steps:
+    1. If |boost| is smaller than 0.0 or greater than 10.0, throw a {{SyntaxError}} and abort these steps.
+    1. Let |phr| be a new object of type {{SpeechRecognitionPhrase}}.
+    1. Set |phr|.{{[[phrase]]}} to be the value of |phrase|.
+    1. Set |phr|.{{[[boost]]}} to be the value of |boost|.
+    1. Return |phr|.
+  </dd>
+
+  <dt><dfn attribute for=SpeechRecognitionPhrase>phrase</dfn> attribute</dt>
+  <dd>This attribute returns the value of {{[[phrase]]}}.</dd>
+
+  <dt><dfn attribute for=SpeechRecognitionPhrase>boost</dfn> attribute</dt>
+  <dd>This attribute returns the value of {{[[boost]]}}.</dd>
+</dl>
+
+<h4 id="speechreco-phraselist">SpeechRecognitionPhraseList</h4>
+
+<p>The SpeechRecognitionPhraseList object holds a list of phrases for contextual biasing and has the following internal slot:</p>
+
+<dl dfn-type=attribute dfn-for="SpeechRecognitionPhraseList">
+    : <dfn>[[phrases]]</dfn>
+    ::
+        A list of {{SpeechRecognitionPhrase}} representing the phrases to be boosted. The initial value is an empty list.
+</dl>
+
+<dl>
+  <dt><dfn constructor for=SpeechRecognitionPhraseList>SpeechRecognitionPhraseList(|phrases|)</dfn> constructor</dt>
+  <dd>
+    When this constructor is invoked, run the following steps:
+    1. Let |list| be a new object of type {{SpeechRecognitionPhraseList}}.
+    1. Set |list|.{{SpeechRecognitionPhraseList/[[phrases]]}} to be the value of |phrases|.
+    1. Return |list|.
+  </dd>
+
+  <dt><dfn attribute for=SpeechRecognitionPhraseList>length</dfn> attribute</dt>
+  <dd>
+    This attribute indicates the number of phrases in the list.
+    When invoked, return the number of items in {{SpeechRecognitionPhraseList/[[phrases]]}}.
+  </dd>
+
+  <dt><dfn method for=SpeechRecognitionPhraseList>item(|index|)</dfn> method</dt>
+  <dd>
+    This method gets the {{SpeechRecognitionPhrase}} object at the |index| of the list.
+    When invoked, run the following steps:
+    1. If |index| is smaller than 0, or greater than or equal to {{SpeechRecognitionPhraseList/length}},
+        throw a {{RangeError}} and abort these steps.
+    1. Return the {{SpeechRecognitionPhrase}} at the |index| of {{SpeechRecognitionPhraseList/[[phrases]]}}.
+  </dd>
+
+  <dt><dfn method for=SpeechRecognitionPhraseList>addItem(|item|)</dfn> method</dt>
+  <dd>
+    This method adds the {{SpeechRecognitionPhrase}} object |item| to the list.
+    When invoked, add |item| to the end of {{SpeechRecognitionPhraseList/[[phrases]]}}.
+    The list is allowed to have multiple {{SpeechRecognitionPhrase}} objects with the same {{SpeechRecognitionPhrase/[[phrase]]}} value,
+    and the speech recognition model should use the last {{SpeechRecognitionPhrase/[[boost]]}} value
+    for this {{SpeechRecognitionPhrase/[[phrase]]}} in the list.
+  </dd>
+
+  <dt><dfn method for=SpeechRecognitionPhraseList>removeItem(|index|)</dfn> method</dt>
+  <dd>
+    This method removes the {{SpeechRecognitionPhrase}} object at the |index| of the list.
+    When invoked, run the following steps:
+    1. If |index| is smaller than 0, or greater than or equal to {{SpeechRecognitionPhraseList/length}},
+        throw a {{RangeError}} and abort these steps.
+    1. Remove the {{SpeechRecognitionPhrase}} object at the |index| of {{SpeechRecognitionPhraseList/[[phrases]]}}.
+  </dd>
+</dl>
+
 <h3 id="tts-section">The SpeechSynthesis Interface</h3>
 
 <p>The SpeechSynthesis interface is the scripted web API for controlling a text-to-speech output.</p>

Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`index.html`
	`2`	`+.DS_Store`
	`3`	`+.idea/`