forked from speechmatics/speechmatics-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcli.py
More file actions
executable file
·570 lines (488 loc) · 17.3 KB
/
cli.py
File metadata and controls
executable file
·570 lines (488 loc) · 17.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
#!/usr/bin/env python3
"""Example usage of speechmatics by implementing a CLI."""
import argparse
import json
import logging
import ssl
import sys
from dataclasses import dataclass
from typing import List
from speechmatics.client import WebsocketClient
from speechmatics.models import (
TranscriptionConfig,
AudioSettings,
ClientMessageType,
ServerMessageType,
ConnectionSettings,
)
LOGGER = logging.getLogger(__name__)
def print_symbol(symbol):
"""
Prints a single symbol to standard error.
Args:
symbol (str): The symbol to print.
"""
print(symbol, end="", file=sys.stderr, flush=True)
def parse_additional_vocab(additional_vocab_filepath):
"""
Parses an additional vocab list from a file.
Args:
additional_vocab_filepath (str): Path to the additional vocab file.
Returns:
List[Union[dict, str]]: A list of objects or strings which are the
additional vocab items.
Raises:
SystemExit: If the file is not valid json.
"""
additional_vocab = []
with open(additional_vocab_filepath) as additional_vocab_file:
try:
additional_vocab = json.load(additional_vocab_file)
except json.JSONDecodeError:
raise SystemExit(
f"Provided additional vocab at: {additional_vocab_filepath} "
f"is not valid json."
)
if not isinstance(additional_vocab, list):
raise SystemExit(
(
f"Additional vocab file at: {additional_vocab_filepath} "
"should be a list of objects/strings."
)
)
if not additional_vocab:
LOGGER.warning(
"Provided additional vocab at: %s is an empty list.",
additional_vocab_filepath,
)
return additional_vocab
def additional_vocab_item(to_parse):
"""
Parses a single item of additional vocab. Used in conjunction with the
additional vocab command line argument.
Args:
to_parse (str): The item to parse.
Returns:
Union[dict, str]: Either a dictionary or a string depending on the form
of the additional vocab item.
Raises:
argparse.ArgumentTypeError: If the item to parse is invalid.
"""
to_parse = str(to_parse)
parts = to_parse.split(":")
if len(parts) > 2:
raise argparse.ArgumentTypeError(
f"Can't have more than one separator (:) in additional vocab: "
f"{to_parse}."
)
content = parts[0]
if not content:
raise argparse.ArgumentTypeError(
f"Additional vocab must at least have content in: {to_parse}"
)
if len(parts) == 1:
return content
additional_vocab = {"content": content, "sounds_like": []}
sounds_likes = parts[1].split(",")
for sounds_like in sounds_likes:
if not sounds_like:
continue
additional_vocab["sounds_like"].append(sounds_like)
if not additional_vocab["sounds_like"]:
del additional_vocab["sounds_like"]
return additional_vocab
def get_log_level(verbosity):
"""
Returns the appropriate log level given a verbosity level.
Args:
verbosity (int): Verbosity level.
Returns:
int: The logging level (e.g. logging.INFO).
Raises:
SystemExit: If the given verbosity level is invalid.
"""
try:
log_level = {
0: logging.WARNING,
1: logging.INFO,
2: logging.DEBUG}[verbosity]
return log_level
except KeyError as error:
key = int(str(error))
raise SystemExit(
f"Only supports 2 log levels eg. -vv, you are asking for "
f"-{'v' * key}"
)
@dataclass
class Transcripts:
text: str
json: List[dict]
def get_connection_settings(args):
"""
Helper function which returns a ConnectionSettings object based on the
command line options given to the program.
Args:
args (dict): Keyword arguments, typically from the command line.
Returns:
speechmatics.models.ConnectionSettings: Settings for the WebSocket
connection.
"""
settings = ConnectionSettings(
url=args["url"], message_buffer_size=args["buffer_size"]
)
if args["ssl_mode"] == "insecure":
settings.ssl_context.check_hostname = False
settings.ssl_context.verify_mode = ssl.CERT_NONE
elif args["ssl_mode"] == "none":
settings.ssl_context = None
return settings
def get_transcription_config(args):
"""
Helper function which returns a TranscriptionConfig object based on the
command line options given to the program.
Args:
args (dict): Keyword arguments probably from the command line.
Returns:
speechmatics.models.TranscriptionConfig: Settings for the ASR engine.
"""
config = TranscriptionConfig(
args["lang"],
enable_partials=True if args["enable_partials"] else None,
output_locale=args["output_locale"],
max_delay=args["max_delay"],
diarization=args["diarization"],
speaker_change_sensitivity=args["speaker_change_sensitivity"],
n_best_limit=args["n_best_limit"],
)
if args["additional_vocab_file"]:
additional_vocab = parse_additional_vocab(
args["additional_vocab_file"])
config.additional_vocab = additional_vocab
LOGGER.info(
"Using additional vocab from file %s",
args["additional_vocab_file"]
)
if args["additional_vocab"]:
if not config.additional_vocab:
config.additional_vocab = args["additional_vocab"]
else:
config.additional_vocab.extend(args["additional_vocab"])
LOGGER.info(
"Using additional vocab from args %s", args["additional_vocab"])
if args["punctuation_permitted_marks"] is not None \
or args["punctuation_sensitivity"]:
config.punctuation_overrides = {}
if args["punctuation_permitted_marks"] is not None:
config.punctuation_overrides["permitted_marks"] = args[
"punctuation_permitted_marks"
].split()
if args["punctuation_sensitivity"]:
config.punctuation_overrides["sensitivity"] = args[
"punctuation_sensitivity"
]
return config
def get_audio_settings(args):
"""
Helper function which returns an AudioSettings object based on the command
line options given to the program.
Args:
args (dict): Keyword arguments, typically from the command line.
Returns:
speechmatics.models.TranscriptionConfig: Settings for the audio stream
in the connection.
"""
settings = AudioSettings(
sample_rate=args["sample_rate"],
chunk_size=args["chunk_size"],
encoding=args["raw"],
)
return settings
# pylint: disable=too-many-arguments
def add_printing_handlers(
api, transcripts, enable_partials=False, debug_handlers_too=False,
speaker_change_token=False, language="en"):
"""
Adds a set of handlers to the websocket client which print out transcripts
as they are received. This includes partials if they are enabled.
Args:
api (speechmatics.client.WebsocketClient): Client instance.
transcripts (Transcripts): Allows the transcripts to be concatenated to
produce a final result.
enable_partials (bool, optional): Whether or not partials are enabled.
debug_handlers_too (bool, optional): Whether or not to enable 'debug'
handlers that print out an ASCII symbol representing messages being
received and sent.
speaker_change_token (bool, optional): Whether to explicitly include a
speaker change token '<sc>' in the output to indicate speaker
changes.
language (string, optional): The language code of the model being used.
This is needed to configure language-specific text formatting.
"""
if debug_handlers_too:
api.add_event_handler(
ServerMessageType.AudioAdded, lambda *args: print_symbol("-")
)
api.add_event_handler(
ServerMessageType.AddPartialTranscript,
lambda *args: print_symbol(".")
)
api.add_event_handler(
ServerMessageType.AddTranscript,
lambda *args: print_symbol("|")
)
api.add_middleware(
ClientMessageType.AddAudio,
lambda *args: print_symbol("+")
)
def partial_transcript_handler(message):
# "\n" does not appear in partial transcripts
print(f'{message["metadata"]["transcript"]}',
end="\r", file=sys.stderr)
def transcript_handler(message):
transcripts.json.append(message)
transcript = message["metadata"]["transcript"]
if transcript:
transcript_to_print = transcript
if speaker_change_token:
transcript_with_sc_token = transcript.replace("\n", "\n<sc>\n")
transcript_to_print = transcript_with_sc_token
transcripts.text += transcript_to_print
print(transcript_to_print)
n_best_results = message.get("n_best_results", [])
if n_best_results:
n_best_list = n_best_results[0]["n_best_list"]
for alternative in n_best_list:
words_joined = join_words(
(word["content"] for word in alternative["words"]),
language=language,
)
print("* [{:.4f}] {}".format(
alternative["confidence"], words_joined))
print()
def end_of_transcript_handler(_):
if enable_partials:
print("\n", file=sys.stderr)
api.add_event_handler(
ServerMessageType.AddPartialTranscript, partial_transcript_handler
)
api.add_event_handler(
ServerMessageType.AddTranscript, transcript_handler)
api.add_event_handler(
ServerMessageType.EndOfTranscript, end_of_transcript_handler)
def join_words(words, language="en"):
"""
Joins a list of words with a language specific separator. Because not all
languages use the standard English white-space between words.
Args:
words (List[str]): List of words
language (str): Language code
Returns:
str: Words joined with a language-specific separator.
"""
if language in {"ja", "cmn"}:
separator = ""
else:
separator = " "
return separator.join(words)
def main(args=None):
"""
Main entrypoint.
Args:
args (List[str]): command-line arguments; defaults to None in which
case arguments will retrieved from `sys.argv` (this is useful
mainly for unit tests).
"""
if not args:
args = vars(parse_args())
logging.basicConfig(level=get_log_level(args["verbose"]))
LOGGER.info("Args: %s", args)
if args["command"] != "transcribe":
raise SystemExit(f"Unknown command: {args['command']}")
api = WebsocketClient(get_connection_settings(args))
if args["url"].lower().startswith("ws://") and args["ssl_mode"] != "none":
raise SystemExit(
f"ssl_mode '{args['ssl_mode']}' is incompatible with protocol"
" 'ws'. Use 'wss' instead."
)
if args["url"].lower().startswith("wss://") and args["ssl_mode"] == "none":
raise SystemExit(
f"ssl_mode 'none' is incompatible with protocol 'wss'. "
f"Use 'ws' instead."
)
transcripts = Transcripts(text="", json=[])
add_printing_handlers(
api,
transcripts,
enable_partials=args["enable_partials"],
debug_handlers_too=args["debug"],
speaker_change_token=args["speaker_change_token"],
language=args["lang"],
)
def run(stream):
try:
api.run_synchronously(
stream, get_transcription_config(args),
get_audio_settings(args)
)
except KeyboardInterrupt:
# Gracefully handle Ctrl-C, else we get a huge stack-trace.
LOGGER.warning("Keyboard interrupt received.")
if args["files"][0] == "-":
run(sys.stdin.buffer)
else:
for filename in args["files"]:
with open(filename, "rb") as audio_file:
run(audio_file)
def parse_args(args=None):
"""
Parses command-line arguments.
Arguments:
args: (List[str], optional): List of arguments to parse.
Returns:
Namespace: The set of arguments provided along with their values.
"""
parser = argparse.ArgumentParser(
description="CLI for Speechmatics products.")
parser.add_argument(
"-v",
dest="verbose",
action="count",
default=0,
help=(
"Set the log level for verbose logs. "
"The number of flags indicate the level, eg. "
"-v is INFO and -vv is DEBUG."
),
)
subparsers = parser.add_subparsers(title='Commands', dest='command')
transcribe_subparser = subparsers.add_parser(
"transcribe",
help="Transcribe one or more audio file(s)"
)
transcribe_subparser.add_argument(
"--ssl-mode",
default="regular",
choices=["regular", "insecure", "none"],
help=(
"Use a preset configuration for the SSL context. With `regular` "
"mode a valid certificate is expected. With `insecure` mode"
" a self signed certificate is allowed."
" With `none` then SSL is not used."
),
)
transcribe_subparser.add_argument(
"--buffer-size",
default=512,
type=int,
help=(
"Maximum number of messages to send before waiting for "
"acknowledgements from the server."
),
)
transcribe_subparser.add_argument(
"--debug",
default=False,
action="store_true",
help=(
"Prints useful symbols to represent the messages on the wire. "
"Symbols are printed to STDERR, use only when STDOUT is "
"redirected to a file."
),
)
transcribe_subparser.add_argument(
"--url",
type=str,
required=True,
help="Websockets URL (e.g. wss://192.168.8.12:9000/)",
)
transcribe_subparser.add_argument(
"--lang", type=str, default="en",
help="Language (ISO code, e.g. en, fr, de)"
)
transcribe_subparser.add_argument(
"--output-locale",
metavar="LOCALE",
type=str,
default=None,
help="Locale of the output of transcripts. eg. en-US",
)
transcribe_subparser.add_argument(
"--additional-vocab",
nargs="*",
type=additional_vocab_item,
help=(
"Space separated list of additional vocab. Expected format: "
"<content (required)>:<sounds like (optional)>,<anymore sounds "
"like> Simple vocab list example: 'Speechmatics gnocchi'. "
"Vocab list with sounds like example: 'gnocchi:nokey,nochi'."
),
)
transcribe_subparser.add_argument(
"--additional-vocab-file",
metavar="VOCAB_FILEPATH",
type=str,
help="File with additional vocab in JSON format",
)
transcribe_subparser.add_argument(
"--enable-partials",
default=False,
action="store_true"
)
transcribe_subparser.add_argument(
"--punctuation-permitted-marks",
type=str,
default=None,
help=(
"Space separated list of permitted punctuation marks for advanced "
"punctuation."
),
)
transcribe_subparser.add_argument(
"--punctuation-sensitivity",
type=float,
help="Sensitivity level for advanced punctuation.",
)
transcribe_subparser.add_argument(
"--diarization",
choices=["none", "speaker_change"],
help="Which type of diarization to use.",
)
transcribe_subparser.add_argument(
"--speaker-change-sensitivity",
type=float,
help="Sensitivity level for speaker change.",
)
transcribe_subparser.add_argument(
"--speaker-change-token",
default=False,
action="store_true",
help="Shows a <sc> token where a speaker change was detected.",
)
transcribe_subparser.add_argument("--max-delay", type=float)
transcribe_subparser.add_argument(
"--raw",
metavar="ENCODING",
type=str,
help=(
"Indicate that the input audio is raw, provide the encoding of "
"this raw audio, eg. pcm_f32le"
),
)
transcribe_subparser.add_argument(
"--sample-rate", type=int, default=44_100)
transcribe_subparser.add_argument(
"--chunk-size", type=int, default=1024*4)
transcribe_subparser.add_argument(
"--n-best-limit",
type=int,
default=None,
help="Upper bound on the number of N-best alternatives to return for "
"each final. If not specified, N-best output is disabled."
"Be aware that this option is not supported for all Speechmatics"
" products.",
)
transcribe_subparser.add_argument(
"files", metavar="FILEPATHS", type=str, nargs="+",
help="File(s) to process"
)
return parser.parse_args(args=args)