From 44bc308cdcb0da8d84fccaada79c82ee853de32f Mon Sep 17 00:00:00 2001
From: Roman Bazalevskiy <rvb@rvb.name>
Date: Fri, 3 Nov 2023 08:56:00 +0300
Subject: [PATCH] =?utf8?q?=D0=9F=D0=B5=D1=80=D0=B5=D0=B4=D0=B5=D0=BB=D0=B0?=
 =?utf8?q?=D0=BD=D0=BE=20=D0=BF=D0=BE=D0=B4=20=D1=81=D0=B2=D0=B5=D0=B6?=
 =?utf8?q?=D0=B8=D0=B9=20Python=20=D0=B8=20Vosk?=
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

---
 voicecontrol | 125 ++++++++++++++++++++++-----------------------------
 1 file changed, 53 insertions(+), 72 deletions(-)

diff --git a/voicecontrol b/voicecontrol
index 7216fef..6b4f17b 100755
--- a/voicecontrol
+++ b/voicecontrol
@@ -17,6 +17,8 @@ from time import time, sleep
 
 import webrtcvad
 
+from pprint import pprint
+
 @contextmanager
 def _pyaudio() -> Generator[PyAudio, None, None]:
     p = PyAudio()
@@ -57,12 +59,6 @@ def SkipSource(source,seconds):
   except:
     pass
 
-def Silence(speaker, seconds):
-  buf = bytes(speaker._frames_per_buffer)
-  bufs = int((seconds)*speaker._rate/speaker._frames_per_buffer)
-  for i in range(bufs):
-    speaker.write(buf)
-
 def PlayBack(pyaud, text, mic = None):
   global config, last_time
   
@@ -84,7 +80,7 @@ def PlayBack(pyaud, text, mic = None):
       decoder = MP3Decoder(req)
 
       speaker = pyaud.open(output=True, format=paInt16, channels=decoder.num_channels, rate=decoder.sample_rate)
-      Silence(speaker, 0.3) 
+      pprint(speaker)
 
       for chunk in decoder:
         speaker.write(chunk)
@@ -97,7 +93,7 @@ def PlayBack(pyaud, text, mic = None):
       last_time = time()
 
       if mic:
-        SkipSource(mic, elapsed + 0.5)
+        SkipSource(mic, elapsed + 0.2)
 
       return elapsed
 
@@ -105,7 +101,7 @@ def PlayBack(pyaud, text, mic = None):
       raise
 
     except:
-      pass
+      raise
 
   else:
     return 0
@@ -198,49 +194,38 @@ async def ListenPhrase(mic, server):
   sz = int(mic._rate*frame)
   sp = int(pause/frame)
 
-  try:
-
-    phrase = ""
-    voice = False
+  phrase = ""
+  voice = False
 
-    while not phrase:
-      data = mic.read(sz)
-      if len(data) == 0:
-        break
-      vd = vad.is_speech(data, mic._rate)
-      if vd and not voice:
-        voice = True
-        if config["debug"]:
-          print("+", end="")
+  while not phrase:
+    data = mic.read(sz)
+    if len(data) == 0:
+      break
+    vd = vad.is_speech(data, mic._rate)
+    if vd and not voice:
+      voice = True
+      if config["debug"]:
+        print("+", end="")
+      cnt = 0
+    if voice and not vd:
+      cnt = cnt + 1
+      if cnt > sp:
         cnt = 0
-      if voice and not vd:
-        cnt = cnt + 1
-        if cnt > sp:
-          cnt = 0
-          voice = False
-          if config["debug"]:
-            print("-")
-      if voice:
-        print("*",end="")
-        await server.send(data)
-        datatxt = await server.recv()
-        data = json.loads(datatxt)
-        try:
-          phrase = data["text"]
-          confidence = min(map(lambda x: x["conf"], data["result"]))
-        except:
-          pass  
+        voice = False
+        if config["debug"]:
+          print("-")
+    if voice:
+      print("*",end="")
+      await server.send(data)
+      datatxt = await server.recv()
+      data = json.loads(datatxt)
+      try:
+        phrase = data["text"]
+      except:
+        pass  
   
-    last_time = time()
-
-    return phrase, confidence
-
-  except KeyboardInterrupt:
-    raise
-  except websockets.exceptions.ConnectionClosedError:
-    raise  
-  except:
-    return '',0
+  last_time = time()
+  return phrase
 
 
 async def main_loop(uri):
@@ -248,13 +233,13 @@ async def main_loop(uri):
   global config, last_time
 
   keyphrase = config["keyphrase"]
-  confidence_treshold = config["confidence_treshold"]
   rec_attempts = config["rec_attempts"]
   commands = config["commands"]
 
   
   with ExitStack() as audio_stack:
     p = audio_stack.enter_context(_pyaudio())
+
     s = audio_stack.enter_context(_pyaudio_open_stream(p,
             format = paInt16, 
             channels = 1,
@@ -274,28 +259,28 @@ async def main_loop(uri):
 
           ws = await web_stack.enter_async_context(_polite_websocket(ws))
           while True:
-            phrase, confidence = await ListenPhrase(s, ws)
+            phrase = await ListenPhrase(s, ws)
             if config["debug"]:
-              print(phrase,confidence)
-            if phrase == keyphrase and confidence>=confidence_treshold :
-              PlayBack(p, "Ð¯ Ð¶Ð´Ñ ÐºÐ¾Ð¼Ð°Ð½Ð´Ñ", mic=s)
+              print(phrase)
+            if phrase == keyphrase :
+              print("COMMAND!")
+              PlayBack(p, "Ð¡Ð»ÑÑÐ°Ñ!", mic=s)
               command = ""
   
               for i in range(rec_attempts):
-                phrase, confidence = await ListenPhrase(s, ws)
+                phrase = await ListenPhrase(s, ws)
                 if config["debug"]:
-                  print(phrase,confidence)
-                if confidence > confidence_treshold:
-                  if (not commands) or (phrase in commands):
-                    if config["debug"]:
-                      print("Command: ", phrase)
-                    command = phrase
-                    RunCommand(command, p, s)
-                    break
-                  else:
-                    PlayBack(p, "ÐÐµ Ð·Ð½Ð°Ñ ÑÐ°ÐºÐ¾Ð¹ ÐºÐ¾Ð¼Ð°Ð½Ð´Ñ: "+phrase, mic=s)
+                  print(phrase)
+                if (not commands) or (phrase in commands):
+                  if config["debug"]:
+                    print("Command: ", phrase)
+                  command = phrase
+                  RunCommand(command, p, s)
+                  break
                 else:
-                  PlayBack(p, "ÐÐµ Ð¿Ð¾Ð½ÑÐ»Ð°, ÑÐ»Ð¸ÑÐºÐ¾Ð¼ Ð½ÐµÑÐ°Ð·Ð±Ð¾ÑÑÐ¸Ð²Ð¾", mic=s)
+                  PlayBack(p, "ÐÐµ Ð·Ð½Ð°Ñ ÑÐ°ÐºÐ¾Ð¹ ÐºÐ¾Ð¼Ð°Ð½Ð´Ñ: "+phrase, mic=s)
+              else:
+                PlayBack(p, "ÐÐµ Ð¿Ð¾Ð½ÑÐ»Ð°, ÑÐ»Ð¸ÑÐºÐ¾Ð¼ Ð½ÐµÑÐ°Ð·Ð±Ð¾ÑÑÐ¸Ð²Ð¾", mic=s)
 
               if not command:
                 PlayBack(p, "Ð¢Ð°Ðº ÐºÐ¾Ð¼Ð°Ð½Ð´Ñ Ð¸ Ð½Ðµ Ð¿Ð¾Ð½ÑÐ»Ð°...", mic=s)
@@ -321,11 +306,6 @@ def get_config(path):
   except:
     rec_attempts = 4
 
-  try:  
-    confidence_treshold = float(config['vosk']['confidence_treshold'])
-  except:
-    confidence_treshold = 0.4
-
   try:
     vosk_server = config['vosk']['server']
   except:
@@ -389,7 +369,6 @@ def get_config(path):
       "asr_server": vosk_server,
       "keyphrase": keyphrase,
       "rec_attempts": rec_attempts,
-      "confidence_treshold": confidence_treshold,
       "tts_url": tts_url,
       "tts_param": tts_param,
       "api_attempts": api_attempts,
@@ -413,6 +392,7 @@ config = get_config(conf_file)
 server = config['asr_server']
 
 vad = webrtcvad.Vad(config['vad_mode'])
+last_time = time()
 
 while True:
 
@@ -423,6 +403,7 @@ while True:
         main_loop(f'ws://' + server))
 
   except (Exception, KeyboardInterrupt) as e:
+    raise
     loop.run_until_complete(
       loop.shutdown_asyncgens())
     if isinstance(e, KeyboardInterrupt):
-- 
2.34.1