Pārlūkot izejas kodu

解决语音的识别和转换兼容性

zwssunny 3 gadi atpakaļ
vecāks
revīzija
24de670c2c

+ 18 - 10
channel/wechat/wechat_channel.py

@@ -5,6 +5,9 @@ wechat channel
 """
 
 import os
+import requests
+import io
+import time
 from lib import itchat
 import json
 from lib.itchat.content import *
@@ -17,9 +20,7 @@ from common.tmp_dir import TmpDir
 from config import conf
 from common.time_check import time_checker
 from plugins import *
-import requests
-import io
-import time
+from voice.audio_convert import mp3_to_wav
 
 
 thread_pool = ThreadPoolExecutor(max_workers=8)
@@ -28,8 +29,7 @@ thread_pool = ThreadPoolExecutor(max_workers=8)
 def thread_pool_callback(worker):
     worker_exception = worker.exception()
     if worker_exception:
-        logger.exception(
-            "Worker return exception: {}".format(worker_exception))
+        logger.exception("Worker return exception: {}".format(worker_exception))
 
 
 @itchat.msg_register(TEXT)
@@ -247,9 +247,16 @@ class WechatChannel(Channel):
                 reply = super().build_reply_content(context.content, context)
             elif context.type == ContextType.VOICE:
                 msg = context['msg']
-                file_name = TmpDir().path() + context.content
-                msg.download(file_name)
-                reply = super().build_voice_to_text(file_name)
+                mp3_path = TmpDir().path() + context.content
+                msg.download(mp3_path)
+                # mp3转wav
+                wav_path = os.path.splitext(mp3_path)[0] + '.wav'
+                mp3_to_wav(mp3_path=mp3_path, wav_path=wav_path)
+                # 语音识别
+                reply = super().build_voice_to_text(wav_path)
+                # 删除临时文件
+                os.remove(wav_path)
+                os.remove(mp3_path)
                 if reply.type != ReplyType.ERROR and reply.type != ReplyType.INFO:
                     context.content = reply.content  # 语音转文字后,将文字内容作为新的context
                     context.type = ContextType.TEXT
@@ -263,12 +270,13 @@ class WechatChannel(Channel):
                             prefixes = conf().get('group_chat_prefix')
                             for prefix in prefixes:
                                 if context.content.startswith(prefix):
-                                    context.content = context.content.replace(prefix, '', 1).strip()
+                                    context.content = context.content.replace(
+                                        prefix, '', 1).strip()
                                     break
                         else:
                             logger.info("[WX]receive voice check prefix: " + 'False')
                             return
-                            
+
                     reply = super().build_reply_content(context.content, context)
                     if reply.type == ReplyType.TEXT:
                         if conf().get('voice_reply_voice'):

+ 16 - 66
channel/wechat/wechaty_channel.py

@@ -4,25 +4,19 @@
 wechaty channel
 Python Wechaty - https://github.com/wechaty/python-wechaty
 """
-import io
 import os
-import json
 import time
 import asyncio
-import requests
-import pysilk
-import wave
-from pydub import AudioSegment
 from typing import Optional, Union
 from bridge.context import Context, ContextType
 from wechaty_puppet import MessageType, FileBox, ScanStatus  # type: ignore
 from wechaty import Wechaty, Contact
-from wechaty.user import Message, Room, MiniProgram, UrlLink
+from wechaty.user import Message, MiniProgram, UrlLink
 from channel.channel import Channel
 from common.log import logger
 from common.tmp_dir import TmpDir
 from config import conf
-
+from voice.audio_convert import sil_to_wav, mp3_to_sil
 
 class WechatyChannel(Channel):
 
@@ -50,8 +44,8 @@ class WechatyChannel(Channel):
 
     async def on_scan(self, status: ScanStatus, qr_code: Optional[str] = None,
                       data: Optional[str] = None):
-        contact = self.Contact.load(self.contact_id)
-        logger.info('[WX] scan user={}, scan status={}, scan qr_code={}'.format(contact, status.name, qr_code))
+        # contact = self.Contact.load(self.contact_id)
+        # logger.info('[WX] scan user={}, scan status={}, scan qr_code={}'.format(contact, status.name, qr_code))
         # print(f'user <{contact}> scan status: {status.name} , 'f'qr_code: {qr_code}')
 
     async def on_message(self, msg: Message):
@@ -67,7 +61,7 @@ class WechatyChannel(Channel):
         content = msg.text()
         mention_content = await msg.mention_text()  # 返回过滤掉@name后的消息
         match_prefix = self.check_prefix(content, conf().get('single_chat_prefix'))
-        conversation: Union[Room, Contact] = from_contact if room is None else room
+        # conversation: Union[Room, Contact] = from_contact if room is None else room
 
         if room is None and msg.type() == MessageType.MESSAGE_TYPE_TEXT:
             if not msg.is_self() and match_prefix is not None:
@@ -102,21 +96,8 @@ class WechatyChannel(Channel):
                 await voice_file.to_file(silk_file)
                 logger.info("[WX]receive voice file: " + silk_file)
                 # 将文件转成wav格式音频
-                wav_file = silk_file.replace(".slk", ".wav")
-                with open(silk_file, 'rb') as f:
-                    silk_data = f.read()
-                pcm_data = pysilk.decode(silk_data)
-
-                with wave.open(wav_file, 'wb') as wav_data:
-                    wav_data.setnchannels(1)
-                    wav_data.setsampwidth(2)
-                    wav_data.setframerate(24000)
-                    wav_data.writeframes(pcm_data)
-                if os.path.exists(wav_file): 
-                    converter_state = "true" # 转换wav成功
-                else:
-                    converter_state = "false" # 转换wav失败
-                logger.info("[WX]receive voice converter: " + converter_state)
+                wav_file = os.path.splitext(silk_file)[0] + '.wav'
+                sil_to_wav(silk_file, wav_file)
                 # 语音识别为文本
                 query = super().build_voice_to_text(wav_file).content
                 # 交验关键字
@@ -183,21 +164,8 @@ class WechatyChannel(Channel):
                 await voice_file.to_file(silk_file)
                 logger.info("[WX]receive voice file: " + silk_file)
                 # 将文件转成wav格式音频
-                wav_file = silk_file.replace(".slk", ".wav")
-                with open(silk_file, 'rb') as f:
-                    silk_data = f.read()
-                pcm_data = pysilk.decode(silk_data)
-
-                with wave.open(wav_file, 'wb') as wav_data:
-                    wav_data.setnchannels(1)
-                    wav_data.setsampwidth(2)
-                    wav_data.setframerate(24000)
-                    wav_data.writeframes(pcm_data)
-                if os.path.exists(wav_file): 
-                    converter_state = "true" # 转换wav成功
-                else:
-                    converter_state = "false" # 转换wav失败
-                logger.info("[WX]receive voice converter: " + converter_state)
+                wav_file = os.path.splitext(silk_file)[0] + '.wav'
+                sil_to_wav(silk_file, wav_file)
                 # 语音识别为文本
                 query = super().build_voice_to_text(wav_file).content
                 # 校验关键字
@@ -260,21 +228,12 @@ class WechatyChannel(Channel):
             if reply_text:
                 # 转换 mp3 文件为 silk 格式
                 mp3_file = super().build_text_to_voice(reply_text).content
-                silk_file = mp3_file.replace(".mp3", ".silk")
-                # Load the MP3 file
-                audio = AudioSegment.from_file(mp3_file, format="mp3")
-                # Convert to WAV format
-                audio = audio.set_frame_rate(24000).set_channels(1)
-                wav_data = audio.raw_data
-                sample_width = audio.sample_width
-                # Encode to SILK format
-                silk_data = pysilk.encode(wav_data, 24000)
-                # Save the silk file
-                with open(silk_file, "wb") as f:
-                    f.write(silk_data)
+                silk_file = os.path.splitext(mp3_file)[0] + '.sil'
+                voiceLength = mp3_to_sil(mp3_file, silk_file)
                 # 发送语音
                 t = int(time.time())
-                file_box = FileBox.from_file(silk_file, name=str(t) + '.silk')
+                file_box = FileBox.from_file(silk_file, name=str(t) + '.sil')
+                file_box.metadata = {'voiceLength': voiceLength}                
                 await self.send(file_box, reply_user_id)
                 # 清除缓存文件
                 os.remove(mp3_file)
@@ -337,21 +296,12 @@ class WechatyChannel(Channel):
             reply_text = '@' + group_user_name + ' ' + reply_text.strip()
             # 转换 mp3 文件为 silk 格式
             mp3_file = super().build_text_to_voice(reply_text).content
-            silk_file = mp3_file.replace(".mp3", ".silk")
-            # Load the MP3 file
-            audio = AudioSegment.from_file(mp3_file, format="mp3")
-            # Convert to WAV format
-            audio = audio.set_frame_rate(24000).set_channels(1)
-            wav_data = audio.raw_data
-            sample_width = audio.sample_width
-            # Encode to SILK format
-            silk_data = pysilk.encode(wav_data, 24000)
-            # Save the silk file
-            with open(silk_file, "wb") as f:
-                f.write(silk_data)
+            silk_file = os.path.splitext(mp3_file)[0] + '.sil'
+            voiceLength = mp3_to_sil(mp3_file, silk_file)
             # 发送语音
             t = int(time.time())
             file_box = FileBox.from_file(silk_file, name=str(t) + '.silk')
+            file_box.metadata = {'voiceLength': voiceLength}            
             await self.send_group(file_box, group_id)
             # 清除缓存文件
             os.remove(mp3_file)

+ 3 - 7
voice/google/google_voice.py

@@ -3,17 +3,14 @@
 google voice service
 """
 
-import pathlib
-import subprocess
 import time
-from bridge.reply import Reply, ReplyType
 import speech_recognition
 import pyttsx3
 from gtts import gTTS
+from bridge.reply import Reply, ReplyType
 from common.log import logger
 from common.tmp_dir import TmpDir
 from voice.voice import Voice
-from voice.audio_convert import mp3_to_wav
 
 
 class GoogleVoice(Voice):
@@ -30,11 +27,10 @@ class GoogleVoice(Voice):
         self.engine.setProperty('voice', voices[1].id)
 
     def voiceToText(self, voice_file):
-        new_file = voice_file.replace('.mp3', '.wav')
+        # new_file = voice_file.replace('.mp3', '.wav')
         # subprocess.call('ffmpeg -i ' + voice_file +
         #                 ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True)
-        mp3_to_wav(voice_file, new_file)
-        with speech_recognition.AudioFile(new_file) as source:
+        with speech_recognition.AudioFile(voice_file) as source:
             audio = self.recognizer.record(source)
         try:
             text = self.recognizer.recognize_google(audio, language='zh-CN')