Developing a Dual-Mode WeChat Bot with Voice Recognition and NLP Integration
Project Overview
The system architecture relies on three distinct phases: capturing the incoming message, converting voice data into text where applicable, and generating intelligent responses via an external NLP service.
- FFmpeg: Required for audio format manipulation. Download the binary package and add the
bindirectory to your system PATH environment variable. - Python Libraries: Install the following packages using pip to handle audio parsing and speech recognition:
pip install pydub
pip install SpeechRecognition
- Text Messages: Handled immediately through standard string processing.
- Voice/Recording Messages: Require a multi-step pipeline. The raw media file must be downloaded from the user's session, converted to a compatible format for speech-to-text engines, and processed before a reply can be generated.
While various APIs exist, some face regional restricsions or authentication complexities. For this implementation, the Microsoft Azure Bing Speech service was selected due to its robust REST API support within Python ecosystem wrappers. The workflow involves:
- Extracting the temporary audio file using
itchat's download mechanism. - Converting the MP3 file to WAV using
AudioSegment. - Passing the WAV stream to the
SpeechRecognitionengine configured with the Bing provider. - Returning the transcribed text to the conversation loop.
# -*- coding: utf-8 -*-
import os
import json
import requests
import itchat
from itchat.content import TEXT, RECORDING
from speech_recognition import Recognizer, AudioFile, UnknownValueError, RequestError
from pydub import AudioSegment
from datetime import datetime
# Configuration Constants
BOT_API_KEY = "YOUR_TURING_API_KEY_HERE"
BING_SPEECH_KEY = "YOUR_BING_API_KEY_HERE"
DEFAULT_RESPONSE = "Received your message."
def init_wechat_client():
"""Initialize and login to WeChat with auto-reload enabled."""
itchat.auto_login(hotReload=True)
def fetch_nlp_response(user_input):
"""Send text to the NLP service and retrieve a response."""
if not user_input:
return None
api_url = "http://www.tuling123.com/openapi/api"
payload = {
'key': BOT_API_KEY,
'info': user_input,
'userid': 'bot_user_id_01'
}
try:
response = requests.post(api_url, data=payload).json()
return response.get('text')
except Exception:
return None
def save_temp_audio(message_dict, extension=".wav"):
"""Download voice attachment and convert to WAV."""
filename = message_dict['FileName']
# Save the original received file temporarily
msg_obj = message_dict
# Convert mp3 to wav for ASR compatibility
mp3_path = f"{filename}.mp3"
wav_path = "tmp_audio.wav"
song = AudioSegment.from_mp3(mp3_path)
song.export(wav_path, format='wav')
# Cleanup original temp file if necessary
if os.path.exists(mp3_path) and mp3_path != wav_path:
pass # Keep original until processing complete
return wav_path
def recognize_speech(audio_file_path):
"""Convert audio file to text using Bing API."""
recognizer = Recognizer()
with AudioFile(audio_file_path) as source:
audio_data = recognizer.record(source)
try:
# Recognize with language set to Chinese (can be changed to en-US)
text = recognizer.recognize_bing(
audio=audio_data,
language="zh-CN",
key=BING_SPEECH_KEY
)
return text
except UnknownValueError:
return None
except RequestError as e:
print(f"API Error: {e}")
return None
def cleanup_files(file_paths):
"""Remove temporary files used for processing."""
for path in file_paths:
if os.path.exists(path):
os.remove(path)
@itchat.msg_register(TEXT)
def handle_text_message(msg):
"""Process plain text inputs."""
user_text = msg['Text']
bot_reply = fetch_nlp_response(user_text) or DEFAULT_RESPONSE
return bot_reply
@itchat.msg_register(RECORDING)
def handle_voice_message(msg):
"""Process incoming voice recordings."""
temp_files = []
try:
# Extract filename from message object
original_filename = msg['FileName']
# Perform conversion first to get a clean WAV file
wav_output = save_temp_audio(msg)
temp_files.append(wav_output)
# Recognize text from converted audio
recognized_text = recognize_speech(wav_output)
if recognized_text:
# Pass recognized text to NLP service
return fetch_nlp_response(recognized_text)
else:
return "Could not process voice."
finally:
# Ensure temporary files are deleted to prevent disk clutter
cleanup_files(temp_files)
if os.path.exists(original_filename):
cleanup_files([original_filename])
if __name__ == "__main__":
init_wechat_client()
print("Bot is running. Scanning QR code...")
itchat.run()