文章目录
*
– 在线语音识别的优势
– 一,语音识别流程图
– 二,录音
– 三,词法分析
在线语音识别的优势
结合语义分析的在线语音识别具有识别准确、灵活性高的特点,但其处理速度不如离线识别。
[En]
Online speech recognition combined with semantic analysis has the characteristics of accurate recognition and high flexibility, but its processing speed is not as fast as offline recognition.
一,语音识别流程图
与离线识别不同的是,在esp32被唤醒后,会进行录音,录音结束后将音频发送到云端进行语音识别,并将返回的文本结果进行词法分析,得到特征值,根据特征值,执行相应的命令。
; 二,录音
esp32被唤醒后就开始录音,通过VAD音量检测,判断用户是否在讲话,若讲话停止则停止录音(或到达录音最大时间),并将数据通过http客户端发送到百度云语音识别接口。
这里展示主要的代码,录音的数据保存到recoder中:
vad_handle_t vad_inst = vad_create(VAD_MODE_4, VAD_SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);
int16_t *vad_buff = (int16_t *)malloc(VAD_BUFFER_LENGTH * sizeof(short));
if (vad_buff == NULL)
{
ESP_LOGE(TAG, "Memory allocation failed!");
}
int index = 0;
int timeout = 0;
int total_rec = 0;
while (1)
{
raw_stream_read(raw_read, (char *)buffer, audio_wn_chunksize * sizeof(short));
if (enable_wn)
{
if (wakenet->detect(model_wn_data, (int16_t *)buffer) == 1)
{
ESP_LOGI(TAG, "wake up start listening");
LED_ON;
enable_wn = false;
}
}
else
{
if (recoder != NULL)
{
if (total_rec < (MAX_RECODER - 960) && timeout < RECODER_TIMEOUT)
{
memcpy(recoder + (index * audio_wn_chunksize * sizeof(short)), buffer, audio_wn_chunksize * sizeof(short));
index++;
total_rec += audio_wn_chunksize * sizeof(short);
}
else
{
LED_OFF;
ESP_LOGI(TAG, "stop listening");
memset(http_buff, 0, MAX_HTTP_LEN);
memset(url, 0, 200);
esp_http_client_config_t config = {
.method = HTTP_METHOD_POST,
.event_handler = http_event_handle,
.user_data = (void *)http_buff,
};
sprintf(url, BAIDU_ASR_URL, baidu_access_token);
config.url = url;
printf("start connect to url = %s\r\n", config.url);
esp_http_client_handle_t client = esp_http_client_init(&config);
esp_http_client_set_header(client, "Content-Type", "audio/pcm;rate=16000");
esp_http_client_set_post_field(client, (const char *)recoder, total_rec);
ESP_LOGI(TAG, "start trasnlate");
esp_http_client_perform(client);
esp_http_client_close(client);
esp_http_client_cleanup(client);
free(recoder);
recoder = NULL;
index = 0;
total_rec = 0;
timeout = 0;
enable_wn = true;
}
}
else{
recoder = malloc(MAX_RECODER);
}
memcpy(vad_buff, buffer, VAD_BUFFER_LENGTH * sizeof(short));
vad_state_t vad_state = vad_process(vad_inst, vad_buff);
if (vad_state == VAD_SPEECH)
{
timeout = 0;
}
else
{
timeout++;
}
三,词法分析
接收到语音识别返回的文本后,还需要对文本进行词法分析,解析出文本中包含的指令。这个功能使用的是百度的词法分析定制版。具体逻辑是:首先我们确定一个词汇集,如:{打开,开启,启动},并将该词汇集命名为OPEN,{空调,格力空调}命名为”AC”
例如,用户输入”打开空调”,词法分析就会得到以下结果
{
"log_id": 4870567568319578302,
"items": [
{
"loc_details": [
],
"byte_offset": 0,
"uri": "",
"ne": "OPEN",
"basic_words": [
"打开"
],
"item": "打开",
"pos": "",
"byte_length": 6,
"formal": ""
},
{
"loc_details": [
],
"byte_offset": 6,
"uri": "",
"ne": "AC",
"basic_words": [
"空调"
],
"item": "空调",
"pos": "",
"byte_length": 6,
"formal": ""
}
],
"text": "打开空调"
}
我们读取”ne”键中的内容,就能判断用户的意图。以下代码请求词法分析
int Etymology_Analysis()
{
cJSON *root = cJSON_Parse(http_buff);
if(root==NULL)
{
ESP_LOGI(TAG,"cjson parse error");
return 0;
}
cJSON *item=cJSON_GetObjectItem(root, "err_no");
if(item->valueint!=0)
{
ESP_LOGI(TAG,"translate error,err_no:%d",item->valueint);
cJSON_Delete(root);
return 0;
}
item = cJSON_GetObjectItem(root, "result");
item = cJSON_GetArrayItem(item,0);
char *result = cJSON_GetStringValue(item);
char *post_data = malloc(POST_DATA_LEN);
snprintf(post_data, POST_DATA_LEN, "{\"text\":\"%s\"}", result);
ESP_LOGI(TAG, "POST DATA:%s", post_data);
memset(http_buff, 0, MAX_HTTP_LEN);
memset(url, 0, 200);
esp_http_client_config_t config={
.method=HTTP_METHOD_POST,
.event_handler=http_event_handle,
.user_data = (void *)http_buff,
};
sprintf(url, BAIDU_ETY_URL, baidu_access_token);
config.url = url;
esp_http_client_handle_t client = esp_http_client_init(&config);
esp_http_client_set_header(client, "Content-Type", "application/json");
esp_http_client_set_post_field(client,(const char*)post_data,strlen(post_data));
printf("start connect to url = %s\r\n",config.url);
esp_http_client_perform(client);
int con_len = esp_http_client_get_content_length(client);
ESP_LOGI(TAG, "Status = %d, content_length = %d", esp_http_client_get_status_code(client), con_len);
esp_http_client_close(client);
esp_http_client_cleanup(client);
cJSON_Delete(root);
free(post_data);
return 1;
}
在获取到以上的JSON数据后,接下来就是提取”ne”中的内容,用下面的数据结构来辅助解析。
typedef struct
{
enum Lexical lexical;
char text[10];
} Ety_Element;
static Ety_Element ety_eles[10] = {0};
typedef struct
{
int number;
enum Object object;
enum AC_Option option;
} Audio_Order;
以下代码会解析每个词,并填充到ety_eles数组,每个单词对应一个ety_eles成员:
int parse_items()
{
cJSON *root = cJSON_Parse(http_buff);
cJSON *items = cJSON_GetObjectItem(root, "items");
if(items == NULL)
{
return 0;
}
int arry_size=cJSON_GetArraySize(items);
memset(ety_eles, 0, 10 * sizeof(Ety_Element));
cJSON *item,*sub_item;
char *character, *text;
for (int i = 0; i < arry_size; i++)
{
item = cJSON_GetArrayItem(items, i);
sub_item = cJSON_GetObjectItem(item, "pos");
character = cJSON_GetStringValue(sub_item);
if (strncmp(character,"",1)==0)
{
sub_item = cJSON_GetObjectItem(item, "ne");
character = cJSON_GetStringValue(sub_item);
}
printf("char = %s \r\n", character);
if (strncmp(character, "NUM", 3) == 0)
{
}
else if(strncmp(character,"AC",2)==0){
ety_eles[i].lexical = Aircon;
}
else if(strncmp(character,"BT",2)==0){
ety_eles[i].lexical = Bt;
}
else if(strncmp(character,"WEA",3)==0){
ety_eles[i].lexical = Weather;
}
else if(strncmp(character,"DOWN",4)==0){
ety_eles[i].lexical = Down;
}
else if(strncmp(character,"UP",2)==0){
ety_eles[i].lexical = Up;
}
else if(strncmp(character,"CLOSE",5)==0){
ety_eles[i].lexical = Close;
}
else if(strncmp(character,"OPEN",4)==0){
ety_eles[i].lexical = Open;
}
else if(strncmp(character,"TOMO",4)==0)
{
ety_eles[i].lexical = Tomorrow;
}
else if(strncmp(character,"AFTTO",5)==0)
{
ety_eles[i].lexical = Aftermotorrow;
}
else if(strncmp(character,"TODAY",4)==0)
{
ety_eles[i].lexical = Today;
}
else if(strncmp(character,"TIME",4)==0){
ety_eles[i].lexical = TIME;
}
else if(strncmp(character,"n",1)==0){
ety_eles[i].lexical = Nouns;
}
else if(strncmp(character,"w",1)==0){
ety_eles[i].lexical = Word;
}
else if(strncmp(character,"v",1)==0){
ety_eles[i].lexical = Verbs;
}
else if(strncmp(character,"m",1)==0){
sub_item = cJSON_GetObjectItem(item, "basic_words");
sub_item = cJSON_GetArrayItem(sub_item, 0);
text = cJSON_GetStringValue(sub_item);
ety_eles[i].lexical = Mount;
strncpy(ety_eles[i].text, text, strlen(text));
}
else if(strncmp(character,"r",1)==0){
ety_eles[i].lexical = Pronouns;
}
else{
ety_eles[i].lexical = Other;
}
}
cJSON_Delete(root);
return arry_size;
}
下面,根据得到的ety_eles数组,组装成一个Audio_Order类型的命令:
Audio_Order build_order(int i)
{
Audio_Order ord={
.number=0,
.object=obj_other,
.option=AC_OPTION_MAX
};
for (int x = 0; x < i; x++)
{
switch(ety_eles[x].lexical)
{
case Aircon:
ord.object = obj_Ac;
break;
case Bt:
ord.object = obj_Bt;
break;
case Weather:
ord.object = obj_Weather;
break;
case Open:
ord.option = AC_OPTION_OPEN;
break;
case Close:
ord.option = AC_OPTION_CLOSE;
break;
case Up:
ord.option = AC_OPTION_UP;
break;
case Down:
ord.option = AC_OPTION_DOWN;
break;
case Num:
ord.number = atoi(ety_eles[x].text);
break;
case Mount:
ord.number = atoi(ety_eles[x].text);
case TIME:
break;
case Today:
ord.number = 0;
break;
case Tomorrow:
ord.number = 1;
break;
case Aftermotorrow:
ord.number = 2;
break;
default:
break;
}
}
return ord;
}
有了Audio_Order命令,我们就能根据命令的内容作出反应。
Original: https://blog.csdn.net/weixin_44821644/article/details/115325281
Author: killer-p
Title: ESP32在线语音识别 词法解析
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/512908/
转载文章受原作者版权保护。转载请注明原作者出处!