项目：对话系统。TTS和ASR流前端工程问题总结网站首页 其他

项目：对话系统。TTS和ASR流前端工程问题总结

背藏玫瑰 2024-08-15 12:01:03

简介项目：对话系统。TTS和ASR流前端工程问题总结

文章目录

前言

一、网页授权媒体

二、ArrayBuffer、TypeBuffer 截断、拼接

前言

demo模拟线上客户通话，实现tts播报和asr语音转文本，最终将tts和asr播放的文本进行存储或下载。

一、网页授权媒体

这个之前有写过

//  方法就一个
    openMedia() {
      const that = this;
      const gotMediaStream = () => {
        that.getMedia = true;
        //这里是开始websocket的方法
        that.useWebSocket();
      };
      const handleError = (err) => {
        // console.log("navigator catch error", err);
        that.$message.error("未获录音权限");
      };
      if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
        // console.log("getusermedia is not supported!");
        that.$message.error("无录音权限");
      } else {
        const constrants = {
          audio: {
            volume: { min: 0.0, max: 1.0 },
            noiseSuppression: false,
            echoCancellation: false,
          },
        };
        navigator.mediaDevices
          .getUserMedia(constrants)
          .then(gotMediaStream)
          .catch(handleError);
      }
    },
    // 初始化的websocket
    useWebSocket() {
      const that = this;
      asrWs = new WebSocket(process.env.VUE_APP_WS_URL);
      asrWs.binaryType = "arraybuffer"; // 传输的是 ArrayBuffer 类型的数据
      asrWs.onopen = function () {
        if (asrWs.readyState === 1) {
          // 开启体验和心跳 但是未开始录音
          // that.experienceStatus = true;
          that.heartStart();
        }
      };
      asrWs.onmessage = function (msg) {
       //这里就是相关的业务逻辑了
        that.heartStart();
      };
      asrWs.onclose = function (err) {
        console.log(err);
      };

      asrWs.onerror = function (err) {
        that.heartStart();
      };
    },

修改了一些配置，这个配置解决录音音量较小的问题。

        navigator.mediaDevices
          .getUserMedia({audio:true })
          .then(gotMediaStream)
          .catch(handleError);

二、ArrayBuffer、TypeBuffer 截断、拼接

首先，这个 ArrayBuffer 类型化数组，类型化数组是JavaScript操作二进制数据的一个接口。最初为了满足JavaScript与显卡之间大量的、实时的数据交换，它们之间的数据通信必须是二进制的，而不能是传统的文本格式的背景下诞生的。

ArrayBuffer

var bf = new ArrayBuffer(40); // 生成了字节长度为40的内存区域
//通过提供的 byteLength 属性返回分配字节的长度
console.log(bf.byteLength);  // 40
/*
    值得注意的是如果要分配的内存区域很大，有可能分配失败（因为没有那么多的连续空余内存），所以有必要检查是否分配成功。
*/

ArrayBuffer对象有一个slice方法，允许将内存区域的一部分，拷贝生成一个新的ArrayBuffer对象。

const bf = new ArrayBuffer(40);
const newBf = bf.slice(0, 10); // 从0 - 9 不包括 10

上面代码拷贝buffer对象的前10个字节，生成一个新的ArrayBuffer对象。slice方法其实包含两步，第一步是先分配一段新内存，第二步是将原来那个ArrayBuffer对象拷贝过去。
slice方法接受两个参数，第一个参数表示拷贝开始的字节序号，第二个参数表示拷贝截止的字节序号。如果省略第二个参数，则默认到原ArrayBuffer对象的结尾。
除了slice方法，ArrayBuffer对象不提供任何直接读写内存的方法，只允许在其上方建立视图，然后通过视图读写。

TypeBuffer

Int8Array：8位有符号整数，长度1个字节。
Uint8Array：8位无符号整数，长度1个字节。
Int16Array：16位有符号整数，长度2个字节。
Uint16Array：16位无符号整数，长度2个字节。
Int32Array：32位有符号整数，长度4个字节。
Uint32Array：32位无符号整数，长度4个字节。
Float32Array：32位浮点数，长度4个字节。
Float64Array：64位浮点数，长度8个字节。

我的需求是将多个ArrayBuffer完成拼接

二、ASR/TTS是什么？

ASR：语音转文本，将语音转成对应的文本，然后做其他处理。

有http方式和websocket方式。

http方式

传送还是file,file属于blob 使用FormData

export function updataBlob(data) {
  let formData = new FormData();
  formData.append("file", data);
  formData.append("chatId", store.getters.chatId);
  return request({
    url: "model/audio/upload",
    method: "post",
    data: formData,
    headers: {
        "Content-Type": "multipart/form-data",
      },
  });
}

这种方式需要考虑是否为PCM和还是WAV格式的问题，后端解析需不需要加头

websocket这种方式

这种方式实时传送音频，对音频进行了切片。之前文章中也有代码。
https://mp.csdn.net/mp_blog/creation/editor/124618257

TTS是什么

tts是将文本转成语音。
涉及语音就涉及播放。

（1）如果是一次性返回pcm格式文件，需要播放就需要加上头，wav格式。播放TTS返回的文件

添加依赖

"wav-headers": "^1.0.1"

var getFileHeaders = require("wav-headers");

添加wav请求头

    generateWav(buffer) {
      var options = {
        channels: 1,
        sampleRate: 16000,
        bitDepth: 16,
        dataLength: buffer.length,
      };
      var headersBuffer = getFileHeaders(options);

      var temp = new Uint8Array(buffer.byteLength + headersBuffer.byteLength);
      temp.set(new Uint8Array(headersBuffer), 0);
      temp.set(new Uint8Array(buffer), headersBuffer.byteLength);
      return temp;
    },

上传文件 new Blob([this.PCMList], { type: "audio/wav" }); 记住有个[ ] 本来this.PCMList 就是个TypeArray。

下面的生成wav文件下载。

    upAudioOne() {
      store.commit("audio/SET_AUDIO_CUT_SIZE", -1);
      this.PCMList = this.generateWav(this.PCMList);
      const blob = new Blob([this.PCMList], { type: "audio/wav" });
      updataBlob(blob);

      // let blobUrl = window.URL.createObjectURL(blob);
      // let link = document.createElement('a')
      // link.style.display = 'none'
      // link.href = blobUrl
      // link.download = 'test' + '.wav'
      // document.body.appendChild(link)
      // link.click()

    },

这个是生成wav文件直接下载或播放

      if (Object.prototype.toString.call(data) == "[object Object]") {
        this.$message({
          message: data.message,
          type: "warning",
          duration: 3 * 1000,
        });
      } else {
        if (type) {
          this.cunrentAudioUrl = URL.createObjectURL(data);
          this.$refs.audio.volume = 0.1;
        } else {
          const reader = new FileReader();
          reader.readAsDataURL(data);
          reader.onload = (e) => {
            const a = document.createElement("a");
            a.download = item.id + ".wav";
            a.href = e.target.result;
            document.body.appendChild(a);
            a.click();
            document.body.removeChild(a);
          };
        }
      }

(2) 如果返回的是流式文件就需要将文件合成拼接在一起 记录TTS返回音频

    concatenate(resultConstructor, ...arrays) {
      let totalLength = 0;
      let startIndx = 0;
      if (!arrays[0].length) {
        this.PCMListAllIndex = [];
      } else {
        startIndx = arrays[0].length;
      }
      for (let arr of arrays) {
        totalLength += arr.length;
      }
      let result = new resultConstructor(totalLength);
      let offset = 0;
      for (let arr of arrays) {
        result.set(arr, offset);
        offset += arr.length;
      }
      this.PCMListAllIndex.push({ start: startIndx, end: totalLength });
      return result;
    },

使用方法合成

var dataAudio = new Uint8Array(msg.data);

if (that.ttsSart) {

that.PCMList = that.concatenate(

Uint8Array,

that.PCMList,

dataAudio

);

}

TTS流式播放

返回是ArrayBuffer ，这个写了看了一个第三方的做了修改

var bufferSource = null;
class PCMPlayer {
  constructor(option) {
    this.init(option);
  }

  init(option) {
    const defaultOption = {
      inputCodec: "Int16", // 传入的数据是采用多少位编码，默认16位
      channels: 1, // 声道数
      sampleRate: 8000, // 采样率 单位Hz
      flushTime: 3000, // 缓存时间 单位 ms
    };

    this.option = Object.assign({}, defaultOption, option); // 实例最终配置参数
    this.samples = new Float32Array(); // 样本存放区域
    // this.interval = setInterval(this.flush.bind(this), this.option.flushTime);
    this.audioCtx = null;
    this.convertValue = this.getConvertValue();
    this.typedArray = this.getTypedArray();
    this.initAudioContext();
    this.sourcePlayLen = 0;
  }

  sourceLenInit() {
    this.sourcePlayLen = 0;
    this.samples = new Float32Array();
    // console.log("初始化=》被调用");
  }
  ttsClose() {
    this.sourcePlayLen = 0;
    if (bufferSource) {
      bufferSource.stop(0); //立即停止
    }

    this.destroy();
  }

  getConvertValue() {
    const inputCodecs = {
      Int8: 128,
      Int16: 32768,
      Int32: 2147483648,
      Float32: 1,
    };
    if (!inputCodecs[this.option.inputCodec])
      throw new Error(
        "wrong codec.please input one of these codecs:Int8,Int16,Int32,Float32"
      );
    return inputCodecs[this.option.inputCodec];
  }

  getTypedArray() {
    // 根据传入的目标编码位数
    // 选定前端的所需要的保存的二进制数据格式
    // 完整TypedArray请看文档
    // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/TypedArray
    const typedArrays = {
      Int8: Int8Array,
      Int16: Int16Array,
      Int32: Int32Array,
      Float32: Float32Array,
    };
    if (!typedArrays[this.option.inputCodec])
      throw new Error(
        "wrong codec.please input one of these codecs:Int8,Int16,Int32,Float32"
      );
    return typedArrays[this.option.inputCodec];
  }

  initAudioContext() {
    this.sourcePlayLen = 0;
    // 初始化音频上下文的东西
    this.audioCtx = new (window.AudioContext || window.webkitAudioContext)();
    // 控制音量的 GainNode
    // https://developer.mozilla.org/en-US/docs/Web/API/BaseAudioContext/createGain
    this.gainNode = this.audioCtx.createGain();
    this.gainNode.gain.value = 0.1;
    this.gainNode.connect(this.audioCtx.destination);
    this.startTime = this.audioCtx.currentTime;
  }

  static isTypedArray(data) {
    // 检测输入的数据是否为 TypedArray 类型或 ArrayBuffer 类型
    return (
      (data.byteLength &&
        data.buffer &&
        data.buffer.constructor == ArrayBuffer) ||
      data.constructor == ArrayBuffer
    );
  }

  isSupported(data) {
    // 数据类型是否支持
    // 目前支持 ArrayBuffer 或者 TypedArray
    if (!PCMPlayer.isTypedArray(data))
      throw new Error("请传入ArrayBuffer或者任意TypedArray");
    return true;
  }

  feed(data) {
    if (!this.audioCtx) {
      return;
    }
    this.isSupported(data);

    // 获取格式化后的buffer
    data = this.getFormatedValue(data);
    // 开始拷贝buffer数据
    // 新建一个Float32Array的空间
    const tmp = new Float32Array(this.samples.length + data.length);
    // console.log(data, this.samples, this.samples.length)
    // 复制当前的实例的buffer值（历史buff)
    // 从头（0）开始复制
    tmp.set(this.samples, 0);

    // 复制传入的新数据
    // 从历史buff位置开始
    tmp.set(data, this.samples.length);

    // 将新的完整buff数据赋值给samples
    // interval定时器也会从samples里面播放数据
    this.samples = tmp;
    this.flush();
    // console.log('this.samples', this.samples)
  }

  getFormatedValue(data) {
    if (data.constructor == ArrayBuffer) {
      data = new this.typedArray(data);
    } else {
      data = new this.typedArray(data.buffer);
    }

    let float32 = new Float32Array(data.length);

    for (let i = 0; i < data.length; i++) {
      // buffer 缓冲区的数据，需要是IEEE754 里32位的线性PCM，范围从-1到+1
      // 所以对数据进行除法
      // 除以对应的位数范围，得到-1到+1的数据
      // float32[i] = data[i] / 0x8000;
      float32[i] = data[i] / this.convertValue;
    }
    return float32;
  }

  volume(volume) {
    this.gainNode.gain.value = volume;
  }

  destroy() {
    // if (this.interval) {
    //   clearInterval(this.interval);
    // }
    this.samples = null;
    this.audioCtx && this.audioCtx.close();
    this.audioCtx = null;
  }

  flush() {
    const self = this;

    if (this.samples.length) {
      this.sourcePlayLen += 1;
    }

    if (!this.samples.length) {
      return;
    }
    if (!this.audioCtx) {
      return;
    }
    bufferSource = this.audioCtx.createBufferSource();
    if (typeof this.option.onended === "function") {
      bufferSource.onended = function (event) {
        self.option.onended(this, event);
      };
    }
    const length = this.samples.length / this.option.channels;
    const audioBuffer = this.audioCtx.createBuffer(
      this.option.channels,
      length,
      this.option.sampleRate
    );

    for (let channel = 0; channel < this.option.channels; channel++) {
      const audioData = audioBuffer.getChannelData(channel);
      let offset = channel;
      let decrement = 50;
      for (let i = 0; i < length; i++) {
        audioData[i] = this.samples[offset];
        /* fadein */
        if (i < 50) {
          audioData[i] = (audioData[i] * i) / 50;
        }
        /* fadeout*/
        if (i >= length - 51) {
          audioData[i] = (audioData[i] * decrement--) / 50;
        }
        offset += this.option.channels;
      }
    }

    if (this.startTime < this.audioCtx.currentTime) {
      this.startTime = this.audioCtx.currentTime;
    }
    bufferSource.buffer = audioBuffer;
    bufferSource.connect(this.gainNode);
    bufferSource.start(this.startTime);
    if (this.sourcePlayLen == 1) {
      if (typeof this.option.firstPlay === "function") {
        self.option.firstPlay(this, Date.now());
      }
    }
    // console.log(bufferSource);
    this.startTime += audioBuffer.duration;
    this.samples = new Float32Array();
  }

  async pause() {
    await this.audioCtx.suspend();
  }

  async continue() {
    await this.audioCtx.resume();
  }
  bindAudioContextEvent() {
    const self = this;
    if (typeof self.option.onstatechange === "function") {
      this.audioCtx.onstatechange = function (event) {
        self.option.onstatechange(this, event, self.audioCtx.state);
      };
    }
  }
  getSate() {
    const self = this;
    return self.audioCtx.state;
  }
}

export default PCMPlayer;

引入js

import PCMPlayer from "./pcm-player.js";

方法中调用

this.player.sourcePlayLen 和 playLen 是所有流式的ArrayBuffer播放完成的判断。

原理：

使用音频上下文及createBufferSource() 实时播放二进制流

this.audioCtx = new (window.AudioContext || window.webkitAudioContext)();

bufferSource = this.audioCtx.createBufferSource(); 见pcm-player.js源码

每个ArrayBuffer都会播放，每次播放播放会调用 onended方法，记录每次feed方法的调用的次数(就是sourcePlayLen)和个文件播放完成 onended 次数做个判断。相等就播放完成。

firstPlay这个回调是 sourcePlayLen= 1的的回调，判断音频播放中状态的回调方法。这些和业务相关，自己可以补充回调方法。

          var dataAudio = new Uint8Array(msg.data);
          if (that.ttsSart) {
            that.PCMList = that.concatenate(
              Uint8Array,
              that.PCMList,
              dataAudio
            );

            that.txtProgreeArr.push(that.sorcePushConut);
            // that.ttsPlayStatus = true;
            that.player && that.player.feed(dataAudio);
            that.sorcePushConut += 1;
          }

    play() {
      const that = this;
      if (this.player) return;
      this.player = new PCMPlayer({
        encoding: "16bitInt",
        channels: 1,
        sampleRate: 16000,
        flushTime: 1000,
        firstPlay: function (r) {
          that.ttsPlayStatus = true;
        },
        onended: function (r) {
          that.playLen += 1;
          if (that.playLen === that.player.sourcePlayLen) {
            that.playLen = 0;
            that.player.sourceLenInit();
            // 正常结束
            that.ttsIsInterrupt = true;
            that.ttsPlayStatus = false;
            that.ttsOnend();
            if (that.chatSate == 4) {
              that.ttsStop();
            }
          }
        },
      })
    },

主要的几个方法

创建：this.player = new PCMPlayer({})

暂停：this.player.ttsClose();

摧毁：this.player.destroy();