Move src/voice to src/audio for better naming

Many of these files are used by Audio and Voice messages.

Fixes https://github.com/vector-im/element-web/issues/18131
This commit is contained in:
Travis Ralston 2021-07-22 09:26:26 -06:00
parent 248a758ad6
commit eec63574e6
21 changed files with 14 additions and 14 deletions

View file

@ -0,0 +1,37 @@
/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import { DEFAULT_WAVEFORM, Playback } from "./Playback";
import { PlaybackManager } from "./PlaybackManager";
/**
* A managed playback is a Playback instance that is guided by a PlaybackManager.
*/
export class ManagedPlayback extends Playback {
public constructor(private manager: PlaybackManager, buf: ArrayBuffer, seedWaveform = DEFAULT_WAVEFORM) {
super(buf, seedWaveform);
}
public async play(): Promise<void> {
this.manager.playOnly(this);
return super.play();
}
public destroy() {
this.manager.destroyPlaybackInstance(this);
super.destroy();
}
}

311
src/audio/Playback.ts Normal file
View file

@ -0,0 +1,311 @@
/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import EventEmitter from "events";
import { UPDATE_EVENT } from "../stores/AsyncStore";
import { arrayFastResample, arrayRescale, arraySeed, arraySmoothingResample } from "../utils/arrays";
import { SimpleObservable } from "matrix-widget-api";
import { IDestroyable } from "../utils/IDestroyable";
import { PlaybackClock } from "./PlaybackClock";
import { createAudioContext, decodeOgg } from "./compat";
import { clamp } from "../utils/numbers";
export enum PlaybackState {
Decoding = "decoding",
Stopped = "stopped", // no progress on timeline
Paused = "paused", // some progress on timeline
Playing = "playing", // active progress through timeline
}
export const PLAYBACK_WAVEFORM_SAMPLES = 39;
const THUMBNAIL_WAVEFORM_SAMPLES = 100; // arbitrary: [30,120]
export const DEFAULT_WAVEFORM = arraySeed(0, PLAYBACK_WAVEFORM_SAMPLES);
function makePlaybackWaveform(input: number[]): number[] {
// First, convert negative amplitudes to positive so we don't detect zero as "noisy".
const noiseWaveform = input.map(v => Math.abs(v));
// Next, we'll resample the waveform using a smoothing approach so we can keep the same rough shape.
// We also rescale the waveform to be 0-1 for the remaining function logic.
const resampled = arrayRescale(arraySmoothingResample(noiseWaveform, PLAYBACK_WAVEFORM_SAMPLES), 0, 1);
// Then, we'll do a high and low pass filter to isolate actual speaking volumes within the rescaled
// waveform. Most speech happens below the 0.5 mark.
const filtered = resampled.map(v => clamp(v, 0.1, 0.5));
// Finally, we'll rescale the filtered waveform (0.1-0.5 becomes 0-1 again) so the user sees something
// sensible. This is what we return to keep our contract of "values between zero and one".
return arrayRescale(filtered, 0, 1);
}
export class Playback extends EventEmitter implements IDestroyable {
/**
* Stable waveform for representing a thumbnail of the media. Values are
* guaranteed to be between zero and one, inclusive.
*/
public readonly thumbnailWaveform: number[];
private readonly context: AudioContext;
private source: AudioBufferSourceNode | MediaElementAudioSourceNode;
private state = PlaybackState.Decoding;
private audioBuf: AudioBuffer;
private element: HTMLAudioElement;
private resampledWaveform: number[];
private waveformObservable = new SimpleObservable<number[]>();
private readonly clock: PlaybackClock;
private readonly fileSize: number;
/**
* Creates a new playback instance from a buffer.
* @param {ArrayBuffer} buf The buffer containing the sound sample.
* @param {number[]} seedWaveform Optional seed waveform to present until the proper waveform
* can be calculated. Contains values between zero and one, inclusive.
*/
constructor(private buf: ArrayBuffer, seedWaveform = DEFAULT_WAVEFORM) {
super();
// Capture the file size early as reading the buffer will result in a 0-length buffer left behind
this.fileSize = this.buf.byteLength;
this.context = createAudioContext();
this.resampledWaveform = arrayFastResample(seedWaveform ?? DEFAULT_WAVEFORM, PLAYBACK_WAVEFORM_SAMPLES);
this.thumbnailWaveform = arrayFastResample(seedWaveform ?? DEFAULT_WAVEFORM, THUMBNAIL_WAVEFORM_SAMPLES);
this.waveformObservable.update(this.resampledWaveform);
this.clock = new PlaybackClock(this.context);
}
/**
* Size of the audio clip in bytes. May be zero if unknown. This is updated
* when the playback goes through phase changes.
*/
public get sizeBytes(): number {
return this.fileSize;
}
/**
* Stable waveform for the playback. Values are guaranteed to be between
* zero and one, inclusive.
*/
public get waveform(): number[] {
return this.resampledWaveform;
}
public get waveformData(): SimpleObservable<number[]> {
return this.waveformObservable;
}
public get clockInfo(): PlaybackClock {
return this.clock;
}
public get currentState(): PlaybackState {
return this.state;
}
public get isPlaying(): boolean {
return this.currentState === PlaybackState.Playing;
}
public emit(event: PlaybackState, ...args: any[]): boolean {
this.state = event;
super.emit(event, ...args);
super.emit(UPDATE_EVENT, event, ...args);
return true; // we don't ever care if the event had listeners, so just return "yes"
}
public destroy() {
// noinspection JSIgnoredPromiseFromCall - not concerned about being called async here
this.stop();
this.removeAllListeners();
this.clock.destroy();
this.waveformObservable.close();
if (this.element) {
URL.revokeObjectURL(this.element.src);
this.element.remove();
}
}
public async prepare() {
// The point where we use an audio element is fairly arbitrary, though we don't want
// it to be too low. As of writing, voice messages want to show a waveform but audio
// messages do not. Using an audio element means we can't show a waveform preview, so
// we try to target the difference between a voice message file and large audio file.
// Overall, the point of this is to avoid memory-related issues due to storing a massive
// audio buffer in memory, as that can balloon to far greater than the input buffer's
// byte length.
if (this.buf.byteLength > 5 * 1024 * 1024) { // 5mb
console.log("Audio file too large: processing through <audio /> element");
this.element = document.createElement("AUDIO") as HTMLAudioElement;
const prom = new Promise((resolve, reject) => {
this.element.onloadeddata = () => resolve(null);
this.element.onerror = (e) => reject(e);
});
this.element.src = URL.createObjectURL(new Blob([this.buf]));
await prom; // make sure the audio element is ready for us
} else {
// Safari compat: promise API not supported on this function
this.audioBuf = await new Promise((resolve, reject) => {
this.context.decodeAudioData(this.buf, b => resolve(b), async e => {
try {
// This error handler is largely for Safari as well, which doesn't support Opus/Ogg
// very well.
console.error("Error decoding recording: ", e);
console.warn("Trying to re-encode to WAV instead...");
const wav = await decodeOgg(this.buf);
// noinspection ES6MissingAwait - not needed when using callbacks
this.context.decodeAudioData(wav, b => resolve(b), e => {
console.error("Still failed to decode recording: ", e);
reject(e);
});
} catch (e) {
console.error("Caught decoding error:", e);
reject(e);
}
});
});
// Update the waveform to the real waveform once we have channel data to use. We don't
// exactly trust the user-provided waveform to be accurate...
const waveform = Array.from(this.audioBuf.getChannelData(0));
this.resampledWaveform = makePlaybackWaveform(waveform);
}
this.waveformObservable.update(this.resampledWaveform);
this.emit(PlaybackState.Stopped); // signal that we're not decoding anymore
this.clock.flagLoadTime(); // must happen first because setting the duration fires a clock update
this.clock.durationSeconds = this.element ? this.element.duration : this.audioBuf.duration;
}
private onPlaybackEnd = async () => {
await this.context.suspend();
this.emit(PlaybackState.Stopped);
};
public async play() {
// We can't restart a buffer source, so we need to create a new one if we hit the end
if (this.state === PlaybackState.Stopped) {
this.disconnectSource();
this.makeNewSourceBuffer();
if (this.element) {
await this.element.play();
} else {
(this.source as AudioBufferSourceNode).start();
}
}
// We use the context suspend/resume functions because it allows us to pause a source
// node, but that still doesn't help us when the source node runs out (see above).
await this.context.resume();
this.clock.flagStart();
this.emit(PlaybackState.Playing);
}
private disconnectSource() {
if (this.element) return; // leave connected, we can (and must) re-use it
this.source?.disconnect();
this.source?.removeEventListener("ended", this.onPlaybackEnd);
}
private makeNewSourceBuffer() {
if (this.element && this.source) return; // leave connected, we can (and must) re-use it
if (this.element) {
this.source = this.context.createMediaElementSource(this.element);
} else {
this.source = this.context.createBufferSource();
this.source.buffer = this.audioBuf;
}
this.source.addEventListener("ended", this.onPlaybackEnd);
this.source.connect(this.context.destination);
}
public async pause() {
await this.context.suspend();
this.emit(PlaybackState.Paused);
}
public async stop() {
await this.onPlaybackEnd();
this.clock.flagStop();
}
public async toggle() {
if (this.isPlaying) await this.pause();
else await this.play();
}
public async skipTo(timeSeconds: number) {
// Dev note: this function talks a lot about clock desyncs. There is a clock running
// independently to the audio context and buffer so that accurate human-perceptible
// time can be exposed. The PlaybackClock class has more information, but the short
// version is that we need to line up the useful time (clip position) with the context
// time, and avoid as many deviations as possible as otherwise the user could see the
// wrong time, and we stop playback at the wrong time, etc.
timeSeconds = clamp(timeSeconds, 0, this.clock.durationSeconds);
// Track playing state so we don't cause seeking to start playing the track.
const isPlaying = this.isPlaying;
if (isPlaying) {
// Pause first so we can get an accurate measurement of time
await this.context.suspend();
}
// We can't simply tell the context/buffer to jump to a time, so we have to
// start a whole new buffer and start it from the new time offset.
const now = this.context.currentTime;
this.disconnectSource();
this.makeNewSourceBuffer();
// We have to resync the clock because it can get confused about where we're
// at in the audio clip.
this.clock.syncTo(now, timeSeconds);
// Always start the source to queue it up. We have to do this now (and pause
// quickly if we're not supposed to be playing) as otherwise the clock can desync
// when it comes time to the user hitting play. After a couple jumps, the user
// will have desynced the clock enough to be about 10-15 seconds off, while this
// keeps it as close to perfect as humans can perceive.
if (this.element) {
this.element.currentTime = timeSeconds;
} else {
(this.source as AudioBufferSourceNode).start(now, timeSeconds);
}
// Dev note: it's critical that the code gap between `this.source.start()` and
// `this.pause()` is as small as possible: we do not want to delay *anything*
// as that could cause a clock desync, or a buggy feeling as a single note plays
// during seeking.
if (isPlaying) {
// If we were playing before, continue the context so the clock doesn't desync.
await this.context.resume();
} else {
// As mentioned above, we'll have to pause the clip if we weren't supposed to
// be playing it just yet. If we didn't have this, the audio clip plays but all
// the states will be wrong: clock won't advance, pause state doesn't match the
// blaring noise leaving the user's speakers, etc.
//
// Also as mentioned, if the code gap is small enough then this should be
// executed immediately after the start time, leaving no feasible time for the
// user's speakers to play any sound.
await this.pause();
}
}
}

151
src/audio/PlaybackClock.ts Normal file
View file

@ -0,0 +1,151 @@
/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import { SimpleObservable } from "matrix-widget-api";
import { IDestroyable } from "../utils/IDestroyable";
import { MatrixEvent } from "matrix-js-sdk/src/models/event";
/**
* Tracks accurate human-perceptible time for an audio clip, as informed
* by managed playback. This clock is tightly coupled with the operation
* of the Playback class, making assumptions about how the provided
* AudioContext will be used (suspended/resumed to preserve time, etc).
*
* But why do we need a clock? The AudioContext exposes time information,
* and so does the audio buffer, but not in a way that is useful for humans
* to perceive. The audio buffer time is often lagged behind the context
* time due to internal processing delays of the audio API. Additionally,
* the context's time is tracked from when it was first initialized/started,
* not related to positioning within the clip. However, the context time
* is the most accurate time we can use to determine position within the
* clip if we're fast enough to track the pauses and stops.
*
* As a result, we track every play, pause, stop, and seek event from the
* Playback class (kinda: it calls us, which is close enough to the same
* thing). These events are then tracked on the AudioContext time scale,
* with assumptions that code execution will result in negligible desync
* of the clock, or at least no perceptible difference in time. It's
* extremely important that the calling code, and the clock's own code,
* is extremely fast between the event happening and the clock time being
* tracked - anything more than a dozen milliseconds is likely to stack up
* poorly, leading to clock desync.
*
* Clock desync can be dangerous for the stability of the playback controls:
* if the clock thinks the user is somewhere else in the clip, it could
* inform the playback of the wrong place in time, leading to dead air in
* the output or, if severe enough, a clock that won't stop running while
* the audio is paused/stopped. Other examples include the clip stopping at
* 90% time due to playback ending, the clip playing from the wrong spot
* relative to the time, and negative clock time.
*
* Note that the clip duration is fed to the clock: this is to ensure that
* we have the most accurate time possible to present.
*/
export class PlaybackClock implements IDestroyable {
private clipStart = 0;
private stopped = true;
private lastCheck = 0;
private observable = new SimpleObservable<number[]>();
private timerId: number;
private clipDuration = 0;
private placeholderDuration = 0;
public constructor(private context: AudioContext) {
}
public get durationSeconds(): number {
return this.clipDuration || this.placeholderDuration;
}
public set durationSeconds(val: number) {
this.clipDuration = val;
this.observable.update([this.timeSeconds, this.clipDuration]);
}
public get timeSeconds(): number {
// The modulo is to ensure that we're only looking at the most recent clip
// time, as the context is long-running and multiple plays might not be
// informed to us (if the control is looping, for example). By taking the
// remainder of the division operation, we're assuming that playback is
// incomplete or stopped, thus giving an accurate position within the active
// clip segment.
return (this.context.currentTime - this.clipStart) % this.clipDuration;
}
public get liveData(): SimpleObservable<number[]> {
return this.observable;
}
private checkTime = () => {
const now = this.timeSeconds; // calculated dynamically
if (this.lastCheck !== now) {
this.observable.update([now, this.durationSeconds]);
this.lastCheck = now;
}
};
/**
* Populates default information about the audio clip from the event body.
* The placeholders will be overridden once known.
* @param {MatrixEvent} event The event to use for placeholders.
*/
public populatePlaceholdersFrom(event: MatrixEvent) {
const durationMs = Number(event.getContent()['info']?.['duration']);
if (Number.isFinite(durationMs)) this.placeholderDuration = durationMs / 1000;
}
/**
* Mark the time in the audio context where the clip starts/has been loaded.
* This is to ensure the clock isn't skewed into thinking it is ~0.5s into
* a clip when the duration is set.
*/
public flagLoadTime() {
this.clipStart = this.context.currentTime;
}
public flagStart() {
if (this.stopped) {
this.clipStart = this.context.currentTime;
this.stopped = false;
}
if (!this.timerId) {
// cast to number because the types are wrong
// 100ms interval to make sure the time is as accurate as possible without
// being overly insane
this.timerId = <number><any>setInterval(this.checkTime, 100);
}
}
public flagStop() {
this.stopped = true;
// Reset the clock time now so that the update going out will trigger components
// to check their seek/position information (alongside the clock).
this.clipStart = this.context.currentTime;
}
public syncTo(contextTime: number, clipTime: number) {
this.clipStart = contextTime - clipTime;
this.stopped = false; // count as a mid-stream pause (if we were stopped)
this.checkTime();
}
public destroy() {
this.observable.close();
if (this.timerId) clearInterval(this.timerId);
}
}

View file

@ -0,0 +1,54 @@
/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import { DEFAULT_WAVEFORM, Playback } from "./Playback";
import { ManagedPlayback } from "./ManagedPlayback";
/**
* Handles management of playback instances to ensure certain functionality, like
* one playback operating at any one time.
*/
export class PlaybackManager {
private static internalInstance: PlaybackManager;
private instances: ManagedPlayback[] = [];
public static get instance(): PlaybackManager {
if (!PlaybackManager.internalInstance) {
PlaybackManager.internalInstance = new PlaybackManager();
}
return PlaybackManager.internalInstance;
}
/**
* Stops all other playback instances. If no playback is provided, all instances
* are stopped.
* @param playback Optional. The playback to leave untouched.
*/
public playOnly(playback?: Playback) {
this.instances.filter(p => p !== playback).forEach(p => p.stop());
}
public destroyPlaybackInstance(playback: ManagedPlayback) {
this.instances = this.instances.filter(p => p !== playback);
}
public createPlaybackInstance(buf: ArrayBuffer, waveform = DEFAULT_WAVEFORM): Playback {
const instance = new ManagedPlayback(this, buf, waveform);
this.instances.push(instance);
return instance;
}
}

View file

@ -0,0 +1,82 @@
/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import { IAmplitudePayload, ITimingPayload, PayloadEvent, WORKLET_NAME } from "./consts";
import { percentageOf } from "../utils/numbers";
// from AudioWorkletGlobalScope: https://developer.mozilla.org/en-US/docs/Web/API/AudioWorkletGlobalScope
declare const currentTime: number;
// declare const currentFrame: number;
// declare const sampleRate: number;
// We rate limit here to avoid overloading downstream consumers with amplitude information.
// The two major consumers are the voice message waveform thumbnail (resampled down to an
// appropriate length) and the live waveform shown to the user. Effectively, this controls
// the refresh rate of that live waveform and the number of samples the thumbnail has to
// work with.
const TARGET_AMPLITUDE_FREQUENCY = 16; // Hz
function roundTimeToTargetFreq(seconds: number): number {
// Epsilon helps avoid floating point rounding issues (1 + 1 = 1.999999, etc)
return Math.round((seconds + Number.EPSILON) * TARGET_AMPLITUDE_FREQUENCY) / TARGET_AMPLITUDE_FREQUENCY;
}
function nextTimeForTargetFreq(roundedSeconds: number): number {
// The extra round is just to make sure we cut off any floating point issues
return roundTimeToTargetFreq(roundedSeconds + (1 / TARGET_AMPLITUDE_FREQUENCY));
}
class MxVoiceWorklet extends AudioWorkletProcessor {
private nextAmplitudeSecond = 0;
private amplitudeIndex = 0;
process(inputs, outputs, parameters) {
const currentSecond = roundTimeToTargetFreq(currentTime);
if (currentSecond === this.nextAmplitudeSecond) {
// We're expecting exactly one mono input source, so just grab the very first frame of
// samples for the analysis.
const monoChan = inputs[0][0];
// The amplitude of the frame's samples is effectively the loudness of the frame. This
// translates into a bar which can be rendered as part of the whole recording clip's
// waveform.
//
// We translate the amplitude down to 0-1 for sanity's sake.
const minVal = Math.min(...monoChan);
const maxVal = Math.max(...monoChan);
const amplitude = percentageOf(maxVal, -1, 1) - percentageOf(minVal, -1, 1);
this.port.postMessage(<IAmplitudePayload>{
ev: PayloadEvent.AmplitudeMark,
amplitude: amplitude,
forIndex: this.amplitudeIndex++,
});
this.nextAmplitudeSecond = nextTimeForTargetFreq(currentSecond);
}
// We mostly use this worklet to fire regular clock updates through to components
this.port.postMessage(<ITimingPayload>{ ev: PayloadEvent.Timekeep, timeSeconds: currentTime });
// We're supposed to return false when we're "done" with the audio clip, but seeing as
// we are acting as a passive processor we are never truly "done". The browser will clean
// us up when it is done with us.
return true;
}
}
registerProcessor(WORKLET_NAME, MxVoiceWorklet);
export default null; // to appease module loaders (we never use the export)

349
src/audio/VoiceRecording.ts Normal file
View file

@ -0,0 +1,349 @@
/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import * as Recorder from 'opus-recorder';
import encoderPath from 'opus-recorder/dist/encoderWorker.min.js';
import { MatrixClient } from "matrix-js-sdk/src/client";
import MediaDeviceHandler from "../MediaDeviceHandler";
import { SimpleObservable } from "matrix-widget-api";
import EventEmitter from "events";
import { IDestroyable } from "../utils/IDestroyable";
import { Singleflight } from "../utils/Singleflight";
import { PayloadEvent, WORKLET_NAME } from "./consts";
import { UPDATE_EVENT } from "../stores/AsyncStore";
import { Playback } from "./Playback";
import { createAudioContext } from "./compat";
import { IEncryptedFile } from "matrix-js-sdk/src/@types/event";
import { uploadFile } from "../ContentMessages";
import { FixedRollingArray } from "../utils/FixedRollingArray";
import { clamp } from "../utils/numbers";
const CHANNELS = 1; // stereo isn't important
export const SAMPLE_RATE = 48000; // 48khz is what WebRTC uses. 12khz is where we lose quality.
const BITRATE = 24000; // 24kbps is pretty high quality for our use case in opus.
const TARGET_MAX_LENGTH = 120; // 2 minutes in seconds. Somewhat arbitrary, though longer == larger files.
const TARGET_WARN_TIME_LEFT = 10; // 10 seconds, also somewhat arbitrary.
export const RECORDING_PLAYBACK_SAMPLES = 44;
export interface IRecordingUpdate {
waveform: number[]; // floating points between 0 (low) and 1 (high).
timeSeconds: number; // float
}
export enum RecordingState {
Started = "started",
EndingSoon = "ending_soon", // emits an object with a single numerical value: secondsLeft
Ended = "ended",
Uploading = "uploading",
Uploaded = "uploaded",
}
export interface IUpload {
mxc?: string; // for unencrypted uploads
encrypted?: IEncryptedFile;
}
export class VoiceRecording extends EventEmitter implements IDestroyable {
private recorder: Recorder;
private recorderContext: AudioContext;
private recorderSource: MediaStreamAudioSourceNode;
private recorderStream: MediaStream;
private recorderWorklet: AudioWorkletNode;
private recorderProcessor: ScriptProcessorNode;
private buffer = new Uint8Array(0); // use this.audioBuffer to access
private lastUpload: IUpload;
private recording = false;
private observable: SimpleObservable<IRecordingUpdate>;
private amplitudes: number[] = []; // at each second mark, generated
private playback: Playback;
private liveWaveform = new FixedRollingArray(RECORDING_PLAYBACK_SAMPLES, 0);
public constructor(private client: MatrixClient) {
super();
}
public get contentType(): string {
return "audio/ogg";
}
public get contentLength(): number {
return this.buffer.length;
}
public get durationSeconds(): number {
if (!this.recorder) throw new Error("Duration not available without a recording");
return this.recorderContext.currentTime;
}
public get isRecording(): boolean {
return this.recording;
}
public emit(event: string, ...args: any[]): boolean {
super.emit(event, ...args);
super.emit(UPDATE_EVENT, event, ...args);
return true; // we don't ever care if the event had listeners, so just return "yes"
}
private async makeRecorder() {
try {
this.recorderStream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: CHANNELS,
noiseSuppression: true, // browsers ignore constraints they can't honour
deviceId: MediaDeviceHandler.getAudioInput(),
},
});
this.recorderContext = createAudioContext({
// latencyHint: "interactive", // we don't want a latency hint (this causes data smoothing)
});
this.recorderSource = this.recorderContext.createMediaStreamSource(this.recorderStream);
// Set up our worklet. We use this for timing information and waveform analysis: the
// web audio API prefers this be done async to avoid holding the main thread with math.
const mxRecorderWorkletPath = document.body.dataset.vectorRecorderWorkletScript;
if (!mxRecorderWorkletPath) {
// noinspection ExceptionCaughtLocallyJS
throw new Error("Unable to create recorder: no worklet script registered");
}
// Connect our inputs and outputs
if (this.recorderContext.audioWorklet) {
await this.recorderContext.audioWorklet.addModule(mxRecorderWorkletPath);
this.recorderWorklet = new AudioWorkletNode(this.recorderContext, WORKLET_NAME);
this.recorderSource.connect(this.recorderWorklet);
this.recorderWorklet.connect(this.recorderContext.destination);
// Dev note: we can't use `addEventListener` for some reason. It just doesn't work.
this.recorderWorklet.port.onmessage = (ev) => {
switch (ev.data['ev']) {
case PayloadEvent.Timekeep:
this.processAudioUpdate(ev.data['timeSeconds']);
break;
case PayloadEvent.AmplitudeMark:
// Sanity check to make sure we're adding about one sample per second
if (ev.data['forIndex'] === this.amplitudes.length) {
this.amplitudes.push(ev.data['amplitude']);
this.liveWaveform.pushValue(ev.data['amplitude']);
}
break;
}
};
} else {
// Safari fallback: use a processor node instead, buffered to 1024 bytes of data
// like the worklet is.
this.recorderProcessor = this.recorderContext.createScriptProcessor(1024, CHANNELS, CHANNELS);
this.recorderSource.connect(this.recorderProcessor);
this.recorderProcessor.connect(this.recorderContext.destination);
this.recorderProcessor.addEventListener("audioprocess", this.onAudioProcess);
}
this.recorder = new Recorder({
encoderPath, // magic from webpack
encoderSampleRate: SAMPLE_RATE,
encoderApplication: 2048, // voice (default is "audio")
streamPages: true, // this speeds up the encoding process by using CPU over time
encoderFrameSize: 20, // ms, arbitrary frame size we send to the encoder
numberOfChannels: CHANNELS,
sourceNode: this.recorderSource,
encoderBitRate: BITRATE,
// We use low values for the following to ease CPU usage - the resulting waveform
// is indistinguishable for a voice message. Note that the underlying library will
// pick defaults which prefer the highest possible quality, CPU be damned.
encoderComplexity: 3, // 0-10, 10 is slow and high quality.
resampleQuality: 3, // 0-10, 10 is slow and high quality
});
this.recorder.ondataavailable = (a: ArrayBuffer) => {
const buf = new Uint8Array(a);
const newBuf = new Uint8Array(this.buffer.length + buf.length);
newBuf.set(this.buffer, 0);
newBuf.set(buf, this.buffer.length);
this.buffer = newBuf;
};
} catch (e) {
console.error("Error starting recording: ", e);
if (e instanceof DOMException) { // Unhelpful DOMExceptions are common - parse them sanely
console.error(`${e.name} (${e.code}): ${e.message}`);
}
// Clean up as best as possible
if (this.recorderStream) this.recorderStream.getTracks().forEach(t => t.stop());
if (this.recorderSource) this.recorderSource.disconnect();
if (this.recorder) this.recorder.close();
if (this.recorderContext) {
// noinspection ES6MissingAwait - not important that we wait
this.recorderContext.close();
}
throw e; // rethrow so upstream can handle it
}
}
private get audioBuffer(): Uint8Array {
// We need a clone of the buffer to avoid accidentally changing the position
// on the real thing.
return this.buffer.slice(0);
}
public get liveData(): SimpleObservable<IRecordingUpdate> {
if (!this.recording) throw new Error("No observable when not recording");
return this.observable;
}
public get isSupported(): boolean {
return !!Recorder.isRecordingSupported();
}
public get hasRecording(): boolean {
return this.buffer.length > 0;
}
private onAudioProcess = (ev: AudioProcessingEvent) => {
this.processAudioUpdate(ev.playbackTime);
// We skip the functionality of the worklet regarding waveform calculations: we
// should get that information pretty quick during the playback info.
};
private processAudioUpdate = (timeSeconds: number) => {
if (!this.recording) return;
this.observable.update({
waveform: this.liveWaveform.value.map(v => clamp(v, 0, 1)),
timeSeconds: timeSeconds,
});
// Now that we've updated the data/waveform, let's do a time check. We don't want to
// go horribly over the limit. We also emit a warning state if needed.
//
// We use the recorder's perspective of time to make sure we don't cut off the last
// frame of audio, otherwise we end up with a 1:59 clip (119.68 seconds). This extra
// safety can allow us to overshoot the target a bit, but at least when we say 2min
// maximum we actually mean it.
//
// In testing, recorder time and worker time lag by about 400ms, which is roughly the
// time needed to encode a sample/frame.
//
// Ref for recorderSeconds: https://github.com/chris-rudmin/opus-recorder#instance-fields
const recorderSeconds = this.recorder.encodedSamplePosition / 48000;
const secondsLeft = TARGET_MAX_LENGTH - recorderSeconds;
if (secondsLeft < 0) { // go over to make sure we definitely capture that last frame
// noinspection JSIgnoredPromiseFromCall - we aren't concerned with it overlapping
this.stop();
} else if (secondsLeft <= TARGET_WARN_TIME_LEFT) {
Singleflight.for(this, "ending_soon").do(() => {
this.emit(RecordingState.EndingSoon, { secondsLeft });
return Singleflight.Void;
});
}
};
public async start(): Promise<void> {
if (this.lastUpload || this.hasRecording) {
throw new Error("Recording already prepared");
}
if (this.recording) {
throw new Error("Recording already in progress");
}
if (this.observable) {
this.observable.close();
}
this.observable = new SimpleObservable<IRecordingUpdate>();
await this.makeRecorder();
await this.recorder.start();
this.recording = true;
this.emit(RecordingState.Started);
}
public async stop(): Promise<Uint8Array> {
return Singleflight.for(this, "stop").do(async () => {
if (!this.recording) {
throw new Error("No recording to stop");
}
// Disconnect the source early to start shutting down resources
await this.recorder.stop(); // stop first to flush the last frame
this.recorderSource.disconnect();
if (this.recorderWorklet) this.recorderWorklet.disconnect();
if (this.recorderProcessor) {
this.recorderProcessor.disconnect();
this.recorderProcessor.removeEventListener("audioprocess", this.onAudioProcess);
}
// close the context after the recorder so the recorder doesn't try to
// connect anything to the context (this would generate a warning)
await this.recorderContext.close();
// Now stop all the media tracks so we can release them back to the user/OS
this.recorderStream.getTracks().forEach(t => t.stop());
// Finally do our post-processing and clean up
this.recording = false;
await this.recorder.close();
this.emit(RecordingState.Ended);
return this.audioBuffer;
});
}
/**
* Gets a playback instance for this voice recording. Note that the playback will not
* have been prepared fully, meaning the `prepare()` function needs to be called on it.
*
* The same playback instance is returned each time.
*
* @returns {Playback} The playback instance.
*/
public getPlayback(): Playback {
this.playback = Singleflight.for(this, "playback").do(() => {
return new Playback(this.audioBuffer.buffer, this.amplitudes); // cast to ArrayBuffer proper;
});
return this.playback;
}
public destroy() {
// noinspection JSIgnoredPromiseFromCall - not concerned about stop() being called async here
this.stop();
this.removeAllListeners();
Singleflight.forgetAllFor(this);
// noinspection JSIgnoredPromiseFromCall - not concerned about being called async here
this.playback?.destroy();
this.observable.close();
}
public async upload(inRoomId: string): Promise<IUpload> {
if (!this.hasRecording) {
throw new Error("No recording available to upload");
}
if (this.lastUpload) return this.lastUpload;
try {
this.emit(RecordingState.Uploading);
const { url: mxc, file: encrypted } = await uploadFile(this.client, inRoomId, new Blob([this.audioBuffer], {
type: this.contentType,
}));
this.lastUpload = { mxc, encrypted };
this.emit(RecordingState.Uploaded);
} catch (e) {
this.emit(RecordingState.Ended);
throw e;
}
return this.lastUpload;
}
}

82
src/audio/compat.ts Normal file
View file

@ -0,0 +1,82 @@
/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import { SAMPLE_RATE } from "./VoiceRecording";
// @ts-ignore - we know that this is not a module. We're looking for a path.
import decoderWasmPath from 'opus-recorder/dist/decoderWorker.min.wasm';
import wavEncoderPath from 'opus-recorder/dist/waveWorker.min.js';
import decoderPath from 'opus-recorder/dist/decoderWorker.min.js';
export function createAudioContext(opts?: AudioContextOptions): AudioContext {
if (window.AudioContext) {
return new AudioContext(opts);
} else if (window.webkitAudioContext) {
// While the linter is correct that "a constructor name should not start with
// a lowercase letter", it's also wrong to think that we have control over this.
// eslint-disable-next-line new-cap
return new window.webkitAudioContext(opts);
} else {
throw new Error("Unsupported browser");
}
}
export function decodeOgg(audioBuffer: ArrayBuffer): Promise<ArrayBuffer> {
// Condensed version of decoder example, using a promise:
// https://github.com/chris-rudmin/opus-recorder/blob/master/example/decoder.html
return new Promise((resolve) => { // no reject because the workers don't seem to have a fail path
console.log("Decoder WASM path: " + decoderWasmPath); // so we use the variable (avoid tree shake)
const typedArray = new Uint8Array(audioBuffer);
const decoderWorker = new Worker(decoderPath);
const wavWorker = new Worker(wavEncoderPath);
decoderWorker.postMessage({
command: 'init',
decoderSampleRate: SAMPLE_RATE,
outputBufferSampleRate: SAMPLE_RATE,
});
wavWorker.postMessage({
command: 'init',
wavBitDepth: 24, // standard for 48khz (SAMPLE_RATE)
wavSampleRate: SAMPLE_RATE,
});
decoderWorker.onmessage = (ev) => {
if (ev.data === null) { // null == done
wavWorker.postMessage({ command: 'done' });
return;
}
wavWorker.postMessage({
command: 'encode',
buffers: ev.data,
}, ev.data.map(b => b.buffer));
};
wavWorker.onmessage = (ev) => {
if (ev.data.message === 'page') {
// The encoding comes through as a single page
resolve(new Blob([ev.data.page], { type: "audio/wav" }).arrayBuffer());
}
};
decoderWorker.postMessage({
command: 'decode',
pages: typedArray,
}, [typedArray.buffer]);
});
}

37
src/audio/consts.ts Normal file
View file

@ -0,0 +1,37 @@
/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
export const WORKLET_NAME = "mx-voice-worklet";
export enum PayloadEvent {
Timekeep = "timekeep",
AmplitudeMark = "amplitude_mark",
}
export interface IPayload {
ev: PayloadEvent;
}
export interface ITimingPayload extends IPayload {
ev: PayloadEvent.Timekeep;
timeSeconds: number;
}
export interface IAmplitudePayload extends IPayload {
ev: PayloadEvent.AmplitudeMark;
forIndex: number;
amplitude: number;
}