Move src/voice to src/audio for better naming

Many of these files are used by Audio and Voice messages. Fixes https://github.com/vector-im/element-web/issues/18131
2021-07-22 09:26:26 -06:00 · 2021-07-22 09:26:26 -06:00 · eec63574e6
commit eec63574e6
parent 248a758ad6
21 changed files with 14 additions and 14 deletions
--- a/src/audio/ManagedPlayback.ts
+++ b/src/audio/ManagedPlayback.ts
@ -0,0 +1,37 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import { DEFAULT_WAVEFORM, Playback } from "./Playback";
+import { PlaybackManager } from "./PlaybackManager";
+
+/**
+ * A managed playback is a Playback instance that is guided by a PlaybackManager.
+ */
+export class ManagedPlayback extends Playback {
+    public constructor(private manager: PlaybackManager, buf: ArrayBuffer, seedWaveform = DEFAULT_WAVEFORM) {
+        super(buf, seedWaveform);
+    }
+
+    public async play(): Promise<void> {
+        this.manager.playOnly(this);
+        return super.play();
+    }
+
+    public destroy() {
+        this.manager.destroyPlaybackInstance(this);
+        super.destroy();
+    }
+}
--- a/src/audio/Playback.ts
+++ b/src/audio/Playback.ts
@ -0,0 +1,311 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import EventEmitter from "events";
+import { UPDATE_EVENT } from "../stores/AsyncStore";
+import { arrayFastResample, arrayRescale, arraySeed, arraySmoothingResample } from "../utils/arrays";
+import { SimpleObservable } from "matrix-widget-api";
+import { IDestroyable } from "../utils/IDestroyable";
+import { PlaybackClock } from "./PlaybackClock";
+import { createAudioContext, decodeOgg } from "./compat";
+import { clamp } from "../utils/numbers";
+
+export enum PlaybackState {
+    Decoding = "decoding",
+    Stopped = "stopped", // no progress on timeline
+    Paused = "paused", // some progress on timeline
+    Playing = "playing", // active progress through timeline
+}
+
+export const PLAYBACK_WAVEFORM_SAMPLES = 39;
+const THUMBNAIL_WAVEFORM_SAMPLES = 100; // arbitrary: [30,120]
+export const DEFAULT_WAVEFORM = arraySeed(0, PLAYBACK_WAVEFORM_SAMPLES);
+
+function makePlaybackWaveform(input: number[]): number[] {
+    // First, convert negative amplitudes to positive so we don't detect zero as "noisy".
+    const noiseWaveform = input.map(v => Math.abs(v));
+
+    // Next, we'll resample the waveform using a smoothing approach so we can keep the same rough shape.
+    // We also rescale the waveform to be 0-1 for the remaining function logic.
+    const resampled = arrayRescale(arraySmoothingResample(noiseWaveform, PLAYBACK_WAVEFORM_SAMPLES), 0, 1);
+
+    // Then, we'll do a high and low pass filter to isolate actual speaking volumes within the rescaled
+    // waveform. Most speech happens below the 0.5 mark.
+    const filtered = resampled.map(v => clamp(v, 0.1, 0.5));
+
+    // Finally, we'll rescale the filtered waveform (0.1-0.5 becomes 0-1 again) so the user sees something
+    // sensible. This is what we return to keep our contract of "values between zero and one".
+    return arrayRescale(filtered, 0, 1);
+}
+
+export class Playback extends EventEmitter implements IDestroyable {
+    /**
+     * Stable waveform for representing a thumbnail of the media. Values are
+     * guaranteed to be between zero and one, inclusive.
+     */
+    public readonly thumbnailWaveform: number[];
+
+    private readonly context: AudioContext;
+    private source: AudioBufferSourceNode | MediaElementAudioSourceNode;
+    private state = PlaybackState.Decoding;
+    private audioBuf: AudioBuffer;
+    private element: HTMLAudioElement;
+    private resampledWaveform: number[];
+    private waveformObservable = new SimpleObservable<number[]>();
+    private readonly clock: PlaybackClock;
+    private readonly fileSize: number;
+
+    /**
+     * Creates a new playback instance from a buffer.
+     * @param {ArrayBuffer} buf The buffer containing the sound sample.
+     * @param {number[]} seedWaveform Optional seed waveform to present until the proper waveform
+     * can be calculated. Contains values between zero and one, inclusive.
+     */
+    constructor(private buf: ArrayBuffer, seedWaveform = DEFAULT_WAVEFORM) {
+        super();
+        // Capture the file size early as reading the buffer will result in a 0-length buffer left behind
+        this.fileSize = this.buf.byteLength;
+        this.context = createAudioContext();
+        this.resampledWaveform = arrayFastResample(seedWaveform ?? DEFAULT_WAVEFORM, PLAYBACK_WAVEFORM_SAMPLES);
+        this.thumbnailWaveform = arrayFastResample(seedWaveform ?? DEFAULT_WAVEFORM, THUMBNAIL_WAVEFORM_SAMPLES);
+        this.waveformObservable.update(this.resampledWaveform);
+        this.clock = new PlaybackClock(this.context);
+    }
+
+    /**
+     * Size of the audio clip in bytes. May be zero if unknown. This is updated
+     * when the playback goes through phase changes.
+     */
+    public get sizeBytes(): number {
+        return this.fileSize;
+    }
+
+    /**
+     * Stable waveform for the playback. Values are guaranteed to be between
+     * zero and one, inclusive.
+     */
+    public get waveform(): number[] {
+        return this.resampledWaveform;
+    }
+
+    public get waveformData(): SimpleObservable<number[]> {
+        return this.waveformObservable;
+    }
+
+    public get clockInfo(): PlaybackClock {
+        return this.clock;
+    }
+
+    public get currentState(): PlaybackState {
+        return this.state;
+    }
+
+    public get isPlaying(): boolean {
+        return this.currentState === PlaybackState.Playing;
+    }
+
+    public emit(event: PlaybackState, ...args: any[]): boolean {
+        this.state = event;
+        super.emit(event, ...args);
+        super.emit(UPDATE_EVENT, event, ...args);
+        return true; // we don't ever care if the event had listeners, so just return "yes"
+    }
+
+    public destroy() {
+        // noinspection JSIgnoredPromiseFromCall - not concerned about being called async here
+        this.stop();
+        this.removeAllListeners();
+        this.clock.destroy();
+        this.waveformObservable.close();
+        if (this.element) {
+            URL.revokeObjectURL(this.element.src);
+            this.element.remove();
+        }
+    }
+
+    public async prepare() {
+        // The point where we use an audio element is fairly arbitrary, though we don't want
+        // it to be too low. As of writing, voice messages want to show a waveform but audio
+        // messages do not. Using an audio element means we can't show a waveform preview, so
+        // we try to target the difference between a voice message file and large audio file.
+        // Overall, the point of this is to avoid memory-related issues due to storing a massive
+        // audio buffer in memory, as that can balloon to far greater than the input buffer's
+        // byte length.
+        if (this.buf.byteLength > 5 * 1024 * 1024) { // 5mb
+            console.log("Audio file too large: processing through <audio /> element");
+            this.element = document.createElement("AUDIO") as HTMLAudioElement;
+            const prom = new Promise((resolve, reject) => {
+                this.element.onloadeddata = () => resolve(null);
+                this.element.onerror = (e) => reject(e);
+            });
+            this.element.src = URL.createObjectURL(new Blob([this.buf]));
+            await prom; // make sure the audio element is ready for us
+        } else {
+            // Safari compat: promise API not supported on this function
+            this.audioBuf = await new Promise((resolve, reject) => {
+                this.context.decodeAudioData(this.buf, b => resolve(b), async e => {
+                    try {
+                        // This error handler is largely for Safari as well, which doesn't support Opus/Ogg
+                        // very well.
+                        console.error("Error decoding recording: ", e);
+                        console.warn("Trying to re-encode to WAV instead...");
+
+                        const wav = await decodeOgg(this.buf);
+
+                        // noinspection ES6MissingAwait - not needed when using callbacks
+                        this.context.decodeAudioData(wav, b => resolve(b), e => {
+                            console.error("Still failed to decode recording: ", e);
+                            reject(e);
+                        });
+                    } catch (e) {
+                        console.error("Caught decoding error:", e);
+                        reject(e);
+                    }
+                });
+            });
+
+            // Update the waveform to the real waveform once we have channel data to use. We don't
+            // exactly trust the user-provided waveform to be accurate...
+            const waveform = Array.from(this.audioBuf.getChannelData(0));
+            this.resampledWaveform = makePlaybackWaveform(waveform);
+        }
+
+        this.waveformObservable.update(this.resampledWaveform);
+
+        this.emit(PlaybackState.Stopped); // signal that we're not decoding anymore
+        this.clock.flagLoadTime(); // must happen first because setting the duration fires a clock update
+        this.clock.durationSeconds = this.element ? this.element.duration : this.audioBuf.duration;
+    }
+
+    private onPlaybackEnd = async () => {
+        await this.context.suspend();
+        this.emit(PlaybackState.Stopped);
+    };
+
+    public async play() {
+        // We can't restart a buffer source, so we need to create a new one if we hit the end
+        if (this.state === PlaybackState.Stopped) {
+            this.disconnectSource();
+            this.makeNewSourceBuffer();
+            if (this.element) {
+                await this.element.play();
+            } else {
+                (this.source as AudioBufferSourceNode).start();
+            }
+        }
+
+        // We use the context suspend/resume functions because it allows us to pause a source
+        // node, but that still doesn't help us when the source node runs out (see above).
+        await this.context.resume();
+        this.clock.flagStart();
+        this.emit(PlaybackState.Playing);
+    }
+
+    private disconnectSource() {
+        if (this.element) return; // leave connected, we can (and must) re-use it
+        this.source?.disconnect();
+        this.source?.removeEventListener("ended", this.onPlaybackEnd);
+    }
+
+    private makeNewSourceBuffer() {
+        if (this.element && this.source) return; // leave connected, we can (and must) re-use it
+
+        if (this.element) {
+            this.source = this.context.createMediaElementSource(this.element);
+        } else {
+            this.source = this.context.createBufferSource();
+            this.source.buffer = this.audioBuf;
+        }
+
+        this.source.addEventListener("ended", this.onPlaybackEnd);
+        this.source.connect(this.context.destination);
+    }
+
+    public async pause() {
+        await this.context.suspend();
+        this.emit(PlaybackState.Paused);
+    }
+
+    public async stop() {
+        await this.onPlaybackEnd();
+        this.clock.flagStop();
+    }
+
+    public async toggle() {
+        if (this.isPlaying) await this.pause();
+        else await this.play();
+    }
+
+    public async skipTo(timeSeconds: number) {
+        // Dev note: this function talks a lot about clock desyncs. There is a clock running
+        // independently to the audio context and buffer so that accurate human-perceptible
+        // time can be exposed. The PlaybackClock class has more information, but the short
+        // version is that we need to line up the useful time (clip position) with the context
+        // time, and avoid as many deviations as possible as otherwise the user could see the
+        // wrong time, and we stop playback at the wrong time, etc.
+
+        timeSeconds = clamp(timeSeconds, 0, this.clock.durationSeconds);
+
+        // Track playing state so we don't cause seeking to start playing the track.
+        const isPlaying = this.isPlaying;
+
+        if (isPlaying) {
+            // Pause first so we can get an accurate measurement of time
+            await this.context.suspend();
+        }
+
+        // We can't simply tell the context/buffer to jump to a time, so we have to
+        // start a whole new buffer and start it from the new time offset.
+        const now = this.context.currentTime;
+        this.disconnectSource();
+        this.makeNewSourceBuffer();
+
+        // We have to resync the clock because it can get confused about where we're
+        // at in the audio clip.
+        this.clock.syncTo(now, timeSeconds);
+
+        // Always start the source to queue it up. We have to do this now (and pause
+        // quickly if we're not supposed to be playing) as otherwise the clock can desync
+        // when it comes time to the user hitting play. After a couple jumps, the user
+        // will have desynced the clock enough to be about 10-15 seconds off, while this
+        // keeps it as close to perfect as humans can perceive.
+        if (this.element) {
+            this.element.currentTime = timeSeconds;
+        } else {
+            (this.source as AudioBufferSourceNode).start(now, timeSeconds);
+        }
+
+        // Dev note: it's critical that the code gap between `this.source.start()` and
+        // `this.pause()` is as small as possible: we do not want to delay *anything*
+        // as that could cause a clock desync, or a buggy feeling as a single note plays
+        // during seeking.
+
+        if (isPlaying) {
+            // If we were playing before, continue the context so the clock doesn't desync.
+            await this.context.resume();
+        } else {
+            // As mentioned above, we'll have to pause the clip if we weren't supposed to
+            // be playing it just yet. If we didn't have this, the audio clip plays but all
+            // the states will be wrong: clock won't advance, pause state doesn't match the
+            // blaring noise leaving the user's speakers, etc.
+            //
+            // Also as mentioned, if the code gap is small enough then this should be
+            // executed immediately after the start time, leaving no feasible time for the
+            // user's speakers to play any sound.
+            await this.pause();
+        }
+    }
+}
--- a/src/audio/PlaybackClock.ts
+++ b/src/audio/PlaybackClock.ts
@ -0,0 +1,151 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import { SimpleObservable } from "matrix-widget-api";
+import { IDestroyable } from "../utils/IDestroyable";
+import { MatrixEvent } from "matrix-js-sdk/src/models/event";
+
+/**
+ * Tracks accurate human-perceptible time for an audio clip, as informed
+ * by managed playback. This clock is tightly coupled with the operation
+ * of the Playback class, making assumptions about how the provided
+ * AudioContext will be used (suspended/resumed to preserve time, etc).
+ *
+ * But why do we need a clock? The AudioContext exposes time information,
+ * and so does the audio buffer, but not in a way that is useful for humans
+ * to perceive. The audio buffer time is often lagged behind the context
+ * time due to internal processing delays of the audio API. Additionally,
+ * the context's time is tracked from when it was first initialized/started,
+ * not related to positioning within the clip. However, the context time
+ * is the most accurate time we can use to determine position within the
+ * clip if we're fast enough to track the pauses and stops.
+ *
+ * As a result, we track every play, pause, stop, and seek event from the
+ * Playback class (kinda: it calls us, which is close enough to the same
+ * thing). These events are then tracked on the AudioContext time scale,
+ * with assumptions that code execution will result in negligible desync
+ * of the clock, or at least no perceptible difference in time. It's
+ * extremely important that the calling code, and the clock's own code,
+ * is extremely fast between the event happening and the clock time being
+ * tracked - anything more than a dozen milliseconds is likely to stack up
+ * poorly, leading to clock desync.
+ *
+ * Clock desync can be dangerous for the stability of the playback controls:
+ * if the clock thinks the user is somewhere else in the clip, it could
+ * inform the playback of the wrong place in time, leading to dead air in
+ * the output or, if severe enough, a clock that won't stop running while
+ * the audio is paused/stopped. Other examples include the clip stopping at
+ * 90% time due to playback ending, the clip playing from the wrong spot
+ * relative to the time, and negative clock time.
+ *
+ * Note that the clip duration is fed to the clock: this is to ensure that
+ * we have the most accurate time possible to present.
+ */
+export class PlaybackClock implements IDestroyable {
+    private clipStart = 0;
+    private stopped = true;
+    private lastCheck = 0;
+    private observable = new SimpleObservable<number[]>();
+    private timerId: number;
+    private clipDuration = 0;
+    private placeholderDuration = 0;
+
+    public constructor(private context: AudioContext) {
+    }
+
+    public get durationSeconds(): number {
+        return this.clipDuration || this.placeholderDuration;
+    }
+
+    public set durationSeconds(val: number) {
+        this.clipDuration = val;
+        this.observable.update([this.timeSeconds, this.clipDuration]);
+    }
+
+    public get timeSeconds(): number {
+        // The modulo is to ensure that we're only looking at the most recent clip
+        // time, as the context is long-running and multiple plays might not be
+        // informed to us (if the control is looping, for example). By taking the
+        // remainder of the division operation, we're assuming that playback is
+        // incomplete or stopped, thus giving an accurate position within the active
+        // clip segment.
+        return (this.context.currentTime - this.clipStart) % this.clipDuration;
+    }
+
+    public get liveData(): SimpleObservable<number[]> {
+        return this.observable;
+    }
+
+    private checkTime = () => {
+        const now = this.timeSeconds; // calculated dynamically
+        if (this.lastCheck !== now) {
+            this.observable.update([now, this.durationSeconds]);
+            this.lastCheck = now;
+        }
+    };
+
+    /**
+     * Populates default information about the audio clip from the event body.
+     * The placeholders will be overridden once known.
+     * @param {MatrixEvent} event The event to use for placeholders.
+     */
+    public populatePlaceholdersFrom(event: MatrixEvent) {
+        const durationMs = Number(event.getContent()['info']?.['duration']);
+        if (Number.isFinite(durationMs)) this.placeholderDuration = durationMs / 1000;
+    }
+
+    /**
+     * Mark the time in the audio context where the clip starts/has been loaded.
+     * This is to ensure the clock isn't skewed into thinking it is ~0.5s into
+     * a clip when the duration is set.
+     */
+    public flagLoadTime() {
+        this.clipStart = this.context.currentTime;
+    }
+
+    public flagStart() {
+        if (this.stopped) {
+            this.clipStart = this.context.currentTime;
+            this.stopped = false;
+        }
+
+        if (!this.timerId) {
+            // cast to number because the types are wrong
+            // 100ms interval to make sure the time is as accurate as possible without
+            // being overly insane
+            this.timerId = <number><any>setInterval(this.checkTime, 100);
+        }
+    }
+
+    public flagStop() {
+        this.stopped = true;
+
+        // Reset the clock time now so that the update going out will trigger components
+        // to check their seek/position information (alongside the clock).
+        this.clipStart = this.context.currentTime;
+    }
+
+    public syncTo(contextTime: number, clipTime: number) {
+        this.clipStart = contextTime - clipTime;
+        this.stopped = false; // count as a mid-stream pause (if we were stopped)
+        this.checkTime();
+    }
+
+    public destroy() {
+        this.observable.close();
+        if (this.timerId) clearInterval(this.timerId);
+    }
+}
--- a/src/audio/PlaybackManager.ts
+++ b/src/audio/PlaybackManager.ts
@ -0,0 +1,54 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import { DEFAULT_WAVEFORM, Playback } from "./Playback";
+import { ManagedPlayback } from "./ManagedPlayback";
+
+/**
+ * Handles management of playback instances to ensure certain functionality, like
+ * one playback operating at any one time.
+ */
+export class PlaybackManager {
+    private static internalInstance: PlaybackManager;
+
+    private instances: ManagedPlayback[] = [];
+
+    public static get instance(): PlaybackManager {
+        if (!PlaybackManager.internalInstance) {
+            PlaybackManager.internalInstance = new PlaybackManager();
+        }
+        return PlaybackManager.internalInstance;
+    }
+
+    /**
+     * Stops all other playback instances. If no playback is provided, all instances
+     * are stopped.
+     * @param playback Optional. The playback to leave untouched.
+     */
+    public playOnly(playback?: Playback) {
+        this.instances.filter(p => p !== playback).forEach(p => p.stop());
+    }
+
+    public destroyPlaybackInstance(playback: ManagedPlayback) {
+        this.instances = this.instances.filter(p => p !== playback);
+    }
+
+    public createPlaybackInstance(buf: ArrayBuffer, waveform = DEFAULT_WAVEFORM): Playback {
+        const instance = new ManagedPlayback(this, buf, waveform);
+        this.instances.push(instance);
+        return instance;
+    }
+}
--- a/src/audio/RecorderWorklet.ts
+++ b/src/audio/RecorderWorklet.ts
@ -0,0 +1,82 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import { IAmplitudePayload, ITimingPayload, PayloadEvent, WORKLET_NAME } from "./consts";
+import { percentageOf } from "../utils/numbers";
+
+// from AudioWorkletGlobalScope: https://developer.mozilla.org/en-US/docs/Web/API/AudioWorkletGlobalScope
+declare const currentTime: number;
+// declare const currentFrame: number;
+// declare const sampleRate: number;
+
+// We rate limit here to avoid overloading downstream consumers with amplitude information.
+// The two major consumers are the voice message waveform thumbnail (resampled down to an
+// appropriate length) and the live waveform shown to the user. Effectively, this controls
+// the refresh rate of that live waveform and the number of samples the thumbnail has to
+// work with.
+const TARGET_AMPLITUDE_FREQUENCY = 16; // Hz
+
+function roundTimeToTargetFreq(seconds: number): number {
+    // Epsilon helps avoid floating point rounding issues (1 + 1 = 1.999999, etc)
+    return Math.round((seconds + Number.EPSILON) * TARGET_AMPLITUDE_FREQUENCY) / TARGET_AMPLITUDE_FREQUENCY;
+}
+
+function nextTimeForTargetFreq(roundedSeconds: number): number {
+    // The extra round is just to make sure we cut off any floating point issues
+    return roundTimeToTargetFreq(roundedSeconds + (1 / TARGET_AMPLITUDE_FREQUENCY));
+}
+
+class MxVoiceWorklet extends AudioWorkletProcessor {
+    private nextAmplitudeSecond = 0;
+    private amplitudeIndex = 0;
+
+    process(inputs, outputs, parameters) {
+        const currentSecond = roundTimeToTargetFreq(currentTime);
+        if (currentSecond === this.nextAmplitudeSecond) {
+            // We're expecting exactly one mono input source, so just grab the very first frame of
+            // samples for the analysis.
+            const monoChan = inputs[0][0];
+
+            // The amplitude of the frame's samples is effectively the loudness of the frame. This
+            // translates into a bar which can be rendered as part of the whole recording clip's
+            // waveform.
+            //
+            // We translate the amplitude down to 0-1 for sanity's sake.
+            const minVal = Math.min(...monoChan);
+            const maxVal = Math.max(...monoChan);
+            const amplitude = percentageOf(maxVal, -1, 1) - percentageOf(minVal, -1, 1);
+
+            this.port.postMessage(<IAmplitudePayload>{
+                ev: PayloadEvent.AmplitudeMark,
+                amplitude: amplitude,
+                forIndex: this.amplitudeIndex++,
+            });
+            this.nextAmplitudeSecond = nextTimeForTargetFreq(currentSecond);
+        }
+
+        // We mostly use this worklet to fire regular clock updates through to components
+        this.port.postMessage(<ITimingPayload>{ ev: PayloadEvent.Timekeep, timeSeconds: currentTime });
+
+        // We're supposed to return false when we're "done" with the audio clip, but seeing as
+        // we are acting as a passive processor we are never truly "done". The browser will clean
+        // us up when it is done with us.
+        return true;
+    }
+}
+
+registerProcessor(WORKLET_NAME, MxVoiceWorklet);
+
+export default null; // to appease module loaders (we never use the export)
--- a/src/audio/VoiceRecording.ts
+++ b/src/audio/VoiceRecording.ts
@ -0,0 +1,349 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import * as Recorder from 'opus-recorder';
+import encoderPath from 'opus-recorder/dist/encoderWorker.min.js';
+import { MatrixClient } from "matrix-js-sdk/src/client";
+import MediaDeviceHandler from "../MediaDeviceHandler";
+import { SimpleObservable } from "matrix-widget-api";
+import EventEmitter from "events";
+import { IDestroyable } from "../utils/IDestroyable";
+import { Singleflight } from "../utils/Singleflight";
+import { PayloadEvent, WORKLET_NAME } from "./consts";
+import { UPDATE_EVENT } from "../stores/AsyncStore";
+import { Playback } from "./Playback";
+import { createAudioContext } from "./compat";
+import { IEncryptedFile } from "matrix-js-sdk/src/@types/event";
+import { uploadFile } from "../ContentMessages";
+import { FixedRollingArray } from "../utils/FixedRollingArray";
+import { clamp } from "../utils/numbers";
+
+const CHANNELS = 1; // stereo isn't important
+export const SAMPLE_RATE = 48000; // 48khz is what WebRTC uses. 12khz is where we lose quality.
+const BITRATE = 24000; // 24kbps is pretty high quality for our use case in opus.
+const TARGET_MAX_LENGTH = 120; // 2 minutes in seconds. Somewhat arbitrary, though longer == larger files.
+const TARGET_WARN_TIME_LEFT = 10; // 10 seconds, also somewhat arbitrary.
+
+export const RECORDING_PLAYBACK_SAMPLES = 44;
+
+export interface IRecordingUpdate {
+    waveform: number[]; // floating points between 0 (low) and 1 (high).
+    timeSeconds: number; // float
+}
+
+export enum RecordingState {
+    Started = "started",
+    EndingSoon = "ending_soon", // emits an object with a single numerical value: secondsLeft
+    Ended = "ended",
+    Uploading = "uploading",
+    Uploaded = "uploaded",
+}
+
+export interface IUpload {
+    mxc?: string; // for unencrypted uploads
+    encrypted?: IEncryptedFile;
+}
+
+export class VoiceRecording extends EventEmitter implements IDestroyable {
+    private recorder: Recorder;
+    private recorderContext: AudioContext;
+    private recorderSource: MediaStreamAudioSourceNode;
+    private recorderStream: MediaStream;
+    private recorderWorklet: AudioWorkletNode;
+    private recorderProcessor: ScriptProcessorNode;
+    private buffer = new Uint8Array(0); // use this.audioBuffer to access
+    private lastUpload: IUpload;
+    private recording = false;
+    private observable: SimpleObservable<IRecordingUpdate>;
+    private amplitudes: number[] = []; // at each second mark, generated
+    private playback: Playback;
+    private liveWaveform = new FixedRollingArray(RECORDING_PLAYBACK_SAMPLES, 0);
+
+    public constructor(private client: MatrixClient) {
+        super();
+    }
+
+    public get contentType(): string {
+        return "audio/ogg";
+    }
+
+    public get contentLength(): number {
+        return this.buffer.length;
+    }
+
+    public get durationSeconds(): number {
+        if (!this.recorder) throw new Error("Duration not available without a recording");
+        return this.recorderContext.currentTime;
+    }
+
+    public get isRecording(): boolean {
+        return this.recording;
+    }
+
+    public emit(event: string, ...args: any[]): boolean {
+        super.emit(event, ...args);
+        super.emit(UPDATE_EVENT, event, ...args);
+        return true; // we don't ever care if the event had listeners, so just return "yes"
+    }
+
+    private async makeRecorder() {
+        try {
+            this.recorderStream = await navigator.mediaDevices.getUserMedia({
+                audio: {
+                    channelCount: CHANNELS,
+                    noiseSuppression: true, // browsers ignore constraints they can't honour
+                    deviceId: MediaDeviceHandler.getAudioInput(),
+                },
+            });
+            this.recorderContext = createAudioContext({
+                // latencyHint: "interactive", // we don't want a latency hint (this causes data smoothing)
+            });
+            this.recorderSource = this.recorderContext.createMediaStreamSource(this.recorderStream);
+
+            // Set up our worklet. We use this for timing information and waveform analysis: the
+            // web audio API prefers this be done async to avoid holding the main thread with math.
+            const mxRecorderWorkletPath = document.body.dataset.vectorRecorderWorkletScript;
+            if (!mxRecorderWorkletPath) {
+                // noinspection ExceptionCaughtLocallyJS
+                throw new Error("Unable to create recorder: no worklet script registered");
+            }
+
+            // Connect our inputs and outputs
+            if (this.recorderContext.audioWorklet) {
+                await this.recorderContext.audioWorklet.addModule(mxRecorderWorkletPath);
+                this.recorderWorklet = new AudioWorkletNode(this.recorderContext, WORKLET_NAME);
+                this.recorderSource.connect(this.recorderWorklet);
+                this.recorderWorklet.connect(this.recorderContext.destination);
+
+                // Dev note: we can't use `addEventListener` for some reason. It just doesn't work.
+                this.recorderWorklet.port.onmessage = (ev) => {
+                    switch (ev.data['ev']) {
+                        case PayloadEvent.Timekeep:
+                            this.processAudioUpdate(ev.data['timeSeconds']);
+                            break;
+                        case PayloadEvent.AmplitudeMark:
+                            // Sanity check to make sure we're adding about one sample per second
+                            if (ev.data['forIndex'] === this.amplitudes.length) {
+                                this.amplitudes.push(ev.data['amplitude']);
+                                this.liveWaveform.pushValue(ev.data['amplitude']);
+                            }
+                            break;
+                    }
+                };
+            } else {
+                // Safari fallback: use a processor node instead, buffered to 1024 bytes of data
+                // like the worklet is.
+                this.recorderProcessor = this.recorderContext.createScriptProcessor(1024, CHANNELS, CHANNELS);
+                this.recorderSource.connect(this.recorderProcessor);
+                this.recorderProcessor.connect(this.recorderContext.destination);
+                this.recorderProcessor.addEventListener("audioprocess", this.onAudioProcess);
+            }
+
+            this.recorder = new Recorder({
+                encoderPath, // magic from webpack
+                encoderSampleRate: SAMPLE_RATE,
+                encoderApplication: 2048, // voice (default is "audio")
+                streamPages: true, // this speeds up the encoding process by using CPU over time
+                encoderFrameSize: 20, // ms, arbitrary frame size we send to the encoder
+                numberOfChannels: CHANNELS,
+                sourceNode: this.recorderSource,
+                encoderBitRate: BITRATE,
+
+                // We use low values for the following to ease CPU usage - the resulting waveform
+                // is indistinguishable for a voice message. Note that the underlying library will
+                // pick defaults which prefer the highest possible quality, CPU be damned.
+                encoderComplexity: 3, // 0-10, 10 is slow and high quality.
+                resampleQuality: 3, // 0-10, 10 is slow and high quality
+            });
+            this.recorder.ondataavailable = (a: ArrayBuffer) => {
+                const buf = new Uint8Array(a);
+                const newBuf = new Uint8Array(this.buffer.length + buf.length);
+                newBuf.set(this.buffer, 0);
+                newBuf.set(buf, this.buffer.length);
+                this.buffer = newBuf;
+            };
+        } catch (e) {
+            console.error("Error starting recording: ", e);
+            if (e instanceof DOMException) { // Unhelpful DOMExceptions are common - parse them sanely
+                console.error(`${e.name} (${e.code}): ${e.message}`);
+            }
+
+            // Clean up as best as possible
+            if (this.recorderStream) this.recorderStream.getTracks().forEach(t => t.stop());
+            if (this.recorderSource) this.recorderSource.disconnect();
+            if (this.recorder) this.recorder.close();
+            if (this.recorderContext) {
+                // noinspection ES6MissingAwait - not important that we wait
+                this.recorderContext.close();
+            }
+
+            throw e; // rethrow so upstream can handle it
+        }
+    }
+
+    private get audioBuffer(): Uint8Array {
+        // We need a clone of the buffer to avoid accidentally changing the position
+        // on the real thing.
+        return this.buffer.slice(0);
+    }
+
+    public get liveData(): SimpleObservable<IRecordingUpdate> {
+        if (!this.recording) throw new Error("No observable when not recording");
+        return this.observable;
+    }
+
+    public get isSupported(): boolean {
+        return !!Recorder.isRecordingSupported();
+    }
+
+    public get hasRecording(): boolean {
+        return this.buffer.length > 0;
+    }
+
+    private onAudioProcess = (ev: AudioProcessingEvent) => {
+        this.processAudioUpdate(ev.playbackTime);
+
+        // We skip the functionality of the worklet regarding waveform calculations: we
+        // should get that information pretty quick during the playback info.
+    };
+
+    private processAudioUpdate = (timeSeconds: number) => {
+        if (!this.recording) return;
+
+        this.observable.update({
+            waveform: this.liveWaveform.value.map(v => clamp(v, 0, 1)),
+            timeSeconds: timeSeconds,
+        });
+
+        // Now that we've updated the data/waveform, let's do a time check. We don't want to
+        // go horribly over the limit. We also emit a warning state if needed.
+        //
+        // We use the recorder's perspective of time to make sure we don't cut off the last
+        // frame of audio, otherwise we end up with a 1:59 clip (119.68 seconds). This extra
+        // safety can allow us to overshoot the target a bit, but at least when we say 2min
+        // maximum we actually mean it.
+        //
+        // In testing, recorder time and worker time lag by about 400ms, which is roughly the
+        // time needed to encode a sample/frame.
+        //
+        // Ref for recorderSeconds: https://github.com/chris-rudmin/opus-recorder#instance-fields
+        const recorderSeconds = this.recorder.encodedSamplePosition / 48000;
+        const secondsLeft = TARGET_MAX_LENGTH - recorderSeconds;
+        if (secondsLeft < 0) { // go over to make sure we definitely capture that last frame
+            // noinspection JSIgnoredPromiseFromCall - we aren't concerned with it overlapping
+            this.stop();
+        } else if (secondsLeft <= TARGET_WARN_TIME_LEFT) {
+            Singleflight.for(this, "ending_soon").do(() => {
+                this.emit(RecordingState.EndingSoon, { secondsLeft });
+                return Singleflight.Void;
+            });
+        }
+    };
+
+    public async start(): Promise<void> {
+        if (this.lastUpload || this.hasRecording) {
+            throw new Error("Recording already prepared");
+        }
+        if (this.recording) {
+            throw new Error("Recording already in progress");
+        }
+        if (this.observable) {
+            this.observable.close();
+        }
+        this.observable = new SimpleObservable<IRecordingUpdate>();
+        await this.makeRecorder();
+        await this.recorder.start();
+        this.recording = true;
+        this.emit(RecordingState.Started);
+    }
+
+    public async stop(): Promise<Uint8Array> {
+        return Singleflight.for(this, "stop").do(async () => {
+            if (!this.recording) {
+                throw new Error("No recording to stop");
+            }
+
+            // Disconnect the source early to start shutting down resources
+            await this.recorder.stop(); // stop first to flush the last frame
+            this.recorderSource.disconnect();
+            if (this.recorderWorklet) this.recorderWorklet.disconnect();
+            if (this.recorderProcessor) {
+                this.recorderProcessor.disconnect();
+                this.recorderProcessor.removeEventListener("audioprocess", this.onAudioProcess);
+            }
+
+            // close the context after the recorder so the recorder doesn't try to
+            // connect anything to the context (this would generate a warning)
+            await this.recorderContext.close();
+
+            // Now stop all the media tracks so we can release them back to the user/OS
+            this.recorderStream.getTracks().forEach(t => t.stop());
+
+            // Finally do our post-processing and clean up
+            this.recording = false;
+            await this.recorder.close();
+            this.emit(RecordingState.Ended);
+
+            return this.audioBuffer;
+        });
+    }
+
+    /**
+     * Gets a playback instance for this voice recording. Note that the playback will not
+     * have been prepared fully, meaning the `prepare()` function needs to be called on it.
+     *
+     * The same playback instance is returned each time.
+     *
+     * @returns {Playback} The playback instance.
+     */
+    public getPlayback(): Playback {
+        this.playback = Singleflight.for(this, "playback").do(() => {
+            return new Playback(this.audioBuffer.buffer, this.amplitudes); // cast to ArrayBuffer proper;
+        });
+        return this.playback;
+    }
+
+    public destroy() {
+        // noinspection JSIgnoredPromiseFromCall - not concerned about stop() being called async here
+        this.stop();
+        this.removeAllListeners();
+        Singleflight.forgetAllFor(this);
+        // noinspection JSIgnoredPromiseFromCall - not concerned about being called async here
+        this.playback?.destroy();
+        this.observable.close();
+    }
+
+    public async upload(inRoomId: string): Promise<IUpload> {
+        if (!this.hasRecording) {
+            throw new Error("No recording available to upload");
+        }
+
+        if (this.lastUpload) return this.lastUpload;
+
+        try {
+            this.emit(RecordingState.Uploading);
+            const { url: mxc, file: encrypted } = await uploadFile(this.client, inRoomId, new Blob([this.audioBuffer], {
+                type: this.contentType,
+            }));
+            this.lastUpload = { mxc, encrypted };
+            this.emit(RecordingState.Uploaded);
+        } catch (e) {
+            this.emit(RecordingState.Ended);
+            throw e;
+        }
+        return this.lastUpload;
+    }
+}
--- a/src/audio/compat.ts
+++ b/src/audio/compat.ts
@ -0,0 +1,82 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import { SAMPLE_RATE } from "./VoiceRecording";
+
+// @ts-ignore - we know that this is not a module. We're looking for a path.
+import decoderWasmPath from 'opus-recorder/dist/decoderWorker.min.wasm';
+import wavEncoderPath from 'opus-recorder/dist/waveWorker.min.js';
+import decoderPath from 'opus-recorder/dist/decoderWorker.min.js';
+
+export function createAudioContext(opts?: AudioContextOptions): AudioContext {
+    if (window.AudioContext) {
+        return new AudioContext(opts);
+    } else if (window.webkitAudioContext) {
+        // While the linter is correct that "a constructor name should not start with
+        // a lowercase letter", it's also wrong to think that we have control over this.
+        // eslint-disable-next-line new-cap
+        return new window.webkitAudioContext(opts);
+    } else {
+        throw new Error("Unsupported browser");
+    }
+}
+
+export function decodeOgg(audioBuffer: ArrayBuffer): Promise<ArrayBuffer> {
+    // Condensed version of decoder example, using a promise:
+    // https://github.com/chris-rudmin/opus-recorder/blob/master/example/decoder.html
+    return new Promise((resolve) => { // no reject because the workers don't seem to have a fail path
+        console.log("Decoder WASM path: " + decoderWasmPath); // so we use the variable (avoid tree shake)
+        const typedArray = new Uint8Array(audioBuffer);
+        const decoderWorker = new Worker(decoderPath);
+        const wavWorker = new Worker(wavEncoderPath);
+
+        decoderWorker.postMessage({
+            command: 'init',
+            decoderSampleRate: SAMPLE_RATE,
+            outputBufferSampleRate: SAMPLE_RATE,
+        });
+
+        wavWorker.postMessage({
+            command: 'init',
+            wavBitDepth: 24, // standard for 48khz (SAMPLE_RATE)
+            wavSampleRate: SAMPLE_RATE,
+        });
+
+        decoderWorker.onmessage = (ev) => {
+            if (ev.data === null) { // null == done
+                wavWorker.postMessage({ command: 'done' });
+                return;
+            }
+
+            wavWorker.postMessage({
+                command: 'encode',
+                buffers: ev.data,
+            }, ev.data.map(b => b.buffer));
+        };
+
+        wavWorker.onmessage = (ev) => {
+            if (ev.data.message === 'page') {
+                // The encoding comes through as a single page
+                resolve(new Blob([ev.data.page], { type: "audio/wav" }).arrayBuffer());
+            }
+        };
+
+        decoderWorker.postMessage({
+            command: 'decode',
+            pages: typedArray,
+        }, [typedArray.buffer]);
+    });
+}
--- a/src/audio/consts.ts
+++ b/src/audio/consts.ts
@ -0,0 +1,37 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+export const WORKLET_NAME = "mx-voice-worklet";
+
+export enum PayloadEvent {
+    Timekeep = "timekeep",
+    AmplitudeMark = "amplitude_mark",
+}
+
+export interface IPayload {
+    ev: PayloadEvent;
+}
+
+export interface ITimingPayload extends IPayload {
+    ev: PayloadEvent.Timekeep;
+    timeSeconds: number;
+}
+
+export interface IAmplitudePayload extends IPayload {
+    ev: PayloadEvent.AmplitudeMark;
+    forIndex: number;
+    amplitude: number;
+}