1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
//! AV1 support.

use std::sync::atomic::{AtomicBool, Ordering};

use crate::Time;
use dav1d::{PixelLayout, PlanarImageComponent};

use super::{
    async_decoder_wrapper::SyncDecoder, Chunk, Error, Frame, FrameContent, FrameInfo,
    OutputCallback, PixelFormat, Result, YuvMatrixCoefficients, YuvPixelLayout, YuvRange,
};

pub struct SyncDav1dDecoder {
    decoder: dav1d::Decoder,
    debug_name: String,
}

impl SyncDecoder for SyncDav1dDecoder {
    fn submit_chunk(&mut self, should_stop: &AtomicBool, chunk: Chunk, on_output: &OutputCallback) {
        re_tracing::profile_function!();
        self.submit_chunk(chunk, on_output);
        self.output_frames(should_stop, on_output);
    }

    /// Clear and reset everything
    fn reset(&mut self) {
        re_tracing::profile_function!();

        self.decoder.flush();

        debug_assert!(matches!(self.decoder.get_picture(), Err(dav1d::Error::Again)),
            "There should be no pending pictures, since we output them directly after submitting a chunk.");
    }
}

impl SyncDav1dDecoder {
    pub fn new(debug_name: String) -> Result<Self> {
        re_tracing::profile_function!();

        if !cfg!(feature = "nasm") {
            // The `nasm` feature makes AV1 decoding much faster.
            // On Linux the difference is huge (~25x).
            // On Windows, the difference was also pretty big (unsure how big).
            // On an M3 Mac the difference is smalelr (2-3x),
            // and ever without `nasm` emilk can play an 8k video at 2x speed.

            if cfg!(target_os = "macos") && cfg!(target_arch = "aarch64") {
                re_log::warn_once!(
                    "The native AV1 video decoder is unnecessarily slow. \
                    Speed it up by compiling Rerun with the `nasm` feature enabled."
                );
            } else {
                // Better to return an error than to be perceived as being slow
                return Err(Error::Dav1dWithoutNasm);
            }
        }

        // See https://videolan.videolan.me/dav1d/structDav1dSettings.html for settings docs
        let mut settings = dav1d::Settings::new();

        // Prioritize delivering video frames, not error messages.
        settings.set_strict_std_compliance(false);

        // Set to 1 for low-latency decoding.
        settings.set_max_frame_delay(1);

        let decoder = dav1d::Decoder::with_settings(&settings)?;

        Ok(Self {
            decoder,
            debug_name,
        })
    }

    fn submit_chunk(&mut self, chunk: Chunk, on_output: &OutputCallback) {
        re_tracing::profile_function!();
        econtext::econtext_function_data!(format!(
            "chunk timestamp: {:?}",
            chunk.presentation_timestamp
        ));

        re_tracing::profile_scope!("send_data");
        match self.decoder.send_data(
            chunk.data,
            None,
            Some(chunk.presentation_timestamp.0),
            Some(chunk.duration.0),
        ) {
            Ok(()) => {}
            Err(err) => {
                debug_assert!(err != dav1d::Error::Again, "Bug in AV1 decoder: send_data returned `Error::Again`. This shouldn't happen, since we process all images in a chunk right away");
                on_output(Err(Error::Dav1d(err)));
            }
        };
    }

    /// Returns the number of new frames.
    fn output_frames(&mut self, should_stop: &AtomicBool, on_output: &OutputCallback) -> usize {
        re_tracing::profile_function!();
        let mut count = 0;
        while !should_stop.load(Ordering::SeqCst) {
            let picture = {
                econtext::econtext!("get_picture");
                self.decoder.get_picture()
            };
            match picture {
                Ok(picture) => {
                    let frame = create_frame(&self.debug_name, &picture);
                    on_output(frame);
                    count += 1;
                }
                Err(dav1d::Error::Again) => {
                    // We need to submit more chunks to get more pictures
                    break;
                }
                Err(err) => {
                    on_output(Err(Error::Dav1d(err)));
                }
            }
        }
        count
    }
}

fn create_frame(debug_name: &str, picture: &dav1d::Picture) -> Result<Frame> {
    re_tracing::profile_function!();

    let bits_per_component = picture
        .bits_per_component()
        .map_or(picture.bit_depth(), |bpc| bpc.0);

    let bytes_per_component = if bits_per_component == 8 {
        1
    } else if 8 < bits_per_component && bits_per_component <= 16 {
        // TODO(#7594): Support HDR video.
        // We currently handle HDR videos by throwing away the lowest bits,
        // and doing so rather slowly on CPU. It works, but the colors won't be perfectly correct.
        re_log::warn_once!(
            "{debug_name:?} is a High-Dynamic-Range (HDR) video with {bits_per_component} bits per component. Rerun does not support this fully. Color accuracy and performance may suffer.",
        );
        // Note that `bit_depth` is either 8 or 16, which is semi-independent `bits_per_component` (which is None/8/10/12).
        2
    } else {
        return Err(Error::BadBitsPerComponent(bits_per_component));
    };

    let mut data = match picture.pixel_layout() {
        // Monochrome
        PixelLayout::I400 => picture.plane(PlanarImageComponent::Y).to_vec(),

        PixelLayout::I420 | PixelLayout::I422 | PixelLayout::I444 => {
            let height_y = picture.height() as usize;
            let height_uv = match picture.pixel_layout() {
                PixelLayout::I400 => 0,
                PixelLayout::I420 => height_y / 2,
                PixelLayout::I422 | PixelLayout::I444 => height_y,
            };

            let packed_stride_y = bytes_per_component * picture.width() as usize;
            let actual_stride_y = picture.stride(PlanarImageComponent::Y) as usize;

            let packed_stride_uv = match picture.pixel_layout() {
                PixelLayout::I400 => 0,
                PixelLayout::I420 | PixelLayout::I422 => packed_stride_y / 2,
                PixelLayout::I444 => packed_stride_y,
            };
            let actual_stride_uv = picture.stride(PlanarImageComponent::U) as usize; // U / V stride is always the same.

            let num_packed_bytes_y = packed_stride_y * height_y;
            let num_packed_bytes_uv = packed_stride_uv * height_uv;

            if actual_stride_y == packed_stride_y && actual_stride_uv == packed_stride_uv {
                // Best case scenario: There's no additional strides at all, so we can just copy the data directly.
                // TODO(andreas): This still has *significant* overhead for 8k video. Can we take ownership of the data instead without a copy?
                re_tracing::profile_scope!("fast path");
                let plane_y = &picture.plane(PlanarImageComponent::Y)[0..num_packed_bytes_y];
                let plane_u = &picture.plane(PlanarImageComponent::U)[0..num_packed_bytes_uv];
                let plane_v = &picture.plane(PlanarImageComponent::V)[0..num_packed_bytes_uv];
                [plane_y, plane_u, plane_v].concat()
            } else {
                // At least either y or u/v have strides.
                //
                // We could make our image ingestion pipeline even more sophisticated and pass that stride information through.
                // But given that this is a matter of replacing a single large memcpy with a few hundred _still_ quite large ones,
                // this should not make a lot of difference (citation needed!).

                let mut data = Vec::with_capacity(num_packed_bytes_y + num_packed_bytes_uv * 2);
                {
                    let plane = picture.plane(PlanarImageComponent::Y);
                    if packed_stride_y == actual_stride_y {
                        data.extend_from_slice(&plane[0..num_packed_bytes_y]);
                    } else {
                        re_tracing::profile_scope!("slow path, y-plane");

                        for y in 0..height_y {
                            let offset = y * actual_stride_y;
                            data.extend_from_slice(&plane[offset..(offset + packed_stride_y)]);
                        }
                    }
                }
                for comp in [PlanarImageComponent::U, PlanarImageComponent::V] {
                    let plane = picture.plane(comp);
                    if actual_stride_uv == packed_stride_uv {
                        data.extend_from_slice(&plane[0..num_packed_bytes_uv]);
                    } else {
                        re_tracing::profile_scope!("slow path, u/v-plane");

                        for y in 0..height_uv {
                            let offset = y * actual_stride_uv;
                            data.extend_from_slice(&plane[offset..(offset + packed_stride_uv)]);
                        }
                    }
                }

                data
            }
        }
    };

    if bytes_per_component == 2 {
        re_tracing::profile_scope!("Truncate HDR"); // costs around 1.5ms per megapixel on MacBook Pro M3 Max
        let rshift = bits_per_component - 8; // we throw away the low bits
        data = data
            .chunks(2)
            .map(|c| {
                let lo = c[0] as u16;
                let hi = c[1] as u16;
                let full = (hi << 8) | lo;
                (full >> rshift) as u8
            })
            .collect();
    }

    let format = PixelFormat::Yuv {
        layout: match picture.pixel_layout() {
            PixelLayout::I400 => YuvPixelLayout::Y400,
            PixelLayout::I420 => YuvPixelLayout::Y_U_V420,
            PixelLayout::I422 => YuvPixelLayout::Y_U_V422,
            PixelLayout::I444 => YuvPixelLayout::Y_U_V444,
        },
        range: match picture.color_range() {
            dav1d::pixel::YUVRange::Limited => YuvRange::Limited,
            dav1d::pixel::YUVRange::Full => YuvRange::Full,
        },
        coefficients: yuv_matrix_coefficients(debug_name, picture),
    };

    Ok(Frame {
        content: FrameContent {
            data,
            width: picture.width(),
            height: picture.height(),
            format,
        },
        info: FrameInfo {
            is_sync: None,    // TODO(emilk)
            sample_idx: None, // TODO(emilk),
            frame_nr: None,   // TODO(emilk),
            presentation_timestamp: Time(picture.timestamp().unwrap_or(0)),
            duration: Time(picture.duration()),
            latest_decode_timestamp: None,
        },
    })
}

fn yuv_matrix_coefficients(debug_name: &str, picture: &dav1d::Picture) -> YuvMatrixCoefficients {
    // Quotes are from https://wiki.x266.mov/docs/colorimetry/matrix (if not noted otherwise)
    #[allow(clippy::match_same_arms)]
    match picture.matrix_coefficients() {
        dav1d::pixel::MatrixCoefficients::Identity => YuvMatrixCoefficients::Identity,

        dav1d::pixel::MatrixCoefficients::BT709 => YuvMatrixCoefficients::Bt709,

        dav1d::pixel::MatrixCoefficients::Unspecified
        | dav1d::pixel::MatrixCoefficients::Reserved => {
            // This happens quite often. Don't issue a warning, that would be noise!

            if picture.transfer_characteristic() == dav1d::pixel::TransferCharacteristic::SRGB {
                // If the transfer characteristic is sRGB, assume BT.709 primaries, would be quite odd otherwise.
                // TODO(andreas): Other transfer characteristics may also hint at primaries.
                YuvMatrixCoefficients::Bt709
            } else {
                // Best guess: If the picture is 720p+ assume Bt709 because Rec709
                // is the "HDR" standard.
                // TODO(#7594): 4k/UHD material should probably assume Bt2020?
                // else if picture.height() >= 720 {
                //     YuvMatrixCoefficients::Bt709
                // } else {
                //     YuvMatrixCoefficients::Bt601
                // }
                //
                // This is also what the mpv player does (and probably others):
                // https://wiki.x266.mov/docs/colorimetry/matrix#2-unspecified
                // (and similar for primaries! https://wiki.x266.mov/docs/colorimetry/primaries#2-unspecified)
                //
                // …then again, eyeballing VLC it looks like it just always assumes BT.709.
                // The handwavy test case employed here was the same video in low & high resolution
                // without specified primaries. Both looked the same.
                YuvMatrixCoefficients::Bt709
            }
        }

        dav1d::pixel::MatrixCoefficients::BT470M => {
            // "BT.470M is a standard that was used in analog television systems in the United States."
            // I guess Bt601 will do!
            YuvMatrixCoefficients::Bt601
        }
        dav1d::pixel::MatrixCoefficients::BT470BG | dav1d::pixel::MatrixCoefficients::ST170M => {
            // This is PAL & NTSC standards, both are part of Bt.601.
            YuvMatrixCoefficients::Bt601
        }
        dav1d::pixel::MatrixCoefficients::ST240M => {
            // "SMPTE 240M was an interim standard used during the early days of HDTV (1988-1998)."
            // Not worth the effort: HD -> Bt709 🤷
            YuvMatrixCoefficients::Bt709
        }

        dav1d::pixel::MatrixCoefficients::BT2020NonConstantLuminance
        | dav1d::pixel::MatrixCoefficients::BT2020ConstantLuminance
        | dav1d::pixel::MatrixCoefficients::ICtCp
        | dav1d::pixel::MatrixCoefficients::ST2085 => {
            // TODO(#7594): HDR support (we'll probably only care about `BT2020NonConstantLuminance`?)
            re_log::warn_once!("Video {debug_name:?} specified HDR color primaries. Rerun doesn't handle HDR colors correctly yet. Color artifacts may be visible.");
            YuvMatrixCoefficients::Bt709
        }

        dav1d::pixel::MatrixCoefficients::ChromaticityDerivedNonConstantLuminance
        | dav1d::pixel::MatrixCoefficients::ChromaticityDerivedConstantLuminance
        | dav1d::pixel::MatrixCoefficients::YCgCo => {
            re_log::warn_once!(
                 "Video {debug_name:?} specified unsupported matrix coefficients {:?}. Color artifacts may be visible.",
                 picture.matrix_coefficients()
             );
            YuvMatrixCoefficients::Bt709
        }
    }
}