1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
use std::{ops::Range, sync::mpsc};

use crate::texture_info::Texture2DBufferInfo;
use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool};

/// Identifier used to identify a buffer upon retrieval of the data.
///
/// Does not need to be unique!
pub type GpuReadbackIdentifier = u64;

/// Type used for storing user data on the gpu readback belt.
pub type GpuReadbackUserDataStorage = Box<dyn std::any::Any + 'static + Send>;

struct PendingReadbackRange {
    identifier: GpuReadbackIdentifier,
    buffer_range: Range<wgpu::BufferAddress>,
    user_data: GpuReadbackUserDataStorage,
}

#[derive(thiserror::Error, Debug)]
pub enum GpuReadbackError {
    #[error("Texture format {0:?} is not supported for readback.")]
    UnsupportedTextureFormatForReadback(wgpu::TextureFormat),

    #[error("Texture or buffer does not have the required copy-source usage flag.")]
    MissingSrcCopyUsage,
}

/// A reserved slice for GPU readback.
///
/// Readback needs to happen from a buffer/texture with copy-source usage,
/// as we need to copy the data from the GPU to this CPU accessible buffer.
pub struct GpuReadbackBuffer {
    chunk_buffer: GpuBuffer,
    range_in_chunk: Range<wgpu::BufferAddress>,
}

impl GpuReadbackBuffer {
    /// Populates the buffer with data from a single layer of a 2D texture.
    ///
    /// Implementation note:
    /// Does 2D-only entirely for convenience as it greatly simplifies the input parameters.
    /// Additionally, we assume as tightly as possible packed data as this is by far the most common use.
    pub fn read_texture2d(
        &mut self,
        encoder: &mut wgpu::CommandEncoder,
        source: wgpu::ImageCopyTexture<'_>,
        copy_extents: wgpu::Extent3d,
    ) -> Result<(), GpuReadbackError> {
        self.read_multiple_texture2d(encoder, &[(source, copy_extents)])
    }

    /// Reads multiple textures into the same buffer.
    ///
    /// This is primarily useful if you need to make sure that data from all textures is available at the same time.
    ///
    /// Special care has to be taken to ensure that the buffer is large enough.
    ///
    /// ATTENTION: Keep in mind that internal offsets need to be a multiple of the texture block size!
    /// While readback buffer starts are guaranteed to be aligned correctly, there might need to be extra padding needed between texture copies.
    /// This method will add the required padding between the texture copies if necessary.
    /// Panics if the buffer is too small.
    pub fn read_multiple_texture2d(
        &mut self,
        encoder: &mut wgpu::CommandEncoder,
        sources_and_extents: &[(wgpu::ImageCopyTexture<'_>, wgpu::Extent3d)],
    ) -> Result<(), GpuReadbackError> {
        for (source, copy_extents) in sources_and_extents {
            let src_texture = source.texture;
            if !src_texture.usage().contains(wgpu::TextureUsages::COPY_SRC) {
                return Err(GpuReadbackError::MissingSrcCopyUsage);
            }

            let start_offset = wgpu::util::align_to(
                self.range_in_chunk.start,
                src_texture
                    .format()
                    .block_copy_size(Some(source.aspect))
                    .ok_or(GpuReadbackError::UnsupportedTextureFormatForReadback(
                        source.texture.format(),
                    ))? as u64,
            );

            let buffer_info = Texture2DBufferInfo::new(src_texture.format(), *copy_extents);

            // Validate that stay within the slice (wgpu can't fully know our intention here, so we have to check).
            debug_assert!(
                buffer_info.buffer_size_padded <= self.range_in_chunk.end - start_offset,
                "Texture data is too large to fit into the readback buffer!"
            );

            encoder.copy_texture_to_buffer(
                *source,
                wgpu::ImageCopyBuffer {
                    buffer: &self.chunk_buffer,
                    layout: wgpu::ImageDataLayout {
                        offset: start_offset,
                        bytes_per_row: Some(buffer_info.bytes_per_row_padded),
                        rows_per_image: None,
                    },
                },
                *copy_extents,
            );

            self.range_in_chunk =
                (start_offset + buffer_info.buffer_size_padded)..self.range_in_chunk.end;
        }
        Ok(())
    }

    // TODO(andreas): Unused & untested so far!
    //
    // Populates the buffer with data from a buffer.
    //
    // Panics if the readback buffer is too small to fit the data.
    // pub fn read_buffer(
    //     self,
    //     encoder: &mut wgpu::CommandEncoder,
    //     source: &GpuBuffer,
    //     source_offset: wgpu::BufferAddress,
    // ) {
    //     let copy_size = self.range_in_chunk.end - self.range_in_chunk.start;

    //     // Wgpu does validation as well, but in debug mode we want to panic if the buffer doesn't fit.
    //     debug_assert!(copy_size <= source_offset + source.size(),
    //         "Source buffer has a size of {}, can't write {copy_size} bytes with an offset of {source_offset}!",
    //         source.size());

    //     encoder.copy_buffer_to_buffer(
    //         source,
    //         source_offset,
    //         &self.chunk_buffer,
    //         self.range_in_chunk.start,
    //         copy_size,
    //     );
    // }
}

/// Internal chunk of the staging belt.
struct Chunk {
    buffer: GpuBuffer,

    /// Offset from which on the buffer is unused.
    unused_offset: wgpu::BufferAddress,

    /// All ranges that are currently in use, i.e. there is a GPU write to it scheduled.
    ranges_in_use: Vec<PendingReadbackRange>,

    /// Last frame this chunk was received, i.e. the last time a `map_async` action operation finished with it.
    last_received_frame_index: u64,
}

impl Chunk {
    fn remaining_capacity(&self) -> wgpu::BufferAddress {
        self.buffer.size() - self.unused_offset
    }

    /// Caller needs to make sure that there is enough space.
    fn allocate(
        &mut self,
        size_in_bytes: wgpu::BufferAddress,
        identifier: GpuReadbackIdentifier,
        user_data: GpuReadbackUserDataStorage,
    ) -> GpuReadbackBuffer {
        debug_assert!(size_in_bytes <= self.remaining_capacity());

        let buffer_range = self.unused_offset..self.unused_offset + size_in_bytes;

        self.ranges_in_use.push(PendingReadbackRange {
            identifier,
            buffer_range: buffer_range.clone(),
            user_data,
        });

        let buffer = GpuReadbackBuffer {
            chunk_buffer: self.buffer.clone(),
            range_in_chunk: buffer_range,
        };

        self.unused_offset += size_in_bytes;

        buffer
    }
}

/// Efficiently performs many buffer reads by sharing and reusing temporary buffers.
///
/// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
pub struct GpuReadbackBelt {
    /// Minimum size for new buffers.
    chunk_size: u64,

    /// Chunks for which the GPU writes are scheduled, but we haven't mapped them yet.
    active_chunks: Vec<Chunk>,

    /// Chunks that have been unmapped and are ready for writing by the GPU.
    free_chunks: Vec<Chunk>,

    /// Chunks that are currently mapped and ready for reading by the CPU.
    received_chunks: Vec<Chunk>,

    /// When a chunk mapping is successful, it is moved to this sender to be read by the CPU.
    sender: mpsc::Sender<Chunk>,

    /// Chunks are received here are ready to be read by the CPU.
    receiver: mpsc::Receiver<Chunk>,

    /// Current frame index, used for keeping track of how old chunks are.
    frame_index: u64,
}

impl GpuReadbackBelt {
    /// All allocations of this allocator will be aligned to at least this size.
    ///
    /// Buffer mappings however are currently NOT guaranteed to be aligned to this size!
    /// See this issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508).
    const MIN_ALIGNMENT: u64 = wgpu::MAP_ALIGNMENT; //wgpu::COPY_BUFFER_ALIGNMENT.max(wgpu::MAP_ALIGNMENT);

    /// Create a ring buffer for efficient & easy gpu memory readback.
    ///
    /// The `chunk_size` is the unit of internal buffer allocation. Reads will be
    /// sub-allocated within each chunk. Therefore, for optimal use of memory, the
    /// chunk size should be:
    ///
    /// * larger than the largest single [`GpuReadbackBelt::allocate`] operation;
    /// * 1-4 times less than the total amount of data uploaded per submission
    /// * bigger is better, within these bounds.
    ///
    /// TODO(andreas): Adaptive chunk sizes
    /// TODO(andreas): Shrinking after usage spikes (e.g. screenshots of different sizes!)
    pub fn new(chunk_size: wgpu::BufferSize) -> Self {
        let (sender, receiver) = mpsc::channel();
        Self {
            chunk_size: wgpu::util::align_to(chunk_size.get(), Self::MIN_ALIGNMENT),
            active_chunks: Vec::new(),
            free_chunks: Vec::new(),
            received_chunks: Vec::new(),
            sender,
            receiver,
            frame_index: 0,
        }
    }

    /// Allocates a Gpu writable buffer & cpu readable buffer with a given size.
    pub fn allocate(
        &mut self,
        device: &wgpu::Device,
        buffer_pool: &GpuBufferPool,
        size_in_bytes: wgpu::BufferAddress,
        identifier: GpuReadbackIdentifier,
        user_data: GpuReadbackUserDataStorage,
    ) -> GpuReadbackBuffer {
        re_tracing::profile_function!();

        debug_assert!(size_in_bytes > 0, "Cannot allocate zero-sized buffer");

        let size_in_bytes = wgpu::util::align_to(size_in_bytes, Self::MIN_ALIGNMENT);

        // Try to find space in any of the active chunks first.
        let mut chunk = if let Some(index) = self
            .active_chunks
            .iter_mut()
            .position(|chunk| chunk.remaining_capacity() >= size_in_bytes)
        {
            self.active_chunks.swap_remove(index)
        } else {
            // Use a free chunk if possible, fall back to creating a new one if necessary.
            if let Some(index) = self
                .free_chunks
                .iter()
                .position(|chunk| chunk.remaining_capacity() >= size_in_bytes)
            {
                self.free_chunks.swap_remove(index)
            } else {
                // Allocation might be bigger than a chunk!
                let buffer_size = self.chunk_size.max(size_in_bytes);
                re_log::trace!(
                    "Allocating new GpuReadbackBelt chunk of size {:.1} MiB",
                    buffer_size as f32 / (1024.0 * 1024.0)
                );
                let buffer = buffer_pool.alloc(
                    device,
                    &BufferDesc {
                        label: "GpuReadbackBelt chunk buffer".into(),
                        size: buffer_size,
                        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
                        mapped_at_creation: false,
                    },
                );

                Chunk {
                    buffer,
                    unused_offset: 0,
                    ranges_in_use: Vec::new(),
                    last_received_frame_index: u64::MAX,
                }
            }
        };

        let buffer_slice = chunk.allocate(size_in_bytes, identifier, user_data);
        self.active_chunks.push(chunk);

        buffer_slice
    }

    /// Prepare used buffers for CPU read.
    ///
    /// This should be called after the command encoder(s) used in [`GpuReadbackBuffer`] copy operations are submitted.
    pub fn after_queue_submit(&mut self) {
        re_tracing::profile_function!();
        for chunk in self.active_chunks.drain(..) {
            let sender = self.sender.clone();
            chunk.buffer.clone().slice(..chunk.unused_offset).map_async(
                wgpu::MapMode::Read,
                move |result| {
                    if result.is_err() {
                        // This should never happen. Drop the chunk and report.
                        re_log::error_once!("Failed to map staging buffer for reading");
                    } else {
                        let _ = sender.send(chunk);
                    }
                },
            );
        }
    }

    /// Should be called at the beginning of a new frame.
    ///
    /// Discards stale data that hasn't been received by [`GpuReadbackBelt::readback_data`] for more than a frame.
    pub fn begin_frame(&mut self, frame_index: u64) {
        // Make sure each frame has at least one `receive_chunk` call before it ends (from the pov of the readback belt).
        // It's important to do this before bumping the frame index, because we want to mark all these chunks as "old"
        // chunks that were available for the previous frame.
        // (A user could have done this just before beginning a frame via `receive_chunks` or not call it at all)
        self.receive_chunks();

        self.frame_index = frame_index;

        // Kill off all stale chunks.
        // Note that this happening is unfortunate but not _really_ a user bug as this can happen very easily:
        // For example, if a picking operation is scheduled on view that is immediately closed after!

        // TODO(andreas): just use `Vec::drain_filter` once it goes stable.
        let (discarded, retained) = self.received_chunks.drain(..).partition(|chunk| {
            // If the chunk was received last frame it is too early to discard it, we need to wait one more.
            // Imagine it was received just at the end of that frame - the user has no chance of getting it back
            // at the code that they might be running at the beginning of a frame.
            chunk.last_received_frame_index + 1 < self.frame_index
        });
        self.received_chunks = retained;

        for chunk in discarded {
            re_log::trace!(
                "Unread data from a GpuReadbackBelt was discarded. {} ranges remained unread.",
                chunk.ranges_in_use.len()
            );
            self.reuse_chunk(chunk);
        }
    }

    /// Try to receive a pending data readback with the given identifier and the given user data type.
    ///
    /// Returns the oldest received data with the given identifier & type.
    /// This is *almost certainly* also the oldest scheduled readback. But there is no *strict* guarantee for this.
    /// It could in theory happen that a readback is scheduled after a previous one, but finishes before it!
    ///
    /// ATTENTION: Do NOT assume any alignment on the slice passed to `on_data_received`.
    /// See this issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508).
    pub fn readback_data<UserDataType: 'static>(
        &mut self,
        identifier: GpuReadbackIdentifier,
        callback: impl FnOnce(&[u8], Box<UserDataType>),
    ) {
        re_tracing::profile_function!();

        self.receive_chunks();

        // Search for the user data in the readback chunks.
        // A linear search is suited best since we expect both the number of pending chunks (typically just one or two!)
        // as well as the number of readbacks per chunk to be small!
        // Also note that identifiers may not be unique!
        for (chunk_index, chunk) in self.received_chunks.iter_mut().enumerate() {
            for (range_index, range) in chunk.ranges_in_use.iter().enumerate() {
                if range.identifier != identifier || !range.user_data.is::<UserDataType>() {
                    continue;
                }

                {
                    let range = chunk.ranges_in_use.swap_remove(range_index);
                    let slice = chunk.buffer.slice(range.buffer_range.clone());
                    let data = slice.get_mapped_range();
                    callback(&data, range.user_data.downcast::<UserDataType>().unwrap());
                }

                // If this was the last range from this chunk, the chunk is ready for re-use!
                if chunk.ranges_in_use.is_empty() {
                    let chunk = self.received_chunks.swap_remove(chunk_index);
                    self.reuse_chunk(chunk);
                }
                return;
            }
        }
    }

    /// Check if any new chunks are ready to be read.
    fn receive_chunks(&mut self) {
        while let Ok(mut chunk) = self.receiver.try_recv() {
            chunk.last_received_frame_index = self.frame_index;
            self.received_chunks.push(chunk);
        }
    }

    fn reuse_chunk(&mut self, mut chunk: Chunk) {
        chunk.buffer.unmap();
        chunk.ranges_in_use.clear();
        chunk.unused_offset = 0;
        self.free_chunks.push(chunk);
    }
}

impl std::fmt::Debug for GpuReadbackBelt {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("GpuReadbackBelt")
            .field("chunk_size", &self.chunk_size)
            .field("active_chunks", &self.active_chunks.len())
            .field("free_chunks", &self.free_chunks.len())
            .finish_non_exhaustive()
    }
}