1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
use std::sync::mpsc;

use crate::{
    texture_info::Texture2DBufferInfo,
    wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool, GpuTexture},
};

#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
pub enum CpuWriteGpuReadError {
    #[error("Attempting to allocate an empty buffer.")]
    ZeroSizeBufferAllocation,

    #[error(
        "Buffer is full, can't append more data! Buffer has a capacity for {buffer_capacity_elements} elements.
 Tried to add {num_elements_attempted_to_add} elements, but only added {num_elements_actually_added}."
    )]
    BufferFull {
        buffer_capacity_elements: usize,
        num_elements_attempted_to_add: usize,
        num_elements_actually_added: usize,
    },

    #[error("Target buffer has a size of {target_buffer_size}, can't write {copy_size} bytes with an offset of {destination_offset}!")]
    TargetBufferTooSmall {
        target_buffer_size: u64,
        copy_size: u64,
        destination_offset: u64,
    },

    #[error("Target texture doesn't fit the size of the written data to this buffer! Texture target buffer should be at most {max_copy_size} bytes, but the to be written data was {written_data_size} bytes.")]
    TargetTextureBufferSizeMismatch {
        max_copy_size: u64,
        written_data_size: usize,
    },
}

/// A sub-allocated staging buffer that can be written to.
///
/// Behaves a bit like a fixed sized `Vec` in that far it keeps track of how many elements were written to it.
///
/// We do *not* allow reading from this buffer as it is typically write-combined memory.
/// Reading would work, but it can be *very* slow.
/// For details on why, see
/// [Write combining is not your friend, by Fabian Giesen](https://fgiesen.wordpress.com/2013/01/29/write-combining-is-not-your-friend/)
/// Note that the "vec like behavior" further encourages
/// * not leaving holes
/// * keeping writes sequential
pub struct CpuWriteGpuReadBuffer<T: bytemuck::Pod + Send + Sync> {
    /// Write view into the relevant buffer portion.
    ///
    /// UNSAFE: The lifetime is transmuted to be `'static`.
    /// In actuality it is tied to the lifetime of [`chunk_buffer`](#structfield.chunk_buffer)!
    write_view: wgpu::BufferViewMut<'static>,

    /// Range in T elements in `write_view` that haven't been written yet.
    unwritten_element_range: std::ops::Range<usize>,

    chunk_buffer: GpuBuffer,
    byte_offset_in_chunk_buffer: wgpu::BufferAddress,

    /// Marker for the type whose alignment and size requirements are honored by `write_view`.
    _type: std::marker::PhantomData<T>,
}

impl<T> CpuWriteGpuReadBuffer<T>
where
    T: bytemuck::Pod + Send + Sync,
{
    /// Memory as slice.
    ///
    /// Note that we can't rely on any alignment guarantees here!
    /// We could offset the mapped CPU-sided memory, but then the GPU offset won't be aligned anymore.
    /// There's no way we can meet conflicting alignment requirements, so we need to work with unaligned bytes instead.
    /// See [this comment on this wgpu issue](https://github.com/gfx-rs/wgpu/issues/3508#issuecomment-1485044324) about what we tried before.
    ///
    /// Once wgpu has some alignment guarantees, we might be able to use this here to allow faster copies!
    /// (copies of larger blocks are likely less affected as `memcpy` typically does dynamic check/dispatching for SIMD based copies)
    ///
    /// Do *not* make this public as we need to guarantee that the memory is *never* read from!
    #[inline(always)]
    fn as_mut_byte_slice(&mut self) -> &mut [u8] {
        // TODO(andreas): Is this access slow given that it internally goes through a trait interface? Should we keep the pointer around?
        &mut self.write_view[self.unwritten_element_range.start * std::mem::size_of::<T>()
            ..self.unwritten_element_range.end * std::mem::size_of::<T>()]
    }

    /// Pushes a slice of elements into the buffer.
    ///
    /// If the buffer is not big enough, only the first `self.remaining_capacity()` elements are pushed before returning an error.
    #[inline]
    pub fn extend_from_slice(&mut self, elements: &[T]) -> Result<(), CpuWriteGpuReadError> {
        if elements.is_empty() {
            return Ok(());
        }

        re_tracing::profile_function_if!(10_000 < elements.len());

        let remaining_capacity = self.remaining_capacity();
        let (result, elements) = if elements.len() > remaining_capacity {
            (
                Err(CpuWriteGpuReadError::BufferFull {
                    buffer_capacity_elements: self.capacity(),
                    num_elements_attempted_to_add: elements.len(),
                    num_elements_actually_added: remaining_capacity,
                }),
                &elements[..remaining_capacity],
            )
        } else {
            (Ok(()), elements)
        };

        let bytes = bytemuck::cast_slice(elements);
        self.as_mut_byte_slice()[..bytes.len()].copy_from_slice(bytes);
        self.unwritten_element_range.start += elements.len();

        result
    }

    /// Pushes several elements into the buffer.
    ///
    /// If the buffer is not big enough, only the first [`CpuWriteGpuReadBuffer::remaining_capacity`] elements are pushed before returning an error.
    /// Otherwise, returns the number of elements pushed for convenience.
    #[inline]
    pub fn extend(
        &mut self,
        mut elements: impl ExactSizeIterator<Item = T>,
    ) -> Result<usize, CpuWriteGpuReadError> {
        re_tracing::profile_function!();

        // TODO(emilk): optimize the extend function.
        // Right now it is 3-4x faster to collect to a vec first, which is crazy.
        //
        // Mimalloc can't align types larger than 64 bytes now and will silently ignore it.
        // https://github.com/purpleprotocol/mimalloc_rust/issues/128
        // Therefore, large alignments won't work with collect.
        let pretend_mimalloc_aligns_correctly = false; // TODO(#5875): update mimalloc
        if std::mem::align_of::<T>() <= 32 || pretend_mimalloc_aligns_correctly {
            let vec: Vec<T> = elements.collect();

            #[allow(clippy::dbg_macro)]
            if pretend_mimalloc_aligns_correctly {
                dbg!(std::any::type_name::<T>());
                dbg!(std::mem::size_of::<T>());
                dbg!(std::mem::align_of::<T>());
                dbg!(vec.len());
                dbg!(vec.as_ptr());
                dbg!(vec.as_ptr() as usize % std::mem::align_of::<T>());
            }
            debug_assert_eq!(
                vec.as_ptr() as usize % std::mem::align_of::<T>(),
                0,
                "Vec::collect collects into unaligned memory!"
            );

            self.extend_from_slice(vec.as_slice())?;
            Ok(vec.len())
        } else {
            let num_written_before = self.num_written();

            while let Some(element) = elements.next() {
                if self.unwritten_element_range.start >= self.unwritten_element_range.end {
                    let num_elements_actually_added = self.num_written() - num_written_before;

                    return Err(CpuWriteGpuReadError::BufferFull {
                        buffer_capacity_elements: self.capacity(),
                        num_elements_attempted_to_add: num_elements_actually_added
                            + elements.count()
                            + 1,
                        num_elements_actually_added,
                    });
                }

                self.as_mut_byte_slice()[..std::mem::size_of::<T>()]
                    .copy_from_slice(bytemuck::bytes_of(&element));
                self.unwritten_element_range.start += 1;
            }

            Ok(self.num_written() - num_written_before)
        }
    }

    /// Fills the buffer with n instances of an element.
    ///
    /// If the buffer is not big enough, only the first `self.remaining_capacity()` elements are pushed before returning an error.
    pub fn add_n(&mut self, element: T, num_elements: usize) -> Result<(), CpuWriteGpuReadError> {
        if num_elements == 0 {
            return Ok(());
        }

        re_tracing::profile_function_if!(10_000 < num_elements);

        let remaining_capacity = self.remaining_capacity();
        let (result, num_elements) = if num_elements > remaining_capacity {
            (
                Err(CpuWriteGpuReadError::BufferFull {
                    buffer_capacity_elements: self.capacity(),
                    num_elements_attempted_to_add: num_elements,
                    num_elements_actually_added: remaining_capacity,
                }),
                remaining_capacity,
            )
        } else {
            (Ok(()), num_elements)
        };

        let mut offset = 0;
        let buffer_bytes = self.as_mut_byte_slice();
        let element_bytes = bytemuck::bytes_of(&element);

        for _ in 0..num_elements {
            let end = offset + std::mem::size_of::<T>();
            buffer_bytes[offset..end].copy_from_slice(element_bytes);
            offset = end;
        }
        self.unwritten_element_range.start += num_elements;

        result
    }

    /// Pushes a single element into the buffer and advances the write pointer.
    ///
    /// Returns an error if the data no longer fits into the buffer.
    #[inline]
    pub fn push(&mut self, element: T) -> Result<(), CpuWriteGpuReadError> {
        if self.remaining_capacity() == 0 {
            return Err(CpuWriteGpuReadError::BufferFull {
                buffer_capacity_elements: self.capacity(),
                num_elements_attempted_to_add: 1,
                num_elements_actually_added: 0,
            });
        }

        self.as_mut_byte_slice()[..std::mem::size_of::<T>()]
            .copy_from_slice(bytemuck::bytes_of(&element));
        self.unwritten_element_range.start += 1;

        Ok(())
    }

    /// True if no elements have been pushed into the buffer so far.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.unwritten_element_range.start == 0
    }

    /// The number of elements pushed into the buffer so far.
    #[inline]
    pub fn num_written(&self) -> usize {
        self.unwritten_element_range.start
    }

    /// The number of elements that can still be pushed into the buffer.
    #[inline]
    pub fn remaining_capacity(&self) -> usize {
        self.unwritten_element_range.end - self.unwritten_element_range.start
    }

    /// Total number of elements that the buffer can hold.
    pub fn capacity(&self) -> usize {
        self.unwritten_element_range.end
    }

    /// Copies all so far written data to the first layer of a 2D texture.
    ///
    /// Assumes that the buffer consists of as-tightly-packed-as-possible rows of data.
    /// (taking into account required padding as specified by [`wgpu::COPY_BYTES_PER_ROW_ALIGNMENT`])
    ///
    /// Fails if the buffer size is not sufficient to fill the entire texture.
    #[allow(unused)]
    pub fn copy_to_texture2d_entire_first_layer(
        self,
        encoder: &mut wgpu::CommandEncoder,
        destination: &GpuTexture,
    ) -> Result<(), CpuWriteGpuReadError> {
        self.copy_to_texture2d(
            encoder,
            wgpu::ImageCopyTexture {
                texture: &destination.texture,
                mip_level: 0,
                origin: wgpu::Origin3d::ZERO,
                aspect: wgpu::TextureAspect::All,
            },
            destination.texture.size(),
        )
    }

    /// Copies all so far written data to a rectangle on a single 2D texture layer.
    ///
    /// Assumes that the buffer consists of as-tightly-packed-as-possible rows of data.
    /// (taking into account required padding as specified by [`wgpu::COPY_BYTES_PER_ROW_ALIGNMENT`])
    ///
    /// Implementation note:
    /// Does 2D-only entirely for convenience as it greatly simplifies the input parameters.
    pub fn copy_to_texture2d(
        self,
        encoder: &mut wgpu::CommandEncoder,
        destination: wgpu::ImageCopyTexture<'_>,
        copy_size: wgpu::Extent3d,
    ) -> Result<(), CpuWriteGpuReadError> {
        let buffer_info = Texture2DBufferInfo::new(destination.texture.format(), copy_size);

        // Validate that we stay within the written part of the slice (wgpu can't fully know our intention here, so we have to check).
        // This is a bit of a leaky check since we haven't looked at copy_size which may limit the amount of memory we need.
        if (buffer_info.buffer_size_padded as usize) < self.num_written() * std::mem::size_of::<T>()
        {
            return Err(CpuWriteGpuReadError::TargetTextureBufferSizeMismatch {
                max_copy_size: buffer_info.buffer_size_padded,
                written_data_size: self.num_written() * std::mem::size_of::<T>(),
            });
        }

        encoder.copy_buffer_to_texture(
            wgpu::ImageCopyBuffer {
                buffer: &self.chunk_buffer,
                layout: wgpu::ImageDataLayout {
                    offset: self.byte_offset_in_chunk_buffer,
                    bytes_per_row: Some(buffer_info.bytes_per_row_padded),
                    rows_per_image: None,
                },
            },
            destination,
            copy_size,
        );

        Ok(())
    }

    /// Copies the entire buffer to another buffer and drops it.
    pub fn copy_to_buffer(
        self,
        encoder: &mut wgpu::CommandEncoder,
        destination: &GpuBuffer,
        destination_offset: wgpu::BufferAddress,
    ) -> Result<(), CpuWriteGpuReadError> {
        let copy_size = (std::mem::size_of::<T>() * self.unwritten_element_range.start) as u64;

        // Wgpu does validation as well, but we want to be able to track this error right away.
        if copy_size > destination_offset + destination.size() {
            return Err(CpuWriteGpuReadError::TargetBufferTooSmall {
                target_buffer_size: destination.size(),
                copy_size,
                destination_offset,
            });
        }

        encoder.copy_buffer_to_buffer(
            &self.chunk_buffer,
            self.byte_offset_in_chunk_buffer,
            destination,
            destination_offset,
            copy_size,
        );

        Ok(())
    }
}

/// Internal chunk of the staging belt.
struct Chunk {
    buffer: GpuBuffer,

    /// Starting at this offset the buffer is unused.
    unused_offset: wgpu::BufferAddress,
}

impl Chunk {
    fn remaining_capacity(&self) -> u64 {
        self.buffer.size() - self.unused_offset
    }

    /// Caller needs to make sure that there is enough space.
    fn allocate<T: bytemuck::Pod + Send + Sync>(
        &mut self,
        num_elements: usize,
        size_in_bytes: u64,
    ) -> CpuWriteGpuReadBuffer<T> {
        debug_assert!(num_elements * std::mem::size_of::<T>() <= size_in_bytes as usize);

        let byte_offset_in_chunk_buffer = self.unused_offset;
        let end_offset = byte_offset_in_chunk_buffer + size_in_bytes;

        debug_assert!(byte_offset_in_chunk_buffer % CpuWriteGpuReadBelt::MIN_OFFSET_ALIGNMENT == 0);
        debug_assert!(end_offset <= self.buffer.size());

        let buffer_slice = self.buffer.slice(byte_offset_in_chunk_buffer..end_offset);
        let write_view = buffer_slice.get_mapped_range_mut();
        self.unused_offset = end_offset;

        #[allow(unsafe_code)]
        // SAFETY:
        // write_view has a lifetime dependency on the chunk's buffer - internally it holds a pointer to it!
        //
        // To ensure that the buffer is still around, we put the ref counted buffer handle into the struct with it.
        // Additionally, the buffer pool needs to ensure:
        // * it can't drop buffers if there's still users
        //      -> We assert on that
        // * buffers are never moved in memory
        //      -> buffers are always owned by the pool and are always Arc.
        //          This means it not allowed to move the buffer out.
        //          (We could make them Pin<Arc<>> but this complicates things inside the BufferPool)
        let write_view = unsafe {
            std::mem::transmute::<wgpu::BufferViewMut<'_>, wgpu::BufferViewMut<'static>>(write_view)
        };

        CpuWriteGpuReadBuffer {
            chunk_buffer: self.buffer.clone(),
            byte_offset_in_chunk_buffer,
            write_view,
            unwritten_element_range: 0..num_elements,
            _type: std::marker::PhantomData,
        }
    }
}

/// Efficiently performs many buffer writes by sharing and reusing temporary buffers.
///
/// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
///
/// Based on to [`wgpu::util::StagingBelt`](https://github.com/gfx-rs/wgpu/blob/a420e453c3d9c93dfb1a8526bf11c000d895c916/wgpu/src/util/belt.rs)
/// However, there are some important differences:
/// * can create buffers without yet knowing the target copy location
/// * lifetime of returned buffers is independent of the [`CpuWriteGpuReadBelt`] (allows working with several in parallel!)
/// * use of `re_renderer`'s resource pool
/// * handles alignment in a defined manner
///   (see this as of writing open wgpu issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508))
pub struct CpuWriteGpuReadBelt {
    /// Minimum size for new buffers.
    chunk_size: u64,

    /// Chunks which are CPU write at the moment.
    active_chunks: Vec<Chunk>,

    /// Chunks which are GPU read at the moment.
    ///
    /// I.e. they have scheduled transfers already; they are unmapped and one or more
    /// command encoder has one or more `copy_buffer_to_buffer` commands with them
    /// as source.
    closed_chunks: Vec<Chunk>,

    /// Chunks that are back from the GPU and ready to be mapped for write and put
    /// into `active_chunks`.
    free_chunks: Vec<Chunk>,

    /// When closed chunks are mapped again, the map callback sends them here.
    ///
    /// Note that we shouldn't use `SyncSender` since this can block the `Sender` if a buffer is full,
    /// which means that in a single threaded situation (Web!) we might deadlock.
    sender: mpsc::Sender<Chunk>,

    /// Free chunks are received here to be put on `self.free_chunks`.
    receiver: mpsc::Receiver<Chunk>,
}

impl CpuWriteGpuReadBelt {
    /// All allocations of this allocator will be aligned to at least this size.
    ///
    /// Requiring a minimum alignment means we need to pad less often.
    /// Also, it has the potential of making memcpy operations faster.
    ///
    /// Needs to be larger or equal than [`wgpu::MAP_ALIGNMENT`], [`wgpu::COPY_BUFFER_ALIGNMENT`]
    /// and the largest possible texel block footprint (since offsets for texture copies require this)
    ///
    /// For alignment requirements in `WebGPU` in general, refer to
    /// [the specification on alignment-class limitations](https://www.w3.org/TR/webgpu/#limit-class-alignment)
    ///
    /// Note that this does NOT mean that the CPU memory has *any* alignment.
    /// See this issue about [lack of CPU memory alignment](https://github.com/gfx-rs/wgpu/issues/3508) in wgpu/WebGPU.
    const MIN_OFFSET_ALIGNMENT: u64 = 16;

    /// Create a cpu-write & gpu-read staging belt.
    ///
    /// The `chunk_size` is the unit of internal buffer allocation; writes will be
    /// sub-allocated within each chunk. Therefore, for optimal use of memory, the
    /// chunk size should be:
    ///
    /// * larger than the largest single [`CpuWriteGpuReadBelt::allocate`] operation;
    /// * 1-4 times less than the total amount of data uploaded per submission
    ///   (per [`CpuWriteGpuReadBelt::before_queue_submit()`]); and
    /// * bigger is better, within these bounds.
    ///
    /// TODO(andreas): Adaptive chunk sizes
    /// TODO(andreas): Shrinking after usage spikes?
    pub fn new(chunk_size: wgpu::BufferSize) -> Self {
        static_assertions::const_assert!(
            wgpu::MAP_ALIGNMENT <= CpuWriteGpuReadBelt::MIN_OFFSET_ALIGNMENT
        );
        static_assertions::const_assert!(
            wgpu::COPY_BUFFER_ALIGNMENT <= CpuWriteGpuReadBelt::MIN_OFFSET_ALIGNMENT
        );
        // Largest uncompressed texture format (btw. many compressed texture format have the same block size!)
        debug_assert!(
            wgpu::TextureFormat::Rgba32Uint
                .block_copy_size(None)
                .unwrap() as u64
                <= Self::MIN_OFFSET_ALIGNMENT
        );

        let (sender, receiver) = mpsc::channel();
        Self {
            chunk_size: wgpu::util::align_to(chunk_size.get(), Self::MIN_OFFSET_ALIGNMENT),
            active_chunks: Vec::new(),
            closed_chunks: Vec::new(),
            free_chunks: Vec::new(),
            sender,
            receiver,
        }
    }

    /// Allocates a cpu writable buffer for `num_elements` instances of type `T`.
    ///
    /// The buffer will be aligned to T's alignment, but no less than [`Self::MIN_OFFSET_ALIGNMENT`].
    pub fn allocate<T: bytemuck::Pod + Send + Sync>(
        &mut self,
        device: &wgpu::Device,
        buffer_pool: &GpuBufferPool,
        num_elements: usize,
    ) -> Result<CpuWriteGpuReadBuffer<T>, CpuWriteGpuReadError> {
        if num_elements == 0 {
            return Err(CpuWriteGpuReadError::ZeroSizeBufferAllocation);
        }

        re_tracing::profile_function!();

        debug_assert!(num_elements > 0, "Cannot allocate zero-sized buffer");

        // Potentially overestimate size with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string.
        let size = wgpu::util::align_to(
            (std::mem::size_of::<T>() * num_elements) as wgpu::BufferAddress,
            Self::MIN_OFFSET_ALIGNMENT,
        );

        // Try to find space in any of the active chunks first.
        let mut chunk = if let Some(index) = self
            .active_chunks
            .iter_mut()
            .position(|chunk| chunk.remaining_capacity() >= size)
        {
            self.active_chunks.swap_remove(index)
        } else {
            self.receive_chunks(); // ensure self.free_chunks is up to date

            // Use a free chunk if possible, fall back to creating a new one if necessary.
            if let Some(index) = self
                .free_chunks
                .iter()
                .position(|chunk| chunk.remaining_capacity() >= size)
            {
                self.free_chunks.swap_remove(index)
            } else {
                // Allocation might be bigger than a chunk!
                let buffer_size =
                    wgpu::util::align_to(self.chunk_size.max(size), Self::MIN_OFFSET_ALIGNMENT);
                re_log::trace!(
                    "Allocating new CpuWriteGpuReadBelt chunk of size {:.1} MiB",
                    buffer_size as f32 / (1024.0 * 1024.0)
                );
                let buffer = buffer_pool.alloc(
                    device,
                    &BufferDesc {
                        label: "CpuWriteGpuReadBelt chunk buffer".into(),
                        size: buffer_size,
                        usage: wgpu::BufferUsages::MAP_WRITE | wgpu::BufferUsages::COPY_SRC,
                        mapped_at_creation: true,
                    },
                );

                Chunk {
                    buffer,
                    unused_offset: 0,
                }
            }
        };

        let cpu_buffer_view = chunk.allocate(num_elements, size);
        self.active_chunks.push(chunk);
        Ok(cpu_buffer_view)
    }

    /// Prepare currently mapped buffers for use in a submission.
    ///
    /// This must be called before the command encoder(s) used in [`CpuWriteGpuReadBuffer`] copy operations are submitted.
    ///
    /// At this point, all the partially used staging buffers are closed (cannot be used for
    /// further writes) until after [`CpuWriteGpuReadBelt::after_queue_submit`] is called *and* the GPU is done
    /// copying the data from them.
    pub fn before_queue_submit(&mut self) {
        re_tracing::profile_function!();

        // This would be a great usecase for persistent memory mapping, i.e. mapping without the need to unmap
        // https://github.com/gfx-rs/wgpu/issues/1468
        // However, WebGPU does not support this!

        for chunk in self.active_chunks.drain(..) {
            chunk.buffer.unmap();
            self.closed_chunks.push(chunk);
        }
    }

    /// Recall all of the closed buffers back to be reused.
    ///
    /// This must only be called after the command encoder(s) used in [`CpuWriteGpuReadBuffer`]
    /// copy operations are submitted. Additional calls are harmless.
    /// Not calling this as soon as possible may result in increased buffer memory usage.
    pub fn after_queue_submit(&mut self) {
        re_tracing::profile_function!();
        self.receive_chunks();

        let sender = &self.sender;
        for chunk in self.closed_chunks.drain(..) {
            let sender = sender.clone();
            chunk
                .buffer
                .clone()
                .slice(..)
                .map_async(wgpu::MapMode::Write, move |_| {
                    let _ = sender.send(chunk);
                });
        }
    }

    /// Move all chunks that the GPU is done with (and are now mapped again)
    /// from `self.receiver` to `self.free_chunks`.
    fn receive_chunks(&mut self) {
        while let Ok(mut chunk) = self.receiver.try_recv() {
            chunk.unused_offset = 0;
            self.free_chunks.push(chunk);
        }
    }
}

impl std::fmt::Debug for CpuWriteGpuReadBelt {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CpuWriteGpuReadBelt")
            .field("chunk_size", &self.chunk_size)
            .field("active_chunks", &self.active_chunks.len())
            .field("closed_chunks", &self.closed_chunks.len())
            .field("free_chunks", &self.free_chunks.len())
            .finish_non_exhaustive()
    }
}