1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
use std::{ops::Range, sync::mpsc};
use crate::texture_info::Texture2DBufferInfo;
use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool};
/// Identifier used to identify a buffer upon retrieval of the data.
///
/// Does not need to be unique!
pub type GpuReadbackIdentifier = u64;
/// Type used for storing user data on the gpu readback belt.
pub type GpuReadbackUserDataStorage = Box<dyn std::any::Any + 'static + Send>;
struct PendingReadbackRange {
identifier: GpuReadbackIdentifier,
buffer_range: Range<wgpu::BufferAddress>,
user_data: GpuReadbackUserDataStorage,
}
#[derive(thiserror::Error, Debug)]
pub enum GpuReadbackError {
#[error("Texture format {0:?} is not supported for readback.")]
UnsupportedTextureFormatForReadback(wgpu::TextureFormat),
#[error("Texture or buffer does not have the required copy-source usage flag.")]
MissingSrcCopyUsage,
}
/// A reserved slice for GPU readback.
///
/// Readback needs to happen from a buffer/texture with copy-source usage,
/// as we need to copy the data from the GPU to this CPU accessible buffer.
pub struct GpuReadbackBuffer {
chunk_buffer: GpuBuffer,
range_in_chunk: Range<wgpu::BufferAddress>,
}
impl GpuReadbackBuffer {
/// Populates the buffer with data from a single layer of a 2D texture.
///
/// Implementation note:
/// Does 2D-only entirely for convenience as it greatly simplifies the input parameters.
/// Additionally, we assume as tightly as possible packed data as this is by far the most common use.
pub fn read_texture2d(
&mut self,
encoder: &mut wgpu::CommandEncoder,
source: wgpu::ImageCopyTexture<'_>,
copy_extents: wgpu::Extent3d,
) -> Result<(), GpuReadbackError> {
self.read_multiple_texture2d(encoder, &[(source, copy_extents)])
}
/// Reads multiple textures into the same buffer.
///
/// This is primarily useful if you need to make sure that data from all textures is available at the same time.
///
/// Special care has to be taken to ensure that the buffer is large enough.
///
/// ATTENTION: Keep in mind that internal offsets need to be a multiple of the texture block size!
/// While readback buffer starts are guaranteed to be aligned correctly, there might need to be extra padding needed between texture copies.
/// This method will add the required padding between the texture copies if necessary.
/// Panics if the buffer is too small.
pub fn read_multiple_texture2d(
&mut self,
encoder: &mut wgpu::CommandEncoder,
sources_and_extents: &[(wgpu::ImageCopyTexture<'_>, wgpu::Extent3d)],
) -> Result<(), GpuReadbackError> {
for (source, copy_extents) in sources_and_extents {
let src_texture = source.texture;
if !src_texture.usage().contains(wgpu::TextureUsages::COPY_SRC) {
return Err(GpuReadbackError::MissingSrcCopyUsage);
}
let start_offset = wgpu::util::align_to(
self.range_in_chunk.start,
src_texture
.format()
.block_copy_size(Some(source.aspect))
.ok_or(GpuReadbackError::UnsupportedTextureFormatForReadback(
source.texture.format(),
))? as u64,
);
let buffer_info = Texture2DBufferInfo::new(src_texture.format(), *copy_extents);
// Validate that stay within the slice (wgpu can't fully know our intention here, so we have to check).
debug_assert!(
buffer_info.buffer_size_padded <= self.range_in_chunk.end - start_offset,
"Texture data is too large to fit into the readback buffer!"
);
encoder.copy_texture_to_buffer(
*source,
wgpu::ImageCopyBuffer {
buffer: &self.chunk_buffer,
layout: wgpu::ImageDataLayout {
offset: start_offset,
bytes_per_row: Some(buffer_info.bytes_per_row_padded),
rows_per_image: None,
},
},
*copy_extents,
);
self.range_in_chunk =
(start_offset + buffer_info.buffer_size_padded)..self.range_in_chunk.end;
}
Ok(())
}
// TODO(andreas): Unused & untested so far!
//
// Populates the buffer with data from a buffer.
//
// Panics if the readback buffer is too small to fit the data.
// pub fn read_buffer(
// self,
// encoder: &mut wgpu::CommandEncoder,
// source: &GpuBuffer,
// source_offset: wgpu::BufferAddress,
// ) {
// let copy_size = self.range_in_chunk.end - self.range_in_chunk.start;
// // Wgpu does validation as well, but in debug mode we want to panic if the buffer doesn't fit.
// debug_assert!(copy_size <= source_offset + source.size(),
// "Source buffer has a size of {}, can't write {copy_size} bytes with an offset of {source_offset}!",
// source.size());
// encoder.copy_buffer_to_buffer(
// source,
// source_offset,
// &self.chunk_buffer,
// self.range_in_chunk.start,
// copy_size,
// );
// }
}
/// Internal chunk of the staging belt.
struct Chunk {
buffer: GpuBuffer,
/// Offset from which on the buffer is unused.
unused_offset: wgpu::BufferAddress,
/// All ranges that are currently in use, i.e. there is a GPU write to it scheduled.
ranges_in_use: Vec<PendingReadbackRange>,
/// Last frame this chunk was received, i.e. the last time a `map_async` action operation finished with it.
last_received_frame_index: u64,
}
impl Chunk {
fn remaining_capacity(&self) -> wgpu::BufferAddress {
self.buffer.size() - self.unused_offset
}
/// Caller needs to make sure that there is enough space.
fn allocate(
&mut self,
size_in_bytes: wgpu::BufferAddress,
identifier: GpuReadbackIdentifier,
user_data: GpuReadbackUserDataStorage,
) -> GpuReadbackBuffer {
debug_assert!(size_in_bytes <= self.remaining_capacity());
let buffer_range = self.unused_offset..self.unused_offset + size_in_bytes;
self.ranges_in_use.push(PendingReadbackRange {
identifier,
buffer_range: buffer_range.clone(),
user_data,
});
let buffer = GpuReadbackBuffer {
chunk_buffer: self.buffer.clone(),
range_in_chunk: buffer_range,
};
self.unused_offset += size_in_bytes;
buffer
}
}
/// Efficiently performs many buffer reads by sharing and reusing temporary buffers.
///
/// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
pub struct GpuReadbackBelt {
/// Minimum size for new buffers.
chunk_size: u64,
/// Chunks for which the GPU writes are scheduled, but we haven't mapped them yet.
active_chunks: Vec<Chunk>,
/// Chunks that have been unmapped and are ready for writing by the GPU.
free_chunks: Vec<Chunk>,
/// Chunks that are currently mapped and ready for reading by the CPU.
received_chunks: Vec<Chunk>,
/// When a chunk mapping is successful, it is moved to this sender to be read by the CPU.
sender: mpsc::Sender<Chunk>,
/// Chunks are received here are ready to be read by the CPU.
receiver: mpsc::Receiver<Chunk>,
/// Current frame index, used for keeping track of how old chunks are.
frame_index: u64,
}
impl GpuReadbackBelt {
/// All allocations of this allocator will be aligned to at least this size.
///
/// Buffer mappings however are currently NOT guaranteed to be aligned to this size!
/// See this issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508).
const MIN_ALIGNMENT: u64 = wgpu::MAP_ALIGNMENT; //wgpu::COPY_BUFFER_ALIGNMENT.max(wgpu::MAP_ALIGNMENT);
/// Create a ring buffer for efficient & easy gpu memory readback.
///
/// The `chunk_size` is the unit of internal buffer allocation. Reads will be
/// sub-allocated within each chunk. Therefore, for optimal use of memory, the
/// chunk size should be:
///
/// * larger than the largest single [`GpuReadbackBelt::allocate`] operation;
/// * 1-4 times less than the total amount of data uploaded per submission
/// * bigger is better, within these bounds.
///
/// TODO(andreas): Adaptive chunk sizes
/// TODO(andreas): Shrinking after usage spikes (e.g. screenshots of different sizes!)
pub fn new(chunk_size: wgpu::BufferSize) -> Self {
let (sender, receiver) = mpsc::channel();
Self {
chunk_size: wgpu::util::align_to(chunk_size.get(), Self::MIN_ALIGNMENT),
active_chunks: Vec::new(),
free_chunks: Vec::new(),
received_chunks: Vec::new(),
sender,
receiver,
frame_index: 0,
}
}
/// Allocates a Gpu writable buffer & cpu readable buffer with a given size.
pub fn allocate(
&mut self,
device: &wgpu::Device,
buffer_pool: &GpuBufferPool,
size_in_bytes: wgpu::BufferAddress,
identifier: GpuReadbackIdentifier,
user_data: GpuReadbackUserDataStorage,
) -> GpuReadbackBuffer {
re_tracing::profile_function!();
debug_assert!(size_in_bytes > 0, "Cannot allocate zero-sized buffer");
let size_in_bytes = wgpu::util::align_to(size_in_bytes, Self::MIN_ALIGNMENT);
// Try to find space in any of the active chunks first.
let mut chunk = if let Some(index) = self
.active_chunks
.iter_mut()
.position(|chunk| chunk.remaining_capacity() >= size_in_bytes)
{
self.active_chunks.swap_remove(index)
} else {
// Use a free chunk if possible, fall back to creating a new one if necessary.
if let Some(index) = self
.free_chunks
.iter()
.position(|chunk| chunk.remaining_capacity() >= size_in_bytes)
{
self.free_chunks.swap_remove(index)
} else {
// Allocation might be bigger than a chunk!
let buffer_size = self.chunk_size.max(size_in_bytes);
re_log::trace!(
"Allocating new GpuReadbackBelt chunk of size {:.1} MiB",
buffer_size as f32 / (1024.0 * 1024.0)
);
let buffer = buffer_pool.alloc(
device,
&BufferDesc {
label: "GpuReadbackBelt chunk buffer".into(),
size: buffer_size,
usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
},
);
Chunk {
buffer,
unused_offset: 0,
ranges_in_use: Vec::new(),
last_received_frame_index: u64::MAX,
}
}
};
let buffer_slice = chunk.allocate(size_in_bytes, identifier, user_data);
self.active_chunks.push(chunk);
buffer_slice
}
/// Prepare used buffers for CPU read.
///
/// This should be called after the command encoder(s) used in [`GpuReadbackBuffer`] copy operations are submitted.
pub fn after_queue_submit(&mut self) {
re_tracing::profile_function!();
for chunk in self.active_chunks.drain(..) {
let sender = self.sender.clone();
chunk.buffer.clone().slice(..chunk.unused_offset).map_async(
wgpu::MapMode::Read,
move |result| {
if result.is_err() {
// This should never happen. Drop the chunk and report.
re_log::error_once!("Failed to map staging buffer for reading");
} else {
let _ = sender.send(chunk);
}
},
);
}
}
/// Should be called at the beginning of a new frame.
///
/// Discards stale data that hasn't been received by [`GpuReadbackBelt::readback_data`] for more than a frame.
pub fn begin_frame(&mut self, frame_index: u64) {
// Make sure each frame has at least one `receive_chunk` call before it ends (from the pov of the readback belt).
// It's important to do this before bumping the frame index, because we want to mark all these chunks as "old"
// chunks that were available for the previous frame.
// (A user could have done this just before beginning a frame via `receive_chunks` or not call it at all)
self.receive_chunks();
self.frame_index = frame_index;
// Kill off all stale chunks.
// Note that this happening is unfortunate but not _really_ a user bug as this can happen very easily:
// For example, if a picking operation is scheduled on view that is immediately closed after!
// TODO(andreas): just use `Vec::drain_filter` once it goes stable.
let (discarded, retained) = self.received_chunks.drain(..).partition(|chunk| {
// If the chunk was received last frame it is too early to discard it, we need to wait one more.
// Imagine it was received just at the end of that frame - the user has no chance of getting it back
// at the code that they might be running at the beginning of a frame.
chunk.last_received_frame_index + 1 < self.frame_index
});
self.received_chunks = retained;
for chunk in discarded {
re_log::trace!(
"Unread data from a GpuReadbackBelt was discarded. {} ranges remained unread.",
chunk.ranges_in_use.len()
);
self.reuse_chunk(chunk);
}
}
/// Try to receive a pending data readback with the given identifier and the given user data type.
///
/// Returns the oldest received data with the given identifier & type.
/// This is *almost certainly* also the oldest scheduled readback. But there is no *strict* guarantee for this.
/// It could in theory happen that a readback is scheduled after a previous one, but finishes before it!
///
/// ATTENTION: Do NOT assume any alignment on the slice passed to `on_data_received`.
/// See this issue on [Alignment guarantees for mapped buffers](https://github.com/gfx-rs/wgpu/issues/3508).
pub fn readback_data<UserDataType: 'static>(
&mut self,
identifier: GpuReadbackIdentifier,
callback: impl FnOnce(&[u8], Box<UserDataType>),
) {
re_tracing::profile_function!();
self.receive_chunks();
// Search for the user data in the readback chunks.
// A linear search is suited best since we expect both the number of pending chunks (typically just one or two!)
// as well as the number of readbacks per chunk to be small!
// Also note that identifiers may not be unique!
for (chunk_index, chunk) in self.received_chunks.iter_mut().enumerate() {
for (range_index, range) in chunk.ranges_in_use.iter().enumerate() {
if range.identifier != identifier || !range.user_data.is::<UserDataType>() {
continue;
}
{
let range = chunk.ranges_in_use.swap_remove(range_index);
let slice = chunk.buffer.slice(range.buffer_range.clone());
let data = slice.get_mapped_range();
callback(&data, range.user_data.downcast::<UserDataType>().unwrap());
}
// If this was the last range from this chunk, the chunk is ready for re-use!
if chunk.ranges_in_use.is_empty() {
let chunk = self.received_chunks.swap_remove(chunk_index);
self.reuse_chunk(chunk);
}
return;
}
}
}
/// Check if any new chunks are ready to be read.
fn receive_chunks(&mut self) {
while let Ok(mut chunk) = self.receiver.try_recv() {
chunk.last_received_frame_index = self.frame_index;
self.received_chunks.push(chunk);
}
}
fn reuse_chunk(&mut self, mut chunk: Chunk) {
chunk.buffer.unmap();
chunk.ranges_in_use.clear();
chunk.unused_offset = 0;
self.free_chunks.push(chunk);
}
}
impl std::fmt::Debug for GpuReadbackBelt {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("GpuReadbackBelt")
.field("chunk_size", &self.chunk_size)
.field("active_chunks", &self.active_chunks.len())
.field("free_chunks", &self.free_chunks.len())
.finish_non_exhaustive()
}
}