re_sdk::external::arrow::row

Struct RowConverter

pub struct RowConverter {
    pub(crate) fields: Arc<[SortField]>,
    pub(crate) codecs: Vec<Codec>,
}

Expand description

Converts ArrayRef columns into a row-oriented format.

Note: The encoding of the row format may change from release to release.

§Overview

The row format is a variable length byte sequence created by concatenating the encoded form of each column. The encoding for each column depends on its datatype (and sort options).

The encoding is carefully designed in such a way that escaping is unnecessary: it is never ambiguous as to whether a byte is part of a sentinel (e.g. null) or a value.

§Unsigned Integer Encoding

A null integer is encoded as a 0_u8, followed by a zero-ed number of bytes corresponding to the integer’s length.

A valid integer is encoded as 1_u8, followed by the big-endian representation of the integer.

              ┌──┬──┬──┬──┐      ┌──┬──┬──┬──┬──┐
   3          │03│00│00│00│      │01│00│00│00│03│
              └──┴──┴──┴──┘      └──┴──┴──┴──┴──┘
              ┌──┬──┬──┬──┐      ┌──┬──┬──┬──┬──┐
  258         │02│01│00│00│      │01│00│00│01│02│
              └──┴──┴──┴──┘      └──┴──┴──┴──┴──┘
              ┌──┬──┬──┬──┐      ┌──┬──┬──┬──┬──┐
 23423        │7F│5B│00│00│      │01│00│00│5B│7F│
              └──┴──┴──┴──┘      └──┴──┴──┴──┴──┘
              ┌──┬──┬──┬──┐      ┌──┬──┬──┬──┬──┐
 NULL         │??│??│??│??│      │00│00│00│00│00│
              └──┴──┴──┴──┘      └──┴──┴──┴──┴──┘

             32-bit (4 bytes)        Row Format
 Value        Little Endian

§Signed Integer Encoding

Signed integers have their most significant sign bit flipped, and are then encoded in the same manner as an unsigned integer.

       ┌──┬──┬──┬──┐       ┌──┬──┬──┬──┐       ┌──┬──┬──┬──┬──┐
    5  │05│00│00│00│       │05│00│00│80│       │01│80│00│00│05│
       └──┴──┴──┴──┘       └──┴──┴──┴──┘       └──┴──┴──┴──┴──┘
       ┌──┬──┬──┬──┐       ┌──┬──┬──┬──┐       ┌──┬──┬──┬──┬──┐
   -5  │FB│FF│FF│FF│       │FB│FF│FF│7F│       │01│7F│FF│FF│FB│
       └──┴──┴──┴──┘       └──┴──┴──┴──┘       └──┴──┴──┴──┴──┘

 Value  32-bit (4 bytes)    High bit flipped      Row Format
         Little Endian

§Float Encoding

Floats are converted from IEEE 754 representation to a signed integer representation by flipping all bar the sign bit if they are negative.

They are then encoded in the same manner as a signed integer.

§Fixed Length Bytes Encoding

Fixed length bytes are encoded in the same fashion as primitive types above.

For a fixed length array of length n:

A null is encoded as 0_u8 null sentinel followed by n 0_u8 bytes

A valid value is encoded as 1_u8 followed by the value bytes

§Variable Length Bytes (including Strings) Encoding

A null is encoded as a 0_u8.

An empty byte array is encoded as 1_u8.

A non-null, non-empty byte array is encoded as 2_u8 followed by the byte array encoded using a block based scheme described below.

The byte array is broken up into fixed-width blocks, each block is written in turn to the output, followed by 0xFF_u8. The final block is padded to 32-bytes with 0_u8 and written to the output, followed by the un-padded length in bytes of this final block as a u8. The first 4 blocks have a length of 8, with subsequent blocks using a length of 32, this is to reduce space amplification for small strings.

Note the following example encodings use a block size of 4 bytes for brevity:

                      ┌───┬───┬───┬───┬───┬───┐
 "MEEP"               │02 │'M'│'E'│'E'│'P'│04 │
                      └───┴───┴───┴───┴───┴───┘

                      ┌───┐
 ""                   │01 |
                      └───┘

 NULL                 ┌───┐
                      │00 │
                      └───┘

"Defenestration"      ┌───┬───┬───┬───┬───┬───┐
                      │02 │'D'│'e'│'f'│'e'│FF │
                      └───┼───┼───┼───┼───┼───┤
                          │'n'│'e'│'s'│'t'│FF │
                          ├───┼───┼───┼───┼───┤
                          │'r'│'a'│'t'│'r'│FF │
                          ├───┼───┼───┼───┼───┤
                          │'a'│'t'│'i'│'o'│FF │
                          ├───┼───┼───┼───┼───┤
                          │'n'│00 │00 │00 │01 │
                          └───┴───┴───┴───┴───┘

This approach is loosely inspired by COBS encoding, and chosen over more traditional byte stuffing as it is more amenable to vectorisation, in particular AVX-256.

§Dictionary Encoding

Dictionaries are hydrated to their underlying values

§Struct Encoding

A null is encoded as a 0_u8.

A valid value is encoded as 1_u8 followed by the row encoding of each child.

This encoding effectively flattens the schema in a depth-first fashion.

For example

┌───────┬────────────────────────┬───────┐
│ Int32 │ Struct[Int32, Float32] │ Int32 │
└───────┴────────────────────────┴───────┘

Is encoded as

┌───────┬───────────────┬───────┬─────────┬───────┐
│ Int32 │ Null Sentinel │ Int32 │ Float32 │ Int32 │
└───────┴───────────────┴───────┴─────────┴───────┘

§List Encoding

Lists are encoded by first encoding all child elements to the row format.

A list value is then encoded as the concatenation of each of the child elements, separately encoded using the variable length encoding described above, followed by the variable length encoding of an empty byte array.

For example given:

[1_u8, 2_u8, 3_u8]
[1_u8, null]
[]
null

The elements would be converted to:

    ┌──┬──┐     ┌──┬──┐     ┌──┬──┐     ┌──┬──┐        ┌──┬──┐
 1  │01│01│  2  │01│02│  3  │01│03│  1  │01│01│  null  │00│00│
    └──┴──┘     └──┴──┘     └──┴──┘     └──┴──┘        └──┴──┘

Which would be encoded as

                        ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐
 [1_u8, 2_u8, 3_u8]     │02│01│01│00│00│02│02│01│02│00│00│02│02│01│03│00│00│02│01│
                        └──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘
                         └──── 1_u8 ────┘   └──── 2_u8 ────┘  └──── 3_u8 ────┘

                        ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐
 [1_u8, null]           │02│01│01│00│00│02│02│00│00│00│00│02│01│
                        └──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘
                         └──── 1_u8 ────┘   └──── null ────┘

With [] represented by an empty byte array, and null a null byte array.

§Ordering

§Float Ordering

Floats are totally ordered in accordance to the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard.

The ordering established by this does not always agree with the PartialOrd and PartialEq implementations of f32. For example, they consider negative and positive zero equal, while this does not

§Null Ordering

The encoding described above will order nulls first, this can be inverted by representing nulls as 0xFF_u8 instead of 0_u8

§Reverse Column Ordering

The order of a given column can be reversed by negating the encoded bytes of non-null values

Fields§

§fields: Arc<[SortField]>§codecs: Vec<Codec>

Implementations§

§

impl RowConverter

pub fn new(fields: Vec<SortField>) -> Result<RowConverter, ArrowError>

Create a new RowConverter with the provided schema

pub fn supports_fields(fields: &[SortField]) -> bool

Check if the given fields are supported by the row format.

pub fn convert_columns( &self, columns: &[Arc<dyn Array>], ) -> Result<Rows, ArrowError>

Convert ArrayRef columns into Rows

See Row for information on when Row can be compared

§Panics

Panics if the schema of columns does not match that provided to RowConverter::new

pub fn append( &self, rows: &mut Rows, columns: &[Arc<dyn Array>], ) -> Result<(), ArrowError>

Convert ArrayRef columns appending to an existing Rows

See Row for information on when Row can be compared

§Panics

Panics if

The schema of columns does not match that provided to RowConverter::new
The provided Rows were not created by this RowConverter

let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
let a1 = StringArray::from(vec!["hello", "world"]);
let a2 = StringArray::from(vec!["a", "a", "hello"]);

let mut rows = converter.empty_rows(5, 128);
converter.append(&mut rows, &[Arc::new(a1)]).unwrap();
converter.append(&mut rows, &[Arc::new(a2)]).unwrap();

let back = converter.convert_rows(&rows).unwrap();
let values: Vec<_> = back[0].as_string::<i32>().iter().map(Option::unwrap).collect();
assert_eq!(&values, &["hello", "world", "a", "a", "hello"]);

pub fn convert_rows<'a, I>( &self, rows: I, ) -> Result<Vec<Arc<dyn Array>>, ArrowError>
where I: IntoIterator<Item = Row<'a>>,

Convert Rows columns into ArrayRef

§Panics

Panics if the rows were not produced by this RowConverter

pub fn empty_rows(&self, row_capacity: usize, data_capacity: usize) -> Rows

Returns an empty Rows with capacity for row_capacity rows with a total length of data_capacity

This can be used to buffer a selection of Row

let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
let array = StringArray::from(vec!["hello", "world", "a", "a", "hello"]);

// Convert to row format and deduplicate
let converted = converter.convert_columns(&[Arc::new(array)]).unwrap();
let mut distinct_rows = converter.empty_rows(3, 100);
let mut dedup: HashSet<Row> = HashSet::with_capacity(3);
converted.iter().filter(|row| dedup.insert(*row)).for_each(|row| distinct_rows.push(row));

// Note: we could skip buffering and feed the filtered iterator directly
// into convert_rows, this is done for demonstration purposes only
let distinct = converter.convert_rows(&distinct_rows).unwrap();
let values: Vec<_> = distinct[0].as_string::<i32>().iter().map(Option::unwrap).collect();
assert_eq!(&values, &["hello", "world", "a"]);

pub fn from_binary( &self, array: GenericByteArray<GenericBinaryType<i32>>, ) -> Rows

Create a new Rows instance from the given binary data.

let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
let array = StringArray::from(vec!["hello", "world", "a", "a", "hello"]);
let rows = converter.convert_columns(&[Arc::new(array)]).unwrap();

// We can convert rows into binary format and back in batch.
let values: Vec<OwnedRow> = rows.iter().map(|r| r.owned()).collect();
let binary = rows.try_into_binary().expect("known-small array");
let converted = converter.from_binary(binary.clone());
assert!(converted.iter().eq(values.iter().map(|r| r.row())));

§Panics

This function expects the passed BinaryArray to contain valid row data as produced by this RowConverter. It will panic if any rows are null. Operations on the returned Rows may panic if the data is malformed.

pub fn parser(&self) -> RowParser

Returns a RowParser that can be used to parse Row from bytes

pub fn size(&self) -> usize

Returns the size of this instance in bytes

Includes the size of Self.

Trait Implementations§

§

impl Debug for RowConverter

§

fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error>

Formats the value using the given formatter. Read more

Auto Trait Implementations§

§

Struct RowConverterCopy item path

§Overview

§Unsigned Integer Encoding

§Signed Integer Encoding

§Float Encoding

§Fixed Length Bytes Encoding

§Variable Length Bytes (including Strings) Encoding

§Dictionary Encoding

§Struct Encoding

§List Encoding

§Ordering

§Float Ordering

§Null Ordering

§Reverse Column Ordering

Fields§

Implementations§

impl RowConverter

pub fn new(fields: Vec<SortField>) -> Result<RowConverter, ArrowError>

pub fn supports_fields(fields: &[SortField]) -> bool

pub fn convert_columns( &self, columns: &[Arc<dyn Array>], ) -> Result<Rows, ArrowError>

§Panics

pub fn append( &self, rows: &mut Rows, columns: &[Arc<dyn Array>], ) -> Result<(), ArrowError>

§Panics

pub fn convert_rows<'a, I>( &self, rows: I, ) -> Result<Vec<Arc<dyn Array>>, ArrowError>where I: IntoIterator<Item = Row<'a>>,

§Panics

pub fn empty_rows(&self, row_capacity: usize, data_capacity: usize) -> Rows

pub fn from_binary( &self, array: GenericByteArray<GenericBinaryType<i32>>, ) -> Rows

§Panics

pub fn parser(&self) -> RowParser

pub fn size(&self) -> usize

Trait Implementations§

impl Debug for RowConverter

fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error>

Auto Trait Implementations§

impl Freeze for RowConverter

impl RefUnwindSafe for RowConverter

impl Send for RowConverter

impl Sync for RowConverter

impl Unpin for RowConverter

impl UnwindSafe for RowConverter

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Az for T

fn az<Dst>(self) -> Dstwhere T: Cast<Dst>,

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<Src, Dst> CastFrom<Src> for Dstwhere Src: Cast<Dst>,

fn cast_from(src: Src) -> Dst

impl<T> CheckedAs for T

fn checked_as<Dst>(self) -> Option<Dst>where T: CheckedCast<Dst>,

impl<Src, Dst> CheckedCastFrom<Src> for Dstwhere Src: CheckedCast<Dst>,

fn checked_cast_from(src: Src) -> Option<Dst>

impl<T> Downcast for Twhere T: Any,

fn into_any(self: Box<T>) -> Box<dyn Any>

fn into_any_rc(self: Rc<T>) -> Rc<dyn Any>

fn as_any(&self) -> &(dyn Any + 'static)

fn as_any_mut(&mut self) -> &mut (dyn Any + 'static)

impl<T> DowncastSync for Twhere T: Any + Send + Sync,

fn into_any_arc(self: Arc<T>) -> Arc<dyn Any + Sync + Send>

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> IntoRequest<T> for T

fn into_request(self) -> Request<T>

impl<Src, Dst> LosslessTryInto<Dst> for Srcwhere Dst: LosslessTryFrom<Src>,

fn lossless_try_into(self) -> Option<Dst>

impl<Src, Dst> LossyInto<Dst> for Srcwhere Dst: LossyFrom<Src>,

fn lossy_into(self) -> Dst

impl<T> OverflowingAs for T

fn overflowing_as<Dst>(self) -> (Dst, bool)where T: OverflowingCast<Dst>,

Struct RowConverter

pub fn convert_rows<'a, I>( &self, rows: I, ) -> Result<Vec<Arc<dyn Array>>, ArrowError>
where I: IntoIterator<Item = Row<'a>>,

impl<T> Any for T
where T: 'static + ?Sized,

fn az<Dst>(self) -> Dst
where T: Cast<Dst>,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<Src, Dst> CastFrom<Src> for Dst
where Src: Cast<Dst>,

fn checked_as<Dst>(self) -> Option<Dst>
where T: CheckedCast<Dst>,

impl<Src, Dst> CheckedCastFrom<Src> for Dst
where Src: CheckedCast<Dst>,

impl<T> Downcast for T
where T: Any,

impl<T> DowncastSync for T
where T: Any + Send + Sync,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<Src, Dst> LosslessTryInto<Dst> for Src
where Dst: LosslessTryFrom<Src>,

impl<Src, Dst> LossyInto<Dst> for Src
where Dst: LossyFrom<Src>,

fn overflowing_as<Dst>(self) -> (Dst, bool)
where T: OverflowingCast<Dst>,

impl<Src, Dst> OverflowingCastFrom<Src> for Dst
where Src: OverflowingCast<Dst>,

fn saturating_as<Dst>(self) -> Dst
where T: SaturatingCast<Dst>,

impl<Src, Dst> SaturatingCastFrom<Src> for Dst
where Src: SaturatingCast<Dst>,

impl<T> To for T
where T: ?Sized,

fn to<T>(self) -> T
where Self: Into<T>,

fn try_to<T>(self) -> Result<T, Self::Error>
where Self: TryInto<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

fn unwrapped_as<Dst>(self) -> Dst
where T: UnwrappedCast<Dst>,

impl<Src, Dst> UnwrappedCastFrom<Src> for Dst
where Src: UnwrappedCast<Dst>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

fn wrapping_as<Dst>(self) -> Dst
where T: WrappingCast<Dst>,

impl<Src, Dst> WrappingCastFrom<Src> for Dst
where Src: WrappingCast<Dst>,

impl<T> Allocation for T
where T: RefUnwindSafe + Send + Sync,

impl<T> ErasedDestructor for T
where T: 'static,

impl<T> Ungil for T
where T: Send,