From 950b792cf1640dc312972d12bae1eab80aad2133 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 1 Oct 2024 15:23:19 +0200 Subject: [PATCH 1/2] =?UTF-8?q?Store=20the=20list=20of=20updated=20IDs=20d?= =?UTF-8?q?irectly=20in=20LMDB=E2=80=AFinstead=20of=20a=20roaring=20bitmap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/error.rs | 1 + src/key.rs | 11 +++++-- src/node_id.rs | 24 ++++++++++---- src/reader.rs | 10 ++++-- src/tests/mod.rs | 1 + src/writer.rs | 84 +++++++++++++++++++++++------------------------- 6 files changed, 76 insertions(+), 55 deletions(-) diff --git a/src/error.rs b/src/error.rs index fe0935c2..7c4824b6 100644 --- a/src/error.rs +++ b/src/error.rs @@ -75,6 +75,7 @@ impl Error { NodeMode::Item => "Item", NodeMode::Tree => "Tree", NodeMode::Metadata => "Metadata", + NodeMode::Updated => todo!(), }, item: key.node.item, } diff --git a/src/key.rs b/src/key.rs index bad953df..a5eb5658 100644 --- a/src/key.rs +++ b/src/key.rs @@ -7,12 +7,13 @@ use heed::BoxedError; use crate::{NodeId, NodeMode}; /// This whole structure must fit in an u64 so we can tell LMDB to optimize its storage. -/// The `prefix` is specified by the user and is used to differentiate between multiple arroy indexes. +/// The `index` is specified by the user and is used to differentiate between multiple arroy indexes. /// The `mode` indicates what we're looking at. /// The `item` point to a specific node. /// If the mode is: /// - `Item`: we're looking at a `Leaf` node. /// - `Tree`: we're looking at one of the internal generated node from arroy. Could be a descendants or a split plane. +/// - `Updated`: The list of items that has been updated since the last build of the database. /// - `Metadata`: There is only one item at `0` that contains the header required to read the index. #[derive(Debug, Copy, Clone)] pub struct Key { @@ -32,8 +33,8 @@ impl Key { Self::new(index, NodeId::metadata()) } - pub const fn updated(index: u16) -> Self { - Self::new(index, NodeId::updated()) + pub const fn updated(index: u16, item: u32) -> Self { + Self::new(index, NodeId::updated(item)) } pub const fn item(index: u16, item: u32) -> Self { @@ -98,6 +99,10 @@ impl Prefix { pub const fn tree(index: u16) -> Self { Self { index, mode: Some(NodeMode::Tree) } } + + pub const fn updated(index: u16) -> Self { + Self { index, mode: Some(NodeMode::Updated) } + } } pub enum PrefixCodec {} diff --git a/src/node_id.rs b/src/node_id.rs index fcd88528..9e81c9cf 100644 --- a/src/node_id.rs +++ b/src/node_id.rs @@ -9,9 +9,15 @@ use crate::ItemId; #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[repr(u8)] pub enum NodeMode { + /// Stores the metadata under the `ItemId` 0 Metadata = 0, - Tree = 1, - Item = 2, + /// Stores the list of all the `ItemId` that have been updated. + /// We only stores `Unit` values under the keys. + Updated = 1, + /// The tree nodes are stored under this id. + Tree = 2, + /// The original vectors are stored under this id in `Leaf` structures. + Item = 3, } impl TryFrom for NodeMode { @@ -21,6 +27,7 @@ impl TryFrom for NodeMode { match v { v if v == NodeMode::Item as u8 => Ok(NodeMode::Item), v if v == NodeMode::Tree as u8 => Ok(NodeMode::Tree), + v if v == NodeMode::Updated as u8 => Ok(NodeMode::Updated), v if v == NodeMode::Metadata as u8 => Ok(NodeMode::Metadata), v => Err(format!("Could not convert {v} as a `NodeMode`.")), } @@ -47,8 +54,8 @@ impl NodeId { Self { mode: NodeMode::Metadata, item: 0 } } - pub const fn updated() -> Self { - Self { mode: NodeMode::Metadata, item: 1 } + pub const fn updated(item: u32) -> Self { + Self { mode: NodeMode::Updated, item } } pub const fn tree(item: u32) -> Self { @@ -107,11 +114,16 @@ mod test { assert!(NodeId::tree(1) > NodeId::tree(0)); assert!(NodeId::tree(0) < NodeId::tree(1)); + assert!(NodeId::updated(0) == NodeId::updated(0)); + assert!(NodeId::updated(1) > NodeId::updated(0)); + assert!(NodeId::updated(0) < NodeId::updated(1)); + // tree < item whatever is the value assert!(NodeId::tree(u32::MAX) < NodeId::item(0)); assert!(NodeId::metadata() == NodeId::metadata()); - assert!(NodeId::metadata() < NodeId::tree(u32::MAX)); - assert!(NodeId::metadata() < NodeId::item(u32::MAX)); + assert!(NodeId::metadata() < NodeId::tree(u32::MIN)); + assert!(NodeId::metadata() < NodeId::updated(u32::MIN)); + assert!(NodeId::metadata() < NodeId::item(u32::MIN)); } } diff --git a/src/reader.rs b/src/reader.rs index b4586d64..4f5536d6 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -4,7 +4,7 @@ use std::iter::repeat; use std::marker; use std::num::NonZeroUsize; -use heed::types::{Bytes, DecodeIgnore}; +use heed::types::DecodeIgnore; use heed::RoTxn; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; @@ -146,7 +146,13 @@ impl<'t, D: Distance> Reader<'t, D> { received: D::name(), }); } - if database.remap_data_type::().get(rtxn, &Key::updated(index))?.is_some() { + if database + .remap_types::() + .prefix_iter(rtxn, &Prefix::updated(index))? + .remap_key_type::() + .next() + .is_some() + { return Err(Error::NeedBuild(index)); } diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 0f041885..42296f85 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -80,6 +80,7 @@ impl fmt::Display for DatabaseHandle { NodeMode::Metadata => { panic!() } + NodeMode::Updated => todo!(), } } diff --git a/src/writer.rs b/src/writer.rs index 23ec9a6e..1813b0ac 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -3,7 +3,7 @@ use std::borrow::Cow; use std::mem; use std::path::PathBuf; -use heed::types::{Bytes, DecodeIgnore}; +use heed::types::{Bytes, DecodeIgnore, Unit}; use heed::{MdbError, PutFlags, RoTxn, RwTxn}; use rand::{Rng, SeedableRng}; use rayon::iter::repeatn; @@ -20,7 +20,6 @@ use crate::parallel::{ TmpNodesReader, }; use crate::reader::item_leaf; -use crate::roaring::RoaringBitmapCodec; use crate::unaligned_vector::UnalignedVector; use crate::{ Database, Error, ItemId, Key, Metadata, MetadataCodec, Node, NodeCodec, NodeId, Prefix, @@ -224,8 +223,10 @@ impl Writer { pub fn need_build(&self, rtxn: &RoTxn) -> Result { Ok(self .database - .remap_data_type::() - .get(rtxn, &Key::updated(self.index))? + .remap_types::() + .prefix_iter(rtxn, &Prefix::updated(self.index))? + .remap_key_type::() + .next() .is_some() || self .database @@ -266,17 +267,7 @@ impl Writer { let vector = UnalignedVector::from_slice(vector); let leaf = Leaf { header: D::new_header(&vector), vector }; self.database.put(wtxn, &Key::item(self.index, item), &Node::Leaf(leaf))?; - let mut updated = self - .database - .remap_data_type::() - .get(wtxn, &Key::updated(self.index))? - .unwrap_or_default(); - updated.insert(item); - self.database.remap_data_type::().put( - wtxn, - &Key::updated(self.index), - &updated, - )?; + self.database.remap_data_type::().put(wtxn, &Key::updated(self.index, item), &())?; Ok(()) } @@ -302,18 +293,8 @@ impl Writer { Err(heed::Error::Mdb(MdbError::KeyExist)) => return Err(Error::InvalidItemAppend), Err(e) => return Err(e.into()), } - let mut updated = self - .database - .remap_data_type::() - .get(wtxn, &Key::updated(self.index))? - .unwrap_or_default(); - // We cannot append here because we may have removed an item with a larger id before - updated.insert(item); - self.database.remap_data_type::().put( - wtxn, - &Key::updated(self.index), - &updated, - )?; + // We cannot append here because the items appear after the updated keys + self.database.remap_data_type::().put(wtxn, &Key::updated(self.index, item), &())?; Ok(()) } @@ -321,16 +302,10 @@ impl Writer { /// Deletes an item stored in this database and returns `true` if it existed. pub fn del_item(&self, wtxn: &mut RwTxn, item: ItemId) -> Result { if self.database.delete(wtxn, &Key::item(self.index, item))? { - let mut updated = self - .database - .remap_data_type::() - .get(wtxn, &Key::updated(self.index))? - .unwrap_or_default(); - updated.insert(item); - self.database.remap_data_type::().put( + self.database.remap_data_type::().put( wtxn, - &Key::updated(self.index), - &updated, + &Key::updated(self.index, item), + &(), )?; Ok(true) @@ -430,7 +405,18 @@ impl Writer { } log::debug!("reset the updated items..."); - self.database.delete(wtxn, &Key::updated(self.index))?; + let mut updated_iter = self + .database + .remap_types::() + .prefix_iter_mut(wtxn, &Prefix::updated(self.index))? + .remap_key_type::(); + while updated_iter.next().transpose()?.is_some() { + // Safe because we don't hold any reference to the database currently + unsafe { + updated_iter.del_current()?; + } + } + drop(updated_iter); log::debug!("write the metadata..."); let metadata = Metadata { @@ -448,11 +434,23 @@ impl Writer { return Ok(()); } - let updated_items = self + log::debug!("reset and retrieve the updated items..."); + let mut updated_items = RoaringBitmap::new(); + let mut updated_iter = self .database - .remap_data_type::() - .get(wtxn, &Key::updated(self.index))? - .unwrap_or_default(); + .remap_types::() + .prefix_iter_mut(wtxn, &Prefix::updated(self.index))? + .remap_key_type::(); + while let Some((key, _)) = updated_iter.next().transpose()? { + let inserted = updated_items.push(key.node.item); + debug_assert!(inserted, "The keys should be sorted by LMDB"); + // Safe because we don't hold any reference to the database currently + unsafe { + updated_iter.del_current()?; + } + } + drop(updated_iter); + // while iterating on the nodes we want to delete all the modified element even if they are being inserted right after. let to_delete = &updated_items; let to_insert = &item_indices & &updated_items; @@ -548,9 +546,6 @@ impl Writer { roots.append(&mut thread_roots); } - log::debug!("reset the updated items..."); - self.database.delete(wtxn, &Key::updated(self.index))?; - log::debug!("write the metadata..."); let metadata = Metadata { dimensions: self.dimensions.try_into().unwrap(), @@ -774,6 +769,7 @@ impl Writer { } } NodeMode::Metadata => unreachable!(), + NodeMode::Updated => todo!(), } } From d9a5694ee073f8eb18cebe8615f87d2ef901cfcd Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 1 Oct 2024 15:50:37 +0200 Subject: [PATCH 2/2] change the todo! in unreachable! --- src/error.rs | 2 +- src/tests/mod.rs | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/error.rs b/src/error.rs index 7c4824b6..975b947c 100644 --- a/src/error.rs +++ b/src/error.rs @@ -75,7 +75,7 @@ impl Error { NodeMode::Item => "Item", NodeMode::Tree => "Tree", NodeMode::Metadata => "Metadata", - NodeMode::Updated => todo!(), + NodeMode::Updated => "Updated", }, item: key.node.item, } diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 42296f85..bdcc3b6d 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -77,10 +77,7 @@ impl fmt::Display for DatabaseHandle { .unwrap(); writeln!(f, "updated_item_ids: {updated_item_ids:?}")?; } - NodeMode::Metadata => { - panic!() - } - NodeMode::Updated => todo!(), + NodeMode::Updated | NodeMode::Metadata => panic!(), } }