Mercurial > public > mercurial-scm > hg
view rust/hg-pyo3/src/revlog/mod.rs @ 52795:adf91dfe6c04
rust-pyo3-index: _index_headrevs
This one demonstrates that why the `with_index_read` and similar
helpers are useful and was actually the main motivation for doing
them: if we kept the borrow used to grab the index before updating
the caches, there would be a panic when calling `borrow_mut`.
This was confirmed with an earlier version by the Python test.
There are perhaps some internal API clarifications to be made, as
the method updating the cache does a seemingly useless return), but
we are keeping it as it was in `hg-cpython`.
author | Georges Racinet <georges.racinet@cloudcrane.io> |
---|---|
date | Wed, 25 Dec 2024 19:06:59 +0100 |
parents | 5ad4ed71fbe0 |
children | 670ebb2f975a |
line wrap: on
line source
// revlog.rs // // Copyright 2019-2020 Georges Racinet <georges.racinet@octobus.net> // 2020-2024 Raphaël Gomès <raphael.gomes@octobus.net> // 2024 Georges Racinet <georges.racinet@cloudcrane.io> // // This software may be used and distributed according to the terms of the // GNU General Public License version 2 or any later version. #![allow(non_snake_case)] use pyo3::buffer::PyBuffer; use pyo3::conversion::IntoPyObject; use pyo3::exceptions::{PyIndexError, PyTypeError}; use pyo3::prelude::*; use pyo3::types::{PyBytes, PyBytesMethods, PyList, PyTuple}; use pyo3_sharedref::{PyShareable, SharedByPyObject}; use std::collections::HashSet; use std::sync::{ atomic::{AtomicUsize, Ordering}, RwLock, RwLockReadGuard, RwLockWriteGuard, }; use hg::{ revlog::{ index::{Index, RevisionDataParams}, inner_revlog::InnerRevlog as CoreInnerRevlog, nodemap::{NodeMap, NodeMapError, NodeTree as CoreNodeTree}, options::RevlogOpenOptions, RevlogIndex, RevlogType, }, utils::files::get_path_from_bytes, vfs::FnCacheVfs, BaseRevision, Revision, UncheckedRevision, NULL_REVISION, }; use crate::{ exceptions::{ graph_error, map_lock_error, map_try_lock_error, nodemap_error, rev_not_in_index, revlog_error_bare, revlog_error_from_msg, }, node::{node_from_py_bytes, node_prefix_from_py_bytes, py_node_for_rev}, revision::{check_revision, rev_pyiter_collect, revs_py_list, PyRevision}, store::PyFnCache, util::{new_submodule, take_buffer_with_slice}, }; mod config; use config::*; mod index; use index::{ py_tuple_to_revision_data_params, revision_data_params_to_py_tuple, PySharedIndex, }; #[pyclass] struct ReadingContextManager { inner_revlog: Py<InnerRevlog>, } #[pymethods] impl ReadingContextManager { fn __enter__(slf: PyRef<'_, Self>) -> PyResult<()> { let inner_bound = slf.inner_revlog.bind(slf.py()); let shareable = &inner_bound.borrow().irl; // Safety: the owner is correct and we won't use `share()` anyway let core_irl = unsafe { shareable.borrow_with_owner(inner_bound) }.read(); core_irl .enter_reading_context() .map_err(revlog_error_from_msg) .inspect_err(|_e| { // `__exit__` is not called from Python if `__enter__` fails core_irl.exit_reading_context(); }) } #[pyo3(signature = (*_args))] fn __exit__(slf: PyRef<'_, Self>, _args: &Bound<'_, PyTuple>) { let inner_bound = slf.inner_revlog.bind(slf.py()); let shareable = &inner_bound.borrow().irl; // Safety: the owner is correct and we won't use `share()` anyway let core_irl_ref = unsafe { shareable.borrow_with_owner(inner_bound) }; core_irl_ref.read().exit_reading_context(); } } #[pyclass] #[allow(dead_code)] struct InnerRevlog { irl: PyShareable<CoreInnerRevlog>, nt: RwLock<Option<CoreNodeTree>>, docket: Option<PyObject>, // Holds a reference to the mmap'ed persistent nodemap data nodemap_mmap: Option<PyBuffer<u8>>, // Holds a reference to the mmap'ed persistent index data index_mmap: Option<PyBuffer<u8>>, revision_cache: Option<PyObject>, head_revs_py_list: Option<Py<PyList>>, head_node_ids_py_list: Option<Py<PyList>>, use_persistent_nodemap: bool, nodemap_queries: AtomicUsize, } #[pymethods] impl InnerRevlog { #[new] // The Python side has authority on this signature. #[allow(clippy::too_many_arguments)] fn new( vfs_base: &Bound<'_, PyBytes>, fncache: &Bound<'_, PyAny>, vfs_is_readonly: bool, index_data: &Bound<'_, PyAny>, index_file: &Bound<'_, PyBytes>, data_file: &Bound<'_, PyBytes>, sidedata_file: &Bound<'_, PyAny>, inline: bool, data_config: &Bound<'_, PyAny>, delta_config: &Bound<'_, PyAny>, feature_config: &Bound<'_, PyAny>, chunk_cache: &Bound<'_, PyAny>, default_compression_header: &Bound<'_, PyAny>, revlog_type: usize, use_persistent_nodemap: bool, ) -> PyResult<Self> { // Let clippy accept the unused arguments. This is a bit better than // a blank `allow` directive let _ = sidedata_file; let _ = chunk_cache; let _ = default_compression_header; let index_file = get_path_from_bytes(index_file.as_bytes()).to_owned(); let data_file = get_path_from_bytes(data_file.as_bytes()).to_owned(); let revlog_type = RevlogType::try_from(revlog_type) .map_err(revlog_error_from_msg)?; let data_config = extract_data_config(data_config, revlog_type)?; let delta_config = extract_delta_config(delta_config, revlog_type)?; let feature_config = extract_feature_config(feature_config, revlog_type)?; let options = RevlogOpenOptions::new( inline, data_config, delta_config, feature_config, ); // Safety: we keep the buffer around inside the returned instance as // `index_mmap` let (buf, bytes) = unsafe { take_buffer_with_slice(index_data)? }; let index = Index::new(bytes, options.index_header()) .map_err(revlog_error_from_msg)?; let base = get_path_from_bytes(vfs_base.as_bytes()).to_owned(); let core = CoreInnerRevlog::new( Box::new(FnCacheVfs::new( base, vfs_is_readonly, Box::new(PyFnCache::new(fncache.clone().unbind())), )), index, index_file, data_file, data_config, delta_config, feature_config, ); Ok(Self { irl: core.into(), nt: None.into(), docket: None, nodemap_mmap: None, index_mmap: buf.into(), head_revs_py_list: None, head_node_ids_py_list: None, revision_cache: None, use_persistent_nodemap, nodemap_queries: AtomicUsize::new(0), }) } fn reading(slf: &Bound<'_, Self>) -> PyResult<ReadingContextManager> { Ok(ReadingContextManager { inner_revlog: slf.clone().unbind(), }) } // // -- forwarded index methods -- // fn _index_get_rev( slf: &Bound<'_, Self>, node: &Bound<'_, PyBytes>, ) -> PyResult<Option<PyRevision>> { let node = node_from_py_bytes(node)?; // Do not rewrite this with `Self::with_index_nt_read`: it makes // inconditionally a volatile nodetree, and that is not the intent // here: the code below specifically avoids that. Self::with_core_read(slf, |self_ref, irl| { let idx = &irl.index; let prev_queries = self_ref.nodemap_queries.fetch_add(1, Ordering::Relaxed); // Filelogs have no persistent nodemaps and are often small, // use a brute force lookup from the end // backwards. If there is a very large filelog // (automation file that changes every // commit etc.), it also seems to work quite well for // all measured purposes so far. if !self_ref.use_persistent_nodemap && prev_queries <= 3 { return Ok(idx .rev_from_node_no_persistent_nodemap(node.into()) .ok() .map(Into::into)); } let opt = self_ref.get_nodetree(idx)?.read().map_err(map_lock_error)?; let nt = opt.as_ref().expect("nodetree should be set"); let rust_rev = nt.find_bin(idx, node.into()).map_err(nodemap_error)?; Ok(rust_rev.map(Into::into)) }) } /// same as `_index_get_rev()` but raises a bare `error.RevlogError` if /// node is not found. /// /// No need to repeat `node` in the exception, `mercurial/revlog.py` /// will catch and rewrap with it fn _index_rev( slf: &Bound<'_, Self>, node: &Bound<'_, PyBytes>, ) -> PyResult<PyRevision> { Self::_index_get_rev(slf, node)?.ok_or_else(revlog_error_bare) } /// return True if the node exist in the index fn _index_has_node( slf: &Bound<'_, Self>, node: &Bound<'_, PyBytes>, ) -> PyResult<bool> { Self::_index_get_rev(slf, node).map(|opt| opt.is_some()) } /// find length of shortest hex nodeid of a binary ID fn _index_shortest( slf: &Bound<'_, Self>, node: &Bound<'_, PyBytes>, ) -> PyResult<usize> { Self::with_index_nt_read(slf, |idx, nt| { match nt.unique_prefix_len_node(idx, &node_from_py_bytes(node)?) { Ok(Some(l)) => Ok(l), Ok(None) => Err(revlog_error_bare()), Err(e) => Err(nodemap_error(e)), } }) } fn _index_partialmatch<'py>( slf: &Bound<'py, Self>, node: &Bound<'py, PyBytes>, ) -> PyResult<Option<Bound<'py, PyBytes>>> { Self::with_index_nt_read(slf, |idx, nt| { Ok(nt .find_bin(idx, node_prefix_from_py_bytes(node)?) .map_err(nodemap_error)? .map(|rev| py_node_for_rev(slf.py(), idx, rev))) }) } /// append an index entry fn _index_append( slf: &Bound<'_, Self>, tup: &Bound<'_, PyTuple>, ) -> PyResult<()> { // no need to check length: in PyO3 tup.get_item() does return // proper errors let node_bytes = tup.get_item(7)?.extract()?; let node = node_from_py_bytes(&node_bytes)?; Self::with_index_nt_write(slf, |idx, nt| { let rev = idx.len() as BaseRevision; // This is ok since we will immediately add the revision to the // index let rev = Revision(rev); idx.append(py_tuple_to_revision_data_params(tup)?) .map_err(revlog_error_from_msg)?; nt.insert(idx, &node, rev).map_err(nodemap_error)?; Ok(()) }) } /// Removes one or several entries from the index. /// /// Historically, on the Mercurial revlog index, `__delitem__` has always /// been both for `del idx[r1]` and `del idx[r1:r2]`. In both cases, /// all entries starting from `r1` are removed anyway. fn _index___delitem__( slf: &Bound<'_, Self>, arg: &Bound<'_, PyAny>, ) -> PyResult<()> { let start = if let Ok(rev) = arg.extract() { UncheckedRevision(rev) } else { // here we could downcast to `PySlice` and use `indices()`, *but* // the rust-cpython based version could not do that, and // `indices()` does some resolving that makes it not equivalent, // e.g., `idx[-1::]` has `start=0`. As we are currently in // transition, we keep it the old way (hoping it was consistent // with the C index). let start = arg.getattr("start")?; UncheckedRevision(start.extract()?) }; Self::with_index_nt_write(slf, |idx, nt| { // In the case of a slice, the check is possibly already done by // `slice.indices`, which is itself an FFI wrapper for CPython's // `PySlice_GetIndicesEx` // (Python integration tests will tell us) let start = idx.check_revision(start).ok_or_else(|| { nodemap_error(NodeMapError::RevisionNotInIndex(start)) })?; idx.remove(start).map_err(revlog_error_from_msg)?; nt.invalidate_all(); Self::fill_nodemap(idx, nt)?; Ok(()) }) } #[pyo3(signature = (*args))] fn _index_headrevs( slf: &Bound<'_, Self>, py: Python<'_>, args: &Bound<'_, PyTuple>, ) -> PyResult<Py<PyList>> { let (filtered_revs, stop_rev) = match args.len() { 0 => Ok((None, None)), 1 => Ok((Some(args.get_item(0)?), None)), 2 => Ok((Some(args.get_item(0)?), Some(args.get_item(1)?))), _ => Err(PyTypeError::new_err("too many arguments")), }?; let stop_rev = stop_rev .map(|o| o.extract::<Option<i32>>()) .transpose()? .flatten(); let filtered_revs = filtered_revs.filter(|o| !o.is_none()); let (from_core, stop_rev) = Self::with_index_read(slf, |idx| { let stop_rev = stop_rev // should this not just be the normal checking? .filter(|rev| 0 <= *rev && *rev < idx.len() as BaseRevision) .map(Revision); let from_core = if let Some(filtered_revs) = filtered_revs { let filtered_revs = rev_pyiter_collect(&filtered_revs, idx)?; idx.head_revs_advanced( &filtered_revs, stop_rev, stop_rev.is_none(), ) } else if stop_rev.is_some() { idx.head_revs_advanced(&HashSet::new(), stop_rev, false) } else { idx.head_revs_shortcut() } .map_err(graph_error)?; Ok((from_core, stop_rev)) })?; if stop_rev.is_some() { // we don't cache result for now let new_heads = from_core.expect("this case should not be cached yet"); revs_py_list(py, new_heads) } else { if let Some(new_heads) = from_core { Self::cache_new_heads_py_list(slf, new_heads)?; } Ok(slf .borrow() .head_revs_py_list .as_ref() .expect("head revs should be cached") .clone_ref(py)) } } fn _index___len__(slf: &Bound<'_, Self>) -> PyResult<usize> { Self::with_index_read(slf, |idx| Ok(idx.len())) } fn _index___getitem__( slf: &Bound<'_, Self>, py: Python<'_>, key: &Bound<'_, PyAny>, ) -> PyResult<PyObject> { Self::with_index_read(slf, |idx| { match key.extract::<BaseRevision>() { Ok(key_as_int) => { let entry_params = if key_as_int == NULL_REVISION.0 { RevisionDataParams::default() } else { let rev = UncheckedRevision(key_as_int); match idx.entry_as_params(rev) { Some(e) => e, None => { return Err(PyIndexError::new_err( "revlog index out of range", )); } } }; Ok(revision_data_params_to_py_tuple(py, entry_params)? .into_any() .unbind()) } // Case when key is a binary Node ID (lame: we're re-unlocking) _ => Self::_index_get_rev(slf, key.downcast::<PyBytes>()?)? .map_or_else( || Ok(py.None()), |py_rev| Ok(py_rev.into_pyobject(py)?.unbind().into()), ), } }) } } impl InnerRevlog { /// Take the lock on `slf.irl` for reading and call a closure. /// /// This serves the purpose to keep the needed intermediate [`PyRef`] /// that must be obtained to access the data from the [`Bound`] reference /// and of which the locked [`CoreInnerRevlog`] depends. /// This also provides releasing of the [`PyRef`] as soon as the closure /// is done, which is crucial if the caller needs to obtain a [`PyRefMut`] /// later on. /// /// In the closure, we hand back the intermediate [`PyRef`] that /// has been generated so that the closure can access more attributes. fn with_core_read<'py, T>( slf: &Bound<'py, Self>, f: impl FnOnce( &PyRef<'py, Self>, RwLockReadGuard<CoreInnerRevlog>, ) -> PyResult<T>, ) -> PyResult<T> { let self_ref = slf.borrow(); // Safety: the owner is the right one. We will anyway // not actually `share` it. Perhaps pyo3-sharedref should provide // something less scary for this kind of usage. let shareable_ref = unsafe { self_ref.irl.borrow_with_owner(slf) }; let guard = shareable_ref.try_read().map_err(map_try_lock_error)?; f(&self_ref, guard) } /// Take the lock on `slf.irl` for writing and call a closure. /// /// See [`Self::with_core_read`] for more explanations. fn with_core_write<'py, T>( slf: &Bound<'py, Self>, f: impl FnOnce( &PyRef<'py, Self>, RwLockWriteGuard<CoreInnerRevlog>, ) -> PyResult<T>, ) -> PyResult<T> { let self_ref = slf.borrow(); // Safety: the owner is the right one. We will anyway // not actually `share` it. Perhaps pyo3-sharedref should provide // something less scary for this kind of usage. let shareable_ref = unsafe { self_ref.irl.borrow_with_owner(slf) }; let guard = shareable_ref.try_write().map_err(map_try_lock_error)?; f(&self_ref, guard) } fn with_index_read<T>( slf: &Bound<'_, Self>, f: impl FnOnce(&Index) -> PyResult<T>, ) -> PyResult<T> { Self::with_core_read(slf, |_, guard| f(&guard.index)) } #[allow(dead_code)] fn with_index_write<T>( slf: &Bound<'_, Self>, f: impl FnOnce(&mut Index) -> PyResult<T>, ) -> PyResult<T> { Self::with_core_write(slf, |_, mut guard| f(&mut guard.index)) } /// Lock `slf` for reading and execute a closure on its [`Index`] and /// [`NodeTree`] /// /// The [`NodeTree`] is initialized an filled before hand if needed. fn with_index_nt_read<T>( slf: &Bound<'_, Self>, f: impl FnOnce(&Index, &CoreNodeTree) -> PyResult<T>, ) -> PyResult<T> { Self::with_core_read(slf, |self_ref, guard| { let idx = &guard.index; let nt = self_ref.get_nodetree(idx)?.read().map_err(map_lock_error)?; let nt = nt.as_ref().expect("nodetree should be set"); f(idx, nt) }) } fn with_index_nt_write<T>( slf: &Bound<'_, Self>, f: impl FnOnce(&mut Index, &mut CoreNodeTree) -> PyResult<T>, ) -> PyResult<T> { Self::with_core_write(slf, |self_ref, mut guard| { let idx = &mut guard.index; let mut nt = self_ref .get_nodetree(idx)? .write() .map_err(map_lock_error)?; let nt = nt.as_mut().expect("nodetree should be set"); f(idx, nt) }) } /// Fill a [`CoreNodeTree`] by doing a full iteration on the given /// [`Index`] /// /// # Python exceptions /// Raises `ValueError` if `nt` has existing data that is inconsistent /// with `idx`. fn fill_nodemap(idx: &Index, nt: &mut CoreNodeTree) -> PyResult<()> { for r in 0..idx.len() { let rev = Revision(r as BaseRevision); // in this case node() won't ever return None nt.insert(idx, idx.node(rev).expect("node should exist"), rev) .map_err(nodemap_error)? } Ok(()) } /// Return a working NodeTree of this InnerRevlog /// /// In case the NodeTree has not been initialized yet (in particular /// not from persistent data at instantiation), it is created and /// filled right away from the index. /// /// Technically, the returned NodeTree is still behind the lock of /// the `nt` field, hence still wrapped in an [`Option`]. Callers /// will need to take the lock and unwrap with `expect()`. /// /// # Python exceptions /// The case mentioned in [`Self::fill_nodemap()`] cannot happen, as the /// NodeTree is empty when it is called. fn get_nodetree( &self, idx: &Index, ) -> PyResult<&RwLock<Option<CoreNodeTree>>> { if self.nt.read().map_err(map_lock_error)?.is_none() { let readonly = Box::<Vec<_>>::default(); let mut nt = CoreNodeTree::load_bytes(readonly, 0); Self::fill_nodemap(idx, &mut nt)?; self.nt.write().map_err(map_lock_error)?.replace(nt); } Ok(&self.nt) } fn cache_new_heads_py_list( slf: &Bound<'_, Self>, new_heads: Vec<Revision>, ) -> PyResult<Py<PyList>> { let py = slf.py(); let new_heads_py_list = revs_py_list(py, new_heads)?; slf.borrow_mut().head_revs_py_list = Some(new_heads_py_list.clone_ref(py)); // TODO is returning really useful? Ok(new_heads_py_list) } } #[pyclass] struct NodeTree { nt: RwLock<CoreNodeTree>, index: SharedByPyObject<PySharedIndex>, } #[pymethods] impl NodeTree { #[new] // The share/mapping should be set apart to become the PyO3 homolog of // `py_rust_index_to_graph` fn new(index_proxy: &Bound<'_, PyAny>) -> PyResult<Self> { let py_irl = index_proxy.getattr("inner")?; let py_irl_ref = py_irl.downcast::<InnerRevlog>()?.borrow(); let shareable_irl = &py_irl_ref.irl; // Safety: the owner is the actual one and we do not leak any // internal reference. let index = unsafe { shareable_irl.share_map(&py_irl, |irl| (&irl.index).into()) }; let nt = CoreNodeTree::default(); // in-RAM, fully mutable Ok(Self { nt: nt.into(), index, }) } /// Tell whether the NodeTree is still valid /// /// In case of mutation of the index, the given results are not /// guaranteed to be correct, and in fact, the methods borrowing /// the inner index would fail because of `PySharedRef` poisoning /// (generation-based guard), same as iterating on a `dict` that has /// been meanwhile mutated. fn is_invalidated(&self, py: Python<'_>) -> PyResult<bool> { // Safety: we don't leak any reference derived from self.index, as // we only check errors let result = unsafe { self.index.try_borrow(py) }; // two cases for result to be an error: // - the index has previously been mutably borrowed // - there is currently a mutable borrow // in both cases this amounts for previous results related to // the index to still be valid. Ok(result.is_err()) } fn insert(&self, py: Python<'_>, rev: PyRevision) -> PyResult<()> { // Safety: we don't leak any reference derived from self.index, // as `nt.insert` does not store direct references let idx = &*unsafe { self.index.try_borrow(py)? }; let rev = check_revision(idx, rev)?; if rev == NULL_REVISION { return Err(rev_not_in_index(rev.into())); } let entry = idx.inner().get_entry(rev).expect("entry should exist"); let mut nt = self.nt.write().map_err(map_lock_error)?; nt.insert(idx, entry.hash(), rev).map_err(nodemap_error) } fn shortest( &self, py: Python<'_>, node: &Bound<'_, PyBytes>, ) -> PyResult<usize> { let nt = self.nt.read().map_err(map_lock_error)?; // Safety: we don't leak any reference derived from self.index // as returned type is Copy let idx = &*unsafe { self.index.try_borrow(py)? }; nt.unique_prefix_len_node(idx, &node_from_py_bytes(node)?) .map_err(nodemap_error)? .ok_or_else(revlog_error_bare) } /// Lookup by node hex prefix in the NodeTree, returning revision number. /// /// This is not part of the classical NodeTree API, but is good enough /// for unit testing, as in `test-rust-revlog.py`. fn prefix_rev_lookup( &self, py: Python<'_>, node_prefix: &Bound<'_, PyBytes>, ) -> PyResult<Option<PyRevision>> { let prefix = node_prefix_from_py_bytes(node_prefix)?; let nt = self.nt.read().map_err(map_lock_error)?; // Safety: we don't leak any reference derived from self.index // as returned type is Copy let idx = &*unsafe { self.index.try_borrow(py)? }; Ok(nt .find_bin(idx, prefix) .map_err(nodemap_error)? .map(|r| r.into())) } } pub fn init_module<'py>( py: Python<'py>, package: &str, ) -> PyResult<Bound<'py, PyModule>> { let m = new_submodule(py, package, "revlog")?; m.add_class::<InnerRevlog>()?; m.add_class::<NodeTree>()?; m.add_class::<ReadingContextManager>()?; Ok(m) }