rust-pyo3-revlog: standalone NodeTree class
This is the actual first usage of `PyShareable`, but perhaps
it could be not so much necessary in this case (we could just
reference the `InnerRevlog` python object, and we do not need
to keep additional state).
// revlog.rs
//
// Copyright 2019-2020 Georges Racinet <georges.racinet@octobus.net>
// 2020-2024 Raphaël Gomès <raphael.gomes@octobus.net>
// 2024 Georges Racinet <georges.racinet@cloudcrane.io>
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2 or any later version.
#![allow(non_snake_case)]
use pyo3::buffer::PyBuffer;
use pyo3::conversion::IntoPyObject;
use pyo3::exceptions::PyIndexError;
use pyo3::prelude::*;
use pyo3::types::{PyBytes, PyBytesMethods, PyList, PyTuple};
use pyo3_sharedref::{PyShareable, SharedByPyObject};
use std::sync::{
atomic::{AtomicUsize, Ordering},
RwLock, RwLockReadGuard, RwLockWriteGuard,
};
use hg::{
revlog::{
index::{Index, RevisionDataParams},
inner_revlog::InnerRevlog as CoreInnerRevlog,
nodemap::{NodeMap, NodeMapError, NodeTree as CoreNodeTree},
options::RevlogOpenOptions,
RevlogIndex, RevlogType,
},
utils::files::get_path_from_bytes,
vfs::FnCacheVfs,
BaseRevision, Revision, UncheckedRevision, NULL_REVISION,
};
use crate::{
exceptions::{
map_lock_error, map_try_lock_error,
nodemap_error, rev_not_in_index, revlog_error_bare,
revlog_error_from_msg,
},
node::{node_from_py_bytes, node_prefix_from_py_bytes, py_node_for_rev},
revision::{check_revision, PyRevision},
store::PyFnCache,
util::{new_submodule, take_buffer_with_slice},
};
mod config;
use config::*;
mod index;
use index::{
py_tuple_to_revision_data_params, revision_data_params_to_py_tuple,
PySharedIndex,
};
#[pyclass]
#[allow(dead_code)]
struct InnerRevlog {
irl: PyShareable<CoreInnerRevlog>,
nt: RwLock<Option<CoreNodeTree>>,
docket: Option<PyObject>,
// Holds a reference to the mmap'ed persistent nodemap data
nodemap_mmap: Option<PyBuffer<u8>>,
// Holds a reference to the mmap'ed persistent index data
index_mmap: Option<PyBuffer<u8>>,
revision_cache: Option<PyObject>,
head_revs_py_list: Option<Py<PyList>>,
head_node_ids_py_list: Option<Py<PyList>>,
use_persistent_nodemap: bool,
nodemap_queries: AtomicUsize,
}
#[pymethods]
impl InnerRevlog {
#[new]
// The Python side has authority on this signature.
#[allow(clippy::too_many_arguments)]
fn new(
vfs_base: &Bound<'_, PyBytes>,
fncache: &Bound<'_, PyAny>,
vfs_is_readonly: bool,
index_data: &Bound<'_, PyAny>,
index_file: &Bound<'_, PyBytes>,
data_file: &Bound<'_, PyBytes>,
sidedata_file: &Bound<'_, PyAny>,
inline: bool,
data_config: &Bound<'_, PyAny>,
delta_config: &Bound<'_, PyAny>,
feature_config: &Bound<'_, PyAny>,
chunk_cache: &Bound<'_, PyAny>,
default_compression_header: &Bound<'_, PyAny>,
revlog_type: usize,
use_persistent_nodemap: bool,
) -> PyResult<Self> {
// Let clippy accept the unused arguments. This is a bit better than
// a blank `allow` directive
let _ = sidedata_file;
let _ = chunk_cache;
let _ = default_compression_header;
let index_file = get_path_from_bytes(index_file.as_bytes()).to_owned();
let data_file = get_path_from_bytes(data_file.as_bytes()).to_owned();
let revlog_type = RevlogType::try_from(revlog_type)
.map_err(revlog_error_from_msg)?;
let data_config = extract_data_config(data_config, revlog_type)?;
let delta_config = extract_delta_config(delta_config, revlog_type)?;
let feature_config =
extract_feature_config(feature_config, revlog_type)?;
let options = RevlogOpenOptions::new(
inline,
data_config,
delta_config,
feature_config,
);
// Safety: we keep the buffer around inside the returned instance as
// `index_mmap`
let (buf, bytes) = unsafe { take_buffer_with_slice(index_data)? };
let index = Index::new(bytes, options.index_header())
.map_err(revlog_error_from_msg)?;
let base = get_path_from_bytes(vfs_base.as_bytes()).to_owned();
let core = CoreInnerRevlog::new(
Box::new(FnCacheVfs::new(
base,
vfs_is_readonly,
Box::new(PyFnCache::new(fncache.clone().unbind())),
)),
index,
index_file,
data_file,
data_config,
delta_config,
feature_config,
);
Ok(Self {
irl: core.into(),
nt: None.into(),
docket: None,
nodemap_mmap: None,
index_mmap: buf.into(),
head_revs_py_list: None,
head_node_ids_py_list: None,
revision_cache: None,
use_persistent_nodemap,
nodemap_queries: AtomicUsize::new(0),
})
}
//
// -- forwarded index methods --
//
fn _index_get_rev(
slf: &Bound<'_, Self>,
node: &Bound<'_, PyBytes>,
) -> PyResult<Option<PyRevision>> {
let node = node_from_py_bytes(node)?;
// Do not rewrite this with `Self::with_index_nt_read`: it makes
// inconditionally a volatile nodetree, and that is not the intent
// here: the code below specifically avoids that.
Self::with_core_read(slf, |self_ref, irl| {
let idx = &irl.index;
let prev_queries =
self_ref.nodemap_queries.fetch_add(1, Ordering::Relaxed);
// Filelogs have no persistent nodemaps and are often small,
// use a brute force lookup from the end
// backwards. If there is a very large filelog
// (automation file that changes every
// commit etc.), it also seems to work quite well for
// all measured purposes so far.
if !self_ref.use_persistent_nodemap && prev_queries <= 3 {
return Ok(idx
.rev_from_node_no_persistent_nodemap(node.into())
.ok()
.map(Into::into));
}
let opt =
self_ref.get_nodetree(idx)?.read().map_err(map_lock_error)?;
let nt = opt.as_ref().expect("nodetree should be set");
let rust_rev =
nt.find_bin(idx, node.into()).map_err(nodemap_error)?;
Ok(rust_rev.map(Into::into))
})
}
/// same as `_index_get_rev()` but raises a bare `error.RevlogError` if
/// node is not found.
///
/// No need to repeat `node` in the exception, `mercurial/revlog.py`
/// will catch and rewrap with it
fn _index_rev(
slf: &Bound<'_, Self>,
node: &Bound<'_, PyBytes>,
) -> PyResult<PyRevision> {
Self::_index_get_rev(slf, node)?.ok_or_else(revlog_error_bare)
}
/// return True if the node exist in the index
fn _index_has_node(
slf: &Bound<'_, Self>,
node: &Bound<'_, PyBytes>,
) -> PyResult<bool> {
Self::_index_get_rev(slf, node).map(|opt| opt.is_some())
}
/// find length of shortest hex nodeid of a binary ID
fn _index_shortest(
slf: &Bound<'_, Self>,
node: &Bound<'_, PyBytes>,
) -> PyResult<usize> {
Self::with_index_nt_read(slf, |idx, nt| {
match nt.unique_prefix_len_node(idx, &node_from_py_bytes(node)?) {
Ok(Some(l)) => Ok(l),
Ok(None) => Err(revlog_error_bare()),
Err(e) => Err(nodemap_error(e)),
}
})
}
fn _index_partialmatch<'py>(
slf: &Bound<'py, Self>,
node: &Bound<'py, PyBytes>,
) -> PyResult<Option<Bound<'py, PyBytes>>> {
Self::with_index_nt_read(slf, |idx, nt| {
Ok(nt
.find_bin(idx, node_prefix_from_py_bytes(node)?)
.map_err(nodemap_error)?
.map(|rev| py_node_for_rev(slf.py(), idx, rev)))
})
}
/// append an index entry
fn _index_append(
slf: &Bound<'_, Self>,
tup: &Bound<'_, PyTuple>,
) -> PyResult<()> {
// no need to check length: in PyO3 tup.get_item() does return
// proper errors
let node_bytes = tup.get_item(7)?.extract()?;
let node = node_from_py_bytes(&node_bytes)?;
Self::with_index_nt_write(slf, |idx, nt| {
let rev = idx.len() as BaseRevision;
// This is ok since we will immediately add the revision to the
// index
let rev = Revision(rev);
idx.append(py_tuple_to_revision_data_params(tup)?)
.map_err(revlog_error_from_msg)?;
nt.insert(idx, &node, rev).map_err(nodemap_error)?;
Ok(())
})
}
/// Removes one or several entries from the index.
///
/// Historically, on the Mercurial revlog index, `__delitem__` has always
/// been both for `del idx[r1]` and `del idx[r1:r2]`. In both cases,
/// all entries starting from `r1` are removed anyway.
fn _index___delitem__(
slf: &Bound<'_, Self>,
arg: &Bound<'_, PyAny>,
) -> PyResult<()> {
let start = if let Ok(rev) = arg.extract() {
UncheckedRevision(rev)
} else {
// here we could downcast to `PySlice` and use `indices()`, *but*
// the rust-cpython based version could not do that, and
// `indices()` does some resolving that makes it not equivalent,
// e.g., `idx[-1::]` has `start=0`. As we are currently in
// transition, we keep it the old way (hoping it was consistent
// with the C index).
let start = arg.getattr("start")?;
UncheckedRevision(start.extract()?)
};
Self::with_index_nt_write(slf, |idx, nt| {
// In the case of a slice, the check is possibly already done by
// `slice.indices`, which is itself an FFI wrapper for CPython's
// `PySlice_GetIndicesEx`
// (Python integration tests will tell us)
let start = idx.check_revision(start).ok_or_else(|| {
nodemap_error(NodeMapError::RevisionNotInIndex(start))
})?;
idx.remove(start).map_err(revlog_error_from_msg)?;
nt.invalidate_all();
Self::fill_nodemap(idx, nt)?;
Ok(())
})
}
fn _index___len__(slf: &Bound<'_, Self>) -> PyResult<usize> {
Self::with_index_read(slf, |idx| Ok(idx.len()))
}
fn _index___getitem__(
slf: &Bound<'_, Self>,
py: Python<'_>,
key: &Bound<'_, PyAny>,
) -> PyResult<PyObject> {
Self::with_index_read(slf, |idx| {
match key.extract::<BaseRevision>() {
Ok(key_as_int) => {
let entry_params = if key_as_int == NULL_REVISION.0 {
RevisionDataParams::default()
} else {
let rev = UncheckedRevision(key_as_int);
match idx.entry_as_params(rev) {
Some(e) => e,
None => {
return Err(PyIndexError::new_err(
"revlog index out of range",
));
}
}
};
Ok(revision_data_params_to_py_tuple(py, entry_params)?
.into_any()
.unbind())
}
// Case when key is a binary Node ID (lame: we're re-unlocking)
_ => Self::_index_get_rev(slf, key.downcast::<PyBytes>()?)?
.map_or_else(
|| Ok(py.None()),
|py_rev| Ok(py_rev.into_pyobject(py)?.unbind().into()),
),
}
})
}
}
impl InnerRevlog {
/// Take the lock on `slf.irl` for reading and call a closure.
///
/// This serves the purpose to keep the needed intermediate [`PyRef`]
/// that must be obtained to access the data from the [`Bound`] reference
/// and of which the locked [`CoreInnerRevlog`] depends.
/// This also provides releasing of the [`PyRef`] as soon as the closure
/// is done, which is crucial if the caller needs to obtain a [`PyRefMut`]
/// later on.
///
/// In the closure, we hand back the intermediate [`PyRef`] that
/// has been generated so that the closure can access more attributes.
fn with_core_read<'py, T>(
slf: &Bound<'py, Self>,
f: impl FnOnce(
&PyRef<'py, Self>,
RwLockReadGuard<CoreInnerRevlog>,
) -> PyResult<T>,
) -> PyResult<T> {
let self_ref = slf.borrow();
// Safety: the owner is the right one. We will anyway
// not actually `share` it. Perhaps pyo3-sharedref should provide
// something less scary for this kind of usage.
let shareable_ref = unsafe { self_ref.irl.borrow_with_owner(slf) };
let guard = shareable_ref.try_read().map_err(map_try_lock_error)?;
f(&self_ref, guard)
}
/// Take the lock on `slf.irl` for writing and call a closure.
///
/// See [`Self::with_core_read`] for more explanations.
fn with_core_write<'py, T>(
slf: &Bound<'py, Self>,
f: impl FnOnce(
&PyRef<'py, Self>,
RwLockWriteGuard<CoreInnerRevlog>,
) -> PyResult<T>,
) -> PyResult<T> {
let self_ref = slf.borrow();
// Safety: the owner is the right one. We will anyway
// not actually `share` it. Perhaps pyo3-sharedref should provide
// something less scary for this kind of usage.
let shareable_ref = unsafe { self_ref.irl.borrow_with_owner(slf) };
let guard = shareable_ref.try_write().map_err(map_try_lock_error)?;
f(&self_ref, guard)
}
fn with_index_read<T>(
slf: &Bound<'_, Self>,
f: impl FnOnce(&Index) -> PyResult<T>,
) -> PyResult<T> {
Self::with_core_read(slf, |_, guard| f(&guard.index))
}
#[allow(dead_code)]
fn with_index_write<T>(
slf: &Bound<'_, Self>,
f: impl FnOnce(&mut Index) -> PyResult<T>,
) -> PyResult<T> {
Self::with_core_write(slf, |_, mut guard| f(&mut guard.index))
}
/// Lock `slf` for reading and execute a closure on its [`Index`] and
/// [`NodeTree`]
///
/// The [`NodeTree`] is initialized an filled before hand if needed.
fn with_index_nt_read<T>(
slf: &Bound<'_, Self>,
f: impl FnOnce(&Index, &CoreNodeTree) -> PyResult<T>,
) -> PyResult<T> {
Self::with_core_read(slf, |self_ref, guard| {
let idx = &guard.index;
let nt =
self_ref.get_nodetree(idx)?.read().map_err(map_lock_error)?;
let nt = nt.as_ref().expect("nodetree should be set");
f(idx, nt)
})
}
fn with_index_nt_write<T>(
slf: &Bound<'_, Self>,
f: impl FnOnce(&mut Index, &mut CoreNodeTree) -> PyResult<T>,
) -> PyResult<T> {
Self::with_core_write(slf, |self_ref, mut guard| {
let idx = &mut guard.index;
let mut nt = self_ref
.get_nodetree(idx)?
.write()
.map_err(map_lock_error)?;
let nt = nt.as_mut().expect("nodetree should be set");
f(idx, nt)
})
}
/// Fill a [`CoreNodeTree`] by doing a full iteration on the given
/// [`Index`]
///
/// # Python exceptions
/// Raises `ValueError` if `nt` has existing data that is inconsistent
/// with `idx`.
fn fill_nodemap(idx: &Index, nt: &mut CoreNodeTree) -> PyResult<()> {
for r in 0..idx.len() {
let rev = Revision(r as BaseRevision);
// in this case node() won't ever return None
nt.insert(idx, idx.node(rev).expect("node should exist"), rev)
.map_err(nodemap_error)?
}
Ok(())
}
/// Return a working NodeTree of this InnerRevlog
///
/// In case the NodeTree has not been initialized yet (in particular
/// not from persistent data at instantiation), it is created and
/// filled right away from the index.
///
/// Technically, the returned NodeTree is still behind the lock of
/// the `nt` field, hence still wrapped in an [`Option`]. Callers
/// will need to take the lock and unwrap with `expect()`.
///
/// # Python exceptions
/// The case mentioned in [`Self::fill_nodemap()`] cannot happen, as the
/// NodeTree is empty when it is called.
fn get_nodetree(
&self,
idx: &Index,
) -> PyResult<&RwLock<Option<CoreNodeTree>>> {
if self.nt.read().map_err(map_lock_error)?.is_none() {
let readonly = Box::<Vec<_>>::default();
let mut nt = CoreNodeTree::load_bytes(readonly, 0);
Self::fill_nodemap(idx, &mut nt)?;
self.nt.write().map_err(map_lock_error)?.replace(nt);
}
Ok(&self.nt)
}
}
#[pyclass]
struct NodeTree {
nt: RwLock<CoreNodeTree>,
index: SharedByPyObject<PySharedIndex>,
}
#[pymethods]
impl NodeTree {
#[new]
// The share/mapping should be set apart to become the PyO3 homolog of
// `py_rust_index_to_graph`
fn new(index_proxy: &Bound<'_, PyAny>) -> PyResult<Self> {
let py_irl = index_proxy.getattr("inner")?;
let py_irl_ref = py_irl.downcast::<InnerRevlog>()?.borrow();
let shareable_irl = &py_irl_ref.irl;
// Safety: the owner is the actual one and we do not leak any
// internal reference.
let index = unsafe {
shareable_irl.share_map(&py_irl, |irl| (&irl.index).into())
};
let nt = CoreNodeTree::default(); // in-RAM, fully mutable
Ok(Self {
nt: nt.into(),
index,
})
}
/// Tell whether the NodeTree is still valid
///
/// In case of mutation of the index, the given results are not
/// guaranteed to be correct, and in fact, the methods borrowing
/// the inner index would fail because of `PySharedRef` poisoning
/// (generation-based guard), same as iterating on a `dict` that has
/// been meanwhile mutated.
fn is_invalidated(&self, py: Python<'_>) -> PyResult<bool> {
// Safety: we don't leak any reference derived from self.index, as
// we only check errors
let result = unsafe { self.index.try_borrow(py) };
// two cases for result to be an error:
// - the index has previously been mutably borrowed
// - there is currently a mutable borrow
// in both cases this amounts for previous results related to
// the index to still be valid.
Ok(result.is_err())
}
fn insert(&self, py: Python<'_>, rev: PyRevision) -> PyResult<()> {
// Safety: we don't leak any reference derived from self.index,
// as `nt.insert` does not store direct references
let idx = &*unsafe { self.index.try_borrow(py)? };
let rev = check_revision(idx, rev)?;
if rev == NULL_REVISION {
return Err(rev_not_in_index(rev.into()));
}
let entry = idx.inner().get_entry(rev).expect("entry should exist");
let mut nt = self.nt.write().map_err(map_lock_error)?;
nt.insert(idx, entry.hash(), rev).map_err(nodemap_error)
}
fn shortest(
&self,
py: Python<'_>,
node: &Bound<'_, PyBytes>,
) -> PyResult<usize> {
let nt = self.nt.read().map_err(map_lock_error)?;
// Safety: we don't leak any reference derived from self.index
// as returned type is Copy
let idx = &*unsafe { self.index.try_borrow(py)? };
nt.unique_prefix_len_node(idx, &node_from_py_bytes(node)?)
.map_err(nodemap_error)?
.ok_or_else(revlog_error_bare)
}
/// Lookup by node hex prefix in the NodeTree, returning revision number.
///
/// This is not part of the classical NodeTree API, but is good enough
/// for unit testing, as in `test-rust-revlog.py`.
fn prefix_rev_lookup(
&self,
py: Python<'_>,
node_prefix: &Bound<'_, PyBytes>,
) -> PyResult<Option<PyRevision>> {
let prefix = node_prefix_from_py_bytes(node_prefix)?;
let nt = self.nt.read().map_err(map_lock_error)?;
// Safety: we don't leak any reference derived from self.index
// as returned type is Copy
let idx = &*unsafe { self.index.try_borrow(py)? };
Ok(nt
.find_bin(idx, prefix)
.map_err(nodemap_error)?
.map(|r| r.into()))
}
}
pub fn init_module<'py>(
py: Python<'py>,
package: &str,
) -> PyResult<Bound<'py, PyModule>> {
let m = new_submodule(py, package, "revlog")?;
m.add_class::<InnerRevlog>()?;
m.add_class::<NodeTree>()?;
Ok(m)
}