view rust/hg-pyo3/src/revlog/mod.rs @ 52788:e29e75e8328c

rust-pyo3-revlog: index and notetree writing accessors It works in the same way as their reading counterparts, given that we are leveraging the inner mutability of the `PyShareable` for the index and of the main object for the nodetree.
author Georges Racinet <georges.racinet@cloudcrane.io>
date Wed, 25 Dec 2024 17:17:40 +0100
parents e5f89bd1a5ee
children 34f44aa5e844
line wrap: on
line source

// revlog.rs
//
// Copyright 2019-2020 Georges Racinet <georges.racinet@octobus.net>
//           2020-2024 Raphaël Gomès <raphael.gomes@octobus.net>
//           2024 Georges Racinet <georges.racinet@cloudcrane.io>
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2 or any later version.
#![allow(non_snake_case)]
use pyo3::buffer::PyBuffer;
use pyo3::prelude::*;
use pyo3::types::{PyBytes, PyBytesMethods, PyList};
use pyo3_sharedref::PyShareable;

use std::sync::{
    atomic::{AtomicUsize, Ordering},
    RwLock, RwLockReadGuard, RwLockWriteGuard,
};

use hg::{
    revlog::{
        index::Index,
        inner_revlog::InnerRevlog as CoreInnerRevlog,
        nodemap::{NodeMap, NodeTree as CoreNodeTree},
        options::RevlogOpenOptions,
        RevlogIndex, RevlogType,
    },
    utils::files::get_path_from_bytes,
    vfs::FnCacheVfs,
    BaseRevision, Revision,
};

use crate::{
    exceptions::{
        map_lock_error, map_try_lock_error, nodemap_error, revlog_error_bare,
        revlog_error_from_msg,
    },
    node::{node_from_py_bytes, node_prefix_from_py_bytes, py_node_for_rev},
    revision::PyRevision,
    store::PyFnCache,
    util::{new_submodule, take_buffer_with_slice},
};

mod config;
use config::*;

#[pyclass]
#[allow(dead_code)]
struct InnerRevlog {
    irl: PyShareable<CoreInnerRevlog>,
    nt: RwLock<Option<CoreNodeTree>>,
    docket: Option<PyObject>,
    // Holds a reference to the mmap'ed persistent nodemap data
    nodemap_mmap: Option<PyBuffer<u8>>,
    // Holds a reference to the mmap'ed persistent index data
    index_mmap: Option<PyBuffer<u8>>,
    revision_cache: Option<PyObject>,
    head_revs_py_list: Option<Py<PyList>>,
    head_node_ids_py_list: Option<Py<PyList>>,
    use_persistent_nodemap: bool,
    nodemap_queries: AtomicUsize,
}

#[pymethods]
impl InnerRevlog {
    #[new]
    // The Python side has authority on this signature.
    #[allow(clippy::too_many_arguments)]
    fn new(
        vfs_base: &Bound<'_, PyBytes>,
        fncache: &Bound<'_, PyAny>,
        vfs_is_readonly: bool,
        index_data: &Bound<'_, PyAny>,
        index_file: &Bound<'_, PyBytes>,
        data_file: &Bound<'_, PyBytes>,
        sidedata_file: &Bound<'_, PyAny>,
        inline: bool,
        data_config: &Bound<'_, PyAny>,
        delta_config: &Bound<'_, PyAny>,
        feature_config: &Bound<'_, PyAny>,
        chunk_cache: &Bound<'_, PyAny>,
        default_compression_header: &Bound<'_, PyAny>,
        revlog_type: usize,
        use_persistent_nodemap: bool,
    ) -> PyResult<Self> {
        // Let clippy accept the unused arguments. This is a bit better than
        // a blank `allow` directive
        let _ = sidedata_file;
        let _ = chunk_cache;
        let _ = default_compression_header;

        let index_file = get_path_from_bytes(index_file.as_bytes()).to_owned();
        let data_file = get_path_from_bytes(data_file.as_bytes()).to_owned();
        let revlog_type = RevlogType::try_from(revlog_type)
            .map_err(revlog_error_from_msg)?;
        let data_config = extract_data_config(data_config, revlog_type)?;
        let delta_config = extract_delta_config(delta_config, revlog_type)?;
        let feature_config =
            extract_feature_config(feature_config, revlog_type)?;
        let options = RevlogOpenOptions::new(
            inline,
            data_config,
            delta_config,
            feature_config,
        );

        // Safety: we keep the buffer around inside the returned instance as
        // `index_mmap`
        let (buf, bytes) = unsafe { take_buffer_with_slice(index_data)? };
        let index = Index::new(bytes, options.index_header())
            .map_err(revlog_error_from_msg)?;

        let base = get_path_from_bytes(vfs_base.as_bytes()).to_owned();
        let core = CoreInnerRevlog::new(
            Box::new(FnCacheVfs::new(
                base,
                vfs_is_readonly,
                Box::new(PyFnCache::new(fncache.clone().unbind())),
            )),
            index,
            index_file,
            data_file,
            data_config,
            delta_config,
            feature_config,
        );
        Ok(Self {
            irl: core.into(),
            nt: None.into(),
            docket: None,
            nodemap_mmap: None,
            index_mmap: buf.into(),
            head_revs_py_list: None,
            head_node_ids_py_list: None,
            revision_cache: None,
            use_persistent_nodemap,
            nodemap_queries: AtomicUsize::new(0),
        })
    }

    //
    // -- forwarded index methods --
    //

    fn _index_get_rev(
        slf: &Bound<'_, Self>,
        node: &Bound<'_, PyBytes>,
    ) -> PyResult<Option<PyRevision>> {
        let node = node_from_py_bytes(node)?;

        // Do not rewrite this with `Self::with_index_nt_read`: it makes
        // inconditionally a volatile nodetree, and that is not the intent
        // here: the code below specifically avoids that.
        Self::with_core_read(slf, |self_ref, irl| {
            let idx = &irl.index;

            let prev_queries =
                self_ref.nodemap_queries.fetch_add(1, Ordering::Relaxed);
            // Filelogs have no persistent nodemaps and are often small,
            // use a brute force lookup from the end
            // backwards. If there is a very large filelog
            // (automation file that changes every
            // commit etc.), it also seems to work quite well for
            // all measured purposes so far.
            if !self_ref.use_persistent_nodemap && prev_queries <= 3 {
                return Ok(idx
                    .rev_from_node_no_persistent_nodemap(node.into())
                    .ok()
                    .map(Into::into));
            }

            let opt =
                self_ref.get_nodetree(idx)?.read().map_err(map_lock_error)?;
            let nt = opt.as_ref().expect("nodetree should be set");

            let rust_rev =
                nt.find_bin(idx, node.into()).map_err(nodemap_error)?;
            Ok(rust_rev.map(Into::into))
        })
    }

    /// same as `_index_get_rev()` but raises a bare `error.RevlogError` if
    /// node is not found.
    ///
    /// No need to repeat `node` in the exception, `mercurial/revlog.py`
    /// will catch and rewrap with it
    fn _index_rev(
        slf: &Bound<'_, Self>,
        node: &Bound<'_, PyBytes>,
    ) -> PyResult<PyRevision> {
        Self::_index_get_rev(slf, node)?.ok_or_else(revlog_error_bare)
    }

    /// return True if the node exist in the index
    fn _index_has_node(
        slf: &Bound<'_, Self>,
        node: &Bound<'_, PyBytes>,
    ) -> PyResult<bool> {
        Self::_index_get_rev(slf, node).map(|opt| opt.is_some())
    }

    /// find length of shortest hex nodeid of a binary ID
    fn _index_shortest(
        slf: &Bound<'_, Self>,
        node: &Bound<'_, PyBytes>,
    ) -> PyResult<usize> {
        Self::with_index_nt_read(slf, |idx, nt| {
            match nt.unique_prefix_len_node(idx, &node_from_py_bytes(node)?) {
                Ok(Some(l)) => Ok(l),
                Ok(None) => Err(revlog_error_bare()),
                Err(e) => Err(nodemap_error(e)),
            }
        })
    }

    fn _index_partialmatch<'py>(
        slf: &Bound<'py, Self>,
        node: &Bound<'py, PyBytes>,
    ) -> PyResult<Option<Bound<'py, PyBytes>>> {
        Self::with_index_nt_read(slf, |idx, nt| {
            Ok(nt
                .find_bin(idx, node_prefix_from_py_bytes(node)?)
                .map_err(nodemap_error)?
                .map(|rev| py_node_for_rev(slf.py(), idx, rev)))
        })
    }

    fn _index___len__(slf: &Bound<'_, Self>) -> PyResult<usize> {
        Self::with_index_read(slf, |idx| Ok(idx.len()))
    }
}

impl InnerRevlog {
    /// Take the lock on `slf.irl` for reading and call a closure.
    ///
    /// This serves the purpose to keep the needed intermediate [`PyRef`]
    /// that must be obtained to access the data from the [`Bound`] reference
    /// and of which the locked [`CoreInnerRevlog`] depends.
    /// This also provides releasing of the [`PyRef`] as soon as the closure
    /// is done, which is crucial if the caller needs to obtain a [`PyRefMut`]
    /// later on.
    ///
    /// In the closure, we hand back the intermediate [`PyRef`] that
    /// has been generated so that the closure can access more attributes.
    fn with_core_read<'py, T>(
        slf: &Bound<'py, Self>,
        f: impl FnOnce(
            &PyRef<'py, Self>,
            RwLockReadGuard<CoreInnerRevlog>,
        ) -> PyResult<T>,
    ) -> PyResult<T> {
        let self_ref = slf.borrow();
        // Safety: the owner is the right one. We will anyway
        // not actually `share` it. Perhaps pyo3-sharedref should provide
        // something less scary for this kind of usage.
        let shareable_ref = unsafe { self_ref.irl.borrow_with_owner(slf) };
        let guard = shareable_ref.try_read().map_err(map_try_lock_error)?;
        f(&self_ref, guard)
    }

    #[allow(dead_code)]
    /// Take the lock on `slf.irl` for writing and call a closure.
    ///
    /// See [`Self::with_core_read`] for more explanations.
    fn with_core_write<'py, T>(
        slf: &Bound<'py, Self>,
        f: impl FnOnce(
            &PyRef<'py, Self>,
            RwLockWriteGuard<CoreInnerRevlog>,
        ) -> PyResult<T>,
    ) -> PyResult<T> {
        let self_ref = slf.borrow();
        // Safety: the owner is the right one. We will anyway
        // not actually `share` it. Perhaps pyo3-sharedref should provide
        // something less scary for this kind of usage.
        let shareable_ref = unsafe { self_ref.irl.borrow_with_owner(slf) };
        let guard = shareable_ref.try_write().map_err(map_try_lock_error)?;
        f(&self_ref, guard)
    }

    fn with_index_read<T>(
        slf: &Bound<'_, Self>,
        f: impl FnOnce(&Index) -> PyResult<T>,
    ) -> PyResult<T> {
        Self::with_core_read(slf, |_, guard| f(&guard.index))
    }

    #[allow(dead_code)]
    fn with_index_write<T>(
        slf: &Bound<'_, Self>,
        f: impl FnOnce(&mut Index) -> PyResult<T>,
    ) -> PyResult<T> {
        Self::with_core_write(slf, |_, mut guard| f(&mut guard.index))
    }

    /// Lock `slf` for reading and execute a closure on its [`Index`] and
    /// [`NodeTree`]
    ///
    /// The [`NodeTree`] is initialized an filled before hand if needed.
    fn with_index_nt_read<T>(
        slf: &Bound<'_, Self>,
        f: impl FnOnce(&Index, &CoreNodeTree) -> PyResult<T>,
    ) -> PyResult<T> {
        Self::with_core_read(slf, |self_ref, guard| {
            let idx = &guard.index;
            let nt =
                self_ref.get_nodetree(idx)?.read().map_err(map_lock_error)?;
            let nt = nt.as_ref().expect("nodetree should be set");
            f(idx, nt)
        })
    }

    #[allow(dead_code)]
    fn with_index_nt_write<T>(
        slf: &Bound<'_, Self>,
        f: impl FnOnce(&mut Index, &mut CoreNodeTree) -> PyResult<T>,
    ) -> PyResult<T> {
        Self::with_core_write(slf, |self_ref, mut guard| {
            let idx = &mut guard.index;
            let mut nt = self_ref
                .get_nodetree(idx)?
                .write()
                .map_err(map_lock_error)?;
            let nt = nt.as_mut().expect("nodetree should be set");
            f(idx, nt)
        })
    }

    /// Fill a [`CoreNodeTree`] by doing a full iteration on the given
    /// [`Index`]
    ///
    /// # Python exceptions
    /// Raises `ValueError` if `nt` has existing data that is inconsistent
    /// with `idx`.
    fn fill_nodemap(idx: &Index, nt: &mut CoreNodeTree) -> PyResult<()> {
        for r in 0..idx.len() {
            let rev = Revision(r as BaseRevision);
            // in this case node() won't ever return None
            nt.insert(idx, idx.node(rev).expect("node should exist"), rev)
                .map_err(nodemap_error)?
        }
        Ok(())
    }

    /// Return a working NodeTree of this InnerRevlog
    ///
    /// In case the NodeTree has not been initialized yet (in particular
    /// not from persistent data at instantiation), it is created and
    /// filled right away from the index.
    ///
    /// Technically, the returned NodeTree is still behind the lock of
    /// the `nt` field, hence still wrapped in an [`Option`]. Callers
    /// will need to take the lock and unwrap with `expect()`.
    ///
    /// # Python exceptions
    /// The case mentioned in [`Self::fill_nodemap()`] cannot happen, as the
    /// NodeTree is empty when it is called.
    fn get_nodetree(
        &self,
        idx: &Index,
    ) -> PyResult<&RwLock<Option<CoreNodeTree>>> {
        if self.nt.read().map_err(map_lock_error)?.is_none() {
            let readonly = Box::<Vec<_>>::default();
            let mut nt = CoreNodeTree::load_bytes(readonly, 0);
            Self::fill_nodemap(idx, &mut nt)?;
            self.nt.write().map_err(map_lock_error)?.replace(nt);
        }
        Ok(&self.nt)
    }
}

pub fn init_module<'py>(
    py: Python<'py>,
    package: &str,
) -> PyResult<Bound<'py, PyModule>> {
    let m = new_submodule(py, package, "revlog")?;
    m.add_class::<InnerRevlog>()?;
    Ok(m)
}