Mercurial > public > mercurial-scm > hg
diff rust/hg-core/src/ancestors.rs @ 52772:e9ced84aeef4
rust-ancestors: use BitSet for seen revisions
This makes AncestorsIterator store its set of seen revisions in a BitSet-based
DescendingRevisionSet instead of a HashSet. This provides faster lookups and
insertions, and uses only A - B bits of memory when iteration goes from revision
A down to revision B. In the worst case iterating from tip to -1 in the
mercurial-devel changelog, for example, it would use about 7 KiB.
Running rhg annotate on 200 random files in mercurial-devel gave on average a
10% improvement. Here is the distribution:
new/old freq histogram
-------- |----- | ---------
0.81 | 17 | **
0.84 | 32 | ****
0.87 | 23 | ***
0.90 | 24 | ***
0.92 | 20 | ***
0.95 | 34 | *****
0.98 | 18 | **
1.01 | 24 | ***
1.04 | 3 |
1.07 | 5 |
-------- |----- | ---------
Avg=0.90 |N=200 |
author | Mitchell Kember <mkember@janestreet.com> |
---|---|
date | Fri, 07 Feb 2025 16:07:39 -0500 |
parents | f90796d33aa0 |
children |
line wrap: on
line diff
--- a/rust/hg-core/src/ancestors.rs Fri Feb 07 16:07:35 2025 -0500 +++ b/rust/hg-core/src/ancestors.rs Fri Feb 07 16:07:39 2025 -0500 @@ -9,9 +9,58 @@ use super::{Graph, GraphError, Revision, NULL_REVISION}; use crate::dagops; +use bit_set::BitSet; use std::cmp::max; use std::collections::{BinaryHeap, HashSet}; +/// A set of revisions backed by a bitset, optimized for descending insertion. +struct DescendingRevisionSet { + /// The underlying bitset storage. + set: BitSet, + /// For a revision `R` we store `ceiling - R` instead of `R` so that + /// memory usage is proportional to how far we've descended. + ceiling: i32, + /// Track length separately because [`BitSet::len`] recounts every time. + len: usize, +} + +impl DescendingRevisionSet { + /// Creates a new empty set that can store revisions up to `ceiling`. + fn new(ceiling: Revision) -> Self { + Self { + set: BitSet::new(), + ceiling: ceiling.0, + len: 0, + } + } + + /// Returns the number of revisions in the set. + fn len(&self) -> usize { + self.len + } + + /// Returns true if the set contains `value`. + fn contains(&self, value: Revision) -> bool { + match self.encode(value) { + Ok(n) => self.set.contains(n), + Err(_) => false, + } + } + + /// Adds `value` to the set. Returns true if it was not already in the set. + /// Returns `Err` if it cannot store it because it is above the ceiling. + fn insert(&mut self, value: Revision) -> Result<bool, GraphError> { + let inserted = self.set.insert(self.encode(value)?); + self.len += inserted as usize; + Ok(inserted) + } + + fn encode(&self, value: Revision) -> Result<usize, GraphError> { + usize::try_from(self.ceiling - value.0) + .map_err(|_| GraphError::ParentOutOfOrder(value)) + } +} + /// Iterator over the ancestors of a given list of revisions /// This is a generic type, defined and implemented for any Graph, so that /// it's easy to @@ -22,7 +71,7 @@ pub struct AncestorsIterator<G: Graph> { graph: G, visit: BinaryHeap<Revision>, - seen: HashSet<Revision>, + seen: DescendingRevisionSet, stoprev: Revision, } @@ -43,12 +92,18 @@ stoprev: Revision, inclusive: bool, ) -> Result<Self, GraphError> { - let filtered_initrevs = initrevs.into_iter().filter(|&r| r >= stoprev); + let filtered_initrevs = initrevs + .into_iter() + .filter(|&r| r >= stoprev) + .collect::<BinaryHeap<_>>(); + let max = *filtered_initrevs.peek().unwrap_or(&NULL_REVISION); + let mut seen = DescendingRevisionSet::new(max); if inclusive { - let visit: BinaryHeap<Revision> = filtered_initrevs.collect(); - let seen = visit.iter().cloned().collect(); + for &rev in &filtered_initrevs { + seen.insert(rev).expect("revs cannot be above their max"); + } return Ok(AncestorsIterator { - visit, + visit: filtered_initrevs, seen, stoprev, graph, @@ -56,24 +111,30 @@ } let mut this = AncestorsIterator { visit: BinaryHeap::new(), - seen: HashSet::new(), + seen, stoprev, graph, }; - this.seen.insert(NULL_REVISION); + this.seen + .insert(NULL_REVISION) + .expect("null is the smallest revision"); for rev in filtered_initrevs { for parent in this.graph.parents(rev)?.iter().cloned() { - this.conditionally_push_rev(parent); + this.conditionally_push_rev(parent)?; } } Ok(this) } #[inline] - fn conditionally_push_rev(&mut self, rev: Revision) { - if self.stoprev <= rev && self.seen.insert(rev) { + fn conditionally_push_rev( + &mut self, + rev: Revision, + ) -> Result<(), GraphError> { + if self.stoprev <= rev && self.seen.insert(rev)? { self.visit.push(rev); } + Ok(()) } /// Consumes partially the iterator to tell if the given target @@ -82,7 +143,7 @@ /// This is meant for iterators actually dedicated to that kind of /// purpose pub fn contains(&mut self, target: Revision) -> Result<bool, GraphError> { - if self.seen.contains(&target) && target != NULL_REVISION { + if self.seen.contains(target) && target != NULL_REVISION { return Ok(true); } for item in self { @@ -110,13 +171,14 @@ if self.visit.len() > 0 { return false; } - if self.seen.len() > 1 { + let seen_len = self.seen.len(); + if seen_len > 1 { return false; } // at this point, the seen set is at most a singleton. // If not `self.inclusive`, it's still possible that it has only // the null revision - self.seen.is_empty() || self.seen.contains(&NULL_REVISION) + seen_len == 0 || self.seen.contains(NULL_REVISION) } } @@ -145,13 +207,23 @@ Ok(ps) => ps, Err(e) => return Some(Err(e)), }; - if p1 < self.stoprev || !self.seen.insert(p1) { + let pop = if p1 < self.stoprev { + true + } else { + match self.seen.insert(p1) { + Ok(inserted) => !inserted, + Err(e) => return Some(Err(e)), + } + }; + if pop { self.visit.pop(); } else { *(self.visit.peek_mut().unwrap()) = p1; }; - self.conditionally_push_rev(p2); + if let Err(e) = self.conditionally_push_rev(p2) { + return Some(Err(e)); + } Some(Ok(current)) } } @@ -406,6 +478,28 @@ } #[test] + fn test_descending_revision_set() { + let mut set = DescendingRevisionSet::new(Revision(1_000_000)); + + assert_eq!(set.len(), 0); + assert!(!set.contains(Revision(999_950))); + + assert_eq!(set.insert(Revision(999_950)), Ok(true)); + assert_eq!(set.insert(Revision(999_950)), Ok(false)); + assert_eq!(set.insert(Revision(1_000_000)), Ok(true)); + assert_eq!( + set.insert(Revision(1_000_001)), + Err(GraphError::ParentOutOfOrder(Revision(1_000_001))) + ); + + assert_eq!(set.len(), 2); + assert!(set.contains(Revision(999_950))); + assert!(!set.contains(Revision(999_951))); + assert!(set.contains(Revision(1_000_000))); + assert!(!set.contains(Revision(1_000_001))); + } + + #[test] /// Same tests as test-ancestor.py, without membership /// (see also test-ancestor.py.out) fn test_list_ancestor() {