rust/hg-core/src/bdiff.rs
changeset 52755 1b7a57a5b47a
child 52767 6183949219b2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/bdiff.rs	Wed Dec 18 10:35:01 2024 -0500
@@ -0,0 +1,328 @@
+//! Safe bindings to bdiff.c.
+
+// TODO: Remove when used.
+#![allow(dead_code)]
+
+use crate::errors::HgError;
+use std::marker::PhantomData;
+
+/// A file split into lines, ready for diffing.
+pub struct Lines<'a> {
+    /// The array of lines, allocated by bdiff.c.
+    /// Must never be mutated apart from freeing it in `Drop`.
+    array: *mut ffi::bdiff_line,
+    /// Length of the array.
+    len: u32,
+    /// Lifetime of the source buffer, since array items store pointers.
+    _lifetime: PhantomData<&'a [u8]>,
+}
+
+/// Splits `source` into lines that can be diffed.
+pub fn split_lines(source: &[u8]) -> Result<Lines, HgError> {
+    let mut array = std::ptr::null_mut();
+    // Safety: The pointer and length are valid since they both come from
+    // `source`, and the out pointer is non-null.
+    let result = unsafe {
+        ffi::bdiff_splitlines(
+            source.as_ptr() as *const std::ffi::c_char,
+            source.len() as isize,
+            &mut array,
+        )
+    };
+    match u32::try_from(result) {
+        Ok(len) => {
+            assert!(!array.is_null());
+            Ok(Lines {
+                array,
+                len,
+                _lifetime: PhantomData,
+            })
+        }
+        Err(_) => {
+            Err(HgError::abort_simple("bdiff_splitlines failed to allocate"))
+        }
+    }
+}
+
+impl<'a> Lines<'a> {
+    /// Returns the number of lines.
+    pub fn len(&self) -> usize {
+        self.len as usize
+    }
+
+    /// Returns an iterator over the lines.
+    pub fn iter(&self) -> LinesIter<'_, 'a> {
+        LinesIter {
+            lines: self,
+            index: 0,
+        }
+    }
+}
+
+impl Drop for Lines<'_> {
+    fn drop(&mut self) {
+        // Safety: This is the only place that frees the array (no
+        // double-free), and it's in a `Drop` impl (no use-after-free).
+        unsafe {
+            libc::free(self.array as *mut std::ffi::c_void);
+        }
+    }
+}
+
+// Safety: It is safe for multiple threads to share `&Lines` because
+// `self.array` is never mutated except in `Drop`.
+unsafe impl Sync for Lines<'_> {}
+
+// Safety: It is safe to send `Lines` to a different thread because
+// `self.array` is never copied so only one thread will free it.
+unsafe impl Send for Lines<'_> {}
+
+#[derive(Clone)]
+pub struct LinesIter<'a, 'b> {
+    lines: &'a Lines<'b>,
+    index: usize,
+}
+
+impl<'b> Iterator for LinesIter<'_, 'b> {
+    type Item = &'b [u8];
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.index == self.lines.len() {
+            return None;
+        }
+        // Safety: We just checked that the index has not reached the length.
+        let line = unsafe { *self.lines.array.add(self.index) };
+        self.index += 1;
+        // Safety: We assume bdiff.c sets `l` and `len` correctly.
+        Some(unsafe {
+            std::slice::from_raw_parts(line.l as *const u8, line.len as usize)
+        })
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.lines.len() - self.index;
+        (len, Some(len))
+    }
+}
+
+impl ExactSizeIterator for LinesIter<'_, '_> {}
+
+/// A diff hunk comparing lines [a1,a2) in file A with lines [b1,b2) in file B.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub struct Hunk {
+    /// Start line index in file A (inclusive).
+    pub a1: u32,
+    /// End line index in file A (exclusive).
+    pub a2: u32,
+    /// Start line index in file B (inclusive).
+    pub b1: u32,
+    /// End line index in file B (exclusive).
+    pub b2: u32,
+}
+
+/// A list of matching hunks.
+pub struct HunkList {
+    /// The head of the linked list, allocated by bdiff.c.
+    head: *mut ffi::bdiff_hunk,
+    /// Length of the list.
+    len: u32,
+}
+
+/// Returns a list of hunks that match in `a` and `b`.
+pub fn diff(a: &Lines, b: &Lines) -> Result<HunkList, HgError> {
+    let mut out = ffi::bdiff_hunk {
+        a1: 0,
+        a2: 0,
+        b1: 0,
+        b2: 0,
+        next: std::ptr::null_mut(),
+    };
+    // Safety: We assume bdiff.c sets `array` and `len` correctly; and the
+    // out pointer is non-null.
+    let result = unsafe {
+        ffi::bdiff_diff(a.array, a.len as i32, b.array, b.len as i32, &mut out)
+    };
+    match u32::try_from(result) {
+        Ok(len) => Ok(HunkList {
+            // Start with out.next because the first hunk is not meaningful and
+            // is not included in len. This matches mercurial/cffi/bdiff.py.
+            head: out.next,
+            len,
+        }),
+        Err(_) => Err(HgError::abort_simple("bdiff_diff failed to allocate")),
+    }
+}
+
+impl HunkList {
+    /// Returns the number of hunks.
+    pub fn len(&self) -> usize {
+        self.len as usize
+    }
+
+    /// Returns an iterator over the hunks.
+    pub fn iter(&self) -> HunkListIter {
+        HunkListIter {
+            // Safety: If `self.head` is null, this is safe. If non-null, then:
+            // - We assume bdiff.c made it properly aligned.
+            // - It's dereferenceable (any bit pattern is ok for `bdiff_hunk`).
+            // - It won't be mutated because `HunkListIter` is tied to `&self`.
+            next: unsafe { self.head.as_ref() },
+            remaining: self.len(),
+        }
+    }
+}
+
+impl Drop for HunkList {
+    fn drop(&mut self) {
+        // Safety: This is the only place that frees `self.head` (no
+        // double-free), and it's in a `Drop` impl (no use-after-free).
+        unsafe {
+            ffi::bdiff_freehunks(self.head);
+        }
+    }
+}
+
+pub struct HunkListIter<'a> {
+    next: Option<&'a ffi::bdiff_hunk>,
+    remaining: usize,
+}
+
+impl Iterator for HunkListIter<'_> {
+    type Item = Hunk;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.next {
+            Some(hunk) => {
+                // Safety: Same reasoning as in `HunkList::iter`.
+                self.next = unsafe { hunk.next.as_ref() };
+                self.remaining -= 1;
+                debug_assert_eq!(hunk.a2 - hunk.a1, hunk.b2 - hunk.b1);
+                Some(Hunk {
+                    a1: hunk.a1 as u32,
+                    a2: hunk.a2 as u32,
+                    b1: hunk.b1 as u32,
+                    b2: hunk.b2 as u32,
+                })
+            }
+            None => {
+                assert_eq!(self.remaining, 0);
+                None
+            }
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (self.remaining, Some(self.remaining))
+    }
+}
+
+impl ExactSizeIterator for HunkListIter<'_> {}
+
+mod ffi {
+    #![allow(non_camel_case_types)]
+
+    use std::ffi::{c_char, c_int};
+
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone)]
+    pub struct bdiff_line {
+        pub hash: c_int,
+        pub n: c_int,
+        pub e: c_int,
+        pub len: isize,
+        pub l: *const c_char,
+    }
+
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone)]
+    pub struct bdiff_hunk {
+        pub a1: c_int,
+        pub a2: c_int,
+        pub b1: c_int,
+        pub b2: c_int,
+        pub next: *mut bdiff_hunk,
+    }
+
+    #[link(name = "bdiff", kind = "static")]
+    extern "C" {
+        /// Splits `a` into lines. On success, stores a pointer to an array of
+        /// lines in `*lr` and returns its length. On failure, returns
+        /// -1. The caller is responsible for freeing the array.
+        ///
+        /// # Safety
+        ///
+        /// - `a` must point to an array of `len` chars.
+        /// - `lr` must be non-null (but `*lr` can be null).
+        pub fn bdiff_splitlines(
+            a: *const c_char,
+            len: isize,
+            lr: *mut *mut bdiff_line,
+        ) -> c_int;
+
+        /// Diffs `a` and `b`. On success, stores the head of a linked list of
+        /// hunks in `base->next` and returns its length. On failure, returns
+        /// -1. The caller is responsible for `bdiff_freehunks(base->next)`.
+        ///
+        /// # Safety
+        ///
+        /// - `a` must point to an array of `an` lines.
+        /// - `b` must point to an array of `bn` lines.
+        /// - `base` must be non-null.
+        pub fn bdiff_diff(
+            a: *mut bdiff_line,
+            an: c_int,
+            b: *mut bdiff_line,
+            bn: c_int,
+            base: *mut bdiff_hunk,
+        ) -> c_int;
+
+        /// Frees the linked list of hunks `l`.
+        ///
+        /// # Safety
+        ///
+        /// - `l` must be non-null, not already freed, and not used after this.
+        pub fn bdiff_freehunks(l: *mut bdiff_hunk);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    fn split(a: &[u8]) -> Vec<&[u8]> {
+        super::split_lines(a).unwrap().iter().collect()
+    }
+
+    fn diff(a: &[u8], b: &[u8]) -> Vec<(u32, u32, u32, u32)> {
+        let la = super::split_lines(a).unwrap();
+        let lb = super::split_lines(b).unwrap();
+        let hunks = super::diff(&la, &lb).unwrap();
+        hunks.iter().map(|h| (h.a1, h.a2, h.b1, h.b2)).collect()
+    }
+
+    #[test]
+    fn test_split_lines() {
+        assert_eq!(split(b""), [] as [&[u8]; 0]);
+        assert_eq!(split(b"\n"), [b"\n"]);
+        assert_eq!(split(b"\r\n"), [b"\r\n"]);
+        assert_eq!(split(b"X\nY"), [b"X\n" as &[u8], b"Y"]);
+        assert_eq!(split(b"X\nY\n"), [b"X\n" as &[u8], b"Y\n"]);
+        assert_eq!(split(b"X\r\nY\r\n"), [b"X\r\n" as &[u8], b"Y\r\n"]);
+    }
+
+    #[test]
+    fn test_diff_single_line() {
+        assert_eq!(diff(b"", b""), &[(0, 0, 0, 0)]);
+        assert_eq!(diff(b"x", b"x"), &[(0, 1, 0, 1), (1, 1, 1, 1)]);
+        assert_eq!(diff(b"x", b"y"), &[(1, 1, 1, 1)]);
+    }
+
+    #[test]
+    fn test_diff_multiple_lines() {
+        assert_eq!(
+            diff(
+                b" line1 \n line2 \n line3 \n line4 \n REMOVED \n",
+                b" ADDED \n line1 \n lined2_CHANGED \n line3 \n line4 \n"
+            ),
+            &[(0, 1, 1, 2), (2, 4, 3, 5), (5, 5, 5, 5)]
+        );
+    }
+}