view rust/hg-core/src/bdiff.rs @ 53017:3ac28aa1430e

rust-annotate: do not impl Sync for Lines I failed to notice before that bdiff_diff mutates its inputs by storing bookkeeping information in the `n` and `e` fields. This means &Lines can't be shared across threads, so Lines cannot be Sync. Fortunately nothing was using this. The parallel file reading and line splitting only relies on Send. The reason I didn't notice this to begin with was that Lines contains a *mut ffi::bdiffline, which is accessible from &Lines, so I never wrote or thought about &mut Lines. This pattern (using a mutable pointer stored in an immutable struct) would be expressed with RefCell if this was pure safe Rust.
author Mitchell Kember <mkember@janestreet.com>
date Mon, 24 Feb 2025 16:56:09 -0500
parents 6183949219b2
children
line wrap: on
line source

//! Safe bindings to bdiff.c.

use crate::errors::HgError;
use std::marker::PhantomData;

/// A file split into lines, ready for diffing.
pub struct Lines<'a> {
    /// The array of lines, allocated by bdiff.c.
    /// Must never be mutated by Rust code apart from freeing it in `Drop`.
    array: *mut ffi::bdiff_line,
    /// Length of the array.
    len: u32,
    /// Lifetime of the source buffer, since array items store pointers.
    _lifetime: PhantomData<&'a [u8]>,
}

/// Splits `source` into lines that can be diffed.
pub fn split_lines(source: &[u8]) -> Result<Lines, HgError> {
    let mut array = std::ptr::null_mut();
    // Safety: The pointer and length are valid since they both come from
    // `source`, and the out pointer is non-null.
    let result = unsafe {
        ffi::bdiff_splitlines(
            source.as_ptr() as *const std::ffi::c_char,
            source.len() as isize,
            &mut array,
        )
    };
    match u32::try_from(result) {
        Ok(len) => {
            assert!(!array.is_null());
            Ok(Lines {
                array,
                len,
                _lifetime: PhantomData,
            })
        }
        Err(_) => {
            Err(HgError::abort_simple("bdiff_splitlines failed to allocate"))
        }
    }
}

impl<'a> Lines<'a> {
    /// Returns the number of lines.
    pub fn len(&self) -> usize {
        self.len as usize
    }

    /// Returns an iterator over the lines.
    pub fn iter(&self) -> LinesIter<'_, 'a> {
        LinesIter {
            lines: self,
            index: 0,
        }
    }
}

impl Drop for Lines<'_> {
    fn drop(&mut self) {
        // Safety: This is the only place that frees the array (no
        // double-free), and it's in a `Drop` impl (no use-after-free).
        unsafe {
            libc::free(self.array as *mut std::ffi::c_void);
        }
    }
}

// Safety: It is safe to send `Lines` to a different thread because
// `self.array` is never copied so only one thread will free it.
unsafe impl Send for Lines<'_> {}

// It is *not* safe to share `&Lines` between threads because `ffi::bdiff_diff`
// mutates lines by storing bookkeeping information in `n` and `e`.
static_assertions_next::assert_impl!(Lines<'_>: !Sync);

#[derive(Clone)]
pub struct LinesIter<'a, 'b> {
    lines: &'a Lines<'b>,
    index: usize,
}

impl<'b> Iterator for LinesIter<'_, 'b> {
    type Item = &'b [u8];

    fn next(&mut self) -> Option<Self::Item> {
        if self.index == self.lines.len() {
            return None;
        }
        // Safety: We just checked that the index has not reached the length.
        let line = unsafe { *self.lines.array.add(self.index) };
        self.index += 1;
        // Safety: We assume bdiff.c sets `l` and `len` correctly.
        Some(unsafe {
            std::slice::from_raw_parts(line.l as *const u8, line.len as usize)
        })
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        let len = self.lines.len() - self.index;
        (len, Some(len))
    }
}

impl ExactSizeIterator for LinesIter<'_, '_> {}

/// A diff hunk comparing lines [a1,a2) in file A with lines [b1,b2) in file B.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct Hunk {
    /// Start line index in file A (inclusive).
    pub a1: u32,
    /// End line index in file A (exclusive).
    pub a2: u32,
    /// Start line index in file B (inclusive).
    pub b1: u32,
    /// End line index in file B (exclusive).
    pub b2: u32,
}

/// A list of matching hunks.
pub struct HunkList {
    /// The head of the linked list, allocated by bdiff.c.
    head: *mut ffi::bdiff_hunk,
    /// Length of the list.
    len: u32,
}

/// Returns a list of hunks that match in `a` and `b`.
pub fn diff(a: &Lines, b: &Lines) -> Result<HunkList, HgError> {
    let mut out = ffi::bdiff_hunk {
        a1: 0,
        a2: 0,
        b1: 0,
        b2: 0,
        next: std::ptr::null_mut(),
    };
    // Safety: We assume bdiff.c sets `array` and `len` correctly; and the
    // out pointer is non-null.
    let result = unsafe {
        ffi::bdiff_diff(a.array, a.len as i32, b.array, b.len as i32, &mut out)
    };
    match u32::try_from(result) {
        Ok(len) => Ok(HunkList {
            // Start with out.next because the first hunk is not meaningful and
            // is not included in len. This matches mercurial/cffi/bdiff.py.
            head: out.next,
            len,
        }),
        Err(_) => Err(HgError::abort_simple("bdiff_diff failed to allocate")),
    }
}

impl HunkList {
    /// Returns the number of hunks.
    pub fn len(&self) -> usize {
        self.len as usize
    }

    /// Returns an iterator over the hunks.
    pub fn iter(&self) -> HunkListIter {
        HunkListIter {
            // Safety: If `self.head` is null, this is safe. If non-null, then:
            // - We assume bdiff.c made it properly aligned.
            // - It's dereferenceable (any bit pattern is ok for `bdiff_hunk`).
            // - It won't be mutated because `HunkListIter` is tied to `&self`.
            next: unsafe { self.head.as_ref() },
            remaining: self.len(),
        }
    }
}

impl Drop for HunkList {
    fn drop(&mut self) {
        // Safety: This is the only place that frees `self.head` (no
        // double-free), and it's in a `Drop` impl (no use-after-free).
        unsafe {
            ffi::bdiff_freehunks(self.head);
        }
    }
}

pub struct HunkListIter<'a> {
    next: Option<&'a ffi::bdiff_hunk>,
    remaining: usize,
}

impl Iterator for HunkListIter<'_> {
    type Item = Hunk;

    fn next(&mut self) -> Option<Self::Item> {
        match self.next {
            Some(hunk) => {
                // Safety: Same reasoning as in `HunkList::iter`.
                self.next = unsafe { hunk.next.as_ref() };
                self.remaining -= 1;
                debug_assert_eq!(hunk.a2 - hunk.a1, hunk.b2 - hunk.b1);
                Some(Hunk {
                    a1: hunk.a1 as u32,
                    a2: hunk.a2 as u32,
                    b1: hunk.b1 as u32,
                    b2: hunk.b2 as u32,
                })
            }
            None => {
                assert_eq!(self.remaining, 0);
                None
            }
        }
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        (self.remaining, Some(self.remaining))
    }
}

impl ExactSizeIterator for HunkListIter<'_> {}

mod ffi {
    #![allow(non_camel_case_types)]

    use std::ffi::{c_char, c_int};

    #[repr(C)]
    #[derive(Debug, Copy, Clone)]
    pub struct bdiff_line {
        pub hash: c_int,
        pub n: c_int,
        pub e: c_int,
        pub len: isize,
        pub l: *const c_char,
    }

    #[repr(C)]
    #[derive(Debug, Copy, Clone)]
    pub struct bdiff_hunk {
        pub a1: c_int,
        pub a2: c_int,
        pub b1: c_int,
        pub b2: c_int,
        pub next: *mut bdiff_hunk,
    }

    #[link(name = "bdiff", kind = "static")]
    extern "C" {
        /// Splits `a` into lines. On success, stores a pointer to an array of
        /// lines in `*lr` and returns its length. On failure, returns
        /// -1. The caller is responsible for freeing the array.
        ///
        /// # Safety
        ///
        /// - `a` must point to an array of `len` chars.
        /// - `lr` must be non-null (but `*lr` can be null).
        pub fn bdiff_splitlines(
            a: *const c_char,
            len: isize,
            lr: *mut *mut bdiff_line,
        ) -> c_int;

        /// Diffs `a` and `b`. On success, stores the head of a linked list of
        /// hunks in `base->next` and returns its length. On failure, returns
        /// -1. The caller is responsible for `bdiff_freehunks(base->next)`.
        ///
        /// # Safety
        ///
        /// - `a` must point to an array of `an` lines.
        /// - `b` must point to an array of `bn` lines.
        /// - `base` must be non-null.
        pub fn bdiff_diff(
            a: *mut bdiff_line,
            an: c_int,
            b: *mut bdiff_line,
            bn: c_int,
            base: *mut bdiff_hunk,
        ) -> c_int;

        /// Frees the linked list of hunks `l`.
        ///
        /// # Safety
        ///
        /// - `l` must be non-null, not already freed, and not used after this.
        pub fn bdiff_freehunks(l: *mut bdiff_hunk);
    }
}

#[cfg(test)]
mod tests {
    fn split(a: &[u8]) -> Vec<&[u8]> {
        super::split_lines(a).unwrap().iter().collect()
    }

    fn diff(a: &[u8], b: &[u8]) -> Vec<(u32, u32, u32, u32)> {
        let la = super::split_lines(a).unwrap();
        let lb = super::split_lines(b).unwrap();
        let hunks = super::diff(&la, &lb).unwrap();
        hunks.iter().map(|h| (h.a1, h.a2, h.b1, h.b2)).collect()
    }

    #[test]
    fn test_split_lines() {
        assert_eq!(split(b""), [] as [&[u8]; 0]);
        assert_eq!(split(b"\n"), [b"\n"]);
        assert_eq!(split(b"\r\n"), [b"\r\n"]);
        assert_eq!(split(b"X\nY"), [b"X\n" as &[u8], b"Y"]);
        assert_eq!(split(b"X\nY\n"), [b"X\n" as &[u8], b"Y\n"]);
        assert_eq!(split(b"X\r\nY\r\n"), [b"X\r\n" as &[u8], b"Y\r\n"]);
    }

    #[test]
    fn test_diff_single_line() {
        assert_eq!(diff(b"", b""), &[(0, 0, 0, 0)]);
        assert_eq!(diff(b"x", b"x"), &[(0, 1, 0, 1), (1, 1, 1, 1)]);
        assert_eq!(diff(b"x", b"y"), &[(1, 1, 1, 1)]);
    }

    #[test]
    fn test_diff_multiple_lines() {
        assert_eq!(
            diff(
                b" line1 \n line2 \n line3 \n line4 \n REMOVED \n",
                b" ADDED \n line1 \n lined2_CHANGED \n line3 \n line4 \n"
            ),
            &[(0, 1, 1, 2), (2, 4, 3, 5), (5, 5, 5, 5)]
        );
    }
}