diff -r a7dcb7c1ff5a -r 1b7a57a5b47a rust/hg-core/src/bdiff.rs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/bdiff.rs Wed Dec 18 10:35:01 2024 -0500 @@ -0,0 +1,328 @@ +//! Safe bindings to bdiff.c. + +// TODO: Remove when used. +#![allow(dead_code)] + +use crate::errors::HgError; +use std::marker::PhantomData; + +/// A file split into lines, ready for diffing. +pub struct Lines<'a> { + /// The array of lines, allocated by bdiff.c. + /// Must never be mutated apart from freeing it in `Drop`. + array: *mut ffi::bdiff_line, + /// Length of the array. + len: u32, + /// Lifetime of the source buffer, since array items store pointers. + _lifetime: PhantomData<&'a [u8]>, +} + +/// Splits `source` into lines that can be diffed. +pub fn split_lines(source: &[u8]) -> Result { + let mut array = std::ptr::null_mut(); + // Safety: The pointer and length are valid since they both come from + // `source`, and the out pointer is non-null. + let result = unsafe { + ffi::bdiff_splitlines( + source.as_ptr() as *const std::ffi::c_char, + source.len() as isize, + &mut array, + ) + }; + match u32::try_from(result) { + Ok(len) => { + assert!(!array.is_null()); + Ok(Lines { + array, + len, + _lifetime: PhantomData, + }) + } + Err(_) => { + Err(HgError::abort_simple("bdiff_splitlines failed to allocate")) + } + } +} + +impl<'a> Lines<'a> { + /// Returns the number of lines. + pub fn len(&self) -> usize { + self.len as usize + } + + /// Returns an iterator over the lines. + pub fn iter(&self) -> LinesIter<'_, 'a> { + LinesIter { + lines: self, + index: 0, + } + } +} + +impl Drop for Lines<'_> { + fn drop(&mut self) { + // Safety: This is the only place that frees the array (no + // double-free), and it's in a `Drop` impl (no use-after-free). + unsafe { + libc::free(self.array as *mut std::ffi::c_void); + } + } +} + +// Safety: It is safe for multiple threads to share `&Lines` because +// `self.array` is never mutated except in `Drop`. +unsafe impl Sync for Lines<'_> {} + +// Safety: It is safe to send `Lines` to a different thread because +// `self.array` is never copied so only one thread will free it. +unsafe impl Send for Lines<'_> {} + +#[derive(Clone)] +pub struct LinesIter<'a, 'b> { + lines: &'a Lines<'b>, + index: usize, +} + +impl<'b> Iterator for LinesIter<'_, 'b> { + type Item = &'b [u8]; + + fn next(&mut self) -> Option { + if self.index == self.lines.len() { + return None; + } + // Safety: We just checked that the index has not reached the length. + let line = unsafe { *self.lines.array.add(self.index) }; + self.index += 1; + // Safety: We assume bdiff.c sets `l` and `len` correctly. + Some(unsafe { + std::slice::from_raw_parts(line.l as *const u8, line.len as usize) + }) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.lines.len() - self.index; + (len, Some(len)) + } +} + +impl ExactSizeIterator for LinesIter<'_, '_> {} + +/// A diff hunk comparing lines [a1,a2) in file A with lines [b1,b2) in file B. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct Hunk { + /// Start line index in file A (inclusive). + pub a1: u32, + /// End line index in file A (exclusive). + pub a2: u32, + /// Start line index in file B (inclusive). + pub b1: u32, + /// End line index in file B (exclusive). + pub b2: u32, +} + +/// A list of matching hunks. +pub struct HunkList { + /// The head of the linked list, allocated by bdiff.c. + head: *mut ffi::bdiff_hunk, + /// Length of the list. + len: u32, +} + +/// Returns a list of hunks that match in `a` and `b`. +pub fn diff(a: &Lines, b: &Lines) -> Result { + let mut out = ffi::bdiff_hunk { + a1: 0, + a2: 0, + b1: 0, + b2: 0, + next: std::ptr::null_mut(), + }; + // Safety: We assume bdiff.c sets `array` and `len` correctly; and the + // out pointer is non-null. + let result = unsafe { + ffi::bdiff_diff(a.array, a.len as i32, b.array, b.len as i32, &mut out) + }; + match u32::try_from(result) { + Ok(len) => Ok(HunkList { + // Start with out.next because the first hunk is not meaningful and + // is not included in len. This matches mercurial/cffi/bdiff.py. + head: out.next, + len, + }), + Err(_) => Err(HgError::abort_simple("bdiff_diff failed to allocate")), + } +} + +impl HunkList { + /// Returns the number of hunks. + pub fn len(&self) -> usize { + self.len as usize + } + + /// Returns an iterator over the hunks. + pub fn iter(&self) -> HunkListIter { + HunkListIter { + // Safety: If `self.head` is null, this is safe. If non-null, then: + // - We assume bdiff.c made it properly aligned. + // - It's dereferenceable (any bit pattern is ok for `bdiff_hunk`). + // - It won't be mutated because `HunkListIter` is tied to `&self`. + next: unsafe { self.head.as_ref() }, + remaining: self.len(), + } + } +} + +impl Drop for HunkList { + fn drop(&mut self) { + // Safety: This is the only place that frees `self.head` (no + // double-free), and it's in a `Drop` impl (no use-after-free). + unsafe { + ffi::bdiff_freehunks(self.head); + } + } +} + +pub struct HunkListIter<'a> { + next: Option<&'a ffi::bdiff_hunk>, + remaining: usize, +} + +impl Iterator for HunkListIter<'_> { + type Item = Hunk; + + fn next(&mut self) -> Option { + match self.next { + Some(hunk) => { + // Safety: Same reasoning as in `HunkList::iter`. + self.next = unsafe { hunk.next.as_ref() }; + self.remaining -= 1; + debug_assert_eq!(hunk.a2 - hunk.a1, hunk.b2 - hunk.b1); + Some(Hunk { + a1: hunk.a1 as u32, + a2: hunk.a2 as u32, + b1: hunk.b1 as u32, + b2: hunk.b2 as u32, + }) + } + None => { + assert_eq!(self.remaining, 0); + None + } + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.remaining, Some(self.remaining)) + } +} + +impl ExactSizeIterator for HunkListIter<'_> {} + +mod ffi { + #![allow(non_camel_case_types)] + + use std::ffi::{c_char, c_int}; + + #[repr(C)] + #[derive(Debug, Copy, Clone)] + pub struct bdiff_line { + pub hash: c_int, + pub n: c_int, + pub e: c_int, + pub len: isize, + pub l: *const c_char, + } + + #[repr(C)] + #[derive(Debug, Copy, Clone)] + pub struct bdiff_hunk { + pub a1: c_int, + pub a2: c_int, + pub b1: c_int, + pub b2: c_int, + pub next: *mut bdiff_hunk, + } + + #[link(name = "bdiff", kind = "static")] + extern "C" { + /// Splits `a` into lines. On success, stores a pointer to an array of + /// lines in `*lr` and returns its length. On failure, returns + /// -1. The caller is responsible for freeing the array. + /// + /// # Safety + /// + /// - `a` must point to an array of `len` chars. + /// - `lr` must be non-null (but `*lr` can be null). + pub fn bdiff_splitlines( + a: *const c_char, + len: isize, + lr: *mut *mut bdiff_line, + ) -> c_int; + + /// Diffs `a` and `b`. On success, stores the head of a linked list of + /// hunks in `base->next` and returns its length. On failure, returns + /// -1. The caller is responsible for `bdiff_freehunks(base->next)`. + /// + /// # Safety + /// + /// - `a` must point to an array of `an` lines. + /// - `b` must point to an array of `bn` lines. + /// - `base` must be non-null. + pub fn bdiff_diff( + a: *mut bdiff_line, + an: c_int, + b: *mut bdiff_line, + bn: c_int, + base: *mut bdiff_hunk, + ) -> c_int; + + /// Frees the linked list of hunks `l`. + /// + /// # Safety + /// + /// - `l` must be non-null, not already freed, and not used after this. + pub fn bdiff_freehunks(l: *mut bdiff_hunk); + } +} + +#[cfg(test)] +mod tests { + fn split(a: &[u8]) -> Vec<&[u8]> { + super::split_lines(a).unwrap().iter().collect() + } + + fn diff(a: &[u8], b: &[u8]) -> Vec<(u32, u32, u32, u32)> { + let la = super::split_lines(a).unwrap(); + let lb = super::split_lines(b).unwrap(); + let hunks = super::diff(&la, &lb).unwrap(); + hunks.iter().map(|h| (h.a1, h.a2, h.b1, h.b2)).collect() + } + + #[test] + fn test_split_lines() { + assert_eq!(split(b""), [] as [&[u8]; 0]); + assert_eq!(split(b"\n"), [b"\n"]); + assert_eq!(split(b"\r\n"), [b"\r\n"]); + assert_eq!(split(b"X\nY"), [b"X\n" as &[u8], b"Y"]); + assert_eq!(split(b"X\nY\n"), [b"X\n" as &[u8], b"Y\n"]); + assert_eq!(split(b"X\r\nY\r\n"), [b"X\r\n" as &[u8], b"Y\r\n"]); + } + + #[test] + fn test_diff_single_line() { + assert_eq!(diff(b"", b""), &[(0, 0, 0, 0)]); + assert_eq!(diff(b"x", b"x"), &[(0, 1, 0, 1), (1, 1, 1, 1)]); + assert_eq!(diff(b"x", b"y"), &[(1, 1, 1, 1)]); + } + + #[test] + fn test_diff_multiple_lines() { + assert_eq!( + diff( + b" line1 \n line2 \n line3 \n line4 \n REMOVED \n", + b" ADDED \n line1 \n lined2_CHANGED \n line3 \n line4 \n" + ), + &[(0, 1, 1, 2), (2, 4, 3, 5), (5, 5, 5, 5)] + ); + } +}