Mercurial > public > mercurial-scm > hg-stable
changeset 52769:1b7a57a5b47a
rust: add safe bindings to bdiff.c
I wrote C FFI bindings manually rather than using a bindgen build step because
there are only 2 structs and 3 functions and they're not going to change.
Note that the relative path in build.rs means that cargo publish will no longer
work. If in the future we want to publish to crates.io, we would probably need
to add a Makefile step that copies bdiff sources into the hg-core crate.
author | Mitchell Kember <mkember@janestreet.com> |
---|---|
date | Wed, 18 Dec 2024 10:35:01 -0500 |
parents | a7dcb7c1ff5a |
children | bbf1c52252ae |
files | rust/Cargo.lock rust/hg-core/Cargo.toml rust/hg-core/build.rs rust/hg-core/src/bdiff.rs rust/hg-core/src/lib.rs |
diffstat | 5 files changed, 344 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/Cargo.lock Fri Feb 07 13:48:50 2025 +0100 +++ b/rust/Cargo.lock Wed Dec 18 10:35:01 2024 -0500 @@ -652,6 +652,7 @@ "bitvec", "byteorder", "bytes-cast", + "cc", "chrono", "clap", "crossbeam-channel",
--- a/rust/hg-core/Cargo.toml Fri Feb 07 13:48:50 2025 +0100 +++ b/rust/hg-core/Cargo.toml Wed Dec 18 10:35:01 2024 -0500 @@ -58,6 +58,9 @@ features = ["zlib"] default-features = false +[build-dependencies] +cc = "1.0" + [dev-dependencies] clap = { version = "4", features = ["derive"] } pretty_assertions = "1.1.0"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/build.rs Wed Dec 18 10:35:01 2024 -0500 @@ -0,0 +1,11 @@ +fn main() { + // The relative paths work locally but won't if published to crates.io. + println!("cargo::rerun-if-changed=../../mercurial/bdiff.c"); + println!("cargo::rerun-if-changed=../../mercurial/bdiff.h"); + println!("cargo::rerun-if-changed=../../mercurial/compat.h"); + println!("cargo::rerun-if-changed=../../mercurial/bitmanipulation.h"); + cc::Build::new() + .warnings(true) + .file("../../mercurial/bdiff.c") + .compile("bdiff"); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/bdiff.rs Wed Dec 18 10:35:01 2024 -0500 @@ -0,0 +1,328 @@ +//! Safe bindings to bdiff.c. + +// TODO: Remove when used. +#![allow(dead_code)] + +use crate::errors::HgError; +use std::marker::PhantomData; + +/// A file split into lines, ready for diffing. +pub struct Lines<'a> { + /// The array of lines, allocated by bdiff.c. + /// Must never be mutated apart from freeing it in `Drop`. + array: *mut ffi::bdiff_line, + /// Length of the array. + len: u32, + /// Lifetime of the source buffer, since array items store pointers. + _lifetime: PhantomData<&'a [u8]>, +} + +/// Splits `source` into lines that can be diffed. +pub fn split_lines(source: &[u8]) -> Result<Lines, HgError> { + let mut array = std::ptr::null_mut(); + // Safety: The pointer and length are valid since they both come from + // `source`, and the out pointer is non-null. + let result = unsafe { + ffi::bdiff_splitlines( + source.as_ptr() as *const std::ffi::c_char, + source.len() as isize, + &mut array, + ) + }; + match u32::try_from(result) { + Ok(len) => { + assert!(!array.is_null()); + Ok(Lines { + array, + len, + _lifetime: PhantomData, + }) + } + Err(_) => { + Err(HgError::abort_simple("bdiff_splitlines failed to allocate")) + } + } +} + +impl<'a> Lines<'a> { + /// Returns the number of lines. + pub fn len(&self) -> usize { + self.len as usize + } + + /// Returns an iterator over the lines. + pub fn iter(&self) -> LinesIter<'_, 'a> { + LinesIter { + lines: self, + index: 0, + } + } +} + +impl Drop for Lines<'_> { + fn drop(&mut self) { + // Safety: This is the only place that frees the array (no + // double-free), and it's in a `Drop` impl (no use-after-free). + unsafe { + libc::free(self.array as *mut std::ffi::c_void); + } + } +} + +// Safety: It is safe for multiple threads to share `&Lines` because +// `self.array` is never mutated except in `Drop`. +unsafe impl Sync for Lines<'_> {} + +// Safety: It is safe to send `Lines` to a different thread because +// `self.array` is never copied so only one thread will free it. +unsafe impl Send for Lines<'_> {} + +#[derive(Clone)] +pub struct LinesIter<'a, 'b> { + lines: &'a Lines<'b>, + index: usize, +} + +impl<'b> Iterator for LinesIter<'_, 'b> { + type Item = &'b [u8]; + + fn next(&mut self) -> Option<Self::Item> { + if self.index == self.lines.len() { + return None; + } + // Safety: We just checked that the index has not reached the length. + let line = unsafe { *self.lines.array.add(self.index) }; + self.index += 1; + // Safety: We assume bdiff.c sets `l` and `len` correctly. + Some(unsafe { + std::slice::from_raw_parts(line.l as *const u8, line.len as usize) + }) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let len = self.lines.len() - self.index; + (len, Some(len)) + } +} + +impl ExactSizeIterator for LinesIter<'_, '_> {} + +/// A diff hunk comparing lines [a1,a2) in file A with lines [b1,b2) in file B. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct Hunk { + /// Start line index in file A (inclusive). + pub a1: u32, + /// End line index in file A (exclusive). + pub a2: u32, + /// Start line index in file B (inclusive). + pub b1: u32, + /// End line index in file B (exclusive). + pub b2: u32, +} + +/// A list of matching hunks. +pub struct HunkList { + /// The head of the linked list, allocated by bdiff.c. + head: *mut ffi::bdiff_hunk, + /// Length of the list. + len: u32, +} + +/// Returns a list of hunks that match in `a` and `b`. +pub fn diff(a: &Lines, b: &Lines) -> Result<HunkList, HgError> { + let mut out = ffi::bdiff_hunk { + a1: 0, + a2: 0, + b1: 0, + b2: 0, + next: std::ptr::null_mut(), + }; + // Safety: We assume bdiff.c sets `array` and `len` correctly; and the + // out pointer is non-null. + let result = unsafe { + ffi::bdiff_diff(a.array, a.len as i32, b.array, b.len as i32, &mut out) + }; + match u32::try_from(result) { + Ok(len) => Ok(HunkList { + // Start with out.next because the first hunk is not meaningful and + // is not included in len. This matches mercurial/cffi/bdiff.py. + head: out.next, + len, + }), + Err(_) => Err(HgError::abort_simple("bdiff_diff failed to allocate")), + } +} + +impl HunkList { + /// Returns the number of hunks. + pub fn len(&self) -> usize { + self.len as usize + } + + /// Returns an iterator over the hunks. + pub fn iter(&self) -> HunkListIter { + HunkListIter { + // Safety: If `self.head` is null, this is safe. If non-null, then: + // - We assume bdiff.c made it properly aligned. + // - It's dereferenceable (any bit pattern is ok for `bdiff_hunk`). + // - It won't be mutated because `HunkListIter` is tied to `&self`. + next: unsafe { self.head.as_ref() }, + remaining: self.len(), + } + } +} + +impl Drop for HunkList { + fn drop(&mut self) { + // Safety: This is the only place that frees `self.head` (no + // double-free), and it's in a `Drop` impl (no use-after-free). + unsafe { + ffi::bdiff_freehunks(self.head); + } + } +} + +pub struct HunkListIter<'a> { + next: Option<&'a ffi::bdiff_hunk>, + remaining: usize, +} + +impl Iterator for HunkListIter<'_> { + type Item = Hunk; + + fn next(&mut self) -> Option<Self::Item> { + match self.next { + Some(hunk) => { + // Safety: Same reasoning as in `HunkList::iter`. + self.next = unsafe { hunk.next.as_ref() }; + self.remaining -= 1; + debug_assert_eq!(hunk.a2 - hunk.a1, hunk.b2 - hunk.b1); + Some(Hunk { + a1: hunk.a1 as u32, + a2: hunk.a2 as u32, + b1: hunk.b1 as u32, + b2: hunk.b2 as u32, + }) + } + None => { + assert_eq!(self.remaining, 0); + None + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + (self.remaining, Some(self.remaining)) + } +} + +impl ExactSizeIterator for HunkListIter<'_> {} + +mod ffi { + #![allow(non_camel_case_types)] + + use std::ffi::{c_char, c_int}; + + #[repr(C)] + #[derive(Debug, Copy, Clone)] + pub struct bdiff_line { + pub hash: c_int, + pub n: c_int, + pub e: c_int, + pub len: isize, + pub l: *const c_char, + } + + #[repr(C)] + #[derive(Debug, Copy, Clone)] + pub struct bdiff_hunk { + pub a1: c_int, + pub a2: c_int, + pub b1: c_int, + pub b2: c_int, + pub next: *mut bdiff_hunk, + } + + #[link(name = "bdiff", kind = "static")] + extern "C" { + /// Splits `a` into lines. On success, stores a pointer to an array of + /// lines in `*lr` and returns its length. On failure, returns + /// -1. The caller is responsible for freeing the array. + /// + /// # Safety + /// + /// - `a` must point to an array of `len` chars. + /// - `lr` must be non-null (but `*lr` can be null). + pub fn bdiff_splitlines( + a: *const c_char, + len: isize, + lr: *mut *mut bdiff_line, + ) -> c_int; + + /// Diffs `a` and `b`. On success, stores the head of a linked list of + /// hunks in `base->next` and returns its length. On failure, returns + /// -1. The caller is responsible for `bdiff_freehunks(base->next)`. + /// + /// # Safety + /// + /// - `a` must point to an array of `an` lines. + /// - `b` must point to an array of `bn` lines. + /// - `base` must be non-null. + pub fn bdiff_diff( + a: *mut bdiff_line, + an: c_int, + b: *mut bdiff_line, + bn: c_int, + base: *mut bdiff_hunk, + ) -> c_int; + + /// Frees the linked list of hunks `l`. + /// + /// # Safety + /// + /// - `l` must be non-null, not already freed, and not used after this. + pub fn bdiff_freehunks(l: *mut bdiff_hunk); + } +} + +#[cfg(test)] +mod tests { + fn split(a: &[u8]) -> Vec<&[u8]> { + super::split_lines(a).unwrap().iter().collect() + } + + fn diff(a: &[u8], b: &[u8]) -> Vec<(u32, u32, u32, u32)> { + let la = super::split_lines(a).unwrap(); + let lb = super::split_lines(b).unwrap(); + let hunks = super::diff(&la, &lb).unwrap(); + hunks.iter().map(|h| (h.a1, h.a2, h.b1, h.b2)).collect() + } + + #[test] + fn test_split_lines() { + assert_eq!(split(b""), [] as [&[u8]; 0]); + assert_eq!(split(b"\n"), [b"\n"]); + assert_eq!(split(b"\r\n"), [b"\r\n"]); + assert_eq!(split(b"X\nY"), [b"X\n" as &[u8], b"Y"]); + assert_eq!(split(b"X\nY\n"), [b"X\n" as &[u8], b"Y\n"]); + assert_eq!(split(b"X\r\nY\r\n"), [b"X\r\n" as &[u8], b"Y\r\n"]); + } + + #[test] + fn test_diff_single_line() { + assert_eq!(diff(b"", b""), &[(0, 0, 0, 0)]); + assert_eq!(diff(b"x", b"x"), &[(0, 1, 0, 1), (1, 1, 1, 1)]); + assert_eq!(diff(b"x", b"y"), &[(1, 1, 1, 1)]); + } + + #[test] + fn test_diff_multiple_lines() { + assert_eq!( + diff( + b" line1 \n line2 \n line3 \n line4 \n REMOVED \n", + b" ADDED \n line1 \n lined2_CHANGED \n line3 \n line4 \n" + ), + &[(0, 1, 1, 2), (2, 4, 3, 5), (5, 5, 5, 5)] + ); + } +}