view rust/hg-core/src/pre_regex.rs @ 52769:1b7a57a5b47a

rust: add safe bindings to bdiff.c I wrote C FFI bindings manually rather than using a bindgen build step because there are only 2 structs and 3 functions and they're not going to change. Note that the relative path in build.rs means that cargo publish will no longer work. If in the future we want to publish to crates.io, we would probably need to add a Makefile step that copies bdiff sources into the hg-core crate.
author Mitchell Kember <mkember@janestreet.com>
date Wed, 18 Dec 2024 10:35:01 -0500
parents b89c934e6269
children
line wrap: on
line source

use core::str;

use lazy_static::lazy_static;

use crate::filepatterns::PatternError;

lazy_static! {
    static ref RE_ESCAPE: Vec<Vec<u8>> = {
        let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
        let to_escape = b"()[]{}?*+-|^$\\.&~#\t\n\r\x0b\x0c";
        for byte in to_escape {
            v[*byte as usize].insert(0, b'\\');
        }
        v
    };
}

pub fn escape_char_for_re(c: u8) -> &'static [u8] {
    &RE_ESCAPE[c as usize]
}

/// An intermediate regular expression representation, that can be used
/// both to compile down to a `Regex` for matching, or converted to
/// a string directly for diagnostics.
#[derive(Debug, Clone)]
pub enum PreRegex {
    Empty,
    Dot,
    DotStar,
    Eof,
    NonslashStar,
    Byte(u8),
    Bytes(Vec<u8>),
    SlashOrEof,
    Re((regex_syntax::hir::Hir, Vec<u8>)),
    Maybe(Box<Self>),
    Alternation(Vec<Self>),
    Sequence(Vec<Self>),
}

mod to_hir {
    use itertools::Itertools;
    use regex_syntax::hir::{
        Class, ClassBytes, ClassBytesRange, Dot, Hir, Look, Repetition,
    };

    use super::PreRegex;

    fn hir_star(hir: Hir) -> Hir {
        Hir::repetition(Repetition {
            min: 0,
            max: None,
            greedy: false,
            sub: Box::new(hir),
        })
    }

    fn hir_eof() -> Hir {
        Hir::look(Look::End)
    }

    fn hir_nonslash() -> Hir {
        let mut class =
            Class::Bytes(ClassBytes::new([ClassBytesRange::new(b'/', b'/')]));
        Class::negate(&mut class);
        Hir::class(class)
    }

    fn hir_byte(b: u8) -> Hir {
        let class =
            Class::Bytes(ClassBytes::new([ClassBytesRange::new(b, b)]));
        Hir::class(class)
    }

    fn hir_literal(text: &[u8]) -> Hir {
        let b: Box<[u8]> = Box::from(text);
        Hir::literal(b)
    }

    pub(crate) fn to_hir(re: &PreRegex) -> regex_syntax::hir::Hir {
        match re {
            PreRegex::Empty => Hir::empty(),
            PreRegex::Dot => Hir::dot(Dot::AnyByte),
            PreRegex::DotStar => hir_star(Hir::dot(Dot::AnyByte)),
            PreRegex::Eof => hir_eof(),
            PreRegex::NonslashStar => hir_star(hir_nonslash()),
            PreRegex::Byte(b) => hir_byte(*b),
            PreRegex::Bytes(bs) => hir_literal(bs),
            PreRegex::SlashOrEof => {
                Hir::alternation(vec![hir_byte(b'/'), hir_eof()])
            }
            PreRegex::Re((hir, _)) => hir.clone(),
            PreRegex::Maybe(s) => {
                Hir::alternation(vec![Hir::empty(), s.to_hir()])
            }
            PreRegex::Alternation(alt) => {
                let alt = alt.iter().map(|r| r.to_hir()).collect_vec();
                Hir::alternation(alt)
            }
            PreRegex::Sequence(seq) => {
                let seq = seq.iter().map(|r| r.to_hir()).collect_vec();
                Hir::concat(seq)
            }
        }
    }
}

impl PreRegex {
    pub fn to_hir(&self) -> regex_syntax::hir::Hir {
        to_hir::to_hir(self)
    }

    fn to_bytes_rec(&self, out: &mut Vec<u8>) {
        match self {
            PreRegex::Empty => (),
            PreRegex::Dot => out.push(b'.'),
            PreRegex::DotStar => out.extend_from_slice(&b".*"[..]),
            PreRegex::Eof => out.push(b'$'),
            PreRegex::NonslashStar => out.extend_from_slice(&b"[^/]*"[..]),
            PreRegex::Byte(b) => out.extend_from_slice(escape_char_for_re(*b)),
            PreRegex::Bytes(bytes) => {
                for b in bytes {
                    out.extend_from_slice(escape_char_for_re(*b))
                }
            }
            PreRegex::SlashOrEof => out.extend_from_slice(&b"(?:/|$)"[..]),
            PreRegex::Re((_hir, src)) => out.extend_from_slice(src),
            PreRegex::Alternation(alt) => {
                if alt.is_empty() {
                    // something that can never match
                    out.extend_from_slice(&b" ^"[..])
                } else {
                    out.extend_from_slice(&b"(?:"[..]);
                    let mut first = true;
                    for r in alt {
                        if first {
                            first = false
                        } else {
                            out.extend_from_slice(&b"|"[..]);
                        }
                        r.to_bytes_rec(out)
                    }
                    out.extend_from_slice(&b")"[..]);
                }
            }
            PreRegex::Sequence(seq) => {
                for r in seq {
                    r.to_bytes_rec(out)
                }
            }
            PreRegex::Maybe(r) => {
                out.extend_from_slice(&b"(?:"[..]);
                r.to_bytes_rec(out);
                out.extend_from_slice(&b")?"[..]);
            }
        }
    }

    pub fn parse(re: &[u8]) -> Result<Self, PatternError> {
        let re_str = str::from_utf8(re)
            .map_err(|err| PatternError::UnsupportedSyntax(err.to_string()))?;
        Ok(Self::Re((
            regex_syntax::parse(re_str).map_err(|err| {
                PatternError::UnsupportedSyntax(err.to_string())
            })?,
            re.to_vec(),
        )))
    }

    pub fn to_bytes(&self) -> Vec<u8> {
        let mut out = vec![];
        self.to_bytes_rec(&mut out);
        out
    }

    pub fn literal(prefix: &[u8]) -> PreRegex {
        Self::Bytes(prefix.to_vec())
    }

    pub fn preceding_dir_components() -> Self {
        Self::Maybe(Box::new(Self::Sequence(vec![
            Self::DotStar,
            Self::Byte(b'/'),
        ])))
    }
}