view rust/hg-core/src/utils/files.rs @ 52781:6183949219b2

rhg: implement rhg annotate This initial implementation produces the same output as Python for all the files I've tried, and is usually 1.5-9x faster. The algorithm is mostly the same, but one key difference is that the Rust implementation only converts filelog revisions to changelog revisions if they will actually appear in the output. This does not support all the command line flags yet. In particular, --template, --include, --exclude, --skip, and whitespace-related flags will cause fallback to Python. Also, --rev 'wdir()' (often used by editor plugins) is not supported. There is also no pager.
author Mitchell Kember <mkember@janestreet.com>
date Fri, 24 Jan 2025 12:01:12 -0500
parents 94e2547e6f3d
children
line wrap: on
line source

// files.rs
//
// Copyright 2019
// Raphaël Gomès <rgomes@octobus.net>,
// Yuya Nishihara <yuya@tcha.org>
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2 or any later version.

//! Functions for fiddling with files.

use crate::utils::{
    hg_path::{path_to_hg_path_buf, HgPath, HgPathBuf, HgPathError},
    path_auditor::PathAuditor,
    strings::replace_slice,
};
use lazy_static::lazy_static;
use same_file::is_same_file;
use std::ffi::{OsStr, OsString};
use std::iter::FusedIterator;
use std::ops::Deref;
use std::os::unix::fs::PermissionsExt;
use std::path::{Path, PathBuf};
use std::{
    borrow::{Cow, ToOwned},
    io,
    time::SystemTime,
};

pub fn get_os_str_from_bytes(bytes: &[u8]) -> &OsStr {
    let os_str;
    #[cfg(unix)]
    {
        use std::os::unix::ffi::OsStrExt;
        os_str = std::ffi::OsStr::from_bytes(bytes);
    }
    // TODO Handle other platforms
    // TODO: convert from WTF8 to Windows MBCS (ANSI encoding).
    // Perhaps, the return type would have to be Result<PathBuf>.
    os_str
}

pub fn get_path_from_bytes(bytes: &[u8]) -> &Path {
    Path::new(get_os_str_from_bytes(bytes))
}

// TODO: need to convert from WTF8 to MBCS bytes on Windows.
// that's why Vec<u8> is returned.
#[cfg(unix)]
pub fn get_bytes_from_path(path: impl AsRef<Path>) -> Vec<u8> {
    get_bytes_from_os_str(path.as_ref())
}

#[cfg(unix)]
pub fn get_bytes_from_os_str(str: impl AsRef<OsStr>) -> Vec<u8> {
    use std::os::unix::ffi::OsStrExt;
    str.as_ref().as_bytes().to_vec()
}

#[cfg(unix)]
pub fn get_bytes_from_os_string(str: OsString) -> Vec<u8> {
    use std::os::unix::ffi::OsStringExt;
    str.into_vec()
}

/// An iterator over repository path yielding itself and its ancestors.
#[derive(Copy, Clone, Debug)]
pub struct Ancestors<'a> {
    next: Option<&'a HgPath>,
}

impl<'a> Iterator for Ancestors<'a> {
    type Item = &'a HgPath;

    fn next(&mut self) -> Option<Self::Item> {
        let next = self.next;
        self.next = match self.next {
            Some(s) if s.is_empty() => None,
            Some(s) => {
                let p = s.bytes().rposition(|c| *c == b'/').unwrap_or(0);
                Some(HgPath::new(&s.as_bytes()[..p]))
            }
            None => None,
        };
        next
    }
}

impl<'a> FusedIterator for Ancestors<'a> {}

/// An iterator over repository path yielding itself and its ancestors.
#[derive(Copy, Clone, Debug)]
pub(crate) struct AncestorsWithBase<'a> {
    next: Option<(&'a HgPath, &'a HgPath)>,
}

impl<'a> Iterator for AncestorsWithBase<'a> {
    type Item = (&'a HgPath, &'a HgPath);

    fn next(&mut self) -> Option<Self::Item> {
        let next = self.next;
        self.next = match self.next {
            Some((s, _)) if s.is_empty() => None,
            Some((s, _)) => Some(s.split_filename()),
            None => None,
        };
        next
    }
}

impl<'a> FusedIterator for AncestorsWithBase<'a> {}

/// Returns an iterator yielding ancestor directories of the given repository
/// path.
///
/// The path is separated by '/', and must not start with '/'.
///
/// The path itself isn't included unless it is b"" (meaning the root
/// directory.)
pub fn find_dirs(path: &HgPath) -> Ancestors {
    let mut dirs = Ancestors { next: Some(path) };
    if !path.is_empty() {
        dirs.next(); // skip itself
    }
    dirs
}

pub fn dir_ancestors(path: &HgPath) -> Ancestors {
    Ancestors { next: Some(path) }
}

/// Returns an iterator yielding ancestor directories of the given repository
/// path.
///
/// The path is separated by '/', and must not start with '/'.
///
/// The path itself isn't included unless it is b"" (meaning the root
/// directory.)
pub(crate) fn find_dirs_with_base(path: &HgPath) -> AncestorsWithBase {
    let mut dirs = AncestorsWithBase {
        next: Some((path, HgPath::new(b""))),
    };
    if !path.is_empty() {
        dirs.next(); // skip itself
    }
    dirs
}

/// TODO more than ASCII?
pub fn normalize_case(path: &HgPath) -> HgPathBuf {
    #[cfg(windows)] // NTFS compares via upper()
    return path.to_ascii_uppercase();
    #[cfg(unix)]
    path.to_ascii_lowercase()
}

lazy_static! {
    static ref IGNORED_CHARS: Vec<Vec<u8>> = {
        [
            0x200c, 0x200d, 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d,
            0x202e, 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
        ]
        .iter()
        .map(|code| {
            std::char::from_u32(*code)
                .unwrap()
                .encode_utf8(&mut [0; 3])
                .bytes()
                .collect()
        })
        .collect()
    };
}

fn hfs_ignore_clean(bytes: &[u8]) -> Vec<u8> {
    let mut buf = bytes.to_owned();
    let needs_escaping = bytes.iter().any(|b| *b == b'\xe2' || *b == b'\xef');
    if needs_escaping {
        for forbidden in IGNORED_CHARS.iter() {
            replace_slice(&mut buf, forbidden, &[])
        }
        buf
    } else {
        buf
    }
}

pub fn lower_clean(bytes: &[u8]) -> Vec<u8> {
    hfs_ignore_clean(&bytes.to_ascii_lowercase())
}

/// Returns the canonical path of `name`, given `cwd` and `root`
pub fn canonical_path(
    root: impl AsRef<Path>,
    cwd: impl AsRef<Path>,
    name: impl AsRef<Path>,
) -> Result<PathBuf, HgPathError> {
    // TODO add missing normalization for other platforms
    let root = root.as_ref();
    let cwd = cwd.as_ref();
    let name = name.as_ref();

    let name = if !name.is_absolute() {
        root.join(cwd).join(name)
    } else {
        name.to_owned()
    };
    let auditor = PathAuditor::new(root);
    if name != root && name.starts_with(root) {
        let name = name.strip_prefix(root).unwrap();
        auditor.audit_path(path_to_hg_path_buf(name)?)?;
        Ok(name.to_owned())
    } else if name == root {
        Ok("".into())
    } else {
        // Determine whether `name' is in the hierarchy at or beneath `root',
        // by iterating name=name.parent() until it returns `None` (can't
        // check name == '/', because that doesn't work on windows).
        let mut name = name.deref();
        let original_name = name.to_owned();
        loop {
            let same = is_same_file(name, root).unwrap_or(false);
            if same {
                if name == original_name {
                    // `name` was actually the same as root (maybe a symlink)
                    return Ok("".into());
                }
                // `name` is a symlink to root, so `original_name` is under
                // root
                let rel_path = original_name.strip_prefix(name).unwrap();
                auditor.audit_path(path_to_hg_path_buf(rel_path)?)?;
                return Ok(rel_path.to_owned());
            }
            name = match name.parent() {
                None => break,
                Some(p) => p,
            };
        }
        // TODO hint to the user about using --cwd
        // Bubble up the responsibility to Python for now
        Err(HgPathError::NotUnderRoot {
            path: original_name,
            root: root.to_owned(),
        })
    }
}

/// Returns the representation of the path relative to the current working
/// directory for display purposes.
///
/// `cwd` is a `HgPath`, so it is considered relative to the root directory
/// of the repository.
///
/// # Examples
///
/// ```
/// use hg::utils::hg_path::HgPath;
/// use hg::utils::files::relativize_path;
/// use std::borrow::Cow;
///
/// let file = HgPath::new(b"nested/file");
/// let cwd = HgPath::new(b"");
/// assert_eq!(relativize_path(file, cwd), Cow::Borrowed(b"nested/file"));
///
/// let cwd = HgPath::new(b"nested");
/// assert_eq!(relativize_path(file, cwd), Cow::Borrowed(b"file"));
///
/// let cwd = HgPath::new(b"other");
/// assert_eq!(relativize_path(file, cwd), Cow::Borrowed(b"../nested/file"));
/// ```
pub fn relativize_path(path: &HgPath, cwd: impl AsRef<HgPath>) -> Cow<[u8]> {
    if cwd.as_ref().is_empty() {
        Cow::Borrowed(path.as_bytes())
    } else {
        // This is not all accurate as to how large `res` will actually be, but
        // profiling `rhg files` on a large-ish repo shows it’s better than
        // starting from a zero-capacity `Vec` and letting `extend` reallocate
        // repeatedly.
        let guesstimate = path.as_bytes().len();

        let mut res: Vec<u8> = Vec::with_capacity(guesstimate);
        let mut path_iter = path.as_bytes().split(|b| *b == b'/').peekable();
        let mut cwd_iter =
            cwd.as_ref().as_bytes().split(|b| *b == b'/').peekable();
        loop {
            match (path_iter.peek(), cwd_iter.peek()) {
                (Some(a), Some(b)) if a == b => (),
                _ => break,
            }
            path_iter.next();
            cwd_iter.next();
        }
        let mut need_sep = false;
        for _ in cwd_iter {
            if need_sep {
                res.extend(b"/")
            } else {
                need_sep = true
            };
            res.extend(b"..");
        }
        for c in path_iter {
            if need_sep {
                res.extend(b"/")
            } else {
                need_sep = true
            };
            res.extend(c);
        }
        Cow::Owned(res)
    }
}

/// Return the `mtime` of a temporary file newly-created in the `.hg` directory
/// of the give repository.
///
/// This is similar to `SystemTime::now()`, with the result truncated to the
/// same time resolution as other files’ modification times. Using `.hg`
/// instead of the system’s default temporary directory (such as `/tmp`) makes
/// it more likely the temporary file is in the same disk partition as contents
/// of the working directory, which can matter since different filesystems may
/// store timestamps with different resolutions.
///
/// This may fail, typically if we lack write permissions. In that case we
/// should continue the `status()` algoritm anyway and consider the current
/// date/time to be unknown.
pub fn filesystem_now(repo_root: &Path) -> Result<SystemTime, io::Error> {
    tempfile::Builder::new()
        .permissions(std::fs::Permissions::from_mode(0o666))
        .tempfile_in(repo_root.join(".hg"))?
        .into_file()
        .metadata()?
        .modified()
}

/// Returns true if file content is considered to be binary (not text).
pub fn is_binary(content: &[u8]) -> bool {
    // Matches binary() in utils/stringutil.py.
    !content.is_empty() && memchr::memchr(b'\0', content).is_some()
}

#[cfg(test)]
mod tests {
    use super::*;
    use pretty_assertions::assert_eq;

    #[test]
    fn find_dirs_some() {
        let mut dirs = super::find_dirs(HgPath::new(b"foo/bar/baz"));
        assert_eq!(dirs.next(), Some(HgPath::new(b"foo/bar")));
        assert_eq!(dirs.next(), Some(HgPath::new(b"foo")));
        assert_eq!(dirs.next(), Some(HgPath::new(b"")));
        assert_eq!(dirs.next(), None);
        assert_eq!(dirs.next(), None);
    }

    #[test]
    fn find_dirs_empty() {
        // looks weird, but mercurial.pathutil.finddirs(b"") yields b""
        let mut dirs = super::find_dirs(HgPath::new(b""));
        assert_eq!(dirs.next(), Some(HgPath::new(b"")));
        assert_eq!(dirs.next(), None);
        assert_eq!(dirs.next(), None);
    }

    #[test]
    fn test_find_dirs_with_base_some() {
        let mut dirs = super::find_dirs_with_base(HgPath::new(b"foo/bar/baz"));
        assert_eq!(
            dirs.next(),
            Some((HgPath::new(b"foo/bar"), HgPath::new(b"baz")))
        );
        assert_eq!(
            dirs.next(),
            Some((HgPath::new(b"foo"), HgPath::new(b"bar")))
        );
        assert_eq!(dirs.next(), Some((HgPath::new(b""), HgPath::new(b"foo"))));
        assert_eq!(dirs.next(), None);
        assert_eq!(dirs.next(), None);
    }

    #[test]
    fn test_find_dirs_with_base_empty() {
        let mut dirs = super::find_dirs_with_base(HgPath::new(b""));
        assert_eq!(dirs.next(), Some((HgPath::new(b""), HgPath::new(b""))));
        assert_eq!(dirs.next(), None);
        assert_eq!(dirs.next(), None);
    }

    #[test]
    fn test_canonical_path() {
        let root = Path::new("/repo");
        let cwd = Path::new("/dir");
        let name = Path::new("filename");
        assert_eq!(
            canonical_path(root, cwd, name),
            Err(HgPathError::NotUnderRoot {
                path: PathBuf::from("/dir/filename"),
                root: root.to_path_buf()
            })
        );

        let root = Path::new("/repo");
        let cwd = Path::new("/");
        let name = Path::new("filename");
        assert_eq!(
            canonical_path(root, cwd, name),
            Err(HgPathError::NotUnderRoot {
                path: PathBuf::from("/filename"),
                root: root.to_path_buf()
            })
        );

        let root = Path::new("/repo");
        let cwd = Path::new("/");
        let name = Path::new("repo/filename");
        assert_eq!(
            canonical_path(root, cwd, name),
            Ok(PathBuf::from("filename"))
        );

        let root = Path::new("/repo");
        let cwd = Path::new("/repo");
        let name = Path::new("filename");
        assert_eq!(
            canonical_path(root, cwd, name),
            Ok(PathBuf::from("filename"))
        );

        let root = Path::new("/repo");
        let cwd = Path::new("/repo/subdir");
        let name = Path::new("filename");
        assert_eq!(
            canonical_path(root, cwd, name),
            Ok(PathBuf::from("subdir/filename"))
        );
    }

    #[test]
    fn test_canonical_path_not_rooted() {
        use std::fs::create_dir;
        use tempfile::tempdir;

        let base_dir = tempdir().unwrap();
        let base_dir_path = base_dir.path();
        let beneath_repo = base_dir_path.join("a");
        let root = base_dir_path.join("a/b");
        let out_of_repo = base_dir_path.join("c");
        let under_repo_symlink = out_of_repo.join("d");

        create_dir(&beneath_repo).unwrap();
        create_dir(&root).unwrap();

        // TODO make portable
        std::os::unix::fs::symlink(&root, &out_of_repo).unwrap();

        assert_eq!(
            canonical_path(&root, Path::new(""), out_of_repo),
            Ok(PathBuf::from(""))
        );
        assert_eq!(
            canonical_path(&root, Path::new(""), &beneath_repo),
            Err(HgPathError::NotUnderRoot {
                path: beneath_repo,
                root: root.to_owned()
            })
        );
        assert_eq!(
            canonical_path(&root, Path::new(""), under_repo_symlink),
            Ok(PathBuf::from("d"))
        );
    }
}