Mercurial > public > mercurial-scm > hg-stable
diff rust/hg-core/src/revlog/revlog.rs @ 45532:26c53ee51c68
hg-core: Add a limited read only `revlog` implementation
Only covers the needs of the upcoming `rhg debugdata` command.
Differential Revision: https://phab.mercurial-scm.org/D8958
author | Antoine Cezar <antoine.cezar@octobus.net> |
---|---|
date | Fri, 04 Sep 2020 11:55:07 +0200 |
parents | |
children | b0d6309ff50c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/revlog/revlog.rs Fri Sep 04 11:55:07 2020 +0200 @@ -0,0 +1,353 @@ +use std::borrow::Cow; +use std::fs::File; +use std::io::Read; +use std::ops::Deref; +use std::path::Path; + +use byteorder::{BigEndian, ByteOrder}; +use flate2::read::ZlibDecoder; +use memmap::{Mmap, MmapOptions}; +use micro_timer::timed; +use zstd; + +use super::index::Index; +use super::patch; +use crate::revlog::Revision; + +pub enum RevlogError { + IoError(std::io::Error), + UnsuportedVersion(u16), + InvalidRevision, + Corrupted, + UnknowDataFormat(u8), +} + +fn mmap_open(path: &Path) -> Result<Mmap, std::io::Error> { + let file = File::open(path)?; + let mmap = unsafe { MmapOptions::new().map(&file) }?; + Ok(mmap) +} + +/// Read only implementation of revlog. +pub struct Revlog { + /// When index and data are not interleaved: bytes of the revlog index. + /// When index and data are interleaved: bytes of the revlog index and + /// data. + index_bytes: Box<dyn Deref<Target = [u8]> + Send>, + /// When index and data are not interleaved: bytes of the revlog data + data_bytes: Option<Box<dyn Deref<Target = [u8]> + Send>>, +} + +impl Revlog { + /// Open a revlog index file. + /// + /// It will also open the associated data file if index and data are not + /// interleaved. + #[timed] + pub fn open(index_path: &Path) -> Result<Self, RevlogError> { + let index_mmap = + mmap_open(&index_path).map_err(RevlogError::IoError)?; + + let version = get_version(&index_mmap); + if version != 1 { + return Err(RevlogError::UnsuportedVersion(version)); + } + + let is_inline = is_inline(&index_mmap); + + let index_bytes = Box::new(index_mmap); + + // TODO load data only when needed // + // type annotation required + // won't recognize Mmap as Deref<Target = [u8]> + let data_bytes: Option<Box<dyn Deref<Target = [u8]> + Send>> = + if is_inline { + None + } else { + let data_path = index_path.with_extension("d"); + let data_mmap = + mmap_open(&data_path).map_err(RevlogError::IoError)?; + Some(Box::new(data_mmap)) + }; + + Ok(Revlog { + index_bytes, + data_bytes, + }) + } + + /// Return the full data associated to a revision. + /// + /// All entries required to build the final data out of deltas will be + /// retrieved as needed, and the deltas will be applied to the inital + /// snapshot to rebuild the final data. + #[timed] + pub fn get_rev_data(&self, rev: Revision) -> Result<Vec<u8>, RevlogError> { + // Todo return -> Cow + let mut entry = self.get_entry(rev)?; + let mut delta_chain = vec![]; + while let Some(base_rev) = entry.base_rev { + delta_chain.push(entry); + entry = self + .get_entry(base_rev) + .map_err(|_| RevlogError::Corrupted)?; + } + + if delta_chain.is_empty() { + Ok(entry.data()?.into()) + } else { + Revlog::build_data_from_deltas(entry, &delta_chain) + } + } + + /// Build the full data of a revision out its snapshot + /// and its deltas. + #[timed] + fn build_data_from_deltas( + snapshot: RevlogEntry, + deltas: &[RevlogEntry], + ) -> Result<Vec<u8>, RevlogError> { + let snapshot = snapshot.data()?; + let deltas = deltas + .iter() + .rev() + .map(RevlogEntry::data) + .collect::<Result<Vec<Cow<'_, [u8]>>, RevlogError>>()?; + let patches: Vec<_> = + deltas.iter().map(|d| patch::PatchList::new(d)).collect(); + let patch = patch::fold_patch_lists(&patches); + Ok(patch.apply(&snapshot)) + } + + /// Return the revlog index. + fn index(&self) -> Index { + let is_inline = self.data_bytes.is_none(); + Index::new(&self.index_bytes, is_inline) + } + + /// Return the revlog data. + fn data(&self) -> &[u8] { + match self.data_bytes { + Some(ref data_bytes) => &data_bytes, + None => &self.index_bytes, + } + } + + /// Get an entry of the revlog. + fn get_entry(&self, rev: Revision) -> Result<RevlogEntry, RevlogError> { + let index = self.index(); + let index_entry = + index.get_entry(rev).ok_or(RevlogError::InvalidRevision)?; + let start = index_entry.offset(); + let end = start + index_entry.compressed_len(); + let entry = RevlogEntry { + rev, + bytes: &self.data()[start..end], + compressed_len: index_entry.compressed_len(), + uncompressed_len: index_entry.uncompressed_len(), + base_rev: if index_entry.base_revision() == rev { + None + } else { + Some(index_entry.base_revision()) + }, + }; + Ok(entry) + } +} + +/// The revlog entry's bytes and the necessary informations to extract +/// the entry's data. +#[derive(Debug)] +pub struct RevlogEntry<'a> { + rev: Revision, + bytes: &'a [u8], + compressed_len: usize, + uncompressed_len: usize, + base_rev: Option<Revision>, +} + +impl<'a> RevlogEntry<'a> { + /// Extract the data contained in the entry. + pub fn data(&self) -> Result<Cow<'_, [u8]>, RevlogError> { + if self.bytes.is_empty() { + return Ok(Cow::Borrowed(&[])); + } + match self.bytes[0] { + // Revision data is the entirety of the entry, including this + // header. + b'\0' => Ok(Cow::Borrowed(self.bytes)), + // Raw revision data follows. + b'u' => Ok(Cow::Borrowed(&self.bytes[1..])), + // zlib (RFC 1950) data. + b'x' => Ok(Cow::Owned(self.uncompressed_zlib_data())), + // zstd data. + b'\x28' => Ok(Cow::Owned(self.uncompressed_zstd_data())), + format_type => Err(RevlogError::UnknowDataFormat(format_type)), + } + } + + fn uncompressed_zlib_data(&self) -> Vec<u8> { + let mut decoder = ZlibDecoder::new(self.bytes); + if self.is_delta() { + let mut buf = Vec::with_capacity(self.compressed_len); + decoder.read_to_end(&mut buf).expect("corrupted zlib data"); + buf + } else { + let mut buf = vec![0; self.uncompressed_len]; + decoder.read_exact(&mut buf).expect("corrupted zlib data"); + buf + } + } + + fn uncompressed_zstd_data(&self) -> Vec<u8> { + if self.is_delta() { + let mut buf = Vec::with_capacity(self.compressed_len); + zstd::stream::copy_decode(self.bytes, &mut buf) + .expect("corrupted zstd data"); + buf + } else { + let mut buf = vec![0; self.uncompressed_len]; + let len = zstd::block::decompress_to_buffer(self.bytes, &mut buf) + .expect("corrupted zstd data"); + assert_eq!(len, self.uncompressed_len, "corrupted zstd data"); + buf + } + } + + /// Tell if the entry is a snapshot or a delta + /// (influences on decompression). + fn is_delta(&self) -> bool { + self.base_rev.is_some() + } +} + +/// Value of the inline flag. +pub fn is_inline(index_bytes: &[u8]) -> bool { + match &index_bytes[0..=1] { + [0, 0] | [0, 2] => false, + _ => true, + } +} + +/// Format version of the revlog. +pub fn get_version(index_bytes: &[u8]) -> u16 { + BigEndian::read_u16(&index_bytes[2..=3]) +} + +#[cfg(test)] +mod tests { + use super::*; + + use super::super::index::IndexEntryBuilder; + + #[cfg(test)] + pub struct RevlogBuilder { + version: u16, + is_general_delta: bool, + is_inline: bool, + offset: usize, + index: Vec<Vec<u8>>, + data: Vec<Vec<u8>>, + } + + #[cfg(test)] + impl RevlogBuilder { + pub fn new() -> Self { + Self { + version: 2, + is_inline: false, + is_general_delta: true, + offset: 0, + index: vec![], + data: vec![], + } + } + + pub fn with_inline(&mut self, value: bool) -> &mut Self { + self.is_inline = value; + self + } + + pub fn with_general_delta(&mut self, value: bool) -> &mut Self { + self.is_general_delta = value; + self + } + + pub fn with_version(&mut self, value: u16) -> &mut Self { + self.version = value; + self + } + + pub fn push( + &mut self, + mut index: IndexEntryBuilder, + data: Vec<u8>, + ) -> &mut Self { + if self.index.is_empty() { + index.is_first(true); + index.with_general_delta(self.is_general_delta); + index.with_inline(self.is_inline); + index.with_version(self.version); + } else { + index.with_offset(self.offset); + } + self.index.push(index.build()); + self.offset += data.len(); + self.data.push(data); + self + } + + pub fn build_inline(&self) -> Vec<u8> { + let mut bytes = + Vec::with_capacity(self.index.len() + self.data.len()); + for (index, data) in self.index.iter().zip(self.data.iter()) { + bytes.extend(index); + bytes.extend(data); + } + bytes + } + } + + #[test] + fn is_not_inline_when_no_inline_flag_test() { + let bytes = RevlogBuilder::new() + .with_general_delta(false) + .with_inline(false) + .push(IndexEntryBuilder::new(), vec![]) + .build_inline(); + + assert_eq!(is_inline(&bytes), false) + } + + #[test] + fn is_inline_when_inline_flag_test() { + let bytes = RevlogBuilder::new() + .with_general_delta(false) + .with_inline(true) + .push(IndexEntryBuilder::new(), vec![]) + .build_inline(); + + assert_eq!(is_inline(&bytes), true) + } + + #[test] + fn is_inline_when_inline_and_generaldelta_flags_test() { + let bytes = RevlogBuilder::new() + .with_general_delta(true) + .with_inline(true) + .push(IndexEntryBuilder::new(), vec![]) + .build_inline(); + + assert_eq!(is_inline(&bytes), true) + } + + #[test] + fn version_test() { + let bytes = RevlogBuilder::new() + .with_version(1) + .push(IndexEntryBuilder::new(), vec![]) + .build_inline(); + + assert_eq!(get_version(&bytes), 1) + } +}