Skip to content
This repository has been archived by the owner on Sep 16, 2024. It is now read-only.

#65: Replace CRC32 with BLAKE3 Hash Function #73

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ crate-type = ["rlib"]

[dependencies]
log = { version = "0.4.17", features = ["release_max_level_off"] }
crc32fast = "1.3.2"
blake3 = "1.5"
walkdir = "2.3.2"
anyhow = "1.0.58"
env_logger = "0.9.0"
Expand Down Expand Up @@ -38,6 +38,9 @@ uuid = { version = "1.6.1", features = ["v4"] }
[dev-dependencies]
tempdir = "0.3.7"
rstest = '0.18.2'
# benchmarking
criterion = { version = "0.5", features = ["html_reports"] }
rand = "0.8"

[build-dependencies]
flate2 = "1.0.24"
Expand All @@ -47,3 +50,8 @@ tar = "0.4.38"
target-lexicon = "0.12.4"
ureq = "2.4.0"
ring = "=0.17.5"

[[bench]]
name = "hash_benchmark"
harness = false
path = "benches/hash_benchmark.rs"
34 changes: 34 additions & 0 deletions benches/hash_benchmark.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use arklib::id::ResourceId;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use rand::prelude::*;

fn generate_random_data(size: usize) -> Vec<u8> {
let mut rng = rand::thread_rng();
(0..size).map(|_| rng.gen()).collect()
}

fn compute_bytes_benchmark(c: &mut Criterion) {
let inputs = [
("compute_bytes_small", 64),
("compute_bytes_medium", 512),
("compute_bytes_large", 4096),
];

for (name, size) in inputs.iter() {
let input_data = generate_random_data(*size);
c.bench_function(name, move |b| {
b.iter(|| {
if let Ok(result) =
ResourceId::compute_bytes(black_box(&input_data))
{
black_box(result);
} else {
panic!("compute_bytes returned an error");
}
});
});
}
}

criterion_group!(benches, compute_bytes_benchmark);
criterion_main!(benches);
3 changes: 2 additions & 1 deletion src/atomic/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ pub struct ReadOnlyFile {
pub path: PathBuf,
}

/// This struct is the only way to read the file. Both path and version are private
/// This struct is the only way to read the file. Both path and version are
/// private
impl ReadOnlyFile {
/// Open the underlying file, which can be read from but not written to.
/// May return `Ok(None)`, which means that no version
Expand Down
6 changes: 4 additions & 2 deletions src/atomic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ mod tests {
let current = file.load().unwrap();
let content = format!("Content from thread {i}!");
(&temp).write_all(content.as_bytes()).unwrap();
// In case slow computer ensure each thread are running in the same time
// In case slow computer ensure each thread are running in the
// same time
std::thread::sleep(std::time::Duration::from_millis(300));
file.compare_and_swap(&current, temp)
});
Expand Down Expand Up @@ -108,7 +109,8 @@ mod tests {
let shared_file = std::sync::Arc::new(AtomicFile::new(root).unwrap());
let thread_number = 10;
assert!(thread_number > 3);
// Need to have less than 255 thread to store thread number as byte directly
// Need to have less than 255 thread to store thread number as byte
// directly
assert!(thread_number < 256);
let mut handles = Vec::with_capacity(thread_number);
for i in 0..thread_number {
Expand Down
2 changes: 0 additions & 2 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ pub enum ArklibError {
Io(#[from] std::io::Error),
#[error("Path error: {0}")]
Path(String),
#[error("There is some collision: {0}")]
Collision(String),
#[error("Parsing error")]
Parse,
#[error("Networking error")]
Expand Down
83 changes: 68 additions & 15 deletions src/id.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use anyhow::anyhow;
use crc32fast::Hasher;
use blake3::Hasher as Blake3Hasher;
use log;
use serde::{Deserialize, Serialize};
use std::fmt::{self, Display, Formatter};
Expand All @@ -25,12 +25,12 @@ use crate::{ArklibError, Result};
)]
pub struct ResourceId {
pub data_size: u64,
pub crc32: u32,
pub blake3: [u8; 32],
}

impl Display for ResourceId {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
write!(f, "{}-{}", self.data_size, self.crc32)
write!(f, "{}-{:?}", self.data_size, self.blake3)
}
}

Expand All @@ -40,9 +40,19 @@ impl FromStr for ResourceId {
fn from_str(s: &str) -> Result<Self> {
let (l, r) = s.split_once('-').ok_or(ArklibError::Parse)?;
let data_size: u64 = l.parse().map_err(|_| ArklibError::Parse)?;
let crc32: u32 = r.parse().map_err(|_| ArklibError::Parse)?;

Ok(ResourceId { data_size, crc32 })
let s = r.trim_start_matches('[').trim_end_matches(']');
let bytes: Vec<&str> = s.split(", ").collect();
if bytes.len() != 32 {
return Err(ArklibError::Parse);
}
let mut blake3 = [0u8; 32];
for (i, byte_str) in bytes.iter().enumerate() {
blake3[i] = byte_str
.trim()
.parse()
.map_err(|_e| ArklibError::Parse)?;
}
Ok(ResourceId { data_size, blake3 })
}
}

Expand All @@ -68,7 +78,7 @@ impl ResourceId {
pub fn compute_bytes(bytes: &[u8]) -> Result<Self> {
let data_size = bytes.len().try_into().map_err(|_| {
ArklibError::Other(anyhow!("Can't convert usize to u64"))
})?; //.unwrap();
})?;
let mut reader = BufReader::with_capacity(BUFFER_CAPACITY, bytes);
ResourceId::compute_reader(data_size, &mut reader)
}
Expand All @@ -84,7 +94,7 @@ impl ResourceId {
data_size / MEGABYTE
);

let mut hasher = Hasher::new();
let mut hasher = Blake3Hasher::new();
let mut bytes_read: u32 = 0;
loop {
let bytes_read_iteration: usize = reader.fill_buf()?.len();
Expand All @@ -99,12 +109,15 @@ impl ResourceId {
})?;
}

let crc32: u32 = hasher.finalize();
let blake3 = hasher.finalize();
log::trace!("[compute] {} bytes has been read", bytes_read);
log::trace!("[compute] checksum: {:#02x}", crc32);
log::trace!("[compute] blake3 hash: {}", blake3);
assert_eq!(std::convert::Into::<u64>::into(bytes_read), data_size);

Ok(ResourceId { data_size, crc32 })
Ok(ResourceId {
data_size,
blake3: blake3.into(),
})
}
}

Expand All @@ -118,6 +131,24 @@ mod tests {

use super::*;

#[test]
fn resource_id_to_and_from_string() {
let plain_text = "Hello, world!";
let mut hasher = Blake3Hasher::new();
hasher.update(plain_text.as_bytes());
let blake3 = hasher.finalize();

let id = ResourceId {
data_size: plain_text.len() as u64,
blake3: blake3.into(),
};

let id_str = id.to_string();
let id2 = id_str.parse::<ResourceId>().unwrap();

assert_eq!(id, id2);
}

#[test]
fn compute_id_test() {
initialize();
Expand All @@ -133,24 +164,46 @@ mod tests {
.len();

let id1 = ResourceId::compute(data_size, file_path).unwrap();
assert_eq!(id1.crc32, 0x342a3d4a);
assert_eq!(
id1.blake3,
[
23, 43, 75, 241, 72, 232, 88, 177, 61, 222, 15, 198, 97, 52,
19, 188, 183, 85, 46, 92, 78, 92, 69, 25, 90, 198, 200, 15, 32,
235, 95, 245
]
);
assert_eq!(id1.data_size, 128760);

let raw_bytes = fs::read(file_path).unwrap();
let id2 = ResourceId::compute_bytes(raw_bytes.as_slice()).unwrap();
assert_eq!(id2.crc32, 0x342a3d4a);
assert_eq!(
id2.blake3,
[
23, 43, 75, 241, 72, 232, 88, 177, 61, 222, 15, 198, 97, 52,
19, 188, 183, 85, 46, 92, 78, 92, 69, 25, 90, 198, 200, 15, 32,
235, 95, 245
]
);
assert_eq!(id2.data_size, 128760);
}

#[test]
fn resource_id_order() {
let id1 = ResourceId {
data_size: 1,
crc32: 2,
blake3: [
23, 43, 75, 241, 72, 232, 88, 177, 61, 222, 15, 198, 97, 52,
19, 188, 183, 85, 46, 92, 78, 92, 69, 25, 90, 198, 200, 15, 32,
235, 95, 245,
],
};
let id2 = ResourceId {
data_size: 2,
crc32: 1,
blake3: [
23, 43, 75, 241, 72, 232, 88, 177, 61, 222, 15, 198, 97, 52,
19, 188, 183, 85, 46, 92, 78, 92, 69, 25, 90, 198, 200, 15, 32,
235, 95, 245,
],
};

assert!(id1 < id2);
Expand Down
Loading