From f988c80854e924db81847f667e4d9c27e682c4e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn-Michael=20Miehe?=
 <40151420+ldericher@users.noreply.github.com>
Date: Sat, 5 Jul 2025 01:24:53 +0000
Subject: [PATCH] [wip] unit tests for `file` module

- move test data to `test_util::data`
---
 src/file/checked.rs   |  12 ++---
 src/file/chunk.rs     |  11 ++++
 src/file/mod.rs       |  55 ++++----------------
 src/test_util/data.rs | 118 ++++++++++++++++++++++++++++++++++++++++++
 src/test_util/mod.rs  |   2 +
 5 files changed, 147 insertions(+), 51 deletions(-)
 create mode 100644 src/test_util/data.rs

diff --git a/src/file/checked.rs b/src/file/checked.rs
index 8bbe215..2adf26e 100644
--- a/src/file/checked.rs
+++ b/src/file/checked.rs
@@ -119,16 +119,16 @@ mod tests {
 
     use tempfile::TempDir;
 
-    use crate::{
-        file::tests::{CASES, HASHES},
-        test_util::create_file,
+    use crate::test_util::{
+        create_file,
+        data::{HASHES_STD_GOOD, cases, data},
     };
 
     use super::*;
 
     #[test]
     fn new_on_existing_file_works() {
-        for (content, size) in CASES {
+        for (content, size) in cases() {
             let file = create_file(content);
             let chk = Checked::new(file.path()).expect("creating `Checked` should succeed");
 
@@ -180,7 +180,7 @@ mod tests {
 
     #[test]
     fn hashing_works() {
-        for (&(content, _), hash) in CASES.iter().zip(HASHES) {
+        for (content, hash) in data().zip(HASHES_STD_GOOD) {
             let file = create_file(content);
             let mut chk = Checked::new(file.path()).expect("creating `Checked` should succeed");
 
@@ -194,7 +194,7 @@ mod tests {
 
     #[test]
     fn hashing_again_errors() {
-        for (content, _) in CASES {
+        for content in data() {
             let file = create_file(content);
             let mut chk = Checked::new(file.path()).expect("creating `Checked` should succeed");
 
diff --git a/src/file/chunk.rs b/src/file/chunk.rs
index c408cae..d3025b8 100644
--- a/src/file/chunk.rs
+++ b/src/file/chunk.rs
@@ -52,3 +52,14 @@ impl<'t> Chunk<'t> {
         self.offset + self.get_length()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    // use super::*;
+
+    // #[test]
+    // fn basic_tests() {
+    //     let mut foo = [0u8; 10];
+    //     let fid = sharry::FileID("fid".to_string());
+    // }
+}
diff --git a/src/file/mod.rs b/src/file/mod.rs
index 3d42b8a..246df07 100644
--- a/src/file/mod.rs
+++ b/src/file/mod.rs
@@ -124,35 +124,16 @@ pub trait FileTrait {
 
 #[cfg(test)]
 mod tests {
-    use crate::test_util::create_file;
+    use crate::test_util::{
+        create_file,
+        data::{DATA_LENGTHS_BAD, HASHES_STD_BAD, HASHES_STD_GOOD, cases, cases_with},
+    };
 
     use super::*;
 
-    pub static CASES: [(&[u8], u64); 8] = [
-        (b"The quick brown fox jumps over the lazy dog", 43), // common pangram
-        (b"hello world", 11),                                 // simple greeting
-        (b"", 0),                                             // empty slice
-        (b"x", 1),                                            // single-byte
-        (b"0123456789", 10),                                  // numeric ASCII
-        (b"!@#$%^&*()_+-=[]{};':,.<>/?", 27),                 // punctuation
-        (b"RustLang1337", 12),                                // mixed alphanumeric
-        (b"foo\0bar\0baz", 11),                               // embedded nulls
-    ];
-
-    pub static HASHES: [&str; 8] = [
-        "qK3Uvd39k+SHfSdG5igXsRY2Sh+nvBSNlQkLxzM7NnP4JAHPeqLkyx7NkCluPxTLVBP47Xe+cwRbE5FM3NapGA", // common pangram
-        "Ahzth5kpbOylV4MquUGlC0oR+DR4zxQfUfkz9lOrn7zAWgN83b7QbjCb8zSULE5YzfGkbiN5EczX/Pl4fLx/0A", // simple greeting
-        "eGoC90IBWQPGxv2FJVLScpEvR0DhWEdhiobiF/cfVBnSXhAxr+5YUxOJZESTTrBLkDpoWxRIt1XVb3Aa/pvizg", // empty slice
-        "CQk3etNREMr7KQnhhWcrfyco0fUJT4rWjW+sYnS/H0mUhagOo2TATtAG0pRZ6jy3xgAoDi+D4DJSmQb4iuMNCg", // single-byte
-        "UqCSwAW2Ib1X5QGgrtlQp2/vuwDQeqQ9rdb1NALMJUE3SfDTxi6MoKfbrjRIQa3qUdU/i2HZaaFdSmMYtXa4rA", // numeric ASCII
-        "Sr91qmX4R/Ly4HsJh5eiG3S1tuO81kwV0KPfRpn1j4jjrQoGL2I+SeKfcGvpXu3l/rfhGdJHF8ei775ZzdgK3Q", // punctuation
-        "Ox+zobaUmB8Ps410/TGOtjjLIJKaMUCwG/iFLNXjwRShuJAmtvQcK9Ahc9+SfD4Ci67HyPPorl7NGjN6LRrmlQ", // mixed alphanumeric
-        "a3rsGWE2kfvN6e2sVhioWP9NOmwLK9trzjc/GKXTPvvsiagiRSHMjlg5jy+bMepip68Pv69dY8TvTSFZES5Jzw", // embedded nulls
-    ];
-
     #[test]
     fn compute_hash_as_expected() {
-        for (&(content, size), expected_hash) in CASES.iter().zip(HASHES) {
+        for (content, size, expected_hash) in cases_with(HASHES_STD_GOOD) {
             let file = create_file(content);
 
             // to capture progress updates from `compute_hash`
@@ -168,18 +149,7 @@ mod tests {
 
     #[test]
     fn hash_size_mismatch() {
-        let bad_sizes = [
-            36, // common pangram
-            12, // simple greeting
-            1,  // empty slice
-            0,  // single-byte
-            9,  // numeric ASCII
-            24, // punctuation
-            13, // mixed alphanumeric
-            10, // embedded nulls
-        ];
-
-        for (&(content, good_size), bad_size) in CASES.iter().zip(bad_sizes) {
+        for (content, good_size, bad_size) in cases_with(DATA_LENGTHS_BAD) {
             let file = create_file(content);
             let callback = drop;
 
@@ -203,7 +173,7 @@ mod tests {
 
     #[test]
     fn hash_value_none() {
-        for (content, size) in CASES {
+        for (content, size) in cases() {
             let file = create_file(content);
             let callback = drop;
 
@@ -217,14 +187,9 @@ mod tests {
 
     #[test]
     fn hash_value_mismatch() {
-        let bad_hashes = [
-            "invalid9k+SHfSdG5igXsRY2Sh+nvBSNlQkLxzM7NnP4JAHPeqLkyx7NkCluPxTLVBP47Xe+cwRbE5FM3NapGA", // common pangram
-            "", // simple greeting
-            "eGoC90IBWQPGxv2FJVLScpEvR0DhWEdhiobiG/cfVBnSXhAxr+5YUxOJZESTTrBLkDpoWxRIt1XVb3Aa/pvizg", // empty slice
-            "Hash", // single-byte
-        ];
-
-        for ((&(content, size), good_hash), bad_hash) in CASES.iter().zip(HASHES).zip(bad_hashes) {
+        for ((content, size, good_hash), bad_hash) in
+            cases_with(HASHES_STD_GOOD).zip(HASHES_STD_BAD)
+        {
             let file = create_file(content);
             let callback = drop;
 
diff --git a/src/test_util/data.rs b/src/test_util/data.rs
new file mode 100644
index 0000000..a5593ba
--- /dev/null
+++ b/src/test_util/data.rs
@@ -0,0 +1,118 @@
+/// test dataset
+const DATA: [&[u8]; 8] = [
+    // empty slice
+    b"",
+    // single-byte
+    b"x",
+    // common ascii pangram
+    b"The quick brown fox jumps over the lazy dog",
+    // ascii with punctuation and digits
+    b"Rust v1.65.0 - Memory Safety, Speed, Concurrency!",
+    // simple unicode (utf-8) greeting
+    "こんにちは世界".as_bytes(),
+    // pseudo-random bytes with embedded nuls
+    &[
+        0x3C, 0xA7, 0x5D, 0xE1, 0x4F, 0x99, 0x00, 0x20, 0x7F, 0xB3, 0xCD, 0x8A, 0x10, 0x55, 0xAA,
+        0xFF, 0x5E, 0xA3, 0x1F, 0xC8, 0x72, 0x4D, 0x99, 0x00, 0xB7, 0x3C, 0x8E, 0xAD, 0x26, 0xF1,
+    ],
+    // long run of identical bytes (1 kib of ascii 'a')
+    &[b'A'; 1024],
+    // very large slice (10 mib of zeroes)
+    &[0u8; 10 * 1024 * 1024],
+];
+
+/// lengths of the test dataset
+const DATA_LENGTHS: [u64; 8] = [
+    DATA[0].len() as u64,
+    DATA[1].len() as u64,
+    DATA[2].len() as u64,
+    DATA[3].len() as u64,
+    DATA[4].len() as u64,
+    DATA[5].len() as u64,
+    DATA[6].len() as u64,
+    DATA[7].len() as u64,
+];
+
+/// anything but the lengths of the test dataset
+pub const DATA_LENGTHS_BAD: [u64; 8] = [36, 12, 1, 0, 9, 24, 13, 10];
+
+/// known good hashes of the test dataset
+///
+/// using BLAKE2b, 512 bit, with unpadded Base64 (standard variant)
+pub const HASHES_STD_GOOD: [&str; 8] = [
+    // empty slice
+    "eGoC90IBWQPGxv2FJVLScpEvR0DhWEdhiobiF/cfVBnSXhAxr+5YUxOJZESTTrBLkDpoWxRIt1XVb3Aa/pvizg",
+    // single-byte
+    "CQk3etNREMr7KQnhhWcrfyco0fUJT4rWjW+sYnS/H0mUhagOo2TATtAG0pRZ6jy3xgAoDi+D4DJSmQb4iuMNCg",
+    // common ascii pangram
+    "qK3Uvd39k+SHfSdG5igXsRY2Sh+nvBSNlQkLxzM7NnP4JAHPeqLkyx7NkCluPxTLVBP47Xe+cwRbE5FM3NapGA",
+    // ascii with punctuation and digits
+    "NOtceHp9LrSYpXvSP3ayPbgMUyX4hynBYt4KtHuwJDsv1ELco5QeUj9aJTYTqbw4KzRKY+RjsbR26N3smUeCmA",
+    // simple unicode (utf-8) greeting
+    "h3xQg25wr/XqaXgqXWJivbVgN89XQoZUN/JcSZB0jxOtkbVStY7hnO+pm3PnLv6yZ4ZDLrxzYpoBk05BR7Wo1A",
+    // pseudo-random bytes with embedded nuls
+    "kiUYjOegDM9n1ryWtZhukpTuZ8oZbhi2onpXNl6pg16R+JZj5ty4uJZs44YbCu0A9m35Xs3bi/mxfbSulbo5Rg",
+    // long run of identical bytes (1 kib of ascii 'a')
+    "xwGOG01h2kco4CgjJlD9T2v5bM8XVuCrYzKTM4D0s7rCnOH+HR1H2S2Tmg43M+ym1A+AEPTE4J7iGljgntTdZA",
+    // very large slice (10 mib of zeroes)
+    "xsHH9h63e1+254TSCQoWCl6L5eGOo0Zg+ubtQC8Inwj7dwW7oxg0kYCrnkuTRj+7bVYNjlRSDOa8OIdInp73wA",
+];
+
+/// known bad version of `HASHES_STD_GOOD`
+pub const HASHES_STD_BAD: [&str; 8] = [
+    // off by one character (last “z” -> “y”)
+    "eGoC90IBWQPGxv2FJVLScpEvR0DhWEdhiobiF/cfVBnSXhAxr+5YUxOJZESTTrBLkDpoWxRIt1XVb3Aa/pviyg",
+    // truncated by dropping the final 4 chars
+    "CQk3etNREMr7KQnhhWcrfyco0fUJT4rWjW+sYnS/H0mUhagOo2TATtAG0pRZ6jy3xgAoDi+D4DJSmQb4iu",
+    // contains a non‐Base64 character (“#”)
+    "qK3Uvd39k+SHfSdG5igXsRY2Sh+nvBSNlQkLxzM7NnP4JAHPeqLkyx7NkCluPxTLVBP47Xe+cwRbE5FM3NapG#",
+    // too long, extra “AA” at end
+    "NOtceHp9LrSYpXvSP3ayPbgMUyX4hynBYt4KtHuwJDsv1ELco5QeUj9aJTYTqbw4KzRKY+RjsbR26N3smUeCmAAA",
+    // one byte altered at the front (“h” -> “H”)
+    "H3xQg25wr/XqaXgqXWJivbVgN89XQoZUN/JcSZB0jxOtkbVStY7hnO+pm3PnLv6yZ4ZDLrxzYpoBk05BR7Wo1A",
+    // garbled mid‐section
+    "kiUYjOegDM9n1ryWtZhukpTuZ8oZbhi2onpXYZ6pg16R+JZj5ty4uJZs44YbCu0A9m35Xs3bi/mxfbSulbo5Rg",
+    // entirely different length (too short)
+    "xwGOG01h2kco4CgjJlD9T2v5bM8XVuCrYzKTM4D0s7rCnO",
+    // correct length, but all “A”s (obviously wrong)
+    "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+];
+
+// /// known good hashes of the test dataset
+// ///
+// /// using BLAKE2b, 128 bit, with unpadded Base64 (url safe variant)
+// const HASHES_URL_GOOD: [&str; 8] = [
+//     // empty slice
+//     "########################################################",
+//     // single-byte
+//     "########################################################",
+//     // common ascii pangram
+//     "",
+//     // ascii with punctuation and digits
+//     "",
+//     // simple unicode (utf-8) greeting
+//     "",
+//     // pseudo-random bytes with embedded nuls
+//     "",
+//     // long run of identical bytes (1 kib of ascii 'a')
+//     "",
+//     // very large slice (10 mib of zeroes)
+//     "",
+// ];
+
+pub fn data() -> impl Iterator<Item = &'static [u8]> {
+    DATA.iter().map(|item| *item)
+}
+
+pub fn cases() -> impl Iterator<Item = (&'static [u8], u64)> {
+    data().zip(DATA_LENGTHS)
+}
+
+pub fn cases_with<T>(addons: T) -> impl Iterator<Item = (&'static [u8], u64, T::Item)>
+where
+    T: IntoIterator,
+{
+    cases()
+        .zip(addons)
+        .map(|((data, len), addon)| (data, len, addon))
+}
diff --git a/src/test_util/mod.rs b/src/test_util/mod.rs
index 3687825..fbca94c 100644
--- a/src/test_util/mod.rs
+++ b/src/test_util/mod.rs
@@ -1,5 +1,7 @@
 #![cfg(test)]
 
+pub mod data;
+
 use std::{fmt, io::Write};
 
 use tempfile::NamedTempFile;