]> code.octet-stream.net Git - hashgood/blob - src/calculate.rs
3ba5a7d0538be39b45a5616386e8082aebbb566f
[hashgood] / src / calculate.rs
1 use super::Algorithm;
2 use md5::{Digest, Md5};
3 use sha1::Sha1;
4 use sha2::Sha256;
5 use sha2::Sha512;
6 use std::error::Error;
7 use std::fs::File;
8 use std::io::prelude::*;
9 use std::path::Path;
10 use std::sync::mpsc::{channel, Receiver};
11 use std::sync::Arc;
12 use std::thread;
13 use std::thread::JoinHandle;
14
15 pub type CalculateResult = Result<Vec<(Algorithm, Vec<u8>)>, Box<dyn Error>>;
16
17 /// For a given path to the input (may be "-" for STDIN), try to obtain a reader for the data within it.
18 pub fn get_input_reader(input: &Path) -> Result<Box<dyn Read>, String> {
19 if input.to_str() == Some("-") {
20 // Special case: standard input
21 return Ok(Box::new(std::io::stdin()));
22 }
23 if !input.exists() {
24 return Err(format!(
25 "The path '{}' does not exist.",
26 input.to_string_lossy()
27 ));
28 }
29 if !input.is_file() {
30 return Err(format!(
31 "The path '{}' is not a regular file.",
32 input.to_string_lossy()
33 ));
34 }
35 match File::open(input) {
36 Ok(f) => Ok(Box::new(f)),
37 Err(e) => Err(format!("File open: {}", e)),
38 }
39 }
40
41 /// For the given input stream, calculate all requested digest types
42 pub fn create_digests(algorithms: &[Algorithm], mut input: Box<dyn Read>) -> CalculateResult {
43 let mut senders = vec![];
44 let mut handles = vec![];
45
46 if algorithms.contains(&Algorithm::Md5) {
47 let (s, r) = channel();
48 senders.push(s);
49 handles.push(md5_digest(r));
50 }
51 if algorithms.contains(&Algorithm::Sha1) {
52 let (s, r) = channel();
53 senders.push(s);
54 handles.push(sha1_digest(r));
55 }
56 if algorithms.contains(&Algorithm::Sha256) {
57 let (s, r) = channel();
58 senders.push(s);
59 handles.push(sha256_digest(r));
60 }
61 if algorithms.contains(&Algorithm::Sha512) {
62 let (s, r) = channel();
63 senders.push(s);
64 handles.push(sha512_digest(r));
65 }
66
67 // 64 KB chunks will be read from the input at 64 KB and supplied to all hashing threads at once
68 // Right now that could be up to three threads. If CPU-bound, the other threads will mostly block while the slowest one finishes
69 const BUF_SIZE: usize = 1024 * 64;
70 let mut buf = [0; BUF_SIZE];
71 while let Ok(size) = input.read(&mut buf) {
72 if size == 0 {
73 break;
74 } else {
75 // Create a shared read-only copy for the hashers to take as input
76 // buf is freed up for more reading
77 let chunk = Arc::new(buf[0..size].to_vec());
78 for s in &senders {
79 s.send(chunk.clone())?;
80 }
81 }
82 }
83 drop(senders);
84 // Once all data has been sent we just have to wait for the digests to fall out
85 Ok(handles.into_iter().map(|h| h.join().unwrap()).collect())
86 }
87
88 /// Calculate the md5 digest of some data on the given channel
89 fn md5_digest(rx: Receiver<Arc<Vec<u8>>>) -> JoinHandle<(Algorithm, Vec<u8>)> {
90 thread::spawn(move || {
91 let mut md5 = Md5::new();
92 while let Ok(chunk) = rx.recv() {
93 md5.update(&*chunk);
94 }
95 let result = md5.finalize();
96 (Algorithm::Md5, result.to_vec())
97 })
98 }
99
100 /// Calculate the sha1 digest of some data on the given channel
101 fn sha1_digest(rx: Receiver<Arc<Vec<u8>>>) -> JoinHandle<(Algorithm, Vec<u8>)> {
102 thread::spawn(move || {
103 let mut sha1 = Sha1::new();
104 while let Ok(chunk) = rx.recv() {
105 sha1.update(&*chunk);
106 }
107 let result = sha1.finalize();
108 (Algorithm::Sha1, result.to_vec())
109 })
110 }
111
112 /// Calculate the sha256 digest of some data on the given channel
113 fn sha256_digest(rx: Receiver<Arc<Vec<u8>>>) -> JoinHandle<(Algorithm, Vec<u8>)> {
114 thread::spawn(move || {
115 let mut sha256 = Sha256::new();
116 while let Ok(chunk) = rx.recv() {
117 sha256.update(&*chunk);
118 }
119 let result = sha256.finalize();
120 (Algorithm::Sha256, result.to_vec())
121 })
122 }
123
124 /// Calculate the sha512 digest of some data on the given channel
125 fn sha512_digest(rx: Receiver<Arc<Vec<u8>>>) -> JoinHandle<(Algorithm, Vec<u8>)> {
126 thread::spawn(move || {
127 let mut sha512 = Sha512::new();
128 while let Ok(chunk) = rx.recv() {
129 sha512.update(&*chunk);
130 }
131 let result = sha512.finalize();
132 (Algorithm::Sha512, result.to_vec())
133 })
134 }
135
136 #[cfg(test)]
137 mod tests {
138 use super::*;
139 use std::io::Cursor;
140
141 static SMALL_DATA: [u8; 10] = [b'A'; 10];
142 // python3 -c 'print ("A"*10, end="", flush=True)' | md5sum
143 static SMALL_DATA_MD5: &str = "16c52c6e8326c071da771e66dc6e9e57";
144 // python3 -c 'print ("A"*10, end="", flush=True)' | sha1sum
145 static SMALL_DATA_SHA1: &str = "c71613a7386fd67995708464bf0223c0d78225c4";
146 // python3 -c 'print ("A"*10, end="", flush=True)' | sha256sum
147 static SMALL_DATA_SHA256: &str =
148 "1d65bf29403e4fb1767522a107c827b8884d16640cf0e3b18c4c1dd107e0d49d";
149
150 static LARGE_DATA: [u8; 1_000_000] = [b'B'; 1_000_000];
151 // python3 -c 'print ("B"*1000000, end="", flush=True)' | md5sum
152 static LARGE_DATA_MD5: &str = "9171f6d67a87ca649a702434a03458a1";
153 // python3 -c 'print ("B"*1000000, end="", flush=True)' | sha1sum
154 static LARGE_DATA_SHA1: &str = "cfae4cebfd01884111bdede7cf983626bb249c94";
155 // python3 -c 'print ("B"*1000000, end="", flush=True)' | sha256sum
156 static LARGE_DATA_SHA256: &str =
157 "b9193853f7798e92e2f6b82eda336fa7d6fc0fa90fdefe665f372b0bad8cdf8c";
158
159 fn verify_digest(alg: Algorithm, data: &'static [u8], hash: &str) {
160 let reader = Cursor::new(&*data);
161 let digests = create_digests(&[alg], Box::new(reader)).unwrap();
162 assert_eq!(digests.len(), 1);
163 assert_eq!(digests[0], (alg, hex::decode(hash).unwrap()));
164 }
165
166 /// Assert that digests for all algorithms are calculated correctly for a small piece
167 /// of test data (single block).
168 #[test]
169 fn small_digests() {
170 verify_digest(Algorithm::Md5, &SMALL_DATA, SMALL_DATA_MD5);
171 verify_digest(Algorithm::Sha1, &SMALL_DATA, SMALL_DATA_SHA1);
172 verify_digest(Algorithm::Sha256, &SMALL_DATA, SMALL_DATA_SHA256);
173 }
174
175 /// Assert that digests for all algorithms are calculated correctly for a large piece
176 /// of test data. For our purposes, "large" means that it spans several of the 64 KB
177 /// blocks used to break up the input processing. Using one million bytes instead of
178 /// 1 MiB means that the final block will be slightly smaller than the others.
179 #[test]
180 fn large_digests() {
181 verify_digest(Algorithm::Md5, &LARGE_DATA, LARGE_DATA_MD5);
182 verify_digest(Algorithm::Sha1, &LARGE_DATA, LARGE_DATA_SHA1);
183 verify_digest(Algorithm::Sha256, &LARGE_DATA, LARGE_DATA_SHA256);
184 }
185 }