use std::{env, ops::Range, vec}; use chumsky::prelude::*; use quote::ToTokens; use text::whitespace; const FILES_TO_PARSE: &[&str] = &[ "jpeg", "images", "sgml", "riff", "animation", "audio", "matroska", "vorbis", "audio", "msdos", "webassembly", "elf", "mach", ]; const BLACKLISTED: &[&str] = &["fuji-", "canon-", "corel-", "dicom", "garmin"]; // remove extension entries not in this list from safe entries const SAFE_EXTENSIONS: &[&str] = &[ ".png", ".gif", ".jpeg", ".webp", ".avif", ".apng", ".bmp", ".tiff", ".x-icon", ".opus", ".ogg", ".mp4", ".m4v", ".3gpp", ".mpeg", ".webm", ".aac", ".flac", ".wav", ]; const SAFE_WHITELISTED: &[&str] = &[ "png", "gif", "jpeg", "webp", "avif", "apng", "bmp", "tiff", "x-icon", "opus", "ogg", "mp4", "m4v", "3gpp", "mpeg", "webm", "aac", "flac", "wav", "svg", "rss", ]; // we want to have signatures for these to be able to detect them const UNSAFE_WHITELISTED: &[&str] = &[ ".exe", ".wasm", "elf", "mach", "javascript", "bios", "firmware", "driver", "mpegurl", ]; fn static_signatures() -> Vec { vec![ MIMEAssociation { mime: "application/x-elf-executable".to_string().into(), ext: vec![".pie".to_string(), ".elf".to_string(), ".so".to_string()], safe: false, signatures: vec![FlattenedFileSignature { test: vec![0x7f, b'E', b'L', b'F'], mask: vec![0xff, 0xff, 0xff, 0xff], }], }, MIMEAssociation { mime: "application/x-mach-binary".to_string().into(), ext: vec![".dylib".to_string(), ".bundle".to_string()], safe: false, signatures: vec![FlattenedFileSignature { test: vec![0xfe, 0xed, 0xfa, 0xce], mask: vec![0xff, 0xff, 0xff, 0xff], }], }, MIMEAssociation { mime: "application/vnd.microsoft.portable-executable" .to_string() .into(), ext: vec![".exe".to_string(), ".dll".to_string(), ".sys".to_string()], safe: false, signatures: vec![FlattenedFileSignature { test: b"PE\0\0".to_vec(), mask: vec![0xff, 0xff, 0xff, 0xff], }], }, ] } #[derive(Debug, Clone)] pub enum MagicFileLine { Nop, Unknown, Magic { indent: u8, offset: u64, ty: MagicType, }, AssignAttr { attr: String, value: String, }, } #[derive(Debug, Clone)] pub enum MagicType { Unknown(String), Belong { test: Vec, mask: Option>, }, String { test: Vec, }, } pub fn parse_string_repr() -> impl Parser, Error = Simple> { just('\\') .ignore_then(choice(( just('\\').to(b'\\'), just('n').to(b'\n'), just('r').to(b'\r'), just('t').to(b'\t'), just('x').ignore_then( one_of("0123456789abcdefABCDEF") .repeated() .exactly(2) .map(|s| u8::from_str_radix(&s.iter().collect::(), 16).unwrap()), ), ))) .or(none_of("\\").map(|c| c as u8)) .repeated() .at_least(1) .map(|s| s.to_vec()) .then_ignore(end()) } pub fn parse_hex_repr() -> impl Parser, Error = Simple> { just("0x") .ignore_then( one_of("0123456789abcdef") .repeated() .exactly(2) .map(|s| u8::from_str_radix(&s.iter().collect::(), 16).unwrap()) .repeated() .at_least(1), ) .map(|s| s.to_vec()) .then_ignore(end()) } pub fn parse_magic_line() -> impl Parser> { choice(( just('#') .then_ignore(any().repeated()) .to(MagicFileLine::Nop), just('>') .repeated() .map(|i| i.len() as u8) .then( one_of("0123456789") .repeated() .at_least(1) .try_map(|s, span| { s.iter() .collect::() .parse::() .map_err(|_| Simple::custom(span, "Failed to parse number")) }) .or(just("0x").ignore_then( one_of("0123456789abcdefABCDEF") .repeated() .at_least(1) .try_map(|s, span| { u64::from_str_radix(&s.iter().collect::(), 16) .map_err(|_| Simple::custom(span, "Failed to parse number")) }), )), ) .then_ignore(whitespace().at_least(1)) .then( none_of(" \t\n") .repeated() .at_least(1) .map(String::from_iter), ) .then_ignore(whitespace().at_least(1)) .then( none_of(" \t\n") .repeated() .at_least(1) .map(String::from_iter), ) .try_map(|(((indent, offset), ty), rep), span: Range| { Ok(MagicFileLine::Magic { indent, offset, ty: match ty.as_str() { "string" => MagicType::String { test: parse_string_repr().parse(rep).map_err(|_| { Simple::custom(span, "Failed to parse string pattern") })?, }, "belong" => MagicType::Belong { test: parse_hex_repr() .parse(rep) .map_err(|_| Simple::custom(span, "Failed to parse hex pattern"))?, mask: None, }, s if s.starts_with("belong&") => { let mask = &s["belong&".len()..]; let span_clone = span.clone(); MagicType::Belong { test: parse_hex_repr().parse(rep).map_err(|_| { Simple::custom(span, "Failed to parse hex pattern") })?, mask: Some(parse_hex_repr().parse(mask).map_err(|_| { Simple::custom(span_clone, "Failed to parse hex pattern") })?), } } _ => MagicType::Unknown(ty), }, }) }) .then_ignore(any().repeated()), just("!:") .ignore_then( one_of("abcdefghijklmnopqrstuvwxyz") .repeated() .at_least(1) .map(|s| s.iter().collect()), ) .then_ignore(whitespace().at_least(1)) .then(any().repeated().map(String::from_iter)) .map(|(attr, value)| MagicFileLine::AssignAttr { attr, value }), )) .then_ignore(whitespace()) .then_ignore(end()) } #[derive(Debug, Clone, PartialEq, serde::Serialize)] pub struct FileSignature { pub offset: u64, pub test: Vec, pub mask: Option>, } #[derive(Debug, Clone, PartialEq, serde::Serialize)] pub struct FlattenedFileSignature { pub test: Vec, pub mask: Vec, } impl FlattenedFileSignature { fn codegen(&self) -> impl ToTokens { let data = self .test .iter() .copied() .zip(self.mask.iter().copied()) .map(|(t, m)| { quote::quote! { (#t, #m) } }); quote::quote! { FlattenedFileSignature(&[#(#data),*]) } } } impl From for FlattenedFileSignature { fn from(sig: FileSignature) -> Self { let len = sig.test.len(); FlattenedFileSignature { test: std::iter::repeat(0) .take(sig.offset as usize) .chain(sig.test) .collect(), mask: sig.mask.unwrap_or_else(|| { std::iter::repeat(0) .take(sig.offset as usize) .chain(std::iter::repeat(!0).take(len)) .collect() }), } } } impl std::ops::BitAnd for FlattenedFileSignature { type Output = FlattenedFileSignature; fn bitand(mut self, mut rhs: FlattenedFileSignature) -> Self::Output { if self.test.len() < rhs.test.len() { std::mem::swap(&mut self, &mut rhs); } let test = self .test .iter() .zip( rhs.test .iter() .chain(std::iter::repeat(&0).take(self.test.len() - rhs.test.len())), ) .map(|(a, b)| a | b) .collect(); let mask = self .mask .iter() .zip( rhs.mask .iter() .chain(std::iter::repeat(&0).take(self.test.len() - rhs.test.len())), ) .map(|(a, b)| a | b) .collect(); FlattenedFileSignature { test, mask } } } #[derive(Debug, Clone, PartialEq, serde::Serialize)] pub struct MIMEAssociation { pub mime: Option, pub ext: Vec, pub safe: bool, pub signatures: Vec, } impl MIMEAssociation { fn codegen(&self) -> impl ToTokens { let mime = self.mime.as_deref().unwrap_or(""); let ext = self.ext.first().map(|s| s.as_str()).unwrap_or(""); let safe = self.safe; let signatures = self.signatures.iter().map(|s| s.codegen()); quote::quote! { MIMEAssociation { mime: #mime, ext: #ext, safe: #safe, signatures: &[#(#signatures),*], } } } fn build_vec(lines: Vec) -> Vec { let mut stack = Vec::new(); let mut out: Vec = Vec::new(); for line in lines { match line { MagicFileLine::Magic { ty, offset, indent } => match ty { MagicType::Belong { test, mask } => { stack.truncate(indent as usize); stack.push(FileSignature { offset, test, mask }); } MagicType::String { test } => { stack.truncate(indent as usize); stack.push(FileSignature { offset, test, mask: None, }); } _ => {} }, MagicFileLine::AssignAttr { attr, value } => match attr.as_str() { "mime" if !stack.is_empty() => { let mime = value; let flattened = stack.iter().map(|sig| sig.clone().into()).fold( FlattenedFileSignature { test: Vec::new(), mask: Vec::new(), }, |a, b| a & b, ); if flattened.test.len() > 64 { eprintln!("Signature too long: {:?}", flattened.test.len()); continue; } if let Some(existing) = out .iter_mut() .find(|m| m.mime.as_deref().map(|m| m == mime).unwrap_or(false)) { existing.signatures.push(flattened); } else { out.push(MIMEAssociation { mime: Some(mime), safe: false, ext: vec![], signatures: vec![flattened], }); } } "ext" if !stack.is_empty() => { let ext = value; let flattened = stack.iter().map(|sig| sig.clone().into()).fold( FlattenedFileSignature { test: Vec::new(), mask: Vec::new(), }, |a, b| a & b, ); if flattened.test.len() > 64 { eprintln!("Signature too long: {:?}", flattened.test.len()); continue; } if let Some(existing) = out.iter_mut().find(|m| m.signatures.contains(&flattened)) { existing .ext .extend(ext.split('/').map(|s| format!(".{}", s))) } else { out.push(MIMEAssociation { mime: None, safe: false, ext: ext.split('/').map(|s| format!(".{}", s)).collect(), signatures: vec![flattened], }); } } _ => {} }, _ => {} } } out.iter_mut().for_each(|m| { m.ext.sort(); m.ext.dedup(); m.signatures.sort_by(|a, b| a.test.cmp(&b.test)); m.signatures.dedup(); }); out.dedup(); out } } const BASE_DIR: &str = "submodules/file/magic/Magdir/"; fn main() { let signatures = static_signatures() .into_iter() .chain(FILES_TO_PARSE.iter().flat_map(|file| { println!("cargo:rerun-if-changed={}", file); eprintln!("Using file: {}", file); let path = format!("{}{}", BASE_DIR, file); let content = std::fs::read(&path) .map(|v| String::from_utf8_lossy(&v).to_string()) .unwrap(); let lines = content .lines() .filter(|line| !line.is_empty()) .map(|line| { parse_magic_line().parse(line).unwrap_or_else(|e| { eprintln!("Failed to parse line: {:?}", line); eprintln!("Error: {:?}", e); MagicFileLine::Unknown }) }) .collect::>(); MIMEAssociation::build_vec(lines) .into_iter() .map(|mut m| { if m.mime .as_ref() .map(|m| UNSAFE_WHITELISTED.iter().any(|u| m.contains(u))) .unwrap_or(false) { m.safe = false; return m; } if m.ext .iter() .any(|ext| UNSAFE_WHITELISTED.iter().any(|u| ext.contains(u))) { m.safe = false; return m; } if m.mime .as_ref() .map(|m| SAFE_WHITELISTED.iter().any(|w| m.contains(w))) .unwrap_or(false) { m.safe = true; } if m.ext .iter() .any(|ext| SAFE_WHITELISTED.iter().any(|w| ext.contains(w))) { m.safe = true; } if m.safe { m.ext .retain(|ext| SAFE_EXTENSIONS.iter().any(|s| ext.contains(s))); } m }) .filter(|m| { if let Some(incoming) = &m.mime { let mime = incoming.to_lowercase(); if BLACKLISTED.iter().any(|b| mime.contains(b)) { return false; } if SAFE_WHITELISTED.iter().any(|w| mime.contains(w)) || UNSAFE_WHITELISTED.iter().any(|u| mime.contains(u)) { return true; } } if m.ext .iter() .any(|ext| BLACKLISTED.iter().any(|b| ext.contains(b))) { return false; } if let Some(incoming) = &m.mime { let mime = incoming.to_lowercase(); if SAFE_WHITELISTED.iter().all(|w| mime.contains(w)) || UNSAFE_WHITELISTED.iter().any(|u| mime.contains(u)) { return true; } } if m.ext.iter().any(|ext| { SAFE_WHITELISTED.iter().any(|w| ext.contains(w)) || UNSAFE_WHITELISTED.iter().any(|u| ext.contains(u)) }) { return true; } false }) })) .collect::>(); let max_size = signatures .iter() .map(|s| s.signatures.iter().map(|s| s.test.len()).max().unwrap()) .max() .unwrap(); if max_size > 128 { panic!("Max signature size is too large: {}", max_size); } std::fs::write(env::var("OUT_DIR").unwrap() + "/magic.rs", { let signatures = signatures.iter().map(|s| s.codegen()); quote::quote! { /// Maximum size of a signature pub const SNIFF_SIZE: usize = #max_size; #[allow(clippy::all)] const MAGICS: &[MIMEAssociation] = &[#(#signatures),*]; } .into_token_stream() .to_string() }) .unwrap(); }