yumechi-no-kuni-proxy-worker/build.rs

566 lines
19 KiB
Rust
Raw Normal View History

use std::{env, ops::Range, vec};
use chumsky::prelude::*;
use quote::ToTokens;
use text::whitespace;
const FILES_TO_PARSE: &[&str] = &[
"jpeg",
"images",
"sgml",
"riff",
"animation",
"audio",
"matroska",
"vorbis",
"audio",
"msdos",
"webassembly",
"elf",
"mach",
];
const BLACKLISTED: &[&str] = &["fuji-", "canon-", "corel-", "dicom", "garmin"];
// remove extension entries not in this list from safe entries
const SAFE_EXTENSIONS: &[&str] = &[
".png", ".gif", ".jpeg", ".webp", ".avif", ".apng", ".bmp", ".tiff", ".x-icon", ".opus",
".ogg", ".mp4", ".m4v", ".3gpp", ".mpeg", ".webm", ".aac", ".flac", ".wav",
];
const SAFE_WHITELISTED: &[&str] = &[
"png", "gif", "jpeg", "webp", "avif", "apng", "bmp", "tiff", "x-icon", "opus", "ogg", "mp4",
"m4v", "3gpp", "mpeg", "webm", "aac", "flac", "wav", "svg", "rss",
];
// we want to have signatures for these to be able to detect them
const UNSAFE_WHITELISTED: &[&str] = &[
".exe",
".wasm",
"elf",
"mach",
"javascript",
"bios",
"firmware",
"driver",
"mpegurl",
];
fn static_signatures() -> Vec<MIMEAssociation> {
vec![
MIMEAssociation {
mime: "application/x-elf-executable".to_string().into(),
ext: vec![".pie".to_string(), ".elf".to_string(), ".so".to_string()],
safe: false,
signatures: vec![FlattenedFileSignature {
test: vec![0x7f, b'E', b'L', b'F'],
mask: vec![0xff, 0xff, 0xff, 0xff],
}],
},
MIMEAssociation {
mime: "application/x-mach-binary".to_string().into(),
ext: vec![".dylib".to_string(), ".bundle".to_string()],
safe: false,
signatures: vec![FlattenedFileSignature {
test: vec![0xfe, 0xed, 0xfa, 0xce],
mask: vec![0xff, 0xff, 0xff, 0xff],
}],
},
MIMEAssociation {
mime: "application/vnd.microsoft.portable-executable"
.to_string()
.into(),
ext: vec![".exe".to_string(), ".dll".to_string(), ".sys".to_string()],
safe: false,
signatures: vec![FlattenedFileSignature {
test: b"PE\0\0".to_vec(),
mask: vec![0xff, 0xff, 0xff, 0xff],
}],
},
MIMEAssociation {
mime: "image/vnd.microsoft.icon".to_string().into(),
ext: vec![".ico".to_string()],
safe: true,
signatures: vec![FlattenedFileSignature {
test: vec![0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00],
mask: vec![0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff],
}],
},
]
}
#[derive(Debug, Clone)]
pub enum MagicFileLine {
Nop,
Unknown,
Magic {
indent: u8,
offset: u64,
ty: MagicType,
},
AssignAttr {
attr: String,
value: String,
},
}
#[derive(Debug, Clone)]
pub enum MagicType {
Unknown(String),
Belong {
test: Vec<u8>,
mask: Option<Vec<u8>>,
},
String {
test: Vec<u8>,
},
}
pub fn parse_string_repr() -> impl Parser<char, Vec<u8>, Error = Simple<char>> {
just('\\')
.ignore_then(choice((
just('\\').to(b'\\'),
just('n').to(b'\n'),
just('r').to(b'\r'),
just('t').to(b'\t'),
just('x').ignore_then(
one_of("0123456789abcdefABCDEF")
.repeated()
.exactly(2)
.map(|s| u8::from_str_radix(&s.iter().collect::<String>(), 16).unwrap()),
),
)))
.or(none_of("\\").map(|c| c as u8))
.repeated()
.at_least(1)
.map(|s| s.to_vec())
.then_ignore(end())
}
pub fn parse_hex_repr() -> impl Parser<char, Vec<u8>, Error = Simple<char>> {
just("0x")
.ignore_then(
one_of("0123456789abcdef")
.repeated()
.exactly(2)
.map(|s| u8::from_str_radix(&s.iter().collect::<String>(), 16).unwrap())
.repeated()
.at_least(1),
)
.map(|s| s.to_vec())
.then_ignore(end())
}
pub fn parse_magic_line() -> impl Parser<char, MagicFileLine, Error = Simple<char>> {
choice((
just('#')
.then_ignore(any().repeated())
.to(MagicFileLine::Nop),
just('>')
.repeated()
.map(|i| i.len() as u8)
.then(
one_of("0123456789")
.repeated()
.at_least(1)
.try_map(|s, span| {
s.iter()
.collect::<String>()
.parse::<u64>()
.map_err(|_| Simple::custom(span, "Failed to parse number"))
})
.or(just("0x").ignore_then(
one_of("0123456789abcdefABCDEF")
.repeated()
.at_least(1)
.try_map(|s, span| {
u64::from_str_radix(&s.iter().collect::<String>(), 16)
.map_err(|_| Simple::custom(span, "Failed to parse number"))
}),
)),
)
.then_ignore(whitespace().at_least(1))
.then(
none_of(" \t\n")
.repeated()
.at_least(1)
.map(String::from_iter),
)
.then_ignore(whitespace().at_least(1))
.then(
none_of(" \t\n")
.repeated()
.at_least(1)
.map(String::from_iter),
)
.try_map(|(((indent, offset), ty), rep), span: Range<usize>| {
Ok(MagicFileLine::Magic {
indent,
offset,
ty: match ty.as_str() {
"string" => MagicType::String {
test: parse_string_repr().parse(rep).map_err(|_| {
Simple::custom(span, "Failed to parse string pattern")
})?,
},
"belong" => MagicType::Belong {
test: parse_hex_repr()
.parse(rep)
.map_err(|_| Simple::custom(span, "Failed to parse hex pattern"))?,
mask: None,
},
s if s.starts_with("belong&") => {
let mask = &s["belong&".len()..];
let span_clone = span.clone();
MagicType::Belong {
test: parse_hex_repr().parse(rep).map_err(|_| {
Simple::custom(span, "Failed to parse hex pattern")
})?,
mask: Some(parse_hex_repr().parse(mask).map_err(|_| {
Simple::custom(span_clone, "Failed to parse hex pattern")
})?),
}
}
_ => MagicType::Unknown(ty),
},
})
})
.then_ignore(any().repeated()),
just("!:")
.ignore_then(
one_of("abcdefghijklmnopqrstuvwxyz")
.repeated()
.at_least(1)
.map(|s| s.iter().collect()),
)
.then_ignore(whitespace().at_least(1))
.then(any().repeated().map(String::from_iter))
.map(|(attr, value)| MagicFileLine::AssignAttr { attr, value }),
))
.then_ignore(whitespace())
.then_ignore(end())
}
#[derive(Debug, Clone, PartialEq, serde::Serialize)]
pub struct FileSignature {
pub offset: u64,
pub test: Vec<u8>,
pub mask: Option<Vec<u8>>,
}
#[derive(Debug, Clone, PartialEq, serde::Serialize)]
pub struct FlattenedFileSignature {
pub test: Vec<u8>,
pub mask: Vec<u8>,
}
impl FlattenedFileSignature {
fn codegen(&self) -> impl ToTokens {
let data = self
.test
.iter()
.copied()
.zip(self.mask.iter().copied())
.map(|(t, m)| {
quote::quote! {
(#t, #m)
}
});
quote::quote! {
FlattenedFileSignature(&[#(#data),*])
}
}
}
impl From<FileSignature> for FlattenedFileSignature {
fn from(sig: FileSignature) -> Self {
let len = sig.test.len();
FlattenedFileSignature {
test: std::iter::repeat(0)
.take(sig.offset as usize)
.chain(sig.test)
.collect(),
mask: sig.mask.unwrap_or_else(|| {
std::iter::repeat(0)
.take(sig.offset as usize)
.chain(std::iter::repeat(!0).take(len))
.collect()
}),
}
}
}
impl std::ops::BitAnd<FlattenedFileSignature> for FlattenedFileSignature {
type Output = FlattenedFileSignature;
fn bitand(mut self, mut rhs: FlattenedFileSignature) -> Self::Output {
if self.test.len() < rhs.test.len() {
std::mem::swap(&mut self, &mut rhs);
}
let test = self
.test
.iter()
.zip(
rhs.test
.iter()
.chain(std::iter::repeat(&0).take(self.test.len() - rhs.test.len())),
)
.map(|(a, b)| a | b)
.collect();
let mask = self
.mask
.iter()
.zip(
rhs.mask
.iter()
.chain(std::iter::repeat(&0).take(self.test.len() - rhs.test.len())),
)
.map(|(a, b)| a | b)
.collect();
FlattenedFileSignature { test, mask }
}
}
#[derive(Debug, Clone, PartialEq, serde::Serialize)]
pub struct MIMEAssociation {
pub mime: Option<String>,
pub ext: Vec<String>,
pub safe: bool,
pub signatures: Vec<FlattenedFileSignature>,
}
impl MIMEAssociation {
fn codegen(&self) -> impl ToTokens {
let mime = self.mime.as_deref().unwrap_or("");
let ext = self.ext.first().map(|s| s.as_str()).unwrap_or("");
let safe = self.safe;
let signatures = self.signatures.iter().map(|s| s.codegen());
quote::quote! {
MIMEAssociation {
mime: #mime,
ext: #ext,
safe: #safe,
signatures: &[#(#signatures),*],
}
}
}
fn build_vec(lines: Vec<MagicFileLine>) -> Vec<MIMEAssociation> {
let mut stack = Vec::new();
let mut out: Vec<MIMEAssociation> = Vec::new();
for line in lines {
match line {
MagicFileLine::Magic { ty, offset, indent } => match ty {
MagicType::Belong { test, mask } => {
stack.truncate(indent as usize);
stack.push(FileSignature { offset, test, mask });
}
MagicType::String { test } => {
stack.truncate(indent as usize);
stack.push(FileSignature {
offset,
test,
mask: None,
});
}
_ => {}
},
MagicFileLine::AssignAttr { attr, value } => match attr.as_str() {
"mime" if !stack.is_empty() => {
let mime = value;
let flattened = stack.iter().map(|sig| sig.clone().into()).fold(
FlattenedFileSignature {
test: Vec::new(),
mask: Vec::new(),
},
|a, b| a & b,
);
if flattened.test.len() > 64 {
eprintln!("Signature too long: {:?}", flattened.test.len());
continue;
}
if let Some(existing) = out
.iter_mut()
.find(|m| m.mime.as_deref().map(|m| m == mime).unwrap_or(false))
{
existing.signatures.push(flattened);
} else {
out.push(MIMEAssociation {
mime: Some(mime),
safe: false,
ext: vec![],
signatures: vec![flattened],
});
}
}
"ext" if !stack.is_empty() => {
let ext = value;
let flattened = stack.iter().map(|sig| sig.clone().into()).fold(
FlattenedFileSignature {
test: Vec::new(),
mask: Vec::new(),
},
|a, b| a & b,
);
if flattened.test.len() > 64 {
eprintln!("Signature too long: {:?}", flattened.test.len());
continue;
}
if let Some(existing) =
out.iter_mut().find(|m| m.signatures.contains(&flattened))
{
existing
.ext
.extend(ext.split('/').map(|s| format!(".{}", s)))
} else {
out.push(MIMEAssociation {
mime: None,
safe: false,
ext: ext.split('/').map(|s| format!(".{}", s)).collect(),
signatures: vec![flattened],
});
}
}
_ => {}
},
_ => {}
}
}
out.iter_mut().for_each(|m| {
m.ext.sort();
m.ext.dedup();
m.signatures.sort_by(|a, b| a.test.cmp(&b.test));
m.signatures.dedup();
});
out.dedup();
out
}
}
const BASE_DIR: &str = "submodules/file/magic/Magdir/";
fn main() {
let signatures = static_signatures()
.into_iter()
.chain(FILES_TO_PARSE.iter().flat_map(|file| {
println!("cargo:rerun-if-changed={}", file);
eprintln!("Using file: {}", file);
let path = format!("{}{}", BASE_DIR, file);
let content = std::fs::read(&path)
.map(|v| String::from_utf8_lossy(&v).to_string())
.unwrap();
let lines = content
.lines()
.filter(|line| !line.is_empty())
.map(|line| {
parse_magic_line().parse(line).unwrap_or_else(|e| {
eprintln!("Failed to parse line: {:?}", line);
eprintln!("Error: {:?}", e);
MagicFileLine::Unknown
})
})
.collect::<Vec<_>>();
MIMEAssociation::build_vec(lines)
.into_iter()
.map(|mut m| {
if m.mime
.as_ref()
.map(|m| UNSAFE_WHITELISTED.iter().any(|u| m.contains(u)))
.unwrap_or(false)
{
m.safe = false;
return m;
}
if m.ext
.iter()
.any(|ext| UNSAFE_WHITELISTED.iter().any(|u| ext.contains(u)))
{
m.safe = false;
return m;
}
if m.mime
.as_ref()
.map(|m| SAFE_WHITELISTED.iter().any(|w| m.contains(w)))
.unwrap_or(false)
{
m.safe = true;
}
if m.ext
.iter()
.any(|ext| SAFE_WHITELISTED.iter().any(|w| ext.contains(w)))
{
m.safe = true;
}
if m.safe {
m.ext
.retain(|ext| SAFE_EXTENSIONS.iter().any(|s| ext.contains(s)));
}
m
})
.filter(|m| {
if let Some(incoming) = &m.mime {
let mime = incoming.to_lowercase();
if BLACKLISTED.iter().any(|b| mime.contains(b)) {
return false;
}
if SAFE_WHITELISTED.iter().any(|w| mime.contains(w))
|| UNSAFE_WHITELISTED.iter().any(|u| mime.contains(u))
{
return true;
}
}
if m.ext
.iter()
.any(|ext| BLACKLISTED.iter().any(|b| ext.contains(b)))
{
return false;
}
if let Some(incoming) = &m.mime {
let mime = incoming.to_lowercase();
if SAFE_WHITELISTED.iter().all(|w| mime.contains(w))
|| UNSAFE_WHITELISTED.iter().any(|u| mime.contains(u))
{
return true;
}
}
if m.ext.iter().any(|ext| {
SAFE_WHITELISTED.iter().any(|w| ext.contains(w))
|| UNSAFE_WHITELISTED.iter().any(|u| ext.contains(u))
}) {
return true;
}
false
})
}))
.collect::<Vec<_>>();
let max_size = signatures
.iter()
.map(|s| s.signatures.iter().map(|s| s.test.len()).max().unwrap())
.max()
.unwrap();
if max_size > 128 {
panic!("Max signature size is too large: {}", max_size);
}
std::fs::write(env::var("OUT_DIR").unwrap() + "/magic.rs", {
let signatures = signatures.iter().map(|s| s.codegen());
quote::quote! {
/// Maximum size of a signature
pub const SNIFF_SIZE: usize = #max_size;
#[allow(clippy::all)]
const MAGICS: &[MIMEAssociation] = &[#(#signatures),*];
}
.into_token_stream()
.to_string()
})
.unwrap();
}