Compare commits

..

No commits in common. "01fb0a0cc87a2387f17ab318643aad471051aa0c" and "2cdeeb4f36d9b8bc5eef7ec3e8cbf21741700586" have entirely different histories.

38 changed files with 6419 additions and 2 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
data/
target/

1038
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

44
Cargo.toml Normal file
View file

@ -0,0 +1,44 @@
[package]
name = "gfidx"
version = "0.1.0"
edition = "2021"
[lib]
crate-type = ["cdylib", "rlib"]
[dependencies]
ciborium = "0.2.2"
clap = { version = "4.5.13", features = ["derive"] }
flate2 = "1.0.30"
hashbrown = { version = "0.14.5", features = ["inline-more", "serde"] }
indicatif = "0.17.8"
itertools = "0.13.0"
libc = "0.2.155"
num-traits = "0.2.19"
ouroboros = "0.18.4"
rayon = { version = "1.10.0", optional = true }
serde = { version = "1.0.204", features = ["derive"] }
tabled = "0.15.0"
thiserror = "1.0.63"
varint-rs = "2.2.0"
[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
[features]
default = ["parallel"]
async = []
tokio = ["async"]
parallel = ["rayon"]
[[bench]]
name = "scan_gff3"
harness = false
[profile.release]
debug = true
lto = true
[[bin]]
name = "gfidx"
path = "src/bin/gfidx.rs"

View file

@ -1,3 +1,34 @@
# gfidx # GFidx
GFF3 Indexer GFidx is a GFF3 file indexer. It reads a GFF3 file and creates an index file that can be used to quickly retrieve features by ID, attribute, or range.
```shell
$ gfidx index -f example.gff3
$ gfidx query -f example.gff3
# CDCA8 - Cell Division Cycle Associated 8
relation ENSG00000134690.11
<...>
chr1 HAVANA CDS 37692905 37693033 . + 2 ID=CDS%3AENST00000327331.2;Parent=ENST00000327331.2;gene_id=ENSG00000134690.11;transcript_id=ENST00000327331.2;gene_type=protein_coding;gene_name=CDCA8;transcript_type=protein_coding;transcript_name=CDCA8-201;exon_number=3;exon_id=ENSE00000916824.1;level=2;protein_id=ENSP00000316121.2;transcript_support_level=1;hgnc_id=HGNC%3A14629;tag=alternative_5_UTR%2Cbasic%2CGENCODE_Primary%2Cappris_principal_1%2CCCDS;ccdsid=CCDS424.1;havana_gene=OTTHUMG00000004320.2;havana_transcript=OTTHUMT00000012474.1
Query took: 2.961754ms
34 lines found
Query cost 272.00 KB bytes
# Nucleotide Sugar Transporter Family
trie gene_name SLC35
<...>
chr13 ENSEMBL gene 20612161 20612338 . + . ID=ENSG00000222726.1;gene_id=ENSG00000222726.1;gene_type=snRNA;gene_name=RNU2-7P;level=3;hgnc_id=HGNC%3A42505
Query took: 138.7453ms
2926 lines found
Query cost 22.00 MB bytes
range chr3 650000 1500000
<...>
chr3 HAVANA gene 1595777 1596245 . - . ID=ENSG00000184423.5;gene_id=ENSG00000184423.5;gene_type=processed_pseudogene;gene_name=RPL23AP38;level=1;hgnc_id=HGNC%3A36351;tag=pseudo_consens;havana_gene=OTTHUMG00000154860.1
Query took: 2.999234ms
243 lines found
Query cost 120.00 KB bytes
```

129
benches/scan_gff3.rs Normal file
View file

@ -0,0 +1,129 @@
use std::{
io::{Cursor, Read},
sync::atomic::AtomicU64,
};
use criterion::{criterion_group, criterion_main, Criterion};
use flate2::read::GzDecoder;
use gfidx::{
attr_trie_def,
gff3::{Gff3Line, Gff3Read},
index::range::DEFAULT_RANGE_INTERVAL,
io::{stream::Gff3SeekableStreamReader, tee::Gff3BroadcastRead},
};
const TEST_FILE: &str = "data/gencode.v46.chr_patch_hapl_scaff.annotation.gff3.gz";
fn benchmark_scan_gff3(c: &mut Criterion) {
let mut group = c.benchmark_group("scan_gff3");
group.significance_level(0.1).sample_size(10);
group.bench_function("scan_gff3_cow", |b| {
b.iter(|| {
let mut data = Vec::new();
GzDecoder::new(std::fs::File::open(TEST_FILE).unwrap())
.read_to_end(&mut data)
.unwrap();
let mut reader =
Gff3SeekableStreamReader::new_with_size(Cursor::new(&data), data.len());
let (dir_cnt, cmt_cnt, attr_cnt) =
(AtomicU64::new(0), AtomicU64::new(0), AtomicU64::new(0));
while let Some(line) = reader.read_line().unwrap() {
match line {
Gff3Line::Comment(_) => {
cmt_cnt.fetch_add(1, std::sync::atomic::Ordering::Relaxed)
}
Gff3Line::Directive(_) => {
dir_cnt.fetch_add(1, std::sync::atomic::Ordering::Relaxed)
}
Gff3Line::Feature { attributes, .. } => attr_cnt.fetch_add(
attributes.len() as u64,
std::sync::atomic::Ordering::Relaxed,
),
};
}
(dir_cnt, cmt_cnt, attr_cnt)
})
});
}
fn benchmark_scan_gff3_tee(c: &mut Criterion) {
let mut group = c.benchmark_group("scan_gff3_tee");
group.significance_level(0.1).sample_size(10);
group.bench_function("scan_gff3_tee", |b| {
b.iter(|| {
let mut data = Vec::new();
GzDecoder::new(std::fs::File::open(TEST_FILE).unwrap())
.read_to_end(&mut data)
.unwrap();
let reader = Gff3SeekableStreamReader::new_with_size(Cursor::new(&data), data.len());
let (dir_cnt, cmt_cnt, attr_cnt) =
(AtomicU64::new(0), AtomicU64::new(0), AtomicU64::new(0));
rayon::scope(|s| {
let mut tee = Gff3BroadcastRead::new(reader, 1000);
for _ in 0..2 {
tee.add_channel(Box::new(|_, chunk| {
for (_, line) in chunk {
match line {
Gff3Line::Comment(_) => {
cmt_cnt.fetch_add(1, std::sync::atomic::Ordering::Relaxed)
}
Gff3Line::Directive(_) => {
dir_cnt.fetch_add(1, std::sync::atomic::Ordering::Relaxed)
}
Gff3Line::Feature { attributes, .. } => attr_cnt.fetch_add(
attributes.len() as u64,
std::sync::atomic::Ordering::Relaxed,
),
};
}
}));
}
s.spawn(|_| {
tee.run().unwrap();
});
});
(dir_cnt, cmt_cnt, attr_cnt)
})
});
}
fn benchmark_index_gff3(c: &mut Criterion) {
let mut group = c.benchmark_group("index_gff3");
group.significance_level(0.1).sample_size(10);
group.bench_function("index_gff3", |b| {
let mut data = Vec::new();
GzDecoder::new(std::fs::File::open(TEST_FILE).unwrap())
.read_to_end(&mut data)
.unwrap();
b.iter(move || {
gfidx::index::Gff3Index::build(
&|| Gff3SeekableStreamReader::new_with_size(Cursor::new(&data), data.len()),
DEFAULT_RANGE_INTERVAL,
&attr_trie_def![
"ID",
"gene_id",
"gene_name",
"gene_type",
"transcript_id",
"transcript_name",
"exon_id"
],
)
.unwrap()
})
});
}
criterion_group!(
benches,
benchmark_scan_gff3,
benchmark_scan_gff3_tee,
benchmark_index_gff3
);
criterion_main!(benches);

1
presentation/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
index.html

View file

@ -0,0 +1,302 @@
<?xml version="1.0" encoding="utf-8"?>
<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" demote-non-dropping-particle="sort-only" page-range-format="expanded" default-locale="en-US">
<info>
<title>American Chemical Society</title>
<title-short>ACS</title-short>
<id>http://www.zotero.org/styles/american-chemical-society</id>
<link href="http://www.zotero.org/styles/american-chemical-society" rel="self"/>
<link href="https://pubs.acs.org/doi/full/10.1021/acsguide.40303" rel="documentation"/>
<link href="https://pubs.acs.org/doi/book/10.1021/acsguide" rel="documentation"/>
<author>
<name>Julian Onions</name>
<email>julian.onions@gmail.com</email>
</author>
<contributor>
<name>Ivan Bushmarinov</name>
<email>ib@ineos.ac.ru</email>
</contributor>
<contributor>
<name>Sebastian Karcher</name>
</contributor>
<contributor>
<name>Patrick O'Brien</name>
</contributor>
<category citation-format="numeric"/>
<category field="chemistry"/>
<summary>The American Chemical Society style</summary>
<updated>2022-09-19T18:32:56+00:00</updated>
<rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
</info>
<locale xml:lang="en">
<terms>
<term name="editortranslator" form="short">
<single>ed. and translator</single>
<multiple>eds. and translators</multiple>
</term>
<term name="translator" form="short">
<single>translator</single>
<multiple>translators</multiple>
</term>
<term name="collection-editor" form="short">
<single>series ed.</single>
<multiple>series eds.</multiple>
</term>
</terms>
</locale>
<macro name="editor">
<group delimiter="; ">
<names variable="editor translator" delimiter="; ">
<name sort-separator=", " initialize-with=". " name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
<label form="short" prefix=", " text-case="title"/>
</names>
<names variable="collection-editor">
<name sort-separator=", " initialize-with=". " name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
<label form="short" prefix=", " text-case="title"/>
</names>
</group>
</macro>
<macro name="author">
<names variable="author" suffix=".">
<name sort-separator=", " initialize-with=". " name-as-sort-order="all" delimiter="; " delimiter-precedes-last="always"/>
<label form="short" prefix=", " text-case="capitalize-first"/>
</names>
</macro>
<macro name="publisher">
<choose>
<if type="thesis" match="any">
<group delimiter=", ">
<text variable="publisher"/>
<text variable="publisher-place"/>
</group>
</if>
<else>
<group delimiter=": ">
<text variable="publisher"/>
<text variable="publisher-place"/>
</group>
</else>
</choose>
</macro>
<macro name="title">
<choose>
<if type="bill book graphic legal_case legislation motion_picture report song" match="any">
<text variable="title" text-case="title" font-style="italic"/>
</if>
<else>
<text variable="title" text-case="title"/>
</else>
</choose>
</macro>
<macro name="volume">
<group delimiter=" ">
<text term="volume" form="short" text-case="capitalize-first"/>
<text variable="volume"/>
</group>
</macro>
<macro name="series">
<text variable="collection-title"/>
</macro>
<macro name="pages">
<label variable="page" form="short" suffix=" " strip-periods="true"/>
<text variable="page"/>
</macro>
<macro name="book-container">
<group delimiter=". ">
<text macro="title"/>
<choose>
<if type="entry-dictionary entry-encyclopedia" match="none">
<group delimiter=" ">
<text term="in" text-case="capitalize-first"/>
<text variable="container-title" font-style="italic"/>
</group>
</if>
<else>
<text variable="container-title" font-style="italic"/>
</else>
</choose>
</group>
</macro>
<macro name="issued">
<date variable="issued" delimiter=" ">
<date-part name="year"/>
</date>
</macro>
<macro name="full-issued">
<date variable="issued" delimiter=" ">
<date-part name="month" form="long" suffix=" "/>
<date-part name="day" suffix=", "/>
<date-part name="year"/>
</date>
</macro>
<macro name="edition">
<choose>
<if is-numeric="edition">
<group delimiter=" ">
<number variable="edition" form="ordinal"/>
<text term="edition" form="short"/>
</group>
</if>
<else>
<text variable="edition" suffix="."/>
</else>
</choose>
</macro>
<macro name="access">
<choose>
<if variable="DOI" match="any">
<text variable="DOI" prefix="https://doi.org/"/>
</if>
<else-if type="article-journal book chapter entry-encyclopedia entry-dictionary paper-conference" match="none">
<choose>
<if variable="URL">
<group delimiter=" ">
<text variable="URL"/>
<group delimiter=" " prefix="(" suffix=")">
<text term="accessed"/>
<date variable="accessed">
<date-part name="year"/>
<date-part name="month" prefix="-" form="numeric-leading-zeros"/>
<date-part name="day" prefix="-" form="numeric-leading-zeros"/>
</date>
</group>
</group>
</if>
</choose>
</else-if>
</choose>
</macro>
<citation collapse="citation-number">
<sort>
<key variable="citation-number"/>
</sort>
<layout delimiter="," vertical-align="sup">
<text variable="citation-number"/>
</layout>
</citation>
<bibliography second-field-align="flush" entry-spacing="0">
<layout suffix=".">
<text variable="citation-number" prefix="(" suffix=")"/>
<text macro="author" suffix=" "/>
<choose>
<if type="article-journal review" match="any">
<group delimiter=" ">
<text macro="title" suffix="."/>
<text variable="container-title" font-style="italic" form="short"/>
<group delimiter=", ">
<text macro="issued" font-weight="bold"/>
<choose>
<if variable="volume">
<group delimiter=" ">
<text variable="volume" font-style="italic"/>
<text variable="issue" prefix="(" suffix=")"/>
</group>
</if>
<else>
<group delimiter=" ">
<text term="issue" form="short" text-case="capitalize-first"/>
<text variable="issue"/>
</group>
</else>
</choose>
<text variable="page"/>
</group>
</group>
</if>
<else-if type="article-magazine article-newspaper article" match="any">
<group delimiter=" ">
<text macro="title" suffix="."/>
<text variable="container-title" font-style="italic" suffix="."/>
<text macro="edition"/>
<text macro="publisher"/>
<group delimiter=", ">
<text macro="full-issued"/>
<text macro="pages"/>
</group>
</group>
</else-if>
<else-if type="thesis">
<group delimiter=", ">
<group delimiter=". ">
<text macro="title"/>
<text variable="genre"/>
</group>
<text macro="publisher"/>
<text macro="issued"/>
<text macro="volume"/>
<text macro="pages"/>
</group>
</else-if>
<else-if type="bill book graphic legal_case legislation motion_picture report song" match="any">
<group delimiter="; ">
<group delimiter=", ">
<text macro="title"/>
<text macro="edition"/>
</group>
<text macro="editor" prefix=" "/>
<text macro="series"/>
<choose>
<if type="report">
<group delimiter=" ">
<text variable="genre"/>
<text variable="number"/>
</group>
</if>
</choose>
<group delimiter=", ">
<text macro="publisher"/>
<text macro="issued"/>
</group>
<group delimiter=", ">
<text macro="volume"/>
<text macro="pages"/>
</group>
</group>
</else-if>
<else-if type="patent">
<group delimiter=", ">
<group delimiter=". ">
<text macro="title"/>
<text variable="number"/>
</group>
<date variable="issued" form="text"/>
</group>
</else-if>
<else-if type="chapter paper-conference entry-dictionary entry-encyclopedia" match="any">
<group delimiter="; ">
<text macro="book-container"/>
<text macro="editor"/>
<text macro="series"/>
<group delimiter=", ">
<text macro="publisher"/>
<text macro="issued"/>
</group>
<group delimiter=", ">
<text macro="volume"/>
<text macro="pages"/>
</group>
</group>
</else-if>
<else-if type="webpage post post-weblog" match="any">
<group delimiter=". ">
<text variable="title" font-style="italic"/>
<text variable="container-title"/>
</group>
</else-if>
<else>
<group delimiter=", ">
<group delimiter=". ">
<text macro="title"/>
<text variable="container-title" font-style="italic"/>
</group>
<group delimiter=", ">
<text macro="issued"/>
<text variable="volume" font-style="italic"/>
<text variable="page"/>
</group>
</group>
</else>
</choose>
<text macro="access" prefix=". "/>
</layout>
</bibliography>
</style>

View file

@ -0,0 +1,31 @@
digraph GFidx {
compound=true;
GFF3 [label="GFF3 file", shape=box];
subgraph cluster_gfidx {
label="General Feature index (GFidx) file";
labelloc=b;
"Range index"; "Relation tree";
"Relation tree" [label="Relation tree"];
"Range index" -> "Relation tree" [color=blue];
subgraph cluster_attr_tries {
node [fillcolor="lightblue", style=filled];
label="Attribute indices";
"ID index"; "Gene name index"; "...";
}
"Range index" -> "ID index" [color=blue];
"Range index" -> "Gene name index" [color=blue];
"Range index" -> "..." [color=blue];
}
GFF3 -> "ID index";
GFF3 -> "Gene name index";
GFF3 -> "...";
GFF3 -> "Range index";
GFF3 -> "Relation tree";
}

View file

@ -0,0 +1,113 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 12.0.0 (0)
-->
<!-- Title: GFidx Pages: 1 -->
<svg width="513pt" height="261pt"
viewBox="0.00 0.00 513.00 261.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 257)">
<title>GFidx</title>
<polygon fill="white" stroke="none" points="-4,4 -4,-257 509,-257 509,4 -4,4"/>
<g id="clust1" class="cluster">
<title>cluster_gfidx</title>
<polygon fill="none" stroke="black" points="8,-8 8,-189 497,-189 497,-8 8,-8"/>
<text text-anchor="middle" x="252.5" y="-15.2" font-family="Times,serif" font-size="14.00">General Feature index (GFidx) file</text>
</g>
<g id="clust2" class="cluster">
<title>cluster_attr_tries</title>
<polygon fill="none" stroke="black" points="143,-40.5 143,-117 489,-117 489,-40.5 143,-40.5"/>
<text text-anchor="middle" x="316" y="-47.7" font-family="Times,serif" font-size="14.00">Attribute indices</text>
</g>
<!-- GFF3 -->
<g id="node1" class="node">
<title>GFF3</title>
<polygon fill="none" stroke="black" points="340.62,-253 271.38,-253 271.38,-217 340.62,-217 340.62,-253"/>
<text text-anchor="middle" x="306" y="-229.95" font-family="Times,serif" font-size="14.00">GFF3 file</text>
</g>
<!-- Range index -->
<g id="node2" class="node">
<title>Range index</title>
<ellipse fill="none" stroke="black" cx="183" cy="-163" rx="57.49" ry="18"/>
<text text-anchor="middle" x="183" y="-157.95" font-family="Times,serif" font-size="14.00">Range index</text>
</g>
<!-- GFF3&#45;&gt;Range index -->
<g id="edge8" class="edge">
<title>GFF3&#45;&gt;Range index</title>
<path fill="none" stroke="black" d="M275.28,-216.52C258.51,-206.97 237.62,-195.08 219.96,-185.03"/>
<polygon fill="black" stroke="black" points="221.72,-182.01 211.3,-180.11 218.26,-188.1 221.72,-182.01"/>
</g>
<!-- Relation tree -->
<g id="node3" class="node">
<title>Relation tree</title>
<ellipse fill="none" stroke="black" cx="75" cy="-91" rx="58.52" ry="18"/>
<text text-anchor="middle" x="75" y="-85.95" font-family="Times,serif" font-size="14.00">Relation tree</text>
</g>
<!-- GFF3&#45;&gt;Relation tree -->
<g id="edge9" class="edge">
<title>GFF3&#45;&gt;Relation tree</title>
<path fill="none" stroke="black" d="M299.59,-216.8C291.13,-196.51 274.29,-163.26 249,-145 208.09,-115.47 187.03,-132.43 139,-117 132.44,-114.89 125.58,-112.46 118.9,-109.97"/>
<polygon fill="black" stroke="black" points="120.6,-106.87 110.01,-106.56 118.1,-113.41 120.6,-106.87"/>
</g>
<!-- ID index -->
<g id="node4" class="node">
<title>ID index</title>
<ellipse fill="lightblue" stroke="black" cx="195" cy="-91" rx="43.67" ry="18"/>
<text text-anchor="middle" x="195" y="-85.95" font-family="Times,serif" font-size="14.00">ID index</text>
</g>
<!-- GFF3&#45;&gt;ID index -->
<g id="edge5" class="edge">
<title>GFF3&#45;&gt;ID index</title>
<path fill="none" stroke="black" d="M307.57,-216.61C308.4,-197.35 306.95,-166.23 292,-145 288.04,-139.38 258.14,-123.45 232.57,-110.51"/>
<polygon fill="black" stroke="black" points="234.18,-107.41 223.67,-106.05 231.04,-113.66 234.18,-107.41"/>
</g>
<!-- Gene name index -->
<g id="node5" class="node">
<title>Gene name index</title>
<ellipse fill="lightblue" stroke="black" cx="333" cy="-91" rx="76.43" ry="18"/>
<text text-anchor="middle" x="333" y="-85.95" font-family="Times,serif" font-size="14.00">Gene name index</text>
</g>
<!-- GFF3&#45;&gt;Gene name index -->
<g id="edge6" class="edge">
<title>GFF3&#45;&gt;Gene name index</title>
<path fill="none" stroke="black" d="M312.14,-216.66C314.89,-208.38 317.95,-198.27 320,-189 325.01,-166.3 328.38,-140.16 330.43,-120.74"/>
<polygon fill="black" stroke="black" points="333.91,-121.13 331.42,-110.83 326.94,-120.44 333.91,-121.13"/>
</g>
<!-- ... -->
<g id="node6" class="node">
<title>...</title>
<ellipse fill="lightblue" stroke="black" cx="454" cy="-91" rx="27" ry="18"/>
<text text-anchor="middle" x="454" y="-85.95" font-family="Times,serif" font-size="14.00">...</text>
</g>
<!-- GFF3&#45;&gt;... -->
<g id="edge7" class="edge">
<title>GFF3&#45;&gt;...</title>
<path fill="none" stroke="black" d="M324.15,-216.59C351.12,-190.7 401.78,-142.1 431.21,-113.86"/>
<polygon fill="black" stroke="black" points="433.38,-116.63 438.18,-107.18 428.54,-111.58 433.38,-116.63"/>
</g>
<!-- Range index&#45;&gt;Relation tree -->
<g id="edge1" class="edge">
<title>Range index&#45;&gt;Relation tree</title>
<path fill="none" stroke="blue" d="M158.76,-146.29C144.03,-136.74 125.01,-124.41 108.81,-113.91"/>
<polygon fill="blue" stroke="blue" points="110.72,-110.98 100.43,-108.48 106.91,-116.86 110.72,-110.98"/>
</g>
<!-- Range index&#45;&gt;ID index -->
<g id="edge2" class="edge">
<title>Range index&#45;&gt;ID index</title>
<path fill="none" stroke="blue" d="M185.97,-144.7C187.23,-137.32 188.74,-128.52 190.16,-120.25"/>
<polygon fill="blue" stroke="blue" points="193.57,-121.04 191.81,-110.6 186.67,-119.86 193.57,-121.04"/>
</g>
<!-- Range index&#45;&gt;Gene name index -->
<g id="edge3" class="edge">
<title>Range index&#45;&gt;Gene name index</title>
<path fill="none" stroke="blue" d="M214.08,-147.5C235.83,-137.34 265.2,-123.64 289.3,-112.39"/>
<polygon fill="blue" stroke="blue" points="290.66,-115.62 298.24,-108.22 287.7,-109.28 290.66,-115.62"/>
</g>
<!-- Range index&#45;&gt;... -->
<g id="edge4" class="edge">
<title>Range index&#45;&gt;...</title>
<path fill="none" stroke="blue" d="M238.54,-157.92C287.39,-152.83 359.61,-141.56 418,-117 421.44,-115.55 424.88,-113.75 428.19,-111.77"/>
<polygon fill="blue" stroke="blue" points="429.98,-114.79 436.38,-106.35 426.11,-108.95 429.98,-114.79"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 5.6 KiB

View file

@ -0,0 +1,64 @@
graph attr_trie {
rankdir=LR;
subgraph cluster_id_trie {
label="ID index";
id_root [label="Start", shape=box];
id_0_e [label="E"];
id_1_n [label="N"];
id_2_s [label="S"];
id_3_t [label="T"];
id_3_g [label="G"];
id_4_collapse [label="..."];
id_4_collapse2 [label="..."];
id_root -- id_0_e -- id_1_n -- id_2_s -- id_3_t;
id_2_s -- id_3_g;
id_3_t -- id_4_collapse;
id_3_g -- id_4_collapse2;
id_5_2 [label="2"];
id_6_8 [label="8"];
id_5_2_2 [label="2"];
id_6_5 [label="5"];
id_4_collapse -- id_5_2 -- id_6_8;
id_4_collapse2 -- id_5_2_2 -- id_6_5;
"Feature #1" [label="Feature #1", color=lightblue, style=filled];
"Feature #3" [label="Feature #3", color=lightblue, style=filled];
id_6_8 -- "Feature #3";
id_6_5 -- "Feature #1";
}
subgraph cluster_gene_trie {
label="Gene name index";
gene_root [label="Start", shape=box];
gene_0_D [label="D"];
gene_1_D [label="D"];
gene_2_X [label="X"];
gene_3_1 [label="1"];
gene_4_1 [label="1"];
gene_5_L [label="L"];
gene_6_2 [label="2"];
gene_0_O [label="O"];
gene_1_R [label="R"];
gene_2_4 [label="4"];
gene_3_F [label="F"];
gene_4_5 [label="5"];
"Feature #91_gene" [label="Feature #91", color=lightblue, style=filled];
"Feature #1_gene" [label="Feature #1", color=lightblue, style=filled];
gene_root -- gene_0_D -- gene_1_D -- gene_2_X -- gene_3_1 -- gene_4_1 -- gene_5_L -- gene_6_2 -- "Feature #1_gene";
gene_root -- gene_0_O -- gene_1_R -- gene_2_4 -- gene_3_F -- gene_4_5 -- "Feature #91_gene";
}
}

View file

@ -0,0 +1,332 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 12.0.0 (0)
-->
<!-- Title: attr_trie Pages: 1 -->
<svg width="917pt" height="294pt"
viewBox="0.00 0.00 916.52 294.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 290)">
<title>attr_trie</title>
<polygon fill="white" stroke="none" points="-4,4 -4,-290 912.52,-290 912.52,4 -4,4"/>
<g id="clust1" class="cluster">
<title>cluster_id_trie</title>
<polygon fill="none" stroke="black" points="8,-8 8,-139 900.52,-139 900.52,-8 8,-8"/>
<text text-anchor="middle" x="454.26" y="-121.7" font-family="Times,serif" font-size="14.00">ID index</text>
</g>
<g id="clust2" class="cluster">
<title>cluster_gene_trie</title>
<polygon fill="none" stroke="black" points="8,-147 8,-278 900.52,-278 900.52,-147 8,-147"/>
<text text-anchor="middle" x="454.26" y="-260.7" font-family="Times,serif" font-size="14.00">Gene name index</text>
</g>
<!-- id_root -->
<g id="node1" class="node">
<title>id_root</title>
<polygon fill="none" stroke="black" points="70,-79 16,-79 16,-43 70,-43 70,-79"/>
<text text-anchor="middle" x="43" y="-55.95" font-family="Times,serif" font-size="14.00">Start</text>
</g>
<!-- id_0_e -->
<g id="node2" class="node">
<title>id_0_e</title>
<ellipse fill="none" stroke="black" cx="133" cy="-61" rx="27" ry="18"/>
<text text-anchor="middle" x="133" y="-55.95" font-family="Times,serif" font-size="14.00">E</text>
</g>
<!-- id_root&#45;&#45;id_0_e -->
<g id="edge1" class="edge">
<title>id_root&#45;&#45;id_0_e</title>
<path fill="none" stroke="black" d="M70.4,-61C81.54,-61 94.48,-61 105.62,-61"/>
</g>
<!-- id_1_n -->
<g id="node3" class="node">
<title>id_1_n</title>
<ellipse fill="none" stroke="black" cx="223" cy="-61" rx="27" ry="18"/>
<text text-anchor="middle" x="223" y="-55.95" font-family="Times,serif" font-size="14.00">N</text>
</g>
<!-- id_0_e&#45;&#45;id_1_n -->
<g id="edge2" class="edge">
<title>id_0_e&#45;&#45;id_1_n</title>
<path fill="none" stroke="black" d="M160.4,-61C171.54,-61 184.48,-61 195.62,-61"/>
</g>
<!-- id_2_s -->
<g id="node4" class="node">
<title>id_2_s</title>
<ellipse fill="none" stroke="black" cx="313" cy="-61" rx="27" ry="18"/>
<text text-anchor="middle" x="313" y="-55.95" font-family="Times,serif" font-size="14.00">S</text>
</g>
<!-- id_1_n&#45;&#45;id_2_s -->
<g id="edge3" class="edge">
<title>id_1_n&#45;&#45;id_2_s</title>
<path fill="none" stroke="black" d="M250.4,-61C261.54,-61 274.48,-61 285.62,-61"/>
</g>
<!-- id_3_t -->
<g id="node5" class="node">
<title>id_3_t</title>
<ellipse fill="none" stroke="black" cx="403" cy="-34" rx="27" ry="18"/>
<text text-anchor="middle" x="403" y="-28.95" font-family="Times,serif" font-size="14.00">T</text>
</g>
<!-- id_2_s&#45;&#45;id_3_t -->
<g id="edge4" class="edge">
<title>id_2_s&#45;&#45;id_3_t</title>
<path fill="none" stroke="black" d="M338.05,-53.62C350.46,-49.81 365.57,-45.18 377.98,-41.37"/>
</g>
<!-- id_3_g -->
<g id="node6" class="node">
<title>id_3_g</title>
<ellipse fill="none" stroke="black" cx="403" cy="-88" rx="27" ry="18"/>
<text text-anchor="middle" x="403" y="-82.95" font-family="Times,serif" font-size="14.00">G</text>
</g>
<!-- id_2_s&#45;&#45;id_3_g -->
<g id="edge5" class="edge">
<title>id_2_s&#45;&#45;id_3_g</title>
<path fill="none" stroke="black" d="M338.05,-68.38C350.46,-72.19 365.57,-76.82 377.98,-80.63"/>
</g>
<!-- id_4_collapse -->
<g id="node7" class="node">
<title>id_4_collapse</title>
<ellipse fill="none" stroke="black" cx="493" cy="-34" rx="27" ry="18"/>
<text text-anchor="middle" x="493" y="-28.95" font-family="Times,serif" font-size="14.00">...</text>
</g>
<!-- id_3_t&#45;&#45;id_4_collapse -->
<g id="edge6" class="edge">
<title>id_3_t&#45;&#45;id_4_collapse</title>
<path fill="none" stroke="black" d="M430.4,-34C441.54,-34 454.48,-34 465.62,-34"/>
</g>
<!-- id_4_collapse2 -->
<g id="node8" class="node">
<title>id_4_collapse2</title>
<ellipse fill="none" stroke="black" cx="493" cy="-88" rx="27" ry="18"/>
<text text-anchor="middle" x="493" y="-82.95" font-family="Times,serif" font-size="14.00">...</text>
</g>
<!-- id_3_g&#45;&#45;id_4_collapse2 -->
<g id="edge7" class="edge">
<title>id_3_g&#45;&#45;id_4_collapse2</title>
<path fill="none" stroke="black" d="M430.4,-88C441.54,-88 454.48,-88 465.62,-88"/>
</g>
<!-- id_5_2 -->
<g id="node9" class="node">
<title>id_5_2</title>
<ellipse fill="none" stroke="black" cx="610.93" cy="-34" rx="27" ry="18"/>
<text text-anchor="middle" x="610.93" y="-28.95" font-family="Times,serif" font-size="14.00">2</text>
</g>
<!-- id_4_collapse&#45;&#45;id_5_2 -->
<g id="edge8" class="edge">
<title>id_4_collapse&#45;&#45;id_5_2</title>
<path fill="none" stroke="black" d="M520.31,-34C539.28,-34 564.78,-34 583.73,-34"/>
</g>
<!-- id_5_2_2 -->
<g id="node11" class="node">
<title>id_5_2_2</title>
<ellipse fill="none" stroke="black" cx="610.93" cy="-88" rx="27" ry="18"/>
<text text-anchor="middle" x="610.93" y="-82.95" font-family="Times,serif" font-size="14.00">2</text>
</g>
<!-- id_4_collapse2&#45;&#45;id_5_2_2 -->
<g id="edge10" class="edge">
<title>id_4_collapse2&#45;&#45;id_5_2_2</title>
<path fill="none" stroke="black" d="M520.31,-88C539.28,-88 564.78,-88 583.73,-88"/>
</g>
<!-- id_6_8 -->
<g id="node10" class="node">
<title>id_6_8</title>
<ellipse fill="none" stroke="black" cx="728.87" cy="-34" rx="27" ry="18"/>
<text text-anchor="middle" x="728.87" y="-28.95" font-family="Times,serif" font-size="14.00">8</text>
</g>
<!-- id_5_2&#45;&#45;id_6_8 -->
<g id="edge9" class="edge">
<title>id_5_2&#45;&#45;id_6_8</title>
<path fill="none" stroke="black" d="M638.24,-34C657.21,-34 682.72,-34 701.67,-34"/>
</g>
<!-- Feature #3 -->
<g id="node14" class="node">
<title>Feature #3</title>
<ellipse fill="lightblue" stroke="lightblue" cx="842.2" cy="-34" rx="50.33" ry="18"/>
<text text-anchor="middle" x="842.2" y="-28.95" font-family="Times,serif" font-size="14.00">Feature #3</text>
</g>
<!-- id_6_8&#45;&#45;Feature #3 -->
<g id="edge12" class="edge">
<title>id_6_8&#45;&#45;Feature #3</title>
<path fill="none" stroke="black" d="M756.25,-34C766.8,-34 779.3,-34 791.43,-34"/>
</g>
<!-- id_6_5 -->
<g id="node12" class="node">
<title>id_6_5</title>
<ellipse fill="none" stroke="black" cx="728.87" cy="-88" rx="27" ry="18"/>
<text text-anchor="middle" x="728.87" y="-82.95" font-family="Times,serif" font-size="14.00">5</text>
</g>
<!-- id_5_2_2&#45;&#45;id_6_5 -->
<g id="edge11" class="edge">
<title>id_5_2_2&#45;&#45;id_6_5</title>
<path fill="none" stroke="black" d="M638.24,-88C657.21,-88 682.72,-88 701.67,-88"/>
</g>
<!-- Feature #1 -->
<g id="node13" class="node">
<title>Feature #1</title>
<ellipse fill="lightblue" stroke="lightblue" cx="842.2" cy="-88" rx="50.33" ry="18"/>
<text text-anchor="middle" x="842.2" y="-82.95" font-family="Times,serif" font-size="14.00">Feature #1</text>
</g>
<!-- id_6_5&#45;&#45;Feature #1 -->
<g id="edge13" class="edge">
<title>id_6_5&#45;&#45;Feature #1</title>
<path fill="none" stroke="black" d="M756.25,-88C766.8,-88 779.3,-88 791.43,-88"/>
</g>
<!-- gene_root -->
<g id="node15" class="node">
<title>gene_root</title>
<polygon fill="none" stroke="black" points="70,-218 16,-218 16,-182 70,-182 70,-218"/>
<text text-anchor="middle" x="43" y="-194.95" font-family="Times,serif" font-size="14.00">Start</text>
</g>
<!-- gene_0_D -->
<g id="node16" class="node">
<title>gene_0_D</title>
<ellipse fill="none" stroke="black" cx="133" cy="-173" rx="27" ry="18"/>
<text text-anchor="middle" x="133" y="-167.95" font-family="Times,serif" font-size="14.00">D</text>
</g>
<!-- gene_root&#45;&#45;gene_0_D -->
<g id="edge14" class="edge">
<title>gene_root&#45;&#45;gene_0_D</title>
<path fill="none" stroke="black" d="M70.4,-191.9C82.35,-188.23 96.37,-183.93 108.02,-180.36"/>
</g>
<!-- gene_0_O -->
<g id="node23" class="node">
<title>gene_0_O</title>
<ellipse fill="none" stroke="black" cx="133" cy="-227" rx="27" ry="18"/>
<text text-anchor="middle" x="133" y="-221.95" font-family="Times,serif" font-size="14.00">O</text>
</g>
<!-- gene_root&#45;&#45;gene_0_O -->
<g id="edge22" class="edge">
<title>gene_root&#45;&#45;gene_0_O</title>
<path fill="none" stroke="black" d="M70.4,-208.1C82.35,-211.77 96.37,-216.07 108.02,-219.64"/>
</g>
<!-- gene_1_D -->
<g id="node17" class="node">
<title>gene_1_D</title>
<ellipse fill="none" stroke="black" cx="223" cy="-173" rx="27" ry="18"/>
<text text-anchor="middle" x="223" y="-167.95" font-family="Times,serif" font-size="14.00">D</text>
</g>
<!-- gene_0_D&#45;&#45;gene_1_D -->
<g id="edge15" class="edge">
<title>gene_0_D&#45;&#45;gene_1_D</title>
<path fill="none" stroke="black" d="M160.4,-173C171.54,-173 184.48,-173 195.62,-173"/>
</g>
<!-- gene_2_X -->
<g id="node18" class="node">
<title>gene_2_X</title>
<ellipse fill="none" stroke="black" cx="313" cy="-173" rx="27" ry="18"/>
<text text-anchor="middle" x="313" y="-167.95" font-family="Times,serif" font-size="14.00">X</text>
</g>
<!-- gene_1_D&#45;&#45;gene_2_X -->
<g id="edge16" class="edge">
<title>gene_1_D&#45;&#45;gene_2_X</title>
<path fill="none" stroke="black" d="M250.4,-173C261.54,-173 274.48,-173 285.62,-173"/>
</g>
<!-- gene_3_1 -->
<g id="node19" class="node">
<title>gene_3_1</title>
<ellipse fill="none" stroke="black" cx="403" cy="-173" rx="27" ry="18"/>
<text text-anchor="middle" x="403" y="-167.95" font-family="Times,serif" font-size="14.00">1</text>
</g>
<!-- gene_2_X&#45;&#45;gene_3_1 -->
<g id="edge17" class="edge">
<title>gene_2_X&#45;&#45;gene_3_1</title>
<path fill="none" stroke="black" d="M340.4,-173C351.54,-173 364.48,-173 375.62,-173"/>
</g>
<!-- gene_4_1 -->
<g id="node20" class="node">
<title>gene_4_1</title>
<ellipse fill="none" stroke="black" cx="493" cy="-173" rx="27" ry="18"/>
<text text-anchor="middle" x="493" y="-167.95" font-family="Times,serif" font-size="14.00">1</text>
</g>
<!-- gene_3_1&#45;&#45;gene_4_1 -->
<g id="edge18" class="edge">
<title>gene_3_1&#45;&#45;gene_4_1</title>
<path fill="none" stroke="black" d="M430.4,-173C441.54,-173 454.48,-173 465.62,-173"/>
</g>
<!-- gene_5_L -->
<g id="node21" class="node">
<title>gene_5_L</title>
<ellipse fill="none" stroke="black" cx="610.93" cy="-173" rx="27" ry="18"/>
<text text-anchor="middle" x="610.93" y="-167.95" font-family="Times,serif" font-size="14.00">L</text>
</g>
<!-- gene_4_1&#45;&#45;gene_5_L -->
<g id="edge19" class="edge">
<title>gene_4_1&#45;&#45;gene_5_L</title>
<path fill="none" stroke="black" d="M520.31,-173C539.28,-173 564.78,-173 583.73,-173"/>
</g>
<!-- gene_6_2 -->
<g id="node22" class="node">
<title>gene_6_2</title>
<ellipse fill="none" stroke="black" cx="728.87" cy="-173" rx="27" ry="18"/>
<text text-anchor="middle" x="728.87" y="-167.95" font-family="Times,serif" font-size="14.00">2</text>
</g>
<!-- gene_5_L&#45;&#45;gene_6_2 -->
<g id="edge20" class="edge">
<title>gene_5_L&#45;&#45;gene_6_2</title>
<path fill="none" stroke="black" d="M638.24,-173C657.21,-173 682.72,-173 701.67,-173"/>
</g>
<!-- Feature #1_gene -->
<g id="node29" class="node">
<title>Feature #1_gene</title>
<ellipse fill="lightblue" stroke="lightblue" cx="842.2" cy="-173" rx="50.33" ry="18"/>
<text text-anchor="middle" x="842.2" y="-167.95" font-family="Times,serif" font-size="14.00">Feature #1</text>
</g>
<!-- gene_6_2&#45;&#45;Feature #1_gene -->
<g id="edge21" class="edge">
<title>gene_6_2&#45;&#45;Feature #1_gene</title>
<path fill="none" stroke="black" d="M756.25,-173C766.8,-173 779.3,-173 791.43,-173"/>
</g>
<!-- gene_1_R -->
<g id="node24" class="node">
<title>gene_1_R</title>
<ellipse fill="none" stroke="black" cx="223" cy="-227" rx="27" ry="18"/>
<text text-anchor="middle" x="223" y="-221.95" font-family="Times,serif" font-size="14.00">R</text>
</g>
<!-- gene_0_O&#45;&#45;gene_1_R -->
<g id="edge23" class="edge">
<title>gene_0_O&#45;&#45;gene_1_R</title>
<path fill="none" stroke="black" d="M160.4,-227C171.54,-227 184.48,-227 195.62,-227"/>
</g>
<!-- gene_2_4 -->
<g id="node25" class="node">
<title>gene_2_4</title>
<ellipse fill="none" stroke="black" cx="313" cy="-227" rx="27" ry="18"/>
<text text-anchor="middle" x="313" y="-221.95" font-family="Times,serif" font-size="14.00">4</text>
</g>
<!-- gene_1_R&#45;&#45;gene_2_4 -->
<g id="edge24" class="edge">
<title>gene_1_R&#45;&#45;gene_2_4</title>
<path fill="none" stroke="black" d="M250.4,-227C261.54,-227 274.48,-227 285.62,-227"/>
</g>
<!-- gene_3_F -->
<g id="node26" class="node">
<title>gene_3_F</title>
<ellipse fill="none" stroke="black" cx="403" cy="-227" rx="27" ry="18"/>
<text text-anchor="middle" x="403" y="-221.95" font-family="Times,serif" font-size="14.00">F</text>
</g>
<!-- gene_2_4&#45;&#45;gene_3_F -->
<g id="edge25" class="edge">
<title>gene_2_4&#45;&#45;gene_3_F</title>
<path fill="none" stroke="black" d="M340.4,-227C351.54,-227 364.48,-227 375.62,-227"/>
</g>
<!-- gene_4_5 -->
<g id="node27" class="node">
<title>gene_4_5</title>
<ellipse fill="none" stroke="black" cx="493" cy="-227" rx="27" ry="18"/>
<text text-anchor="middle" x="493" y="-221.95" font-family="Times,serif" font-size="14.00">5</text>
</g>
<!-- gene_3_F&#45;&#45;gene_4_5 -->
<g id="edge26" class="edge">
<title>gene_3_F&#45;&#45;gene_4_5</title>
<path fill="none" stroke="black" d="M430.4,-227C441.54,-227 454.48,-227 465.62,-227"/>
</g>
<!-- Feature #91_gene -->
<g id="node28" class="node">
<title>Feature #91_gene</title>
<ellipse fill="lightblue" stroke="lightblue" cx="610.93" cy="-227" rx="54.93" ry="18"/>
<text text-anchor="middle" x="610.93" y="-221.95" font-family="Times,serif" font-size="14.00">Feature #91</text>
</g>
<!-- gene_4_5&#45;&#45;Feature #91_gene -->
<g id="edge27" class="edge">
<title>gene_4_5&#45;&#45;Feature #91_gene</title>
<path fill="none" stroke="black" d="M520.31,-227C530.86,-227 543.44,-227 555.78,-227"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 14 KiB

View file

@ -0,0 +1,46 @@
graph range_index {
rankdir=LR;
root [label="Root", shape=box];
node_0_chr1 [label="chr1"];
node_0_chr2 [label="chr2"];
node_1_chr1_1_10000 [label="1:1-10000", xlabel="Offset: 0"];
node_1_chr1_10001_20000 [label="1:10001-20000", xlabel="Offset: 0xA000"];
node_1_chr2_1_10000 [label="2:1-10000", xlabel="Offset: 0x14000"];
node_1_chr2_10001_20000 [label="2:10001-20000", xlabel="Offset: 0x1E000"];
node_2_chr1_1_5000 [label="1:1-5000", xlabel="Offset: B000"];
node_2_chr1_5001_10000 [label="1:5001-10000", xlabel="Offset: 0x10000"];
node_2_chr1_10001_15000 [label="1:10001-15000", xlabel="Offset: 0x1F400"];
node_2_chr1_15001_20000 [label="1:15001-20000", xlabel="Offset: 0x2E800"];
node_2_chr2_1_5000 [label="2:1-5000", xlabel="Offset: 0x2D000"];
node_2_chr2_5001_10000 [label="2:5001-10000", xlabel="Offset: 0x3C000"];
node_2_chr2_10001_15000 [label="2:10001-15000", xlabel="Offset: 0x4B400"];
node_2_chr2_15001_20000 [label="2:15001-20000", xlabel="Offset: 0x5A800"];
root -- node_0_chr1;
root -- node_0_chr2;
node_0_chr1 -- node_1_chr1_1_10000;
node_0_chr1 -- node_1_chr1_10001_20000;
node_0_chr2 -- node_1_chr2_1_10000;
node_0_chr2 -- node_1_chr2_10001_20000;
node_1_chr1_1_10000 -- node_2_chr1_1_5000;
node_1_chr1_1_10000 -- node_2_chr1_5001_10000;
node_1_chr1_10001_20000 -- node_2_chr1_10001_15000;
node_1_chr1_10001_20000 -- node_2_chr1_15001_20000;
node_1_chr2_1_10000 -- node_2_chr2_1_5000;
node_1_chr2_1_10000 -- node_2_chr2_5001_10000;
node_1_chr2_10001_20000 -- node_2_chr2_10001_15000;
node_1_chr2_10001_20000 -- node_2_chr2_15001_20000;
}

View file

@ -0,0 +1,185 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 12.0.0 (0)
-->
<!-- Title: range_index Pages: 1 -->
<svg width="493pt" height="439pt"
viewBox="0.00 0.00 493.46 438.50" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 434.5)">
<title>range_index</title>
<polygon fill="white" stroke="none" points="-4,4 -4,-434.5 489.46,-434.5 489.46,4 -4,4"/>
<!-- root -->
<g id="node1" class="node">
<title>root</title>
<polygon fill="none" stroke="black" points="54,-268.5 0,-268.5 0,-232.5 54,-232.5 54,-268.5"/>
<text text-anchor="middle" x="27" y="-245.45" font-family="Times,serif" font-size="14.00">Root</text>
</g>
<!-- node_0_chr1 -->
<g id="node2" class="node">
<title>node_0_chr1</title>
<ellipse fill="none" stroke="black" cx="117.3" cy="-277.5" rx="27.3" ry="18"/>
<text text-anchor="middle" x="117.3" y="-272.45" font-family="Times,serif" font-size="14.00">chr1</text>
</g>
<!-- root&#45;&#45;node_0_chr1 -->
<g id="edge1" class="edge">
<title>root&#45;&#45;node_0_chr1</title>
<path fill="none" stroke="black" d="M54.49,-258.6C66.38,-262.24 80.31,-266.5 91.93,-270.05"/>
</g>
<!-- node_0_chr2 -->
<g id="node3" class="node">
<title>node_0_chr2</title>
<ellipse fill="none" stroke="black" cx="117.3" cy="-169.5" rx="27.3" ry="18"/>
<text text-anchor="middle" x="117.3" y="-164.45" font-family="Times,serif" font-size="14.00">chr2</text>
</g>
<!-- root&#45;&#45;node_0_chr2 -->
<g id="edge2" class="edge">
<title>root&#45;&#45;node_0_chr2</title>
<path fill="none" stroke="black" d="M48.04,-232.12C63.91,-217.56 85.79,-197.49 100.45,-184.03"/>
</g>
<!-- node_1_chr1_1_10000 -->
<g id="node4" class="node">
<title>node_1_chr1_1_10000</title>
<ellipse fill="none" stroke="black" cx="247.81" cy="-358.5" rx="48.79" ry="18"/>
<text text-anchor="middle" x="247.81" y="-353.45" font-family="Times,serif" font-size="14.00">1:1&#45;10000</text>
<text text-anchor="middle" x="175.02" y="-327.2" font-family="Times,serif" font-size="14.00">Offset: 0</text>
</g>
<!-- node_0_chr1&#45;&#45;node_1_chr1_1_10000 -->
<g id="edge3" class="edge">
<title>node_0_chr1&#45;&#45;node_1_chr1_1_10000</title>
<path fill="none" stroke="black" d="M137.81,-289.8C160.42,-304.05 197.71,-327.55 222.26,-343.03"/>
</g>
<!-- node_1_chr1_10001_20000 -->
<g id="node5" class="node">
<title>node_1_chr1_10001_20000</title>
<ellipse fill="none" stroke="black" cx="247.81" cy="-277.5" rx="67.22" ry="18"/>
<text text-anchor="middle" x="247.81" y="-272.45" font-family="Times,serif" font-size="14.00">1:10001&#45;20000</text>
<text text-anchor="middle" x="138.22" y="-246.2" font-family="Times,serif" font-size="14.00">Offset: 0xA000</text>
</g>
<!-- node_0_chr1&#45;&#45;node_1_chr1_10001_20000 -->
<g id="edge4" class="edge">
<title>node_0_chr1&#45;&#45;node_1_chr1_10001_20000</title>
<path fill="none" stroke="black" d="M144.92,-277.5C155.29,-277.5 167.67,-277.5 180.18,-277.5"/>
</g>
<!-- node_1_chr2_1_10000 -->
<g id="node6" class="node">
<title>node_1_chr2_1_10000</title>
<ellipse fill="none" stroke="black" cx="247.81" cy="-169.5" rx="48.79" ry="18"/>
<text text-anchor="middle" x="247.81" y="-164.45" font-family="Times,serif" font-size="14.00">2:1&#45;10000</text>
<text text-anchor="middle" x="154.77" y="-138.2" font-family="Times,serif" font-size="14.00">Offset: 0x14000</text>
</g>
<!-- node_0_chr2&#45;&#45;node_1_chr2_1_10000 -->
<g id="edge5" class="edge">
<title>node_0_chr2&#45;&#45;node_1_chr2_1_10000</title>
<path fill="none" stroke="black" d="M144.92,-169.5C160.44,-169.5 180.47,-169.5 198.66,-169.5"/>
</g>
<!-- node_1_chr2_10001_20000 -->
<g id="node7" class="node">
<title>node_1_chr2_10001_20000</title>
<ellipse fill="none" stroke="black" cx="247.81" cy="-88.5" rx="67.22" ry="18"/>
<text text-anchor="middle" x="247.81" y="-83.45" font-family="Times,serif" font-size="14.00">2:10001&#45;20000</text>
<text text-anchor="middle" x="135.59" y="-57.2" font-family="Times,serif" font-size="14.00">Offset: 0x1E000</text>
</g>
<!-- node_0_chr2&#45;&#45;node_1_chr2_10001_20000 -->
<g id="edge6" class="edge">
<title>node_0_chr2&#45;&#45;node_1_chr2_10001_20000</title>
<path fill="none" stroke="black" d="M137.81,-157.2C159.86,-143.3 195.86,-120.61 220.41,-105.14"/>
</g>
<!-- node_2_chr1_1_5000 -->
<g id="node8" class="node">
<title>node_2_chr1_1_5000</title>
<ellipse fill="none" stroke="black" cx="418.25" cy="-412.5" rx="44.19" ry="18"/>
<text text-anchor="middle" x="418.25" y="-407.45" font-family="Times,serif" font-size="14.00">1:1&#45;5000</text>
<text text-anchor="middle" x="338.81" y="-381.2" font-family="Times,serif" font-size="14.00">Offset: B000</text>
</g>
<!-- node_1_chr1_1_10000&#45;&#45;node_2_chr1_1_5000 -->
<g id="edge7" class="edge">
<title>node_1_chr1_1_10000&#45;&#45;node_2_chr1_1_5000</title>
<path fill="none" stroke="black" d="M285.36,-370.22C314.29,-379.49 354.41,-392.35 382.78,-401.45"/>
</g>
<!-- node_2_chr1_5001_10000 -->
<g id="node9" class="node">
<title>node_2_chr1_5001_10000</title>
<ellipse fill="none" stroke="black" cx="418.25" cy="-358.5" rx="62.61" ry="18"/>
<text text-anchor="middle" x="418.25" y="-353.45" font-family="Times,serif" font-size="14.00">1:5001&#45;10000</text>
<text text-anchor="middle" x="311.38" y="-327.2" font-family="Times,serif" font-size="14.00">Offset: 0x10000</text>
</g>
<!-- node_1_chr1_1_10000&#45;&#45;node_2_chr1_5001_10000 -->
<g id="edge8" class="edge">
<title>node_1_chr1_1_10000&#45;&#45;node_2_chr1_5001_10000</title>
<path fill="none" stroke="black" d="M297.08,-358.5C315.26,-358.5 336.19,-358.5 355.43,-358.5"/>
</g>
<!-- node_2_chr1_10001_15000 -->
<g id="node10" class="node">
<title>node_2_chr1_10001_15000</title>
<ellipse fill="none" stroke="black" cx="418.25" cy="-304.5" rx="67.22" ry="18"/>
<text text-anchor="middle" x="418.25" y="-299.45" font-family="Times,serif" font-size="14.00">1:10001&#45;15000</text>
<text text-anchor="middle" x="440.84" y="-273.2" font-family="Times,serif" font-size="14.00">Offset: 0x1F400</text>
</g>
<!-- node_1_chr1_10001_20000&#45;&#45;node_2_chr1_10001_15000 -->
<g id="edge9" class="edge">
<title>node_1_chr1_10001_20000&#45;&#45;node_2_chr1_10001_15000</title>
<path fill="none" stroke="black" d="M306.3,-286.72C323.55,-289.48 342.47,-292.51 359.72,-295.28"/>
</g>
<!-- node_2_chr1_15001_20000 -->
<g id="node11" class="node">
<title>node_2_chr1_15001_20000</title>
<ellipse fill="none" stroke="black" cx="418.25" cy="-250.5" rx="67.22" ry="18"/>
<text text-anchor="middle" x="418.25" y="-245.45" font-family="Times,serif" font-size="14.00">1:15001&#45;20000</text>
<text text-anchor="middle" x="306.03" y="-219.2" font-family="Times,serif" font-size="14.00">Offset: 0x2E800</text>
</g>
<!-- node_1_chr1_10001_20000&#45;&#45;node_2_chr1_15001_20000 -->
<g id="edge10" class="edge">
<title>node_1_chr1_10001_20000&#45;&#45;node_2_chr1_15001_20000</title>
<path fill="none" stroke="black" d="M306.3,-268.28C323.55,-265.52 342.47,-262.49 359.72,-259.72"/>
</g>
<!-- node_2_chr2_1_5000 -->
<g id="node12" class="node">
<title>node_2_chr2_1_5000</title>
<ellipse fill="none" stroke="black" cx="418.25" cy="-196.5" rx="44.19" ry="18"/>
<text text-anchor="middle" x="418.25" y="-191.45" font-family="Times,serif" font-size="14.00">2:1&#45;5000</text>
<text text-anchor="middle" x="416.68" y="-165.2" font-family="Times,serif" font-size="14.00">Offset: 0x2D000</text>
</g>
<!-- node_1_chr2_1_10000&#45;&#45;node_2_chr2_1_5000 -->
<g id="edge11" class="edge">
<title>node_1_chr2_1_10000&#45;&#45;node_2_chr2_1_5000</title>
<path fill="none" stroke="black" d="M293.06,-176.59C318.89,-180.73 351.25,-185.92 376.46,-189.96"/>
</g>
<!-- node_2_chr2_5001_10000 -->
<g id="node13" class="node">
<title>node_2_chr2_5001_10000</title>
<ellipse fill="none" stroke="black" cx="418.25" cy="-142.5" rx="62.61" ry="18"/>
<text text-anchor="middle" x="418.25" y="-137.45" font-family="Times,serif" font-size="14.00">2:5001&#45;10000</text>
<text text-anchor="middle" x="310.26" y="-111.2" font-family="Times,serif" font-size="14.00">Offset: 0x3C000</text>
</g>
<!-- node_1_chr2_1_10000&#45;&#45;node_2_chr2_5001_10000 -->
<g id="edge12" class="edge">
<title>node_1_chr2_1_10000&#45;&#45;node_2_chr2_5001_10000</title>
<path fill="none" stroke="black" d="M293.06,-162.41C314.41,-158.98 340.22,-154.85 362.76,-151.23"/>
</g>
<!-- node_2_chr2_10001_15000 -->
<g id="node14" class="node">
<title>node_2_chr2_10001_15000</title>
<ellipse fill="none" stroke="black" cx="418.25" cy="-88.5" rx="67.22" ry="18"/>
<text text-anchor="middle" x="418.25" y="-83.45" font-family="Times,serif" font-size="14.00">2:10001&#45;15000</text>
<text text-anchor="middle" x="305.65" y="-57.2" font-family="Times,serif" font-size="14.00">Offset: 0x4B400</text>
</g>
<!-- node_1_chr2_10001_20000&#45;&#45;node_2_chr2_10001_15000 -->
<g id="edge13" class="edge">
<title>node_1_chr2_10001_20000&#45;&#45;node_2_chr2_10001_15000</title>
<path fill="none" stroke="black" d="M315.38,-88.5C326.97,-88.5 339.02,-88.5 350.61,-88.5"/>
</g>
<!-- node_2_chr2_15001_20000 -->
<g id="node15" class="node">
<title>node_2_chr2_15001_20000</title>
<ellipse fill="none" stroke="black" cx="418.25" cy="-34.5" rx="67.22" ry="18"/>
<text text-anchor="middle" x="418.25" y="-29.45" font-family="Times,serif" font-size="14.00">2:15001&#45;20000</text>
<text text-anchor="middle" x="305.28" y="-3.2" font-family="Times,serif" font-size="14.00">Offset: 0x5A800</text>
</g>
<!-- node_1_chr2_10001_20000&#45;&#45;node_2_chr2_15001_20000 -->
<g id="edge14" class="edge">
<title>node_1_chr2_10001_20000&#45;&#45;node_2_chr2_15001_20000</title>
<path fill="none" stroke="black" d="M292.19,-74.59C317.33,-66.53 348.89,-56.41 374.02,-48.36"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 9.6 KiB

View file

@ -0,0 +1,27 @@
graph relation_tree {
root [label="Root", shape=box];
gene_1 [label="Gene #1", xlabel="#101"];
gene_2 [label="Gene #2", xlabel="#102"];
gene_1_transcript_1 [label="Transcript #1", xlabel="#103"];
gene_1_transcript_2 [label="Transcript #2", xlabel="#104"];
gene_2_transcript_1 [label="Transcript #1", xlabel="#105"];
gene_2_transcript_2 [label="Transcript #2", xlabel="#106"];
gene_2_transcript_3 [label="Transcript #3", xlabel="#107"];
gene_1_transcript_1_exon_1 [label="Exon #1", xlabel="#108"];
root -- gene_1;
root -- gene_2;
gene_1 -- gene_1_transcript_1;
gene_1 -- gene_1_transcript_2;
gene_2 -- gene_2_transcript_1;
gene_2 -- gene_2_transcript_2;
gene_2 -- gene_2_transcript_3;
gene_1_transcript_1 -- gene_1_transcript_1_exon_1;
}

View file

@ -0,0 +1,115 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 12.0.0 (0)
-->
<!-- Title: relation_tree Pages: 1 -->
<svg width="712pt" height="260pt"
viewBox="0.00 0.00 712.13 260.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 256)">
<title>relation_tree</title>
<polygon fill="white" stroke="none" points="-4,4 -4,-256 708.13,-256 708.13,4 -4,4"/>
<!-- root -->
<g id="node1" class="node">
<title>root</title>
<polygon fill="none" stroke="black" points="322.56,-252 268.56,-252 268.56,-216 322.56,-216 322.56,-252"/>
<text text-anchor="middle" x="295.56" y="-228.95" font-family="Times,serif" font-size="14.00">Root</text>
</g>
<!-- gene_1 -->
<g id="node2" class="node">
<title>gene_1</title>
<ellipse fill="none" stroke="black" cx="226.56" cy="-162" rx="42.14" ry="18"/>
<text text-anchor="middle" x="226.56" y="-156.95" font-family="Times,serif" font-size="14.00">Gene #1</text>
<text text-anchor="middle" x="170.93" y="-183.2" font-family="Times,serif" font-size="14.00">#101</text>
</g>
<!-- root&#45;&#45;gene_1 -->
<g id="edge1" class="edge">
<title>root&#45;&#45;gene_1</title>
<path fill="none" stroke="black" d="M278.51,-215.7C267.49,-204.52 253.26,-190.08 242.48,-179.14"/>
</g>
<!-- gene_2 -->
<g id="node3" class="node">
<title>gene_2</title>
<ellipse fill="none" stroke="black" cx="434.56" cy="-162" rx="42.14" ry="18"/>
<text text-anchor="middle" x="434.56" y="-156.95" font-family="Times,serif" font-size="14.00">Gene #2</text>
<text text-anchor="middle" x="378.93" y="-183.2" font-family="Times,serif" font-size="14.00">#102</text>
</g>
<!-- root&#45;&#45;gene_2 -->
<g id="edge2" class="edge">
<title>root&#45;&#45;gene_2</title>
<path fill="none" stroke="black" d="M323.03,-219.17C347.71,-206.74 383.73,-188.6 408.1,-176.33"/>
</g>
<!-- gene_1_transcript_1 -->
<g id="node4" class="node">
<title>gene_1_transcript_1</title>
<ellipse fill="none" stroke="black" cx="87.56" cy="-90" rx="60.56" ry="18"/>
<text text-anchor="middle" x="87.56" y="-84.95" font-family="Times,serif" font-size="14.00">Transcript #1</text>
<text text-anchor="middle" x="13.5" y="-111.2" font-family="Times,serif" font-size="14.00">#103</text>
</g>
<!-- gene_1&#45;&#45;gene_1_transcript_1 -->
<g id="edge3" class="edge">
<title>gene_1&#45;&#45;gene_1_transcript_1</title>
<path fill="none" stroke="black" d="M200.09,-147.67C176.46,-135.77 141.89,-118.36 117.31,-105.98"/>
</g>
<!-- gene_1_transcript_2 -->
<g id="node5" class="node">
<title>gene_1_transcript_2</title>
<ellipse fill="none" stroke="black" cx="226.56" cy="-90" rx="60.56" ry="18"/>
<text text-anchor="middle" x="226.56" y="-84.95" font-family="Times,serif" font-size="14.00">Transcript #2</text>
<text text-anchor="middle" x="152.5" y="-111.2" font-family="Times,serif" font-size="14.00">#104</text>
</g>
<!-- gene_1&#45;&#45;gene_1_transcript_2 -->
<g id="edge4" class="edge">
<title>gene_1&#45;&#45;gene_1_transcript_2</title>
<path fill="none" stroke="black" d="M226.56,-143.7C226.56,-132.85 226.56,-118.92 226.56,-108.1"/>
</g>
<!-- gene_2_transcript_1 -->
<g id="node6" class="node">
<title>gene_2_transcript_1</title>
<ellipse fill="none" stroke="black" cx="365.56" cy="-90" rx="60.56" ry="18"/>
<text text-anchor="middle" x="365.56" y="-84.95" font-family="Times,serif" font-size="14.00">Transcript #1</text>
<text text-anchor="middle" x="291.5" y="-111.2" font-family="Times,serif" font-size="14.00">#105</text>
</g>
<!-- gene_2&#45;&#45;gene_2_transcript_1 -->
<g id="edge5" class="edge">
<title>gene_2&#45;&#45;gene_2_transcript_1</title>
<path fill="none" stroke="black" d="M418.91,-145.12C407.93,-133.98 393.29,-119.13 382.12,-107.8"/>
</g>
<!-- gene_2_transcript_2 -->
<g id="node7" class="node">
<title>gene_2_transcript_2</title>
<ellipse fill="none" stroke="black" cx="504.56" cy="-90" rx="60.56" ry="18"/>
<text text-anchor="middle" x="504.56" y="-84.95" font-family="Times,serif" font-size="14.00">Transcript #2</text>
<text text-anchor="middle" x="430.5" y="-111.2" font-family="Times,serif" font-size="14.00">#106</text>
</g>
<!-- gene_2&#45;&#45;gene_2_transcript_2 -->
<g id="edge6" class="edge">
<title>gene_2&#45;&#45;gene_2_transcript_2</title>
<path fill="none" stroke="black" d="M450.45,-145.12C461.74,-133.82 476.87,-118.7 488.26,-107.31"/>
</g>
<!-- gene_2_transcript_3 -->
<g id="node8" class="node">
<title>gene_2_transcript_3</title>
<ellipse fill="none" stroke="black" cx="643.56" cy="-90" rx="60.56" ry="18"/>
<text text-anchor="middle" x="643.56" y="-84.95" font-family="Times,serif" font-size="14.00">Transcript #3</text>
<text text-anchor="middle" x="569.5" y="-111.2" font-family="Times,serif" font-size="14.00">#107</text>
</g>
<!-- gene_2&#45;&#45;gene_2_transcript_3 -->
<g id="edge7" class="edge">
<title>gene_2&#45;&#45;gene_2_transcript_3</title>
<path fill="none" stroke="black" d="M466.76,-150.22C503.86,-137.79 564.81,-117.38 604.56,-104.06"/>
</g>
<!-- gene_1_transcript_1_exon_1 -->
<g id="node9" class="node">
<title>gene_1_transcript_1_exon_1</title>
<ellipse fill="none" stroke="black" cx="87.56" cy="-18" rx="42.14" ry="18"/>
<text text-anchor="middle" x="87.56" y="-12.95" font-family="Times,serif" font-size="14.00">Exon #1</text>
<text text-anchor="middle" x="31.93" y="-39.2" font-family="Times,serif" font-size="14.00">#108</text>
</g>
<!-- gene_1_transcript_1&#45;&#45;gene_1_transcript_1_exon_1 -->
<g id="edge8" class="edge">
<title>gene_1_transcript_1&#45;&#45;gene_1_transcript_1_exon_1</title>
<path fill="none" stroke="black" d="M87.56,-71.7C87.56,-60.85 87.56,-46.92 87.56,-36.1"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 5.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 230 KiB

143
presentation/index.qmd Normal file
View file

@ -0,0 +1,143 @@
---
title: "General Feature index (GFidx)"
subtitle: "Space-efficient and fast index for querying large General Feature Format (GFF) files"
author: ["<redacted>"]
date: "2024-08-01"
format:
revealjs:
transition: fade
theme: sakura.scss
slideNumber: true
embed-resources: true
csl: american-chemical-society.csl
bibliography: presentation.bib
---
## Introduction {.smaller}
- General Feature Format (GFF) files are used to store genomic features and annotations. The GFF3 format is a widely used standard for representing genomic feature hierarchies.
:::: {.columns}
::: {.column width="50%"}
![Sequence Ontology Relationship](http://sequenceontology.org/img/a_slice_of_so.jpg)
:::
::: {.column width="50%"}
Each feature in a GFF file is represented as a line of tab-separated fields. The fields include:
- The range and location of the feature
- The Sequence Ontology (SO) [@noauthor_httpwwwsequenceontologyorg_nodate] term of the feature
- The Sequence Ontology (SO) parent of the feature
- Other attributes such as gene name, gene type, aliases, etc.
:::
::::
## Motivation {.smaller}
- GFF files can be very large, making it difficult to query them efficiently:
- In the Human GENCODE GFF3 file, there are 3.7 million features. A lot of genes have more than 400 features associated with them.
:::: {.columns}
::: {.column width="50%"}
- Often only a subset of the features are needed for analysis.
- Inefficient to load the entire file into memory.
- In a cluster environment, the whole file may be split across multiple nodes
- Looking up a feature by ID or attribute requires a linear scan of the file, which is time and resource consuming.
:::
::: {.column width="50%"}
- Example pipelines and operations include:
- Gene-centric analysis: large number of ID -> feature queries
- RNA alternative splicing analysis: large number of exon -> gene queries
- Genome browser: range to feature queries
- General queries: Ancestors, descendants and overlaps of a feature
:::
::::
## Existing Solutions
- Heuristic-based solutions:
- Assume that features are ordered in a way that related features are close to each other.
- No guarantee that features are ordered in this way
- Only solves the problem for looking up by relation
- Relational databases:
- Load the GFF file into a database
- Requires predefined schema for the attributes
- Querying by range or name/ID prefix is still slow
## GFidx - General Feature index {.smaller}
![GFidx structure](diagrams/all_indices.svg)
- A range index is first built to save the approximate location of each feature in the GFF file. This allows further indexing to only save a reference to the feature in the GFF file instead of the whole feature.
- Two additional types of indices are built:
- Attribute indices: for looking up features by ID or attribute prefix
- Relation tree: for looking up features by parent-child relationships
## Methodology - Lookup by range {.smaller}
:::: {.columns}
::: {.column width="40%"}
- For each sequence, split the range into non-overlapping intervals and record the location of the first and last feature that starts in each interval in the GFF file.
- Split the intervals recursively until the number of features in each interval is small enough, forming a tree structure.
- Some extra data may be returned but by controlling the granularity of the tree, a balance between the amount of unnecessary data and the size of the index can be achieved.
:::
::: {.column width="60%"}
![Range index structure](diagrams/range_index.svg){style="width: 100%"}
:::
::::
## Methodology - Lookup by attribute {.smaller}
- Instead of recording the IDs and attributes by feature, we sort them into a Trie structure, and only record the location of the corresponding feature in the GFF file.
- Fast lookup by ID or attribute prefix.
- Space-efficient: the long ENSEMBL gene IDs are stored only once.
- Fast to build: Only one pass through the GFF file is needed.
![](diagrams/attr_trie.svg){style="width: 100%"}
## Methodology - Lookup by relation {.smaller}
- Parent-child relationships are stored in a tree structure.
- Each node contains the range of the feature and a list of children.
- The tree is built by iterating through the GFF file and adding each feature to the corresponding parent node in the tree.
- Since GFF3 requires all relations to be in the Sequence Ontology structure, the tree structure can be pre-built and reused for different queries.
![](diagrams/relation_tree.svg){style="width: 100%"}
## Results {.smaller}
- A full index of the Human GENCODE [@frankish_gencode_2019] GFF3 file (1.62 GB decompressed) can be built using 3 processors in less than 20 seconds.
- All queries are done in less than 0.2 seconds. Demo queries include:
- All features of CDCA8 (Cell Division Cycle Associated 8) gene - 34 features, 600 $\mu$s, 272 KB downloaded.
- All genes in the SLC35 family (Nucleotide Sugar Transporter Family) - 2,926 features, 128 ms, 22 MB downloaded.
- All genes in chr3 from 650,000 to 1,500,000 bp - 243 features, 1.70 ms, 120 KB downloaded.
- More effort is needed to optimize the space efficiency of the index:
- Simply using an off-the-shelf binary format, the uncompressed index containing all relavant attributes is 268 MB.
- Most of the space is used by the relation index, nearly 100 MB.
## Conclusion
- GFidx has a potential to be a useful tool for both in-memory and over-the-network querying of large GFF files in certain pipelines and applications.
- Web-based services could benefit from GFidx by only downloading the relevant parts of the GFF file.
- Further space optimization and parallelization could be done especially on the trie building and relation tree building steps.
## Future Work {.smaller}
- More research on common queries and operations on GFF files.
- More flexibility on the granularity of indexes to save time and space.
- More efficient encoding of feature locations.
- Compression of the index using BGZF (used in BAM [@noauthor_bam_nodate] files).
- Parallelization of the trie and tree building.
- A WebAssembly build for use in web applications such as IGV Genome Browser: only download the file range that is currently being viewed.
- Support for features with multiple parents (allowed in GFF3 but rarely used).
- Test suites for real-world GFF3 files.
- Integration with existing bioinformatics tools and pipelines.
## References
::: {#refs}
:::

View file

@ -0,0 +1,35 @@
@misc{noauthor_httpwwwsequenceontologyorg_nodate,
title = {http://www.sequenceontology.org/},
shorttitle = {http},
url = {http://www.sequenceontology.org/},
language = {en-US},
urldate = {2024-08-01},
file = {Snapshot:/home/yume/Zotero/storage/L3YQIYUL/www.sequenceontology.org.html:text/html},
}
@article{frankish_gencode_2019,
title = {{GENCODE} reference annotation for the human and mouse genomes},
volume = {47},
issn = {1362-4962},
doi = {10.1093/nar/gky955},
abstract = {The accurate identification and description of the genes in the human and mouse genomes is a fundamental requirement for high quality analysis of data informing both genome biology and clinical genomics. Over the last 15 years, the GENCODE consortium has been producing reference quality gene annotations to provide this foundational resource. The GENCODE consortium includes both experimental and computational biology groups who work together to improve and extend the GENCODE gene annotation. Specifically, we generate primary data, create bioinformatics tools and provide analysis to support the work of expert manual gene annotators and automated gene annotation pipelines. In addition, manual and computational annotation workflows use any and all publicly available data and analysis, along with the research literature to identify and characterise gene loci to the highest standard. GENCODE gene annotations are accessible via the Ensembl and UCSC Genome Browsers, the Ensembl FTP site, Ensembl Biomart, Ensembl Perl and REST APIs as well as https://www.gencodegenes.org.},
language = {eng},
number = {D1},
journal = {Nucleic Acids Research},
author = {Frankish, Adam and Diekhans, Mark and Ferreira, Anne-Maud and Johnson, Rory and Jungreis, Irwin and Loveland, Jane and Mudge, Jonathan M. and Sisu, Cristina and Wright, James and Armstrong, Joel and Barnes, If and Berry, Andrew and Bignell, Alexandra and Carbonell Sala, Silvia and Chrast, Jacqueline and Cunningham, Fiona and Di Domenico, Tomás and Donaldson, Sarah and Fiddes, Ian T. and García Girón, Carlos and Gonzalez, Jose Manuel and Grego, Tiago and Hardy, Matthew and Hourlier, Thibaut and Hunt, Toby and Izuogu, Osagie G. and Lagarde, Julien and Martin, Fergal J. and Martínez, Laura and Mohanan, Shamika and Muir, Paul and Navarro, Fabio C. P. and Parker, Anne and Pei, Baikang and Pozo, Fernando and Ruffier, Magali and Schmitt, Bianca M. and Stapleton, Eloise and Suner, Marie-Marthe and Sycheva, Irina and Uszczynska-Ratajczak, Barbara and Xu, Jinuri and Yates, Andrew and Zerbino, Daniel and Zhang, Yan and Aken, Bronwen and Choudhary, Jyoti S. and Gerstein, Mark and Guigó, Roderic and Hubbard, Tim J. P. and Kellis, Manolis and Paten, Benedict and Reymond, Alexandre and Tress, Michael L. and Flicek, Paul},
month = jan,
year = {2019},
pmid = {30357393},
pmcid = {PMC6323946},
keywords = {Animals, Computational Biology, Databases, Genetic, Genome, Human, Genomics, Humans, Internet, Mice, Molecular Sequence Annotation, Pseudogenes, Software},
pages = {D766--D773},
file = {Full Text:/home/yume/Zotero/storage/6JNTGDLZ/Frankish et al. - 2019 - GENCODE reference annotation for the human and mou.pdf:application/pdf},
}
@misc{noauthor_bam_nodate,
title = {{BAM} {File} {Format}},
url = {https://support.illumina.com/help/BS_App_RNASeq_Alignment_OLH_1000000006112/Content/Source/Informatics/BAM-Format.htm},
urldate = {2024-08-01},
file = {BAM File Format:/home/yume/Zotero/storage/4DTFGIN2/BAM-Format.html:text/html},
}

53
presentation/sakura.scss Normal file
View file

@ -0,0 +1,53 @@
/*-- scss:defaults --*/
@import url(./fonts/league-gothic/league-gothic.css);
@import url(https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic);
// fonts
$font-family-sans-serif: Lato, sans-serif !default;
// colors
$body-bg: darken(#FEDFE1, 6%) !default;
$body-color: #333 !default;
$link-color: #F596AA !default;
$link-color-hover: darken($link-color, 5%) !default;
$selection-bg: rgba(88, 178, 220, 0.99) !default;
// headings
$presentation-heading-font: "League Gothic", sans-serif !default;
$presentation-heading-text-transform: uppercase !default;
$presentation-h1-font-size: 3.77em !default;
$presentation-h1-text-shadow: 0 1px 0 #ccc, 0 2px 0 #c9c9c9, 0 3px 0 #bbb,
0 4px 0 #b9b9b9, 0 5px 0 #aaa, 0 6px 1px rgba(0, 0, 0, 0.1),
0 0 5px rgba(0, 0, 0, 0.1), 0 1px 3px rgba(0, 0, 0, 0.3),
0 3px 5px rgba(0, 0, 0, 0.2), 0 5px 10px rgba(0, 0, 0, 0.25),
0 20px 20px rgba(0, 0, 0, 0.15);
// code blocks
$code-block-bg: transparent !default;
/*-- scss:mixins --*/
@mixin bodyBackground() {
@include radial-gradient(rgba(254, 223, 225, 1), rgba(255, 255, 255, 1));
}
/*-- scss:presentation --*/
section img {
border-radius: 10px;
}
li {
list-style-image: url('data:image/svg+xml,<%3Fxml version="1.0" encoding="UTF-8"%3F><svg transform="translate(0, 0)" transform-origin="center" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 400 400"><path fill="%23E87A90" d="M 137.00,92.00 C 137.23,72.09 144.33,46.27 152.15,28.00 152.15,28.00 161.25,8.00 161.25,8.00 162.35,5.79 164.05,1.59 166.39,0.59 172.77,-2.13 184.79,10.68 188.39,15.00 189.68,16.55 192.11,20.05 194.17,20.38 196.53,20.76 200.16,17.29 202.00,15.90 206.71,12.35 212.83,8.05 218.00,5.31 219.98,4.26 222.66,2.72 224.96,3.41 227.64,4.22 229.62,7.76 231.05,10.00 235.40,16.81 237.41,21.58 240.40,29.00 252.21,58.35 249.52,94.04 238.00,123.00 247.43,115.05 259.28,110.66 271.00,107.29 299.53,99.10 332.84,101.15 362.00,105.00 361.99,119.51 356.75,128.14 349.00,140.00 356.63,142.41 365.85,149.46 372.00,154.61 374.54,156.75 377.75,159.32 377.58,163.00 377.44,166.03 373.92,170.59 372.10,173.00 365.62,181.58 361.48,185.75 353.00,192.34 333.81,207.22 307.09,214.28 283.00,214.00 283.00,214.00 272.00,213.00 272.00,213.00 272.00,213.00 280.58,221.09 280.58,221.09 299.53,241.54 309.39,267.31 315.87,294.00 315.87,294.00 320.13,319.00 320.13,319.00 320.65,321.47 321.66,326.15 320.13,328.28 318.41,330.33 313.45,330.92 311.00,330.92 303.89,330.93 291.54,327.87 285.00,325.00 282.86,335.64 281.14,339.18 277.20,349.00 276.03,351.92 274.62,356.32 271.79,357.98 267.22,360.66 256.47,355.46 252.00,353.36 233.93,344.88 217.91,329.47 208.31,312.00 205.21,306.36 199.72,294.10 198.24,288.00 198.02,287.10 196.00,274.56 196.81,267.56 193.81,276.81 189.19,284.81 182.87,293.00 168.76,311.55 144.92,327.62 124.00,337.74 124.00,337.74 103.00,346.95 103.00,346.95 100.75,347.76 96.87,349.38 94.64,347.97 92.38,346.54 90.84,341.51 90.29,339.00 88.49,330.78 89.00,322.35 89.00,314.00 89.00,314.00 69.00,312.71 69.00,312.71 69.00,312.71 53.00,310.00 53.00,310.00 53.00,310.00 53.00,301.00 53.00,301.00 53.04,278.53 65.65,253.94 81.28,238.28 90.51,229.04 98.19,224.59 108.00,217.00 108.00,217.00 103.00,217.00 103.00,217.00 76.30,216.96 43.89,199.52 23.00,183.87 23.00,183.87 7.00,171.25 7.00,171.25 4.82,169.41 1.45,166.86 0.71,164.00 -0.20,160.52 2.70,157.44 5.04,155.30 10.58,150.25 20.61,144.39 28.00,143.00 28.00,143.00 18.28,118.00 18.28,118.00 17.48,115.34 15.63,110.88 16.64,108.21 17.77,105.20 23.17,103.15 26.00,102.06 35.52,98.42 45.79,96.02 56.00,96.00 65.37,95.99 71.59,95.65 81.00,97.46 98.03,100.73 128.01,115.78 142.00,126.00 139.46,113.35 136.85,105.53 137.00,92.00 Z" /><path fill="%23F596AA" d="M 176.00,6.21 C 173.91,4.75 170.03,1.89 167.39,3.15 165.24,4.18 162.86,9.79 161.75,12.00 161.75,12.00 151.80,35.00 151.80,35.00 138.41,69.07 131.49,116.20 156.00,147.00 156.00,147.00 141.90,128.52 141.90,128.52 136.55,123.87 120.85,115.63 114.00,112.36 107.06,109.06 100.36,105.37 93.00,103.03 65.76,94.36 43.89,96.86 18.00,108.00 21.51,126.65 28.02,133.77 31.00,144.00 24.31,145.25 13.31,151.21 8.02,155.52 5.67,157.42 2.11,160.55 3.25,163.96 4.14,166.65 10.61,171.28 13.00,173.25 23.73,182.10 31.25,187.30 43.00,194.58 60.53,205.43 85.17,214.97 106.00,215.00 115.33,215.01 119.78,214.72 129.00,212.57 129.00,212.57 141.00,209.00 141.00,209.00 135.04,211.53 130.39,213.69 124.00,215.25 118.72,216.55 115.21,215.99 110.00,218.71 106.33,220.63 99.58,225.93 96.00,228.63 69.30,248.69 55.00,274.58 55.00,308.00 55.00,308.00 71.00,310.71 71.00,310.71 71.00,310.71 92.00,312.00 92.00,312.00 92.00,312.00 91.09,323.00 91.09,323.00 90.71,329.79 92.87,340.48 95.00,347.00 103.18,345.72 124.27,335.39 132.00,331.00 156.57,317.04 177.83,301.76 191.22,276.00 193.24,272.11 195.44,265.19 196.50,261.06 198.13,255.38 199.46,241.32 200.06,239.50 200.06,239.50 199.37,249.65 199.06,258.75 198.81,266.25 199.00,273.00 199.00,273.00 199.03,293.46 210.92,316.58 225.01,331.00 234.67,340.89 244.30,347.69 257.00,353.13 259.70,354.28 267.07,357.25 269.78,356.31 272.40,355.40 273.69,351.35 274.77,349.00 277.69,342.62 282.39,329.81 283.00,323.00 292.47,324.46 295.58,327.85 308.00,328.00 310.45,328.02 316.40,328.53 317.76,326.40 319.19,324.67 317.91,317.30 317.76,315.00 317.76,315.00 311.35,285.00 311.35,285.00 311.35,285.00 306.05,269.00 306.05,269.00 298.23,247.61 287.56,227.93 269.00,213.90 254.43,202.88 250.00,204.42 242.00,200.00 256.15,202.85 260.55,207.70 269.00,209.76 288.00,214.39 315.46,210.21 333.00,201.74 346.98,194.99 356.94,188.10 366.92,176.00 369.13,173.32 374.86,166.39 375.04,163.00 375.22,159.50 371.43,156.96 369.00,155.00 362.60,149.87 353.89,143.26 346.00,141.00 352.61,129.23 359.68,121.34 360.00,107.00 360.00,107.00 330.00,104.00 330.00,104.00 330.00,104.00 306.00,104.00 306.00,104.00 285.22,104.03 257.05,111.83 240.00,123.77 229.26,131.28 226.80,135.24 219.00,145.00 219.00,145.00 224.38,137.00 224.38,137.00 228.09,132.36 232.56,129.43 235.29,124.00 239.62,115.40 242.80,101.60 244.13,92.00 248.54,60.22 243.64,31.58 225.00,5.00 212.41,8.60 203.32,18.66 193.00,26.00 189.46,18.78 182.59,10.82 176.00,6.21z" /></svg>');
/* Use translate(X, Y) to position the SVG up, down, left, and right */
}
li::marker {
font-size: 1.5em;
line-height: 0.1em;
}
ol.aside-footnotes li {
list-style-image: none;
}

14
src/bar.rs Normal file
View file

@ -0,0 +1,14 @@
use indicatif::ProgressBar;
pub(crate) fn style_bar(bar: &ProgressBar, has_size: bool) {
bar.set_style(
indicatif::ProgressStyle::default_bar()
.template(if has_size {
"{spinner:.green} {prefix} [{elapsed_precise}] [{bar:40.cyan/blue}] {decimal_bytes}/{decimal_total_bytes} ETA: {eta}"
} else {
"{spinner:.green} {prefix} [{elapsed_precise}] [{bar:40.cyan/blue}] {decimal_bytes} Processed"
})
.unwrap()
.progress_chars("=> "),
);
}

328
src/bin/gfidx.rs Normal file
View file

@ -0,0 +1,328 @@
use std::{
fs::File,
io::{self, BufReader, BufWriter, Read, Write},
};
use clap::Parser;
use gfidx::{
attr_trie_def,
gff3::Gff3Read,
index::{range::DEFAULT_RANGE_INTERVAL, Gff3Index},
io::{
humanize_size,
stream::{Gff3SeekableStreamReader, Gff3StreamReader},
CountingReader, CountingWriter,
},
Error,
};
#[derive(Parser, Debug)]
#[clap(name = "gfidx")]
struct Args {
#[clap(subcommand)]
subcmd: SubCommand,
}
#[derive(Parser, Debug)]
enum SubCommand {
#[clap(name = "index")]
Index(Index),
#[clap(name = "query")]
Query(Query),
}
#[derive(Parser, Debug)]
struct Index {
#[clap(short, long)]
file: String,
#[clap(short, long)]
output: Option<String>,
}
#[derive(Parser, Debug)]
struct Query {
#[clap(short, long)]
file: String,
#[clap(short, long)]
index: Option<String>,
}
fn timed<T, F: FnOnce() -> T>(name: &str, f: F) -> T {
let start = std::time::Instant::now();
let res = f();
let elapsed = start.elapsed();
eprintln!("{} took: {:?}", name, elapsed);
res
}
fn report_and_reset_reader<R>(reader: &CountingReader<R>) -> usize {
let count = reader.count();
reader.reset_count();
count
}
fn main() {
use tabled::settings::Style;
let args: Args = Args::parse();
match args.subcmd {
SubCommand::Index(index) => {
println!("Indexing {}", index.file);
timed("Index GFF3", || {
let idx = if index.file.ends_with(".gz") {
let gz =
flate2::read::GzDecoder::new(std::fs::File::open(&index.file).unwrap());
let size = gz.bytes().count();
Gff3Index::build(
&|| {
let gz = flate2::read::GzDecoder::new(
std::fs::File::open(&index.file).unwrap(),
);
Gff3StreamReader::new_with_size(gz, size)
},
DEFAULT_RANGE_INTERVAL,
&attr_trie_def![
"ID",
"gene_id",
"gene_name",
"gene_type",
"transcript_id",
"transcript_name",
"exon_id"
],
)
.expect("Failed to build index")
} else {
Gff3Index::build(
&|| {
Gff3SeekableStreamReader::open(
std::fs::File::open(&index.file).unwrap(),
)
.unwrap()
},
DEFAULT_RANGE_INTERVAL,
&attr_trie_def![
"ID",
"gene_id",
"gene_name",
"gene_type",
"transcript_id",
"transcript_name",
"exon_id"
],
)
.expect("Failed to build index")
};
let out_file = index
.output
.unwrap_or_else(|| format!("{}.gfidx", index.file));
println!("Index built, writing to {}", out_file);
let mut table = tabled::builder::Builder::default();
let mut sum = 0;
let mut range_index_counter = CountingWriter::new(io::sink());
let mut relation_index_counter = CountingWriter::new(io::sink());
idx.range_index
.as_ref()
.map(|idx| ciborium::into_writer(idx, &mut range_index_counter))
.unwrap_or(Ok(()))
.expect("Failed to write range index");
table.push_record(["Range Index", &humanize_size(range_index_counter.count())]);
idx.relation_index
.as_ref()
.map(|idx| ciborium::into_writer(idx, &mut relation_index_counter))
.unwrap_or(Ok(()))
.expect("Failed to write relation index");
table.push_record([
"Relation Index",
&humanize_size(relation_index_counter.count()),
]);
sum += relation_index_counter.count();
for (name, trie) in idx.tries.iter() {
let mut counter = CountingWriter::new(io::sink());
ciborium::into_writer(trie, &mut counter).expect("Failed to write trie");
table
.push_record([&format!("Trie: {}", name), &humanize_size(counter.count())]);
sum += counter.count();
}
table.push_record(["Total", &humanize_size(sum)]);
ciborium::into_writer(
&idx,
BufWriter::new(File::create(out_file).expect("Failed to create output file")),
)
.expect("Failed to write index");
println!("{}", table.build().with(Style::ascii_rounded()));
});
}
SubCommand::Query(query) => {
let idx_file = query
.index
.unwrap_or_else(|| format!("{}.gfidx", query.file));
let mut gff = Gff3SeekableStreamReader::open_prebuffered(io::BufReader::new(
CountingReader::new(File::open(&query.file).expect("Failed to open GFF3 file")),
))
.expect("Failed to open GFF3 file");
let mut idx: Gff3Index = timed("Read back index file", || {
ciborium::from_reader(BufReader::new(
File::open(idx_file).expect("Failed to open index file"),
))
.expect("Failed to read index")
});
timed("Reconstruct index", || {
if let Some(i) = idx.relation_index.as_mut() {
i.reconstruct_path();
}
});
report_and_reset_reader(gff.borrow_reader());
let stdin = io::stdin();
let mut stdout = io::stdout().lock();
loop {
write!(stdout, "> ").unwrap();
stdout.flush().unwrap();
let mut line = String::new();
stdin.read_line(&mut line).unwrap();
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let parts: Vec<_> = line.split_whitespace().collect();
if parts.is_empty() {
eprintln!("Invalid query");
continue;
}
let query = parts[0];
match query {
"quit" | "exit" | "q" => {
break;
}
"range" => {
if parts.len() < 4 {
eprintln!("Invalid query");
continue;
}
let seqid = parts[1];
let start = parts[2].parse().expect("Invalid start");
let end = parts[3].parse().expect("Invalid end");
match timed("Query", || {
let mut count = 0;
idx.range_index
.as_ref()
.expect("No range index")
.query_lines(&mut gff, seqid, start, end, &mut |line| {
count += 1;
println!("{}", line);
})?;
Ok::<_, Error<io::Error>>(count)
}) {
Ok(count) => {
eprintln!("{} lines found", count);
eprintln!(
"Query cost {} bytes",
humanize_size(report_and_reset_reader(gff.borrow_reader()))
);
}
Err(e) => {
eprintln!("Error: {:?}", e);
}
}
}
"relation" => {
if parts.len() < 2 {
eprintln!("Invalid query");
continue;
}
let id = parts[1];
match timed("Query", || {
let mut count = 0;
let idx = idx.relation_index.as_ref().expect("No relation index");
for seqid in idx.list_seqids().collect::<Vec<_>>() {
idx.traverse_children(&mut gff, seqid, id, |line| {
println!("{}", line);
count += 1;
})?;
}
Ok::<_, Error<io::Error>>(count)
}) {
Ok(count) => {
eprintln!("{} lines found", count);
eprintln!(
"Query cost {} bytes",
humanize_size(report_and_reset_reader(gff.borrow_reader()))
);
}
Err(e) => {
eprintln!("Error: {:?}", e);
}
}
}
"trie" => {
if parts.len() < 3 {
eprintln!("Invalid query");
continue;
}
let name = parts[1];
let prefix = parts[2];
let trie = match idx.tries.get(name) {
Some(trie) => trie,
None => {
eprintln!("Trie not found");
continue;
}
};
let count = timed("Query", || {
let mut count = 0;
trie.for_each_seqid(|_, trie| {
trie.traverse(
prefix,
&mut |offset| {
gff.seek_to(*offset).unwrap().unwrap();
println!("{}", gff.read_line().unwrap().unwrap());
count += 1;
},
true,
);
});
count
});
eprintln!("{} lines found", count);
eprintln!(
"Query cost {} bytes",
humanize_size(report_and_reset_reader(gff.borrow_reader()))
);
}
_ => {
eprintln!("Invalid query");
}
}
}
}
}
}

159
src/ds/linkedlist.rs Normal file
View file

@ -0,0 +1,159 @@
use std::{
borrow::Borrow,
ops::{Deref, DerefMut, Index, IndexMut},
};
#[derive(Debug, Clone)]
pub struct Node<T> {
pub payload: T,
pub next: Option<Box<Node<T>>>,
}
impl<T> From<T> for Node<T> {
fn from(payload: T) -> Self {
Self {
payload,
next: None,
}
}
}
impl<T> Deref for Node<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
&self.payload
}
}
impl<T> DerefMut for Node<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.payload
}
}
impl<T> Node<T> {
fn attach<P: Into<Node<T>>>(&mut self, next: P) -> &mut Self {
let node = next.into();
self.next = Some(Box::new(node));
self.next.as_mut().unwrap()
}
}
impl<'a, T> From<&'a Node<T>> for Vec<&'a T> {
fn from(node: &'a Node<T>) -> Self {
let mut vec = Vec::new();
let mut node: Option<&Node<T>> = Some(node);
while let Some(n) = node {
vec.push(&**(n));
node = n.next.as_ref().map(|n| n.borrow());
}
vec
}
}
#[derive(Debug, Clone, Default)]
pub struct List<T>(Option<Box<Node<T>>>);
impl<T> List<T> {
pub fn push<P: Into<Node<T>>>(&mut self, node: P) -> &mut Self {
if self.0.is_none() {
let next = node.into();
self.0 = Some(Box::new(next));
self
} else {
let next = node.into();
let mut tail = self.0.as_mut();
while tail.as_ref().map(|t| t.next.is_some()).unwrap_or_default() {
tail = tail.unwrap().next.as_mut();
}
tail.unwrap().attach(next);
self
}
}
}
impl<'a, T> From<&'a List<T>> for Vec<&'a T> {
fn from(list: &'a List<T>) -> Self {
let mut vec = Vec::new();
let mut node = list.0.as_ref();
while let Some(n) = node {
vec.push(&n.payload);
node = n.next.as_ref();
}
vec
}
}
impl<T> Index<usize> for List<T> {
type Output = Node<T>;
fn index(&self, index: usize) -> &Self::Output {
let mut node = self.0.as_ref();
for _ in 0..index {
node = node.as_ref().unwrap().next.as_ref();
}
node.as_ref().unwrap()
}
}
impl<T> IndexMut<usize> for List<T> {
fn index_mut(&mut self, index: usize) -> &mut Self::Output {
let mut node = self.0.as_mut();
for _ in 0..index {
node = node.unwrap().next.as_mut();
}
node.unwrap()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_list() {
macro_rules! assert_into {
($from:expr, $to:expr) => {
let target = $from.into();
{
let mut _tmp = &target;
_tmp = &$to;
}
assert_eq!(target, $to);
};
}
let mut list = List::default();
assert!(Into::<Vec<_>>::into(&list).is_empty());
list.push(1);
assert_into!(&list, vec![&1]);
list.push(2).push(3);
assert_into!(&list, vec![&1, &2, &3]);
list.push(4);
assert_into!(&list, vec![&1, &2, &3, &4]);
let mut str_list = List::default();
let str_1 = String::from("1");
let str_2 = String::from("2");
let str_3 = String::from("3");
assert!(Into::<Vec<_>>::into(&str_list).is_empty());
str_list.push(str_1).push(str_2).push(str_3);
assert_into!(
&str_list,
["1".to_string(), "2".to_string(), "3".to_string()]
.iter()
.collect::<Vec<_>>()
);
(*str_list[1]).push('c');
assert_into!(
&str_list,
["1".to_string(), "2c".to_string(), "3".to_string()]
.iter()
.collect::<Vec<_>>()
);
}
}

3
src/ds/mod.rs Normal file
View file

@ -0,0 +1,3 @@
pub mod linkedlist;
pub mod tree;
pub mod trie;

285
src/ds/tree.rs Normal file
View file

@ -0,0 +1,285 @@
use std::ops::{Index, IndexMut};
use serde::{Deserialize, Serialize};
pub trait FindChild<T, I>
where
Self: Default,
{
fn n_children(&self) -> usize;
fn iter_keys<F: FnMut(&I) -> bool>(&self, f: &mut F);
fn find_child(&self, index: &I) -> Option<&Node<T, I, Self>>;
fn find_child_mut(&mut self, index: &I) -> Option<&mut Node<T, I, Self>>;
}
pub trait ModifyChild<T, I>
where
Self: FindChild<T, I>,
{
fn add_child(&mut self, index: I, node: Node<T, I, Self>);
fn remove_child(&mut self, index: &I);
}
#[derive(Debug, Clone)]
pub struct Node<T, I, C>
where
C: FindChild<T, I>,
{
pub payload: T,
pub children: C,
_marker: std::marker::PhantomData<I>,
}
impl<T, I, C> Serialize for Node<T, I, C>
where
T: Serialize,
C: Serialize + FindChild<T, I>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
{
(&self.payload, &self.children).serialize(serializer)
}
}
impl<'de, T, I, C> Deserialize<'de> for Node<T, I, C>
where
T: Deserialize<'de>,
C: Deserialize<'de> + FindChild<T, I>,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let (payload, children) = Deserialize::deserialize(deserializer)?;
Ok(Self {
payload,
children,
_marker: std::marker::PhantomData,
})
}
}
impl<T, I, C> Default for Node<T, I, C>
where
T: Default,
C: Default + FindChild<T, I>,
{
fn default() -> Self {
Self {
payload: Default::default(),
children: Default::default(),
_marker: std::marker::PhantomData,
}
}
}
impl<T, I, C> Node<T, I, C>
where
C: FindChild<T, I>,
{
#[inline(always)]
pub fn new(payload: T, children: C) -> Self {
Self {
payload,
children,
_marker: std::marker::PhantomData,
}
}
#[inline(always)]
pub fn find_child(&self, index: &I) -> Option<&Node<T, I, C>> {
self.children.find_child(index)
}
#[inline(always)]
pub fn find_child_mut(&mut self, index: &I) -> Option<&mut Node<T, I, C>> {
self.children.find_child_mut(index)
}
#[inline(always)]
pub fn for_each_descendant<F, E>(&self, mut f: F) -> Result<(), E>
where
F: FnMut(&Node<T, I, C>) -> Result<(), E>,
{
let mut stack = vec![self];
while let Some(node) = stack.pop() {
f(node)?;
node.children.iter_keys(&mut |key| {
if let Some(child) = node.find_child(key) {
stack.push(child);
}
true
});
}
Ok(())
}
}
impl<T, I, C> Node<T, I, C>
where
C: FindChild<T, I> + ModifyChild<T, I>,
{
#[inline(always)]
pub fn push(&mut self, index: I, node: Node<T, I, C>) {
self.children.add_child(index, node);
}
}
impl<T, I, C> Node<T, I, C>
where
I: Clone,
C: FindChild<T, I> + ModifyChild<T, I>,
{
#[inline(always)]
pub fn find_child_or_insert<F>(&mut self, index: I, f: F) -> &mut Node<T, I, C>
where
F: FnOnce() -> Node<T, I, C>,
{
if self.find_child(&index).is_none() {
self.push(index.clone(), f());
}
self.find_child_mut(&index).unwrap()
}
}
impl<T, I, C> Index<I> for Node<T, I, C>
where
C: FindChild<T, I>,
{
type Output = Node<T, I, C>;
#[inline(always)]
fn index(&self, index: I) -> &Self::Output {
self.find_child(&index).unwrap()
}
}
impl<T, I, C> IndexMut<I> for Node<T, I, C>
where
C: FindChild<T, I>,
{
#[inline(always)]
fn index_mut(&mut self, index: I) -> &mut Self::Output {
self.find_child_mut(&index).unwrap()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Tree<T, I, C>
where
C: FindChild<T, I>,
{
pub root: Option<Node<T, I, C>>,
}
impl<T, I, C> Tree<T, I, C>
where
C: FindChild<T, I>,
{
pub fn with_root(root: Node<T, I, C>) -> Self {
Self { root: Some(root) }
}
}
impl<T, I, C> Default for Tree<T, I, C>
where
C: FindChild<T, I>,
{
fn default() -> Self {
Self { root: None }
}
}
impl<T, I, C> Tree<T, I, C>
where
C: FindChild<T, I>,
{
#[inline(always)]
pub fn new(root: Option<Node<T, I, C>>) -> Self {
Self { root }
}
#[inline(always)]
pub fn traverse<'a, 'b>(
&'a self,
keys: impl Iterator<Item = &'b I>,
) -> Option<&'a Node<T, I, C>>
where
I: 'b,
{
let mut node = self.root.as_ref()?;
for key in keys {
node = node.find_child(key)?;
}
Some(node)
}
#[inline(always)]
pub fn traverse_mut<'a, 'b>(
&'a mut self,
keys: impl Iterator<Item = &'b I>,
) -> Option<&'a mut Node<T, I, C>>
where
I: 'b,
{
let mut node = self.root.as_mut()?;
for key in keys {
node = node.find_child_mut(key)?;
}
Some(node)
}
}
impl<T, I, C> Tree<T, I, C>
where
I: Clone,
C: FindChild<T, I> + ModifyChild<T, I>,
{
#[inline(always)]
pub fn traverse_or_insert<'a, 'b, F>(
&'a mut self,
keys: impl Iterator<Item = &'b I>,
f: &F,
) -> &'a mut Node<T, I, C>
where
F: Fn() -> Node<T, I, C>,
I: 'b,
{
let mut node = self.root.as_mut().unwrap();
for key in keys {
node = node.find_child_or_insert(key.clone(), f);
}
node
}
}
impl<T, I, C> Tree<T, I, C>
where
I: Clone + Default,
C: FindChild<T, I>,
{
#[inline(always)]
pub fn for_each<F, E>(&self, mut f: F) -> Result<(), E>
where
F: FnMut(&[I], &Node<T, I, C>) -> Result<(), E>,
{
let mut stack = vec![(vec![Default::default()], self.root.as_ref().unwrap())];
while let Some((keys, node)) = stack.pop() {
f(&keys, node)?;
node.children.iter_keys(&mut |key| {
let mut keys = keys.clone();
keys.push(key.clone());
stack.push((keys, node.find_child(key).unwrap()));
true
});
}
Ok(())
}
}

436
src/ds/trie.rs Normal file
View file

@ -0,0 +1,436 @@
use std::{hash::Hash, ops::Deref};
use hashbrown::HashMap;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use super::tree::{FindChild, ModifyChild, Node, Tree};
pub trait ToTrieKey<C>
where
C: Eq,
{
fn to_trie_key(&self) -> impl Iterator<Item = C>;
}
impl ToTrieKey<char> for String {
fn to_trie_key(&self) -> impl Iterator<Item = char> {
self.chars()
}
}
impl ToTrieKey<char> for &str {
fn to_trie_key(&self) -> impl Iterator<Item = char> {
self.chars()
}
}
impl<S> ToTrieKey<char> for &S
where
S: AsRef<str>,
{
fn to_trie_key(&self) -> impl Iterator<Item = char> {
self.as_ref().chars()
}
}
impl<T> ToTrieKey<T> for &[T]
where
T: Copy + Ord,
{
fn to_trie_key(&self) -> impl Iterator<Item = T> {
self.iter().cloned()
}
}
#[derive(Debug, Clone)]
pub struct TrieChildrenHolder<T, C>(HashMap<C, Node<T, C, Self>>)
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned;
impl<T, C> Serialize for TrieChildrenHolder<T, C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.0.serialize(serializer)
}
}
impl<'de, T, C> Deserialize<'de> for TrieChildrenHolder<T, C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
Ok(Self(Deserialize::deserialize(deserializer)?))
}
}
impl<T, C> Default for TrieChildrenHolder<T, C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
fn default() -> Self {
Self(HashMap::new())
}
}
impl<T, C> FindChild<T, C> for TrieChildrenHolder<T, C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
#[inline(always)]
fn n_children(&self) -> usize {
self.0.len()
}
#[inline(always)]
fn iter_keys<F: FnMut(&C) -> bool>(&self, f: &mut F) {
let keys = self.0.keys();
for key in keys {
if !f(key) {
break;
}
}
}
#[inline(always)]
fn find_child(&self, index: &C) -> Option<&Node<T, C, Self>> {
self.0.get(index)
}
#[inline(always)]
fn find_child_mut(&mut self, index: &C) -> Option<&mut Node<T, C, Self>> {
self.0.get_mut(index)
}
}
impl<T, C> ModifyChild<T, C> for TrieChildrenHolder<T, C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
#[inline(always)]
fn add_child(&mut self, index: C, node: Node<T, C, Self>) {
self.0.insert(index, node);
}
#[inline(always)]
fn remove_child(&mut self, index: &C) {
self.0.remove(index);
}
}
#[derive(Debug, Clone)]
pub struct Trie<T, C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
tree: Tree<Vec<T>, C, TrieChildrenHolder<Vec<T>, C>>,
}
impl<T, C> Serialize for Trie<T, C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.tree.serialize(serializer)
}
}
impl<'de, T, C> Deserialize<'de> for Trie<T, C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
Ok(Self {
tree: Deserialize::deserialize(deserializer)?,
})
}
}
impl<T, C> Default for Trie<T, C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
fn default() -> Self {
Self {
tree: Tree::default(),
}
}
}
impl<T, C> Deref for Trie<T, C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
type Target = Tree<Vec<T>, C, TrieChildrenHolder<Vec<T>, C>>;
fn deref(&self) -> &Self::Target {
&self.tree
}
}
impl<T, C> Trie<T, C>
where
C: Eq + Hash + Clone + Serialize + DeserializeOwned,
T: Serialize + DeserializeOwned,
{
#[inline]
pub fn traverse<S: ToTrieKey<C>, F>(&self, key: S, f: &mut F, include_children: bool)
where
F: FnMut(&T),
{
let mut current = self.tree.root.as_ref();
for k in key.to_trie_key() {
if let Some(node) = current.and_then(|n| n.find_child(&k)) {
current = Some(node);
} else {
return;
}
}
if include_children {
let mut stack = vec![current];
while let Some(node) = stack.pop() {
if let Some(node) = node {
for payload in &node.payload {
f(payload);
}
for child_key in node.children.0.keys() {
stack.push(node.find_child(child_key));
}
}
}
} else if let Some(node) = current {
for payload in &node.payload {
f(payload);
}
}
}
#[inline]
pub fn test<S: ToTrieKey<C>>(&self, key: S, include_children: bool) -> usize {
let mut current = self.tree.root.as_ref();
for k in key.to_trie_key() {
if let Some(node) = current.and_then(|n| n.find_child(&k)) {
current = Some(node);
} else {
return 0;
}
}
if include_children {
let mut count = 0;
let mut stack = vec![current];
while let Some(node) = stack.pop() {
count += node.map_or(0, |n| n.payload.len());
if let Some(node) = node {
for child_key in node.children.0.keys() {
stack.push(node.find_child(child_key));
}
}
}
count
} else {
current.map_or(0, |n| n.payload.len())
}
}
#[inline]
pub fn insert<S: ToTrieKey<C>>(&mut self, key: S, value: T) {
let mut current = self
.tree
.root
.get_or_insert_with(|| Node::new(Vec::new(), TrieChildrenHolder::default()));
for k in key.to_trie_key() {
if current.find_child(&k).is_none() {
let node = Node::new(Vec::new(), TrieChildrenHolder::default());
current.push(k.clone(), node);
}
current = current.find_child_mut(&k).unwrap();
}
current.payload.push(value);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_traverse_existing_key() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
let mut result = Vec::new();
trie.traverse(
"hello",
&mut |value| {
result.push(*value);
},
false,
);
assert_eq!(result, vec![1]);
}
#[test]
fn test_traverse_non_existing_key() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
let mut result = Vec::new();
trie.traverse(
"foo",
&mut |value| {
result.push(*value);
},
false,
);
assert_eq!(result, vec![]);
}
#[test]
fn test_test_existing_key() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
let count = trie.test("hello", false);
assert_eq!(count, 1);
}
#[test]
fn test_test_non_existing_key() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
let count = trie.test("foo", false);
assert_eq!(count, 0);
}
#[test]
fn test_insert_existing_key() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
trie.insert("hello", 3);
let count = trie.test("hello", false);
assert_eq!(count, 2);
}
#[test]
fn test_insert_new_key() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
trie.insert("foo", 3);
let count = trie.test("foo", false);
assert_eq!(count, 1);
}
#[test]
fn test_traverse_existing_key_with_children() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
trie.insert("hello world", 3);
let mut result = Vec::new();
trie.traverse(
"hello",
&mut |value| {
result.push(*value);
},
true,
);
assert_eq!(result, vec![1, 3]);
}
#[test]
fn test_traverse_non_existing_key_with_children() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
trie.insert("hello world", 3);
let mut result = Vec::new();
trie.traverse(
"foo",
&mut |value| {
result.push(*value);
},
true,
);
assert_eq!(result, vec![]);
}
#[test]
fn test_test_existing_key_with_children() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
trie.insert("hello world", 3);
let count = trie.test("hello", true);
assert_eq!(count, 2);
}
#[test]
fn test_test_non_existing_key_with_children() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
trie.insert("hello world", 3);
let count = trie.test("foo", true);
assert_eq!(count, 0);
}
#[test]
fn test_insert_existing_key_with_children() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
trie.insert("hello world", 3);
trie.insert("hello", 4);
let count = trie.test("hello", true);
assert_eq!(count, 3);
}
#[test]
fn test_insert_new_key_with_children() {
let mut trie = Trie::default();
trie.insert("hello", 1);
trie.insert("world", 2);
trie.insert("hello world", 3);
trie.insert("foo", 4);
let count = trie.test("foo", true);
assert_eq!(count, 1);
}
}

395
src/gff3.rs Normal file
View file

@ -0,0 +1,395 @@
use itertools::Itertools;
use std::borrow::Cow;
use std::fmt::Display;
use std::fmt::Write;
use crate::{Error, ParseErr};
#[derive(Debug, Clone, PartialEq)]
pub enum Gff3Line<'a> {
Comment(std::borrow::Cow<'a, str>),
Directive(std::borrow::Cow<'a, str>),
Feature {
seqid: std::borrow::Cow<'a, str>,
source: std::borrow::Cow<'a, str>,
type_: std::borrow::Cow<'a, str>,
start: u64,
end: u64,
score: Option<f64>,
strand: Option<char>,
phase: Option<u64>,
attributes: Vec<(std::borrow::Cow<'a, str>, std::borrow::Cow<'a, str>)>,
},
}
impl<'a> Gff3Line<'a> {
pub fn get_attr(&'a self, key: &str) -> Option<&'a str> {
match self {
Gff3Line::Feature { attributes, .. } => {
debug_assert!(
attributes
.iter()
.map(|(k, _)| k)
.map(|k| k == key)
.unique()
.count()
== attributes.len()
);
for (k, v) in attributes {
if k == key {
return Some(v);
}
}
None
}
_ => None,
}
}
pub fn to_static(&self) -> Gff3Line<'static> {
match self {
Gff3Line::Comment(s) => Gff3Line::Comment(Cow::Owned(s.to_string())),
Gff3Line::Directive(s) => Gff3Line::Directive(Cow::Owned(s.to_string())),
Gff3Line::Feature {
seqid,
source,
type_,
start,
end,
score,
strand,
phase,
attributes,
} => Gff3Line::Feature {
seqid: Cow::Owned(seqid.to_string()),
source: Cow::Owned(source.to_string()),
type_: Cow::Owned(type_.to_string()),
start: *start,
end: *end,
score: *score,
strand: *strand,
phase: *phase,
attributes: attributes
.iter()
.map(|(k, v)| (Cow::Owned(k.to_string()), Cow::Owned(v.to_string())))
.collect(),
},
}
}
}
#[inline]
fn url_encode(s: &str) -> Cow<'_, str> {
#[inline]
fn char_to_percent(c: char, out: &mut String) {
let mut buf = [0; 4];
for v in c.encode_utf8(&mut buf).bytes() {
write!(out, "%{:02X}", v).unwrap();
}
}
let mut copy = false;
let mut encoded = String::with_capacity(s.len() * 2);
for c in s.chars() {
match c {
'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '.' | '~' => {
encoded.push(c);
}
_ => {
copy = true;
char_to_percent(c, &mut encoded);
}
}
}
if copy {
encoded.shrink_to_fit();
Cow::Owned(encoded)
} else {
Cow::Borrowed(s)
}
}
#[inline]
fn url_decode(s: &str) -> Cow<'_, str> {
#[inline]
fn char_to_utf8(c: char, out: &mut Vec<u8>) {
let mut buf = [0; 4];
for v in c.encode_utf8(&mut buf).bytes() {
out.push(v);
}
}
if !s.contains('%') {
return Cow::Borrowed(s);
}
let mut buf = Vec::with_capacity(s.len());
let mut chars = s.chars();
while let Some(c) = chars.next() {
if c == '%' {
let hex = chars.by_ref().take(2).fold(0, |acc, c| {
(acc << 4)
+ c.to_digit(16)
.unwrap_or_else(|| panic!("Invalid hex digit: {}", c))
});
buf.push(hex as u8);
} else {
char_to_utf8(c, &mut buf);
}
}
let str = unsafe { String::from_utf8_unchecked(buf) };
Cow::Owned(str)
}
impl<'a> Gff3Line<'a> {
#[inline]
pub fn parse_str(s: &'a str) -> Result<Self, ParseErr> {
if let Some(s) = s.strip_prefix("##") {
Ok(Gff3Line::Directive(s.into()))
} else if let Some(s) = s.strip_prefix('#') {
Ok(Gff3Line::Comment(s.into()))
} else {
let mut fields = s.split('\t');
let seqid = fields.next().ok_or(ParseErr::MissingField("seqid"))?;
let source = fields.next().ok_or(ParseErr::MissingField("source"))?;
let type_ = fields.next().ok_or(ParseErr::MissingField("type"))?.into();
let start = fields
.next()
.ok_or(ParseErr::MissingField("start"))?
.parse()
.map_err(ParseErr::ParseInt)?;
let end = fields
.next()
.ok_or(ParseErr::MissingField("end"))?
.parse()
.map_err(ParseErr::ParseInt)?;
let score = match fields.next() {
Some(".") => None,
Some(s) => Some(s.parse().map_err(ParseErr::ParseFloat)?),
None => None,
};
let strand = match fields.next() {
Some(".") => None,
Some(s) => Some(s.chars().next().ok_or(ParseErr::MissingField("strand"))?),
None => None,
};
let phase = match fields.next() {
Some(".") => None,
Some(s) => Some(s.parse().map_err(ParseErr::ParseInt)?),
None => None,
};
let attr_strs = fields
.next()
.ok_or(ParseErr::MissingField("attributes"))?
.split(';');
let mut attributes = Vec::with_capacity(32);
for attr_str in attr_strs {
let mut attr = attr_str.splitn(2, '=');
let key = attr.next().ok_or(ParseErr::MissingField("attribute key"))?;
let value = attr
.next()
.ok_or(ParseErr::MissingField("attribute value"))?;
attributes.push((url_decode(key), url_decode(value)));
}
Ok(Gff3Line::Feature {
seqid: url_decode(seqid),
source: url_decode(source),
type_,
start,
end,
score,
strand,
phase,
attributes,
})
}
}
}
macro_rules! gff3_display {
($name:ident) => {
#[inline]
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
$name::Comment(s) => write!(f, "#{}", s),
$name::Directive(s) => write!(f, "##{}", s),
$name::Feature {
seqid,
source,
type_,
start,
end,
score,
strand,
phase,
attributes,
} => {
write!(
f,
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
url_encode(seqid),
url_encode(source),
type_,
start,
end,
score
.map(|s| s.to_string())
.unwrap_or_else(|| ".".to_string()),
strand
.map(|s| s.to_string())
.unwrap_or_else(|| ".".to_string()),
phase
.map(|s| s.to_string())
.unwrap_or_else(|| ".".to_string()),
attributes
.iter()
.map(|(k, v)| format!("{}={}", url_encode(k), url_encode(v)))
.collect::<Vec<_>>()
.join(";")
)
}
}
}
};
}
impl<'a> Display for Gff3Line<'a> {
gff3_display!(Gff3Line);
}
pub trait Gff3Read<E: std::error::Error> {
fn size(&self) -> Option<usize> {
None
}
fn offset(&mut self) -> Option<u64> {
None
}
fn seek_to(&mut self, _pos: u64) -> Result<Option<()>, Error<E>> {
Ok(None)
}
fn read_line(&mut self) -> Result<Option<Gff3Line>, Error<E>>;
}
#[cfg(feature = "async")]
pub trait Gff3ReadAsync<E: std::error::Error> {
fn offset(&mut self) -> Option<u64> {
None
}
fn read_line_async(
&mut self,
) -> impl std::future::Future<Output = Result<Option<Gff3Line>, Error<E>>> + Send;
}
pub trait Gff3Write<E: std::error::Error> {
fn write_line(&mut self, line: &Gff3Line) -> Result<(), Error<E>>;
}
#[cfg(feature = "async")]
pub trait Gff3WriteAsync<E: std::error::Error> {
fn write_line_async(
&mut self,
line: &Gff3Line,
) -> impl std::future::Future<Output = Result<(), Error<E>>> + Send;
}
#[cfg(test)]
mod tests {
use super::*;
fn cow_is_borrowed<T: ToOwned + ?Sized>(cow: &std::borrow::Cow<'_, T>) -> bool {
match cow {
std::borrow::Cow::Borrowed(_) => true,
_ => false,
}
}
#[test]
fn test_parse_gff3_line() {
let lines: Vec<Gff3Line> = r#"
##gff-version 3
#format: gff3
chr1 HAVANA gene 11869 14409 . + . ID=ENSG00000290825.1;gene_id=ENSG00000290825.1;gene_type=lncRNA;gene_name=DDX11L2;level=2;tag=overlaps_pseudogene
chr1 HAVANA transcript 11869 14409 . + . ID=ENST00000456328.2;Parent=ENSG00000290825.1;gene_id=ENSG00000290825.1;transcript_id=ENST00000456328.2;gene_type=lncRNA;gene_name=DDX11L2;transcript_type=lncRNA;transcript_name=DDX11L2-202;level=2;transcript_support_level=1;tag=basic,Ensembl_canonical;havana_transcript=OTTHUMT00000362751.1
"#.trim().lines().map(Gff3Line::parse_str).collect::<Result<Vec<_>, _>>().unwrap();
assert_eq!(
lines,
vec![
Gff3Line::Directive("gff-version 3".into()),
Gff3Line::Comment("format: gff3".into()),
Gff3Line::Feature {
seqid: "chr1".into(),
source: "HAVANA".into(),
type_: "gene".into(),
start: 11869,
end: 14409,
score: None,
strand: Some('+'),
phase: None,
attributes: vec![
("ID".into(), "ENSG00000290825.1".into()),
("gene_id".into(), "ENSG00000290825.1".into()),
("gene_type".into(), "lncRNA".into()),
("gene_name".into(), "DDX11L2".into()),
("level".into(), "2".into()),
("tag".into(), "overlaps_pseudogene".into())
]
},
Gff3Line::Feature {
seqid: "chr1".into(),
source: "HAVANA".into(),
type_: "transcript".into(),
start: 11869,
end: 14409,
score: None,
strand: Some('+'),
phase: None,
attributes: vec![
("ID".into(), "ENST00000456328.2".into()),
("Parent".into(), "ENSG00000290825.1".into()),
("gene_id".into(), "ENSG00000290825.1".into()),
("transcript_id".into(), "ENST00000456328.2".into()),
("gene_type".into(), "lncRNA".into()),
("gene_name".into(), "DDX11L2".into()),
("transcript_type".into(), "lncRNA".into()),
("transcript_name".into(), "DDX11L2-202".into()),
("level".into(), "2".into()),
("transcript_support_level".into(), "1".into()),
("tag".into(), "basic,Ensembl_canonical".into()),
("havana_transcript".into(), "OTTHUMT00000362751.1".into())
]
}
]
);
}
#[test]
fn test_url_encode() {
assert_eq!(url_encode("Hello, world!"), "Hello%2C%20world%21");
assert_eq!(
url_encode("https://example.com/?q=hello world"),
"https%3A%2F%2Fexample.com%2F%3Fq%3Dhello%20world"
);
assert_eq!(url_encode("abc123"), "abc123");
assert_eq!(url_encode("!€,"), "%21%E2%82%AC%2C");
assert!(cow_is_borrowed(&url_encode("xxx")));
assert!(!cow_is_borrowed(&url_encode("x x")));
}
#[test]
fn test_url_decode() {
assert_eq!(url_decode("Hello,%20world%21"), "Hello, world!");
assert_eq!(
url_decode("https%3A%2F%2Fexample.com%2F%3Fq%3Dhello%20world"),
"https://example.com/?q=hello world"
);
assert_eq!(url_decode("abc123"), "abc123");
assert_eq!(url_decode("%21%E2%82%AC%2C"), "!€,");
assert!(cow_is_borrowed(&url_decode("xxx")));
assert!(!cow_is_borrowed(&url_decode("x%20x")));
}
}

128
src/index/mod.rs Normal file
View file

@ -0,0 +1,128 @@
use std::{cell::UnsafeCell, sync::Mutex};
use hashbrown::HashMap;
use indicatif::{MultiProgress, ProgressBar};
use range::RangeIndex;
use relation::RelationIndex;
use serde::{Deserialize, Serialize};
use trie::TrieIndex;
use crate::{
bar::style_bar,
gff3::{Gff3Line, Gff3Read},
io::tee::Gff3BroadcastRead,
Error,
};
pub mod range;
pub mod relation;
pub mod trie;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Gff3Index {
pub digest: Option<String>,
pub range_index: Option<RangeIndex>,
pub relation_index: Option<RelationIndex>,
pub tries: HashMap<String, TrieIndex<char>>,
}
pub struct SyncUnsafeCell<T>(UnsafeCell<T>);
unsafe impl<T> Sync for SyncUnsafeCell<T> {}
pub type Gff3AttrExtractFunc = for<'a> fn(&'a Gff3Line) -> Option<&'a str>;
impl Gff3Index {
#[cfg(feature = "rayon")]
pub fn build<R, F, E>(
reader_factory: &F,
by: u64,
trie_def: &[(&str, Gff3AttrExtractFunc)],
) -> Result<Self, Error<E>>
where
E: std::error::Error + Send + Sync,
R: Gff3Read<E> + Send,
F: Fn() -> R + Send + Sync,
{
let chunk_size = 1024;
let mut trie_broadcast = Gff3BroadcastRead::new(reader_factory(), chunk_size);
let mp = MultiProgress::new();
let mut trie_builders = trie_def
.iter()
.map(|(name, get_attr)| {
(
*name,
Some(*get_attr),
SyncUnsafeCell(UnsafeCell::new(TrieIndex::<char>::new(name.to_string()))),
if let Some(size) = reader_factory().size() {
let pg = ProgressBar::new(size as u64);
pg.set_prefix(format!("Trie Index: {}", name));
style_bar(&pg, true);
mp.add(pg)
} else {
let pg = ProgressBar::new_spinner();
pg.set_prefix(format!("Trie Index: {}", name));
style_bar(&pg, false);
mp.add(pg)
},
)
})
.collect::<Vec<_>>();
let errors = Mutex::new(Vec::new());
let mut ret = Gff3Index {
digest: None,
range_index: None,
relation_index: None,
tries: HashMap::new(),
};
rayon::scope(|s| {
s.spawn(|_| {
match RangeIndex::build(reader_factory(), by, Some(&mp)) {
Ok(idx) => ret.range_index = Some(idx),
Err(e) => errors.lock().unwrap().push(e),
};
});
s.spawn(|_| {
match RelationIndex::build(reader_factory(), Some(&mp)) {
Ok(idx) => ret.relation_index = Some(idx),
Err(e) => errors.lock().unwrap().push(e),
};
});
for (_, get_attr, trie, pg) in trie_builders.iter_mut() {
let idx = unsafe { trie.0.get().as_mut().unwrap() };
let get_attr = get_attr.take().unwrap();
trie_broadcast.add_channel(Box::new(move |offset, chunk| {
for (o, line) in chunk {
idx.process_line(*o, line, get_attr);
}
pg.set_position(offset);
}));
}
s.spawn(move |_| {
trie_broadcast.run().unwrap();
});
});
let mut errors = errors.into_inner().unwrap();
if errors.len() == 1 {
Err(errors.pop().unwrap())
} else if !errors.is_empty() {
Err(Error::Multiple(errors))
} else {
for (name, _, trie, pg) in trie_builders {
pg.finish();
ret.tries.insert(name.to_string(), trie.0.into_inner());
}
Ok(ret)
}
}
}

179
src/index/range.rs Normal file
View file

@ -0,0 +1,179 @@
use std::borrow::Borrow;
use indicatif::{MultiProgress, ProgressBar};
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use crate::{
bar::style_bar,
gff3::{Gff3Line, Gff3Read},
Error,
};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RangeIndex {
by_seq: Vec<(String, SeqRangeIndex)>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SeqRangeIndex {
by_range: Vec<(u64, u64)>,
}
pub const DEFAULT_RANGE_INTERVAL: u64 = 2048;
impl RangeIndex {
pub fn query(&self, seqid: &str, begin: u64, end: u64) -> Option<(u64, u64)> {
self.by_seq
.iter()
.find(|(id, _)| id == seqid)
.and_then(|(_, idx)| {
idx.by_range
.iter()
.tuple_windows()
.filter_map(|((prev, prev_offset), (split_point, offset))| {
((prev..split_point).contains(&&begin)
|| (prev..split_point).contains(&&end)
|| (begin..end).contains(prev)
|| (begin..end).contains(split_point))
.then_some((prev_offset, offset))
})
.fold(None, |prev, (start, end)| {
if let Some((prev_start, prev_end)) = prev {
Some((
std::cmp::min(prev_start, *start),
std::cmp::max(prev_end, *end),
))
} else {
Some((*start, *end))
}
})
.and_then(|(start, end)| {
if start - end > 0 {
Some((start, end))
} else {
None
}
})
})
}
pub fn query_lines<R, E, F>(
&self,
reader: &mut R,
seqid_in: &str,
begin: u64,
end: u64,
cb: &mut F,
) -> Result<(), Error<E>>
where
R: Gff3Read<E>,
E: std::error::Error,
F: FnMut(&Gff3Line),
{
let (start_off, end_off) = match self.query(seqid_in, begin, end) {
Some((start, end)) => (start, end),
None => return Ok(()),
};
reader.seek_to(start_off)?;
while reader.offset().ok_or(Error::Unseekable)? < end_off {
let line = match reader.read_line()? {
Some(line) => line,
None => break,
};
#[allow(unused_variables)]
if let Gff3Line::Feature {
ref seqid,
ref source,
ref type_,
start,
end,
score,
strand,
phase,
ref attributes,
} = line
{
if seqid == seqid_in
&& ((start..end).contains(&begin)
|| (start..end).contains(&end)
|| (begin..end).contains(&start)
|| (begin..end).contains(&end))
{
cb(&line);
}
}
}
Ok(())
}
pub fn build<R, E>(mut reader: R, by: u64, mp: Option<&MultiProgress>) -> Result<Self, Error<E>>
where
R: Gff3Read<E>,
E: std::error::Error,
{
let mut idx = RangeIndex { by_seq: Vec::new() };
let mut last_seqid = None;
let mut last_start = 0;
let prog = if let Some(size) = reader.size() {
let pg = ProgressBar::new(size as u64);
style_bar(&pg, true);
pg
} else {
let pg = ProgressBar::new_spinner();
style_bar(&pg, false);
pg
};
prog.set_prefix("Range Index");
let prog = mp.map(move |mp| mp.add(prog));
loop {
let line = match reader.read_line()? {
Some(line) => line,
None => break,
};
#[allow(unused_variables)]
if let Gff3Line::Feature {
ref seqid,
ref source,
ref type_,
start,
end,
score,
strand,
phase,
ref attributes,
} = line
{
if last_seqid.as_deref() != Some(seqid.borrow()) {
last_seqid = Some(seqid.to_string());
last_start = 0;
idx.by_seq.push((
seqid.to_string(),
SeqRangeIndex {
by_range: vec![(0, reader.offset().ok_or(Error::Unseekable)?)],
},
));
}
if start - last_start > by {
idx.by_seq
.last_mut()
.unwrap()
.1
.by_range
.push((start, reader.offset().ok_or(Error::Unseekable)?));
last_start = start;
}
}
let offset = reader.offset().ok_or(Error::Unseekable)?;
if let Some(prog) = prog.as_ref() {
prog.set_position(offset)
}
}
Ok(idx)
}
}

272
src/index/relation.rs Normal file
View file

@ -0,0 +1,272 @@
use std::borrow::Borrow;
use hashbrown::HashMap;
use indicatif::{MultiProgress, ProgressBar};
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use crate::{
bar::style_bar,
ds::tree::{FindChild, ModifyChild, Node, Tree},
gff3::{Gff3Line, Gff3Read},
Error,
};
#[derive(Debug, Clone)]
pub struct RelationChildrenHolder<T>
where
T: Default + Serialize + DeserializeOwned,
{
by_id: HashMap<String, Node<T, String, Self>>,
}
impl<T> Serialize for RelationChildrenHolder<T>
where
T: Default + Serialize + DeserializeOwned,
{
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.by_id.serialize(serializer)
}
}
impl<'de, T> Deserialize<'de> for RelationChildrenHolder<T>
where
T: Default + Serialize + DeserializeOwned,
{
fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
Ok(Self {
by_id: Deserialize::deserialize(deserializer)?,
})
}
}
impl<T> Default for RelationChildrenHolder<T>
where
T: Default + Serialize + DeserializeOwned,
{
fn default() -> Self {
Self {
by_id: HashMap::new(),
}
}
}
impl<T> FindChild<T, String> for RelationChildrenHolder<T>
where
T: Default + Serialize + DeserializeOwned,
{
#[inline(always)]
fn find_child(&self, index: &String) -> Option<&Node<T, String, Self>> {
self.by_id.get(index)
}
#[inline(always)]
fn find_child_mut(&mut self, index: &String) -> Option<&mut Node<T, String, Self>> {
self.by_id.get_mut(index)
}
#[inline(always)]
fn iter_keys<F: FnMut(&String) -> bool>(&self, f: &mut F) {
self.by_id.keys().all(f);
}
#[inline(always)]
fn n_children(&self) -> usize {
self.by_id.len()
}
}
impl<T> ModifyChild<T, String> for RelationChildrenHolder<T>
where
T: Default + Serialize + DeserializeOwned,
{
#[inline(always)]
fn add_child(&mut self, index: String, node: Node<T, String, Self>) {
self.by_id.insert(index, node);
}
#[inline(always)]
fn remove_child(&mut self, index: &String) {
self.by_id.remove(index);
}
}
pub type RelationTree = Tree<u64, String, RelationChildrenHolder<u64>>;
#[derive(Debug, Clone)]
pub struct RelationIndex {
pub path_by_id: HashMap<String, Vec<String>>,
pub tree_by_seq: Vec<(String, RelationTree)>,
}
impl Serialize for RelationIndex {
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.tree_by_seq.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for RelationIndex {
fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
Ok(Self {
path_by_id: HashMap::new(),
tree_by_seq: Deserialize::deserialize(deserializer)?,
})
}
}
impl RelationIndex {
#[inline(always)]
pub fn list_seqids(&self) -> impl Iterator<Item = &String> {
self.tree_by_seq.iter().map(|(seqid, _)| seqid)
}
pub fn traverse_children<F, R, E>(
&self,
reader: &mut R,
seqid: &str,
id: &str,
mut f: F,
) -> Result<(), Error<E>>
where
R: Gff3Read<E>,
E: std::error::Error,
F: FnMut(&Gff3Line),
{
let tree = self
.tree_by_seq
.iter()
.find(|(id, _)| id == seqid)
.ok_or(Error::MissingFeature(seqid.to_string()))?;
match tree.1.traverse(
self.path_by_id
.get(id)
.ok_or(Error::MissingFeature(id.to_string()))?
.iter()
.skip(1),
) {
Some(node) => node,
None => return Ok(()),
}
.for_each_descendant(|node| {
let offset = node.payload;
reader.seek_to(offset)?;
let line: Gff3Line = reader.read_line()?.ok_or(Error::<E>::UnexpectedEof)?;
f(&line);
Result::<_, Error<E>>::Ok(())
})?;
Ok(())
}
pub fn reconstruct_path(&mut self) {
let (out, tree) = (&mut self.path_by_id, &self.tree_by_seq);
for (_, tree) in tree {
tree.for_each(|path, _| {
let id = path.last().unwrap();
out.insert(id.to_string(), path.to_vec());
Ok::<_, ()>(())
})
.ok();
}
}
pub fn build<R, E>(mut reader: R, mp: Option<&MultiProgress>) -> Result<Self, Error<E>>
where
R: Gff3Read<E>,
E: std::error::Error,
{
let mut idx = Self {
path_by_id: HashMap::new(),
tree_by_seq: Vec::new(),
};
let mut last_seqid = None;
let prog = if let Some(size) = reader.size() {
let pg = ProgressBar::new(size as u64);
style_bar(&pg, true);
pg
} else {
let pg = ProgressBar::new_spinner();
style_bar(&pg, false);
pg
};
prog.set_prefix("Relation Index");
mp.map(move |mp| mp.add(prog));
loop {
let line = match reader.read_line()? {
Some(line) => line.to_static(),
None => break,
};
#[allow(unused_variables)]
if let crate::gff3::Gff3Line::Feature {
ref seqid,
ref source,
ref type_,
start,
end,
score,
strand,
phase,
ref attributes,
} = line
{
let offset = reader.offset().ok_or(Error::Unseekable)?;
let id = line
.get_attr("ID")
.ok_or_else(|| Error::MissingAttribute("ID".to_string()))?;
if last_seqid.as_deref() != Some(seqid.borrow()) {
last_seqid = Some(seqid.to_string());
idx.path_by_id.insert(id.to_string(), vec![]);
idx.tree_by_seq.push((
seqid.to_string(),
Tree::with_root(Node::new(offset, Default::default())),
));
}
let parent_id = line.get_attr("Parent");
if parent_id.is_none() {
idx.path_by_id.entry_ref(id).insert(vec![id.to_string()]);
idx.tree_by_seq
.last_mut()
.unwrap()
.1
.root
.as_mut()
.unwrap()
.children
.by_id
.insert(id.to_string(), Node::new(offset, Default::default()));
continue;
}
let parent_id = parent_id.unwrap();
let mut path = idx
.path_by_id
.get_mut(parent_id)
.ok_or_else(|| Error::MissingFeature(parent_id.to_string()))?
.clone();
idx.tree_by_seq
.last_mut()
.unwrap()
.1
.traverse_mut(path.iter())
.unwrap()
.push(id.to_string(), Node::new(offset, Default::default()));
path.push(id.to_string());
idx.path_by_id.insert(id.to_string(), path);
}
}
Ok(idx)
}
}

80
src/index/trie.rs Normal file
View file

@ -0,0 +1,80 @@
use std::hash::Hash;
use hashbrown::HashMap;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use crate::{
ds::trie::{ToTrieKey, Trie},
gff3::Gff3Line,
};
#[derive(Debug, Clone)]
pub struct TrieIndex<C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
{
pub by: String,
// HashMap<Sequence ID, Trie>
pub trie: HashMap<String, Trie<u64, C>>,
}
impl<C> Serialize for TrieIndex<C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
{
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
(&self.by, &self.trie).serialize(serializer)
}
}
impl<'de, C> Deserialize<'de> for TrieIndex<C>
where
C: Eq + Hash + Serialize + DeserializeOwned,
{
fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
let (by, trie) = Deserialize::deserialize(deserializer)?;
Ok(Self { by, trie })
}
}
impl<C> TrieIndex<C>
where
C: Eq + Hash + Clone + Serialize + DeserializeOwned,
{
pub fn new(by: String) -> Self {
Self {
by,
trie: HashMap::new(),
}
}
#[inline(always)]
pub fn for_each_seqid<F>(&self, mut f: F)
where
F: FnMut(&str, &Trie<u64, C>),
{
for (seqid, trie) in &self.trie {
f(seqid, trie);
}
}
#[inline(always)]
pub fn process_line<I>(
&mut self,
offset: u64,
line: &Gff3Line,
get_attr: impl for<'b> Fn(&'b Gff3Line) -> Option<&'b I>,
) where
for<'c> &'c I: ToTrieKey<C>,
I: ?Sized,
{
if let Gff3Line::Feature { ref seqid, .. } = line {
if let Some(attr) = get_attr(line) {
let trie = self
.trie
.entry(seqid.to_string())
.or_insert_with(Trie::default);
trie.insert(attr, offset);
}
}
}
}

554
src/io/file.rs Normal file
View file

@ -0,0 +1,554 @@
#[cfg(unix)]
use libc::c_int;
#[cfg(unix)]
use std::os::fd::AsRawFd;
use std::{
error::Error,
io::{self, Read, Seek, Write},
ops::Deref,
};
use crate::{
impl_deserialize_for_copy,
serdes::{Deserialize, DeserializeOwned, Serialize},
};
// handle for memory-mapped heap objects
#[derive(Debug, Clone, Copy)]
pub struct Handle<'a, T> {
ptr: *const T,
_marker: std::marker::PhantomData<&'a T>,
}
impl<'a, T> Deref for Handle<'a, T> {
type Target = T;
fn deref(&self) -> &Self::Target {
unsafe { &*self.ptr }
}
}
pub struct MemoryMap<'a> {
ptr: *const u8,
size: usize,
cleanup: Option<Box<dyn FnOnce() + 'a>>,
}
impl<'a> MemoryMap<'a> {
/// # Safety
/// `ptr` must be a valid pointer to a memory-mapped region of size at least `size`.
pub unsafe fn new(ptr: *const u8, size: usize) -> Self {
Self {
ptr,
size,
cleanup: None,
}
}
/// # Safety
/// `fd` must be a valid file descriptor.
/// `offset` must be a valid offset into the file.
/// `size` must be a valid size for the memory-mapped region.
#[cfg(unix)]
pub unsafe fn from_file<F>(fd: &'a F, offset: usize, size: usize) -> Result<Self, c_int>
where
F: AsRawFd,
{
let ptr = unsafe {
libc::mmap(
std::ptr::null_mut(),
size,
libc::PROT_READ,
libc::MAP_PRIVATE,
fd.as_raw_fd(),
offset as libc::off_t,
)
};
if ptr == libc::MAP_FAILED {
return Err(*libc::__errno_location());
}
Ok(Self {
ptr: ptr as _,
size,
cleanup: Some(Box::new(move || unsafe {
libc::munmap(ptr, size);
})),
})
}
/// # Safety
/// `rec` must be a valid `HeapObjectRec` for this memory map.
/// `rec.offset` must be a valid offset into the memory-mapped region.
pub unsafe fn get_handle<T>(&self, rec: &HeapObjectRec) -> Handle<'a, T> {
debug_assert!(rec.offset as usize <= self.size);
Handle {
ptr: unsafe { self.ptr.add(rec.offset as usize) as *const T },
_marker: std::marker::PhantomData,
}
}
}
impl<'a> Drop for MemoryMap<'a> {
fn drop(&mut self) {
if let Some(cleanup) = self.cleanup.take() {
cleanup();
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub struct HeapObjectRec {
pub offset: u64,
}
impl<E> DeserializeOwned<E> for HeapObjectRec
where
E: Error + From<io::Error>,
{
type Output = Self;
fn deserialize<R>(reader: &mut R) -> Result<Self, E>
where
R: Read,
{
let offset = <u64 as DeserializeOwned<E>>::deserialize(reader)?;
Ok(Self { offset })
}
}
impl<E> Serialize<E> for HeapObjectRec
where
E: Error + From<io::Error>,
{
fn serialize<W>(&self, writer: &mut W) -> Result<(), E>
where
W: Write,
{
self.offset.serialize(writer)?;
Ok(())
}
}
impl_deserialize_for_copy!(HeapObjectRec);
pub struct BufIOStreamRead<R>
where
R: Read,
{
pub(crate) reader: io::BufReader<R>,
}
impl<R> StreamRead<io::Error> for BufIOStreamRead<R>
where
R: Read,
{
fn borrow_read(&mut self) -> &mut impl Read {
&mut self.reader
}
fn read_object<T: DeserializeOwned<io::Error>>(
&mut self,
) -> Result<<T as DeserializeOwned<io::Error>>::Output, io::Error> {
T::deserialize(&mut self.reader)
}
fn read_object_ref<T: Deserialize<io::Error>>(
&mut self,
object: &mut T,
) -> Result<(), io::Error> {
object.deserialize(&mut self.reader)
}
}
pub trait StreamRead<E>
where
E: Error + From<io::Error>,
{
fn borrow_read(&mut self) -> &mut impl Read;
fn read_object<T: DeserializeOwned<E>>(
&mut self,
) -> Result<<T as DeserializeOwned<E>>::Output, E>;
fn read_object_ref<T: Deserialize<E>>(&mut self, object: &mut T) -> Result<(), E>;
}
pub trait HandleGet<E>
where
E: Error + From<io::Error>,
{
fn get_handle<'a, T>(&self) -> Result<Handle<'a, T>, E>
where
T: Deserialize<E>;
}
pub trait HeapRead<E>
where
E: Error + From<io::Error>,
Self: StreamRead<E>,
{
fn offset(&mut self) -> Result<u64, E>;
fn seek_to(&mut self, pos: u64) -> Result<(), E>;
}
pub struct IOHeapRead<R>
where
R: Read + Seek,
{
pub(crate) reader: io::BufReader<R>,
}
impl<R> StreamRead<io::Error> for IOHeapRead<R>
where
R: Read + Seek,
{
fn borrow_read(&mut self) -> &mut impl Read {
&mut self.reader
}
fn read_object<T: DeserializeOwned<io::Error>>(
&mut self,
) -> Result<<T as DeserializeOwned<io::Error>>::Output, io::Error> {
T::deserialize(&mut self.reader)
}
fn read_object_ref<T: Deserialize<io::Error>>(
&mut self,
object: &mut T,
) -> Result<(), io::Error> {
object.deserialize(&mut self.reader)
}
}
impl<R> HeapRead<io::Error> for IOHeapRead<R>
where
R: Read + Seek,
{
fn offset(&mut self) -> Result<u64, io::Error> {
self.reader.stream_position().map_err(Into::into)
}
fn seek_to(&mut self, pos: u64) -> Result<(), io::Error> {
self.reader
.seek(io::SeekFrom::Start(pos))
.map_err(Into::into)
.map(|_| ())
}
}
pub struct StreamWithHeapRead<RS, RH, E>
where
E: Error + From<io::Error>,
RS: StreamRead<E>,
RH: HeapRead<E>,
{
pub(crate) stream: RS,
pub(crate) heap: RH,
_marker: std::marker::PhantomData<E>,
}
impl<RS, RH, E> StreamWithHeapRead<RS, RH, E>
where
E: Error + From<io::Error>,
RS: StreamRead<E>,
RH: HeapRead<E>,
{
pub fn new(stream: RS, heap: RH) -> Self {
Self {
stream,
heap,
_marker: std::marker::PhantomData,
}
}
pub fn read_stream<T: DeserializeOwned<E>>(
&mut self,
) -> Result<<T as DeserializeOwned<E>>::Output, E> {
self.stream.read_object::<T>()
}
pub fn read_stream_ref<T: Deserialize<E>>(&mut self, object: &'_ mut T) -> Result<(), E> {
self.stream.read_object_ref(object)
}
pub fn read_heap<T: DeserializeOwned<E>>(
&mut self,
) -> Result<<T as DeserializeOwned<E>>::Output, E> {
let mut rec = HeapObjectRec { offset: 0 };
self.read_stream_ref(&mut rec)?;
self.read_heap_at::<T>(&rec)
}
pub fn read_heap_at<T: DeserializeOwned<E>>(
&mut self,
rec: &HeapObjectRec,
) -> Result<<T as DeserializeOwned<E>>::Output, E> {
self.heap.seek_to(rec.offset)?;
#[cfg(debug_assertions)]
{
let pos = self.heap.offset().expect("Failed to get offset");
assert_eq!(pos, rec.offset);
}
T::deserialize(&mut self.heap.borrow_read())
}
pub fn read_heap_ref<T: Deserialize<E>>(&mut self, object: &'_ mut T) -> Result<(), E> {
let mut rec = HeapObjectRec { offset: 0 };
self.read_stream_ref(&mut rec)?;
self.read_heap_ref_at(object, &rec)
}
pub fn read_heap_ref_at<T: Deserialize<E>>(
&mut self,
object: &'_ mut T,
rec: &HeapObjectRec,
) -> Result<(), E> {
self.heap.seek_to(rec.offset)?;
#[cfg(debug_assertions)]
{
let pos = self.heap.offset().expect("Failed to get offset");
assert_eq!(pos, rec.offset);
}
object.deserialize(&mut self.heap.borrow_read())
}
}
impl StreamWithHeapRead<BufIOStreamRead<std::fs::File>, IOHeapRead<std::fs::File>, io::Error> {
pub fn new_by_basename<EXT: AsRef<str>>(basename: &str) -> io::Result<Self> {
let stream = BufIOStreamRead {
reader: io::BufReader::new(std::fs::File::open(basename)?),
};
let heap = IOHeapRead {
reader: io::BufReader::new(std::fs::File::open(format!("{}.heap", basename))?),
};
Ok(Self::new(stream, heap))
}
}
pub trait StreamWrite<E>
where
E: Error + From<io::Error>,
{
fn borrow_write(&mut self) -> &mut impl Write;
fn write_object<T: Serialize<E>>(&mut self, object: &T) -> Result<(), E>;
}
#[allow(dead_code)]
pub struct BufIOStreamWrite<W>
where
W: Write,
{
writer: io::BufWriter<W>,
}
impl<W> StreamWrite<io::Error> for BufIOStreamWrite<W>
where
W: Write,
{
fn borrow_write(&mut self) -> &mut impl Write {
&mut self.writer
}
fn write_object<T: Serialize<io::Error>>(&mut self, object: &T) -> Result<(), io::Error> {
object.serialize(&mut self.writer)
}
}
pub trait HeapWrite<E>
where
E: Error + From<io::Error>,
Self: StreamWrite<E>,
{
fn offset(&mut self) -> Result<u64, E>;
fn seek_to(&mut self, pos: u64) -> Result<(), E>;
}
#[allow(dead_code)]
pub struct IOHeapWrite<W>
where
W: Write,
{
writer: io::BufWriter<W>,
}
impl<W> StreamWrite<io::Error> for IOHeapWrite<W>
where
W: Write,
{
fn borrow_write(&mut self) -> &mut impl Write {
&mut self.writer
}
fn write_object<T: Serialize<io::Error>>(&mut self, object: &T) -> Result<(), io::Error> {
object.serialize(&mut self.writer)
}
}
impl<W> HeapWrite<io::Error> for IOHeapWrite<W>
where
W: Write + Seek,
{
fn offset(&mut self) -> Result<u64, io::Error> {
self.writer.stream_position().map_err(Into::into)
}
fn seek_to(&mut self, pos: u64) -> Result<(), io::Error> {
self.writer
.seek(io::SeekFrom::Start(pos))
.map_err(Into::into)
.map(|_| ())
}
}
pub struct StreamWithHeapWrite<WS, WH, E>
where
E: Error + From<io::Error>,
WS: StreamWrite<E>,
WH: HeapWrite<E>,
{
stream: WS,
heap: WH,
_marker: std::marker::PhantomData<E>,
}
impl<WS, WH, E> StreamWithHeapWrite<WS, WH, E>
where
E: Error + From<io::Error>,
WS: StreamWrite<E>,
WH: HeapWrite<E>,
{
pub fn new(stream: WS, heap: WH) -> Self {
Self {
stream,
heap,
_marker: std::marker::PhantomData,
}
}
pub fn write_stream<T: Serialize<E>>(&mut self, object: &T) -> Result<(), E> {
self.stream.write_object(object)
}
pub fn write_heap<T: Serialize<E>>(&mut self, object: &T) -> Result<(), E> {
let offset = self.heap.offset()?;
self.write_stream(&HeapObjectRec { offset })?;
self.heap.write_object(object)?;
Ok(())
}
pub fn write_heap_dangling<T: Serialize<E>>(&mut self, object: &T) -> Result<HeapObjectRec, E> {
let offset = self.heap.offset()?;
self.heap.write_object(object)?;
Ok(HeapObjectRec { offset })
}
}
pub trait HeapedSerialize<RS, RH, E>
where
RS: StreamWrite<E>,
RH: HeapWrite<E>,
E: Error + From<io::Error>,
{
fn serialize(&self, writer: &mut StreamWithHeapWrite<RS, RH, E>) -> Result<(), E>;
}
pub trait HeapedDeserializeOwned<WS, WH, E>
where
WS: StreamRead<E>,
WH: HeapRead<E>,
E: Error + From<io::Error>,
{
fn deserialize(reader: &mut StreamWithHeapRead<WS, WH, E>) -> Result<Self, E>
where
Self: Sized;
}
pub trait HeapedDeserialize<WS, WH, E>
where
WS: StreamRead<E>,
WH: HeapRead<E>,
E: Error + From<io::Error>,
{
fn deserialize(&mut self, reader: &mut StreamWithHeapRead<WS, WH, E>) -> Result<(), E>;
}
#[cfg(test)]
mod tests {
use super::*;
use crate::check_serdes_consistency;
#[test]
fn test_serdes() {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct Test {
on_stream: u8,
on_heap: u8,
none: Option<u8>,
option: Option<u8>,
}
let mut buf = Vec::new();
let mut heap = Vec::new();
let writer = BufIOStreamWrite {
writer: io::BufWriter::new(std::io::Cursor::new(&mut buf)),
};
let heap_writer = IOHeapWrite {
writer: io::BufWriter::new(std::io::Cursor::new(&mut heap)),
};
let mut combined = StreamWithHeapWrite::new(writer, heap_writer);
let test = Test {
on_stream: 1,
on_heap: 2,
none: None,
option: Some(3),
};
combined.write_stream(&test.on_stream).unwrap();
combined.write_heap(&test.on_heap).unwrap();
combined.write_heap(&test.none).unwrap();
combined.write_heap(&test.option).unwrap();
drop(combined);
let reader = BufIOStreamRead {
reader: io::BufReader::new(std::io::Cursor::new(&buf)),
};
let heap_reader = IOHeapRead {
reader: io::BufReader::new(std::io::Cursor::new(&heap)),
};
let mut combined = StreamWithHeapRead::new(reader, heap_reader);
let mut loopback = Test {
on_stream: 0,
on_heap: 0,
none: None,
option: None,
};
combined.read_stream_ref(&mut loopback.on_stream).unwrap();
combined.read_heap_ref(&mut loopback.on_heap).unwrap();
assert_eq!(combined.heap.offset().unwrap(), 1);
combined.read_heap_ref(&mut loopback.none).unwrap();
assert_eq!(combined.heap.offset().unwrap(), 2);
combined.read_heap_ref(&mut loopback.option).unwrap();
assert_eq!(test, loopback);
}
#[cfg(any(unix, target_os = "wasi"))]
#[test]
fn test_mmap() {
use std::fs::File;
let mut f = File::create("foo.txt").unwrap();
f.write_all(b"Hello, world!").unwrap();
drop(f);
let f = File::open("foo.txt").unwrap();
let mmap = unsafe { MemoryMap::from_file(&f, 0, 13).unwrap() };
let handle: Handle<u8> = unsafe { mmap.get_handle(&HeapObjectRec { offset: 1 }) };
assert_eq!(
unsafe { std::slice::from_raw_parts(handle.ptr, 12) },
b"ello, world!"
);
drop(mmap);
std::fs::remove_file("foo.txt").unwrap();
}
#[test]
fn test_heap_rec_serdes() {
check_serdes_consistency!(HeapObjectRec { offset: 1 });
}
}

114
src/io/mod.rs Normal file
View file

@ -0,0 +1,114 @@
use std::cell::RefCell;
use std::io::{self, Read};
use std::io::{Seek, Write};
pub mod file;
pub mod stream;
pub mod tee;
pub fn humanize_size(bytes: usize) -> String {
let units = ["B", "KB", "MB", "GB"];
let mut rem = bytes;
let unit = units
.iter()
.find(|_| {
if rem < 1024 {
true
} else {
rem >>= 10;
false
}
})
.unwrap_or(&"TB");
format!("{:.2} {}", rem as f64, unit)
}
pub struct CountingReader<R> {
reader: R,
count: RefCell<usize>,
}
impl<R> CountingReader<R> {
pub fn new(reader: R) -> Self {
Self {
reader,
count: RefCell::new(0),
}
}
pub fn count(&self) -> usize {
*self.count.borrow()
}
pub fn reset_count(&self) {
*self.count.borrow_mut() = 0;
}
}
impl<R> Read for CountingReader<R>
where
R: Read,
{
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let count = self.reader.read(buf)?;
*self.count.borrow_mut() += count;
Ok(count)
}
}
impl<R> Seek for CountingReader<R>
where
R: Seek,
{
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
self.reader.seek(pos)
}
}
pub struct CountingWriter<W> {
writer: W,
count: usize,
}
impl<W> CountingWriter<W> {
pub fn new(writer: W) -> Self {
Self { writer, count: 0 }
}
pub fn count(&self) -> usize {
self.count
}
}
impl<W> Write for CountingWriter<W>
where
W: Write,
{
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let count = self.writer.write(buf)?;
self.count += count;
Ok(count)
}
fn flush(&mut self) -> io::Result<()> {
self.writer.flush()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_humanize_size() {
assert_eq!(humanize_size(0), "0.00 B");
assert_eq!(humanize_size(1), "1.00 B");
assert_eq!(humanize_size(1024), "1.00 KB");
assert_eq!(humanize_size(1024 * 1024), "1.00 MB");
assert_eq!(humanize_size(1024 * 1024 * 1024), "1.00 GB");
assert_eq!(humanize_size(1024 * 1024 * 1024 * 1024 * 2), "2.00 TB");
}
}

207
src/io/stream.rs Normal file
View file

@ -0,0 +1,207 @@
use std::io::{self, BufRead, Read, Seek, Write};
use crate::{
gff3::{Gff3Line, Gff3Read, Gff3Write},
Error,
};
pub struct Gff3StreamReader<R>
where
R: Read,
{
reader: io::BufReader<R>,
size: Option<usize>,
line_buffer: String,
}
impl<R> Gff3StreamReader<R>
where
R: Read,
{
pub fn new(reader: R) -> Self {
Self {
reader: io::BufReader::new(reader),
size: None,
line_buffer: String::with_capacity(1024),
}
}
pub fn new_with_size(reader: R, size: usize) -> Self {
Self {
reader: io::BufReader::new(reader),
size: Some(size),
line_buffer: String::with_capacity(1024),
}
}
pub fn open(reader: R, size: usize) -> Self {
Self {
reader: io::BufReader::new(reader),
size: Some(size),
line_buffer: String::with_capacity(1024),
}
}
pub(crate) fn reset_buffer(&mut self) {
self.line_buffer.clear();
if self.line_buffer.capacity() > 4096 {
self.line_buffer.shrink_to(1024);
}
}
}
impl<R> Gff3Read<io::Error> for Gff3StreamReader<R>
where
R: Read,
{
fn size(&self) -> Option<usize> {
self.size
}
fn read_line(&mut self) -> Result<Option<Gff3Line>, Error<io::Error>> {
self.reset_buffer();
if self
.reader
.read_line(&mut self.line_buffer)
.map_err(Error::Io)?
> 0
{
let line = &mut self.line_buffer;
if line.ends_with('\n') {
line.pop();
if line.ends_with('\r') {
line.pop();
}
}
let parsed = Gff3Line::parse_str(line).map_err(Error::Parse)?;
Ok(Some(parsed))
} else {
Ok(None)
}
}
}
pub struct Gff3SeekableStreamReader<R>
where
R: Read + Seek,
{
reader: io::BufReader<R>,
size: Option<usize>,
line_buffer: String,
}
impl<R> Gff3SeekableStreamReader<R>
where
R: Read + Seek,
{
pub fn new(reader: R) -> Self {
Self {
reader: io::BufReader::new(reader),
size: None,
line_buffer: String::with_capacity(1024),
}
}
pub fn borrow_reader(&self) -> &R {
self.reader.get_ref()
}
pub fn new_with_size(reader: R, size: usize) -> Self {
Self {
reader: io::BufReader::new(reader),
size: Some(size),
line_buffer: String::with_capacity(1024),
}
}
pub fn open(reader: R) -> Result<Self, io::Error> {
let mut reader = io::BufReader::new(reader);
let mut size = None;
if let Ok(pos) = reader.seek(io::SeekFrom::End(0)) {
size = Some(pos as usize);
reader.seek(io::SeekFrom::Start(0))?;
}
Ok(Self {
reader,
size,
line_buffer: String::with_capacity(1024),
})
}
pub fn open_prebuffered(mut reader: io::BufReader<R>) -> Result<Self, io::Error> {
let mut size = None;
if let Ok(pos) = reader.seek(io::SeekFrom::End(0)) {
size = Some(pos as usize);
reader.seek(io::SeekFrom::Start(0))?;
}
Ok(Self {
reader,
size,
line_buffer: String::with_capacity(1024),
})
}
pub(crate) fn reset_buffer(&mut self) {
self.line_buffer.clear();
if self.line_buffer.capacity() > 4096 {
self.line_buffer.shrink_to(1024);
}
}
}
impl<R> Gff3Read<io::Error> for Gff3SeekableStreamReader<R>
where
R: Read + Seek,
{
fn size(&self) -> Option<usize> {
self.size
}
fn offset(&mut self) -> Option<u64> {
self.reader.stream_position().ok()
}
fn seek_to(&mut self, pos: u64) -> Result<Option<()>, Error<io::Error>> {
self.reader
.seek(io::SeekFrom::Start(pos))
.map(|_| Some(()))
.map_err(Error::Io)
}
fn read_line(&mut self) -> Result<Option<Gff3Line>, Error<io::Error>> {
self.reset_buffer();
if self
.reader
.read_line(&mut self.line_buffer)
.map_err(Error::Io)?
> 0
{
let line = &mut self.line_buffer;
if line.ends_with('\n') {
line.pop();
if line.ends_with('\r') {
line.pop();
}
}
let parsed = Gff3Line::parse_str(line).map_err(Error::Parse)?;
Ok(Some(parsed))
} else {
Ok(None)
}
}
}
pub struct Gff3Writer<W>
where
W: Write,
{
writer: io::BufWriter<W>,
}
impl<W> Gff3Writer<W>
where
W: Write,
{
pub fn new(writer: W) -> Self {
Self {
writer: io::BufWriter::new(writer),
}
}
}
impl<W> Gff3Write<io::Error> for Gff3Writer<W>
where
W: Write,
{
fn write_line(&mut self, line: &Gff3Line) -> Result<(), Error<std::io::Error>> {
writeln!(&mut self.writer, "{}", line).map_err(Error::Io)
}
}

61
src/io/tee.rs Normal file
View file

@ -0,0 +1,61 @@
use crate::{
gff3::{Gff3Line, Gff3Read},
Error,
};
pub struct Gff3BroadcastRead<'a, R, E>
where
E: std::error::Error,
R: Gff3Read<E>,
{
reader: R,
chunk_size: usize,
channels: Vec<ChannelFunc<'a, Gff3Line<'static>>>,
_marker: std::marker::PhantomData<E>,
}
pub type ChannelFunc<'a, O> = Box<dyn FnMut(u64, &[(u64, O)]) + Send + 'a>;
impl<'a, R, E> Gff3BroadcastRead<'a, R, E>
where
E: std::error::Error,
R: Gff3Read<E>,
{
pub fn new(reader: R, chunk_size: usize) -> Self {
Self {
reader,
chunk_size,
channels: Vec::new(),
_marker: std::marker::PhantomData,
}
}
pub fn add_channel(&mut self, channel: ChannelFunc<'a, Gff3Line<'static>>) {
self.channels.push(channel);
}
pub fn run(mut self) -> Result<(), Error<E>> {
let mut chunk = Vec::with_capacity(self.chunk_size);
loop {
let line = self.reader.read_line()?.map(|line| line.to_static());
match line {
Some(line) => {
let offset = self.reader.offset().ok_or(Error::Unseekable)?;
chunk.push((offset, line));
if chunk.len() >= self.chunk_size {
for tx in &mut self.channels {
tx(offset, &chunk);
}
chunk.clear();
}
}
None => {
for tx in &mut self.channels {
let offset = self.reader.offset().ok_or(Error::Unseekable)?;
tx(offset, &chunk);
}
break;
}
}
}
Ok(())
}
}

41
src/lib.rs Normal file
View file

@ -0,0 +1,41 @@
mod bar;
pub mod ds;
pub mod gff3;
pub mod index;
pub mod io;
pub mod macros;
pub mod serdes;
#[derive(thiserror::Error, Debug)]
pub enum Error<E> {
#[error("io error: {0}")]
Io(E),
#[error("utf8 error: {0}")]
Utf8(std::str::Utf8Error),
#[error("parse error: {0}")]
Parse(#[from] ParseErr),
#[error("multiple errors: {0}")]
Multiple(Vec<Error<E>>),
#[error("missing feature: {0}")]
MissingFeature(String),
#[error("missing attribute: {0}")]
MissingAttribute(String),
#[error("internal error: {0}")]
Internal(String),
#[error("unexpected eof")]
UnexpectedEof,
#[error("unseekable stream")]
Unseekable,
}
#[derive(thiserror::Error, Debug)]
pub enum ParseErr {
#[error("missing field: {0}")]
MissingField(&'static str),
#[error("parse int error: {0}")]
ParseInt(#[from] std::num::ParseIntError),
#[error("parse float error: {0}")]
ParseFloat(#[from] std::num::ParseFloatError),
#[error("parse bool error: {0}")]
ParseBool(#[from] std::str::ParseBoolError),
}

37
src/macros.rs Normal file
View file

@ -0,0 +1,37 @@
#[macro_export]
macro_rules! hashbrown_map {
($($key:expr => $value:expr),* $(,)?) => {
{
let mut map = ::hashbrown::HashMap::new();
$(
map.insert($key, $value);
)*
map
}
};
}
#[macro_export]
macro_rules! attr_trie_def {
($key:expr) => {
($key, |line: &gfidx::gff3::Gff3Line| line.get_attr($key))
};
[$($key:expr),* $(,)?] => {
[
$(
attr_trie_def!($key),
)*
]
};
}
#[macro_export]
macro_rules! unsafe_borrow {
(mut $x:expr) => {{
let ret = unsafe { &mut *(::std::ptr::addr_of!($x) as *mut _) };
ret
}};
($x:expr) => {
unsafe { &*(::std::ptr::addr_of!($x) as *const _) }
};
}

432
src/serdes/mod.rs Normal file
View file

@ -0,0 +1,432 @@
use std::{error::Error, io::Write};
use varint_rs::{VarintReader, VarintWriter};
use num_traits::PrimInt;
pub mod tree;
pub trait Serialize<E>
where
E: Error + From<std::io::Error>,
{
fn serialize<W>(&self, writer: &mut W) -> Result<(), E>
where
W: Write;
}
pub trait DeserializeOwned<E>
where
E: Error + From<std::io::Error>,
Self: Sized,
{
type Output;
fn deserialize<R>(reader: &mut R) -> Result<Self::Output, E>
where
R: std::io::Read;
}
pub trait Deserialize<E>
where
E: Error + From<std::io::Error>,
{
fn deserialize<R>(&mut self, reader: &mut R) -> Result<(), E>
where
R: std::io::Read;
}
impl<E> Serialize<E> for bool
where
E: Error + From<std::io::Error>,
{
fn serialize<W>(&self, writer: &mut W) -> Result<(), E>
where
W: Write,
{
writer.write_all(&[*self as u8])?;
Ok(())
}
}
impl<E> DeserializeOwned<E> for bool
where
E: Error + From<std::io::Error>,
{
type Output = bool;
fn deserialize<R>(reader: &mut R) -> Result<Self, E>
where
R: std::io::Read,
{
let mut buf = [0u8; 1];
reader.read_exact(&mut buf)?;
Ok(buf[0] != 0)
}
}
#[macro_export]
macro_rules! impl_deserialize_for_copy {
($( $type:ty ),*) => {
$(
impl<E> Deserialize<E> for $type
where
E: Error + From<std::io::Error>,
{
fn deserialize<R>(&mut self, reader: &mut R) -> Result<(), E>
where
R: std::io::Read,
{
*self = <Self as DeserializeOwned<E>>::deserialize(reader)?;
Ok(())
}
}
)*
};
}
#[test]
fn test_bool_serdes() {
let test_cases = [false, true];
for &test_case in test_cases.iter() {
crate::check_serdes_consistency!(test_case);
}
}
impl_deserialize_for_copy!(bool);
macro_rules! impl_numeric_serdes {
($( $type:ty ),*) => {
$(
impl<E> Serialize<E> for $type
where
E: Error + From<std::io::Error>,
{
fn serialize<W>(&self, writer: &mut W) -> Result<(), E>
where
W: Write,
{
writer.write_all(&self.to_le_bytes())?;
Ok(())
}
}
impl<E> DeserializeOwned<E> for $type
where
E: Error + From<std::io::Error>,
{
type Output = $type;
fn deserialize<R>(reader: &mut R) -> Result<Self, E>
where
R: std::io::Read,
{
let mut buf = [0; std::mem::size_of::<Self>()];
reader.read_exact(&mut buf)?;
Ok(Self::from_le_bytes(buf))
}
}
impl_deserialize_for_copy!($type);
)*
};
}
impl_numeric_serdes!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
impl<E, T> Serialize<E> for Option<T>
where
E: Error + From<std::io::Error>,
T: Serialize<E>,
{
fn serialize<W>(&self, writer: &mut W) -> Result<(), E>
where
W: Write,
{
match self {
Some(value) => {
true.serialize(writer)?;
value.serialize(writer)?;
}
None => false.serialize(writer)?,
}
Ok(())
}
}
impl<E, T, O> DeserializeOwned<E> for Option<T>
where
E: Error + From<std::io::Error>,
T: DeserializeOwned<E, Output = O>,
{
type Output = Option<O>;
fn deserialize<R>(reader: &mut R) -> Result<Self::Output, E>
where
R: std::io::Read,
{
let mut has_value = false;
has_value.deserialize(reader)?;
if has_value {
Ok(Some(<T as DeserializeOwned<E>>::deserialize(reader)?))
} else {
Ok(None)
}
}
}
impl<E, T> Deserialize<E> for Option<T>
where
E: Error + From<std::io::Error>,
T: Deserialize<E> + DeserializeOwned<E, Output = T>,
{
fn deserialize<R>(&mut self, reader: &mut R) -> Result<(), E>
where
R: std::io::Read,
{
let mut has_value = false;
has_value.deserialize(reader)?;
if has_value {
match self {
Some(value) => value.deserialize(reader)?,
None => {
*self = Some(<T as DeserializeOwned<E>>::deserialize(reader)?);
}
}
} else {
*self = None;
}
Ok(())
}
}
impl<E, T> Serialize<E> for &[T]
where
E: Error + From<std::io::Error>,
T: Serialize<E>,
{
fn serialize<W>(&self, writer: &mut W) -> Result<(), E>
where
W: Write,
{
(self.len() as u64).serialize(writer)?;
for item in self.iter() {
item.serialize(writer)?;
}
Ok(())
}
}
impl<E, T, O> DeserializeOwned<E> for Vec<T>
where
E: Error + From<std::io::Error>,
T: DeserializeOwned<E, Output = O>,
{
type Output = Vec<O>;
fn deserialize<R>(reader: &mut R) -> Result<Self::Output, E>
where
R: std::io::Read,
{
let len = <u64 as DeserializeOwned<E>>::deserialize(reader)?;
let mut result = Vec::with_capacity(len as usize);
for _ in 0..len {
result.push(<T as DeserializeOwned<E>>::deserialize(reader)?);
}
Ok(result)
}
}
impl<E, T> Deserialize<E> for Vec<T>
where
E: Error + From<std::io::Error>,
T: Deserialize<E> + DeserializeOwned<E, Output = T>,
{
fn deserialize<R>(&mut self, reader: &mut R) -> Result<(), E>
where
R: std::io::Read,
{
let len = <u64 as DeserializeOwned<E>>::deserialize(reader)?;
self.clear();
self.reserve(len as usize);
for _ in 0..len {
self.push(<T as DeserializeOwned<E>>::deserialize(reader)?);
}
Ok(())
}
}
#[test]
fn test_option_serdes() {
let test_cases: Vec<Option<u8>> = vec![None, Some(0), Some(1), Some(255)];
for &test_case in test_cases.iter() {
crate::check_serdes_consistency!(test_case);
}
}
#[macro_export]
macro_rules! check_serdes_consistency {
($input:expr) => {
let mut buf = Vec::new();
Serialize::<std::io::Error>::serialize(&$input, &mut buf).unwrap();
let mut output = Default::default();
fn hint_same_time<T>(_1: &T, _2: &T) {}
hint_same_time(&$input, &output);
Deserialize::<std::io::Error>::deserialize(&mut output, &mut std::io::Cursor::new(&buf))
.unwrap();
assert_eq!($input, output);
};
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct VarInt<N>(pub N)
where
N: PrimInt;
macro_rules! gen_varint_serdes {
($type:ty, $read_fn:ident, $write_fn:ident, $test_fn:ident) => {
impl<E> Serialize<E> for VarInt<$type>
where
E: Error + From<std::io::Error>,
$type: PrimInt,
{
fn serialize<W>(&self, writer: &mut W) -> Result<(), E>
where
W: Write,
{
VarintWriter::$write_fn(writer, self.0).map_err(E::from)
}
}
impl<E> DeserializeOwned<E> for VarInt<$type>
where
E: Error + From<std::io::Error>,
$type: PrimInt,
{
type Output = VarInt<$type>;
fn deserialize<R>(reader: &mut R) -> Result<Self, E>
where
R: std::io::Read,
{
Ok(VarInt(VarintReader::$read_fn(reader).map_err(E::from)?))
}
}
impl_deserialize_for_copy!(VarInt<$type>);
#[cfg(test)]
#[test]
fn $test_fn() {
let test_cases: Vec<i128> = vec![
-1,
-127,
-128,
-255,
-256,
-16383,
-16384,
-2097151,
-2097152,
-268435455,
-268435456,
0,
1,
127,
128,
255,
256,
16383,
16384,
2097151,
2097152,
268435455,
268435456,
34359738367,
34359738368,
4398046511103,
4398046511104,
562949953421311,
562949953421312,
72057594037927935,
72057594037927936,
9223372036854775807,
9223372036854775808,
];
for &test_case in test_cases.iter() {
if TryInto::<$type>::try_into(test_case).is_err() {
continue;
}
let mut buf = Vec::new();
Serialize::<std::io::Error>::serialize(&VarInt(test_case as $type), &mut buf)
.unwrap();
let result = <VarInt<$type> as DeserializeOwned<std::io::Error>>::deserialize(
&mut std::io::Cursor::new(&buf),
);
assert_eq!(VarInt(test_case as $type), result.unwrap());
let mut result = VarInt(0);
Deserialize::<std::io::Error>::deserialize(
&mut result,
&mut std::io::Cursor::new(&buf),
)
.unwrap();
assert_eq!(VarInt(test_case as $type), result);
}
}
};
}
gen_varint_serdes!(u8, read_u8_varint, write_u8_varint, test_u8_varint_serdes);
gen_varint_serdes!(
u16,
read_u16_varint,
write_u16_varint,
test_u16_varint_serdes
);
gen_varint_serdes!(
u32,
read_u32_varint,
write_u32_varint,
test_u32_varint_serdes
);
gen_varint_serdes!(
u64,
read_u64_varint,
write_u64_varint,
test_u64_varint_serdes
);
gen_varint_serdes!(
u128,
read_u128_varint,
write_u128_varint,
test_u128_varint_serdes
);
gen_varint_serdes!(i8, read_i8_varint, write_i8_varint, test_i8_varint_serdes);
gen_varint_serdes!(
i16,
read_i16_varint,
write_i16_varint,
test_i16_varint_serdes
);
gen_varint_serdes!(
i32,
read_i32_varint,
write_i32_varint,
test_i32_varint_serdes
);
gen_varint_serdes!(
i64,
read_i64_varint,
write_i64_varint,
test_i64_varint_serdes
);
gen_varint_serdes!(
i128,
read_i128_varint,
write_i128_varint,
test_i128_varint_serdes
);

1
src/serdes/tree.rs Normal file
View file

@ -0,0 +1 @@