Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
This commit is contained in:
ゆめ 2024-11-16 04:51:29 -06:00
commit f3f24c85ea
No known key found for this signature in database
15 changed files with 4263 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

2943
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

31
Cargo.toml Normal file
View file

@ -0,0 +1,31 @@
[package]
name = "yume-taishikan"
version = "0.1.0"
edition = "2021"
[dependencies]
chrono = { version = "0.4.38", features = ["serde"] }
clap = { version = "4.5.20", features = ["derive"], optional = true }
dashmap = "6.1.0"
derive_more = { version = "1.0.0", features = ["deref"] }
env_logger = { version = "0.11.5", optional = true }
futures = "0.3.31"
governor = "0.7.0"
lingua = "1.6.2"
log = "0.4.22"
regex = "1.11.1"
reqwest = { version = "0.12.9", features = ["json", "stream"] }
serde = { version = "1.0.214", features = ["derive"] }
serde_json = "1.0.132"
thiserror = "1.0.65"
tokio = { version = "1.41.0", features = ["rt", "rt-multi-thread", "macros", "net", "io-util"] }
[features]
bin = ["clap", "env_logger"]
clap = ["dep:clap"]
env_logger = ["dep:env_logger"]
[[bin]]
name = "yume-taishikan"
path = "src/main.rs"
required-features = ["bin"]

61
README.md Normal file
View file

@ -0,0 +1,61 @@
# Misskey Instances List
A misskey instances listings generator compliant with the official Misskey Hub data format with adjusted score loadings and a focus on growing a diverse and healthy fediverse based on open-source software.
The following is a draft and WIP and has not been implemented yet.
## Differences
- Currently we will use the official Web UI but we may work on a custom one in the future.
- Language detection now uses [lingua](https://crates.io/crates/lingua).
- We respect no-bot tags when accessing instance and user history.
- All instances with a misskey API will be considered a misskey instance. If the instance advertises "misskey" or other popular forks in the software field, it will be considered a misskey instance.
Additionally, there are some fuzzy matching: we will probe `/api/endpoints` to check if it has the necessary endpoints to be considered if:
- software field has "misskey" as substring
- software field starts with "miss"
- software field ends with "key" or "ski" or "fish"
- `misskey.io` is no longer the "root" instance, instead we will use a list of instances whose owners has agreed to share their federation data with us. PRs are welcome to add your instance to the list if you found your instance or fediverse circle is not listed.
Requests per minute is customizable on a per-instance basis and we will not increase the limit without additional confirmation from the instance owner.
- Instances that satisfy any of these conditions will be excluded:
- Not advertising a valid source code repository (either the official Misskey repository or an URL pointing to a Git URL that we will clone). If the source repository is not the official Misskey repository, we must be able to clone it (skipping blobs) within 1 minute.
- Instances with an `/api/charts/notes` not consistent with `/api/stats`.
- `originalUsersCount` is greater than `originalNotesCount`.
- Any of `originalUsersCount`, `originalNotesCount` is not positive.
- The source repository has not been committed for more than 6 months or if the official Misskey repository is presented, the advertised version is NOT EXACTLY one within the last 6 months.
## Loadings (testing)
We redesigned the loadings for `value` (ranking) field to give small, growing and well-maintained instances a better chance to be listed.
The new formula is:
Base score (sum):
- log10(originalUsersCount)
- log2(originalNotesCount / originalUsersCount)
- log2(npd15)
- 2 * openRegistrations
- 0.4 * emailRequiredForSignUp
- 5 * (serverRules.length > 0)
Adjustments (multipliers):
- 0.75 if (!icon)
- 0.5 if (lastUpdated < -45 days since last Misskey official release) (any push on source repository will count as an update)
## Running
You need:
- Nightly Rust Toolchain
- Firejail with the usually-included "git" profile
- Git with filtered cloning support (should be included in most distributions)

42
data/config.json Normal file
View file

@ -0,0 +1,42 @@
{
"blacklist": [
"^.+\\.ngrok\\.io$",
"^.+\\.loca\\.lt$",
"^.+\\.serveo\\.net$",
"^.+\\.trycloudflare\\.com$",
"^.+\\.pagekite\\.(?:net|com|me)$",
"^.+\\.localtunnel\\.me$",
"^.+\\.tunnelto\\.dev$",
"^.+\\.local$",
"^.+\\.localdomain$",
"^.+\\.localhost$",
"^.+\\.test$",
"^.+\\.example$",
"^.+\\.invalid$",
"^.+\\.onion$",
"^.+\\.lan$",
"^.+\\.cn$",
"^.+\\.click$",
"^.+\\.download$"
],
"instances": [
{
"hostname": "",
"root": false,
"include_blocks": false,
"rpm": {
"general": 20,
"instance_list": 0
}
},
{
"hostname": "mi.yumechi.jp",
"root": true,
"include_blocks": true,
"rpm": {
"general": 40,
"instance_list": 100
}
}
]
}

68
src/config.rs Normal file
View file

@ -0,0 +1,68 @@
use derive_more::Deref;
use regex::Regex;
use serde::{Deserialize, Deserializer};
pub static DEFAULT_CONFIG_JSON: &str = include_str!("../data/config.json");
#[derive(Deserialize, Debug, Clone)]
pub struct Config {
pub blacklist: Vec<StrRegex>,
pub instances: Vec<InstanceConfig>,
}
impl Config {
pub fn match_blacklist(&self, hostname: &str) -> bool {
self.blacklist.iter().any(|re| re.0.is_match(hostname))
}
}
#[derive(Deserialize, Debug, Clone)]
pub struct InstanceConfig {
pub hostname: String,
pub root: Option<bool>,
// whether to use this instance's block list to further filter instances
pub include_blocks: Option<bool>,
pub rpm: Option<RequestPerMinute>,
}
#[derive(Deserialize, Debug, Clone, Copy)]
pub struct RequestPerMinute {
pub general: u32,
pub instance_list: u32,
}
#[derive(Debug, Clone, Deref)]
#[repr(transparent)]
pub struct StrRegex(Regex);
impl StrRegex {
pub fn into_inner(self) -> Regex {
self.0
}
}
impl<'de> Deserialize<'de> for StrRegex {
fn deserialize<D>(deserializer: D) -> Result<StrRegex, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
#[allow(clippy::expect_used)]
Ok(StrRegex(Regex::new(&s).expect("Invalid regex")))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_deserialize_config() {
let config: Config =
serde_json::from_str(DEFAULT_CONFIG_JSON).expect("Failed to deserialize");
assert!(!config.blacklist.is_empty());
assert!(!config.instances.is_empty());
}
}

0
src/git/mod.rs Normal file
View file

327
src/http/misskey.rs Normal file
View file

@ -0,0 +1,327 @@
use std::{
borrow::Cow,
collections::{HashMap, HashSet},
fmt::Debug,
future::Future,
hash::{Hash, Hasher},
sync::Arc,
};
use governor::{DefaultDirectRateLimiter, Quota};
use reqwest::{Method, RequestBuilder};
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use crate::config::RequestPerMinute;
use super::ResponseExt;
pub trait DirectThrottler: Send + Sync + Clone {
type ThrottlerError: Debug;
fn throttle_general(&self) -> impl Future<Output = Result<(), Self::ThrottlerError>> + Send;
fn throttle_instance_list(
&self,
) -> impl Future<Output = Result<(), Self::ThrottlerError>> + Send;
}
pub struct MisskeyGovernor {
general: Option<DefaultDirectRateLimiter>,
instance_list: Option<DefaultDirectRateLimiter>,
}
impl MisskeyGovernor {
pub fn new(quota: RequestPerMinute) -> Self {
let general_rpm = quota.general;
let instance_list_rpm = quota.instance_list;
Self {
general: match general_rpm {
0 => None,
_ => {
#[allow(clippy::unwrap_used)]
let quota = Quota::per_minute(general_rpm.try_into().unwrap())
.allow_burst(1.try_into().unwrap());
Some(DefaultDirectRateLimiter::direct(quota))
}
},
instance_list: match instance_list_rpm {
0 => None,
_ => {
#[allow(clippy::unwrap_used)]
let quota = Quota::per_minute(instance_list_rpm.try_into().unwrap())
.allow_burst(1.try_into().unwrap());
Some(DefaultDirectRateLimiter::direct(quota))
}
},
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum MisskeyGovernorError {
#[error("Request denied by rate limiter")]
Denied,
}
impl DirectThrottler for Arc<MisskeyGovernor> {
type ThrottlerError = MisskeyGovernorError;
fn throttle_general(&self) -> impl Future<Output = Result<(), Self::ThrottlerError>> + Send {
async move {
if let Some(ref limiter) = self.general {
limiter.until_ready().await;
Ok(())
} else {
Err(MisskeyGovernorError::Denied)
}
}
}
fn throttle_instance_list(
&self,
) -> impl Future<Output = Result<(), Self::ThrottlerError>> + Send {
async move {
if let Some(ref limiter) = self.instance_list {
limiter.until_ready().await;
Ok(())
} else {
Err(MisskeyGovernorError::Denied)
}
}
}
}
#[derive(Debug, Clone)]
pub struct MisskeyClient<'a, T: DirectThrottler + Clone> {
pub client: reqwest::Client,
throttle: T,
base_url: Cow<'a, str>,
bearer: Option<String>,
}
#[derive(Deserialize, Debug, Clone)]
pub struct FederationInstance {
pub id: String,
pub host: String,
#[serde(rename = "firstRetrievedAt")]
pub first_retrieved_at: String,
#[serde(rename = "usersCount")]
pub users_count: u64,
#[serde(rename = "notesCount")]
pub notes_count: u64,
#[serde(rename = "isNotResponding")]
pub is_not_responding: bool,
#[serde(rename = "isSuspended")]
pub is_suspended: bool,
#[serde(rename = "isBlocked")]
pub is_blocked: bool,
#[serde(rename = "suspensionState")]
pub suspension_state: String,
#[serde(rename = "softwareName")]
pub software_name: Option<String>,
pub name: Option<String>,
pub description: Option<String>,
}
impl PartialEq for FederationInstance {
fn eq(&self, other: &Self) -> bool {
self.id == other.id
}
}
impl Eq for FederationInstance {}
impl Hash for FederationInstance {
fn hash<H: Hasher>(&self, state: &mut H) {
self.id.hash(state);
}
}
#[derive(Serialize, Debug, Default, Clone)]
pub struct ListFederationInstancesOptions {
pub limit: Option<u64>,
pub offset: Option<u64>,
pub sort: Option<String>,
pub blocked: Option<bool>,
pub suspended: Option<bool>,
pub silenced: Option<bool>,
pub federating: Option<bool>,
pub subscribing: Option<bool>,
pub publishing: Option<bool>,
}
#[derive(Deserialize, Debug)]
pub struct InstanceMetadata {
version: String,
// rest of the fields are optional
#[serde(flatten)]
rest: HashMap<String, serde_json::Value>,
}
#[derive(Deserialize, Debug)]
pub struct InstanceStats {
#[serde(rename = "notesCount")]
notes_count: u64,
#[serde(rename = "originalNotesCount")]
original_notes_count: u64,
#[serde(rename = "usersCount")]
users_count: u64,
#[serde(rename = "originalUsersCount")]
original_users_count: u64,
instances: u64,
#[serde(rename = "driveUsageLocal")]
drive_usage_local: Option<u64>,
#[serde(rename = "driveUsageRemote")]
drive_usage_remote: Option<u64>,
}
#[derive(Serialize, Debug, Clone, Copy)]
#[serde(rename_all = "lowercase")]
pub enum ChartSpan {
Day,
Hour,
}
#[derive(Serialize, Debug)]
pub struct InstanceNoteChartOptions {
pub span: ChartSpan,
pub limit: Option<u64>,
}
#[derive(Deserialize, Debug)]
pub struct InstanceChartData {
local: InstanceChartDataInner,
remote: InstanceChartDataInner,
}
#[derive(Deserialize, Debug)]
pub struct InstanceChartDataInner {
total: Vec<u64>,
inc: Vec<u64>,
dec: Vec<u64>,
diffs: InstanceChartDataDiffs,
}
#[derive(Deserialize, Debug)]
pub struct InstanceChartDataDiffs {
normal: Vec<u64>,
reply: Vec<u64>,
renote: Vec<u64>,
#[serde(rename = "withFile")]
with_file: Vec<u64>,
}
pub(crate) static REQUIRED_ENDPOINTS: &[&str] = &["meta", "stats", "charts/notes"];
impl<'a, T: DirectThrottler> MisskeyClient<'a, T> {
pub fn new(client: reqwest::Client, throttle: T, base_url: impl Into<Cow<'a, str>>) -> Self {
Self {
client,
throttle,
base_url: base_url.into(),
bearer: None,
}
}
pub fn base_url(&self) -> &str {
&self.base_url
}
pub fn bearer(mut self, bearer: impl Into<String>) -> Self {
self.bearer = Some(bearer.into());
self
}
pub(crate) fn base_request(&self, method: Method, url: &str) -> RequestBuilder {
let mut req = self
.client
.request(
method,
format!("{}/{}", self.base_url.trim_end_matches('/'), url),
)
.header("Accept", "application/json; charset=utf-8")
.header("User-Agent", "YumeTaishiKan/0.1.0 JoinMisskey/alike"); // Included to respect existing block lists
if let Some(ref bearer) = self.bearer {
req = req.bearer_auth(bearer);
}
req
}
pub(crate) fn base_request_json(
&self,
method: Method,
url: &str,
body: impl Serialize,
) -> RequestBuilder {
self.base_request(method, url)
.header("Content-Type", "application/json")
.json(&body)
}
pub(crate) async fn empty_post<R: DeserializeOwned + Send>(
&self,
limit: usize,
url: &str,
) -> Result<R, super::JsonHTTPError> {
self.base_request(Method::POST, url)
.send()
.await?
.check_status()?
.safe_json_decode(limit)
.await
}
pub(crate) async fn json_post<P: Serialize, R: DeserializeOwned + Send>(
&self,
limit: usize,
url: &str,
body: P,
) -> Result<R, super::JsonHTTPError> {
self.base_request_json(Method::POST, url, body)
.send()
.await?
.check_status()?
.safe_json_decode(limit)
.await
}
pub async fn endpoints(&self) -> Result<HashSet<String>, super::JsonHTTPError> {
self.throttle
.throttle_general()
.await
.map_err(|_| super::JsonHTTPError::Throttled)?;
self.json_post(4 << 10, "/api/endpoints", serde_json::json!({"type": ""})) // for some reason very important..
.await
}
pub async fn federation_instances(
&self,
options: &ListFederationInstancesOptions,
) -> Result<Vec<FederationInstance>, super::JsonHTTPError> {
self.throttle
.throttle_instance_list()
.await
.map_err(|_| super::JsonHTTPError::Throttled)?;
self.json_post(1 << 20, "/api/federation/instances", options)
.await
}
pub async fn instance_metadata(&self) -> Result<InstanceMetadata, super::JsonHTTPError> {
self.empty_post(64 << 10, "/api/meta").await
}
pub async fn instance_stats(&self) -> Result<InstanceStats, super::JsonHTTPError> {
self.empty_post(32 << 10, "/api/stats").await
}
pub async fn instance_chart_data(
&self,
options: &InstanceNoteChartOptions,
) -> Result<InstanceChartData, super::JsonHTTPError> {
self.json_post(128 << 10, "/api/charts/notes", options)
.await
}
}

115
src/http/mod.rs Normal file
View file

@ -0,0 +1,115 @@
use std::{future::Future, hash::Hash, sync::Arc};
use dashmap::DashMap;
use futures::{StreamExt, TryFutureExt, TryStreamExt};
use reqwest::{dns::Resolve, redirect::Policy};
use serde::de::DeserializeOwned;
pub mod misskey;
pub struct ClientMap<K: Hash + Eq, V: Clone> {
clients: DashMap<K, V>,
}
impl<K: Hash + Eq, V: Clone> ClientMap<K, V> {
pub fn new() -> Self {
Self {
clients: DashMap::new(),
}
}
pub fn get_or_insert_with<F>(&self, key: K, f: F) -> V
where
F: FnOnce() -> V,
{
self.clients.entry(key).or_insert_with(f).value().clone()
}
}
/// A Safe DNS resolver that only resolves to global addresses unless the requester itself is local.
pub struct SafeResolver();
impl Resolve for SafeResolver {
fn resolve(&self, name: reqwest::dns::Name) -> reqwest::dns::Resolving {
Box::pin(async move {
match tokio::net::lookup_host(format!("{}:80", name.as_str())).await {
Ok(lookup) => Ok(Box::new(lookup.filter(|addr| match addr {
std::net::SocketAddr::V4(a) => a.ip().is_global(),
std::net::SocketAddr::V6(a) => a.ip().is_global(),
}))
as Box<dyn Iterator<Item = std::net::SocketAddr> + Send>),
Err(e) => {
log::error!("Failed to resolve {}: {}", name.as_str(), e);
Err(e.into())
}
}
})
}
}
/// A reqwest client safe for parallel use and not leaking connections to LAN addresses.
pub(crate) fn new_safe_client() -> reqwest::Result<reqwest::Client> {
reqwest::Client::builder()
.dns_resolver(Arc::new(SafeResolver()))
.connect_timeout(std::time::Duration::from_secs(10))
.timeout(std::time::Duration::from_secs(20))
.redirect(Policy::limited(3))
.tcp_keepalive(Some(std::time::Duration::from_secs(20)))
.build()
}
#[derive(Debug, thiserror::Error)]
pub enum JsonHTTPError {
#[error("Governor rejected request")]
Throttled,
#[error("Unexpected HTTP status code: {0}")]
UnexpectedCode(reqwest::StatusCode),
#[error("HTTP error: {0}")]
Reqwest(#[from] reqwest::Error),
#[error("JSON error: {0}")]
Serde(#[from] serde_json::Error),
#[error("Response size limit exceeded")]
LimitExceeded,
}
pub trait ResponseExt {
fn check_status(self) -> Result<reqwest::Response, JsonHTTPError>;
fn safe_json_decode<R: DeserializeOwned + Send>(
self,
limit: usize,
) -> impl Future<Output = Result<R, JsonHTTPError>> + Send;
}
impl ResponseExt for reqwest::Response {
fn check_status(self) -> Result<reqwest::Response, JsonHTTPError> {
if self.status().is_success() {
Ok(self)
} else {
Err(JsonHTTPError::UnexpectedCode(self.status()))
}
}
fn safe_json_decode<R: DeserializeOwned + Send>(
self,
limit: usize,
) -> impl Future<Output = Result<R, JsonHTTPError>> + Send {
let mut remaining = limit;
self.bytes_stream()
.map(move |chunk| match chunk {
Ok(chunk) => {
if remaining < chunk.len() {
return Err(JsonHTTPError::LimitExceeded);
}
remaining -= chunk.len();
Ok(chunk)
}
Err(e) => Err(e.into()),
})
.try_fold(Vec::new(), |mut vec, chunk| async move {
vec.extend_from_slice(&chunk);
Ok(vec)
})
.and_then(move |vec| {
futures::future::ready(serde_json::from_slice(&vec).map_err(Into::into))
})
}
}

196
src/lib.rs Normal file
View file

@ -0,0 +1,196 @@
#![feature(ip)]
#![warn(clippy::unwrap_used, clippy::expect_used)]
use std::{
collections::HashMap,
fs::File,
io::{BufWriter, Write},
path::PathBuf,
sync::Arc,
};
use dashmap::DashSet;
use futures::{
stream::{self, select_all},
StreamExt, TryStreamExt,
};
use http::{misskey::MisskeyGovernor, ClientMap};
use logic::{analysis::compute_instance_info, discovery::do_instance_discovery};
use tokio::sync::Mutex;
pub mod config;
pub mod git;
pub mod http;
pub mod logic;
pub mod model;
#[derive(Debug, Clone, Default)]
pub struct RunOptions {
// only run the first N pages for each root instance, good for testing
pub limit_pages: Option<u64>,
// only run selected instances
pub only_instance: Option<Vec<String>>,
// use custom config file
pub config: Option<config::Config>,
pub output: PathBuf,
}
#[derive(Debug, thiserror::Error)]
pub enum Error {}
pub async fn run(options: RunOptions) {
let config = options.config.unwrap_or_else(|| {
serde_json::from_str(config::DEFAULT_CONFIG_JSON).expect("Failed to parse default config")
});
log::debug!("Config: {:?}", config);
let default_instance_config = config
.instances
.iter()
.find(|i| i.hostname.is_empty())
.expect("No default instance config found");
let default_rpm = default_instance_config.rpm.expect("No default RPM found");
let instance_config_overrides: HashMap<String, config::InstanceConfig> = config
.instances
.iter()
.filter(|i| !i.hostname.is_empty())
.map(|i| (i.hostname.clone(), i.clone()))
.collect();
let cmap = ClientMap::new();
let root_instances = config.instances.iter().filter(|i| i.root.unwrap_or(false));
let root_instances_block_list = root_instances
.clone()
.filter(|i| i.include_blocks.unwrap_or(false));
// find all blocked instances
let blocked = DashSet::new();
select_all(root_instances_block_list.map(|instance| {
let config = instance_config_overrides
.get(&instance.hostname)
.unwrap_or(default_instance_config);
let client = cmap.get_or_insert_with(instance.hostname.clone(), || {
http::misskey::MisskeyClient::new(
http::new_safe_client().expect("Failed to create client"),
Arc::new(MisskeyGovernor::new(config.rpm.unwrap_or(default_rpm))),
instance.hostname.clone(),
)
});
Box::pin(do_instance_discovery(client, true))
}))
.try_for_each_concurrent(4, |instances| async {
for instance in instances {
blocked.insert(instance.host.clone());
}
Ok(())
})
.await
.expect("Failed to fetch blocked instances");
log::debug!(
"Blocked instances: {:?}",
blocked
.iter()
.map(|i| i.to_string().replace(".", "^"))
.collect::<Vec<_>>()
);
let included = DashSet::new();
select_all(root_instances.map(|instance| {
let this_config = instance_config_overrides
.get(&instance.hostname)
.unwrap_or(default_instance_config);
let client = cmap.get_or_insert_with(instance.hostname.clone(), || {
http::misskey::MisskeyClient::new(
http::new_safe_client().expect("Failed to create client"),
Arc::new(MisskeyGovernor::new(this_config.rpm.unwrap_or(default_rpm))),
instance.hostname.clone(),
)
});
Box::pin(
do_instance_discovery(client, false)
.map_ok(|mut instances| {
instances
.retain(|i| !blocked.contains(&i.host) && !config.match_blacklist(&i.host));
instances
})
.inspect_err(|e| {
log::error!("Failed to fetch instance {}: {:?}", instance.hostname, e)
}),
)
}))
.try_for_each_concurrent(4, |instances| async {
for instance in instances {
included.insert(instance.clone());
}
Ok(())
})
.await
.expect("Failed to fetch instances");
log::info!("Found {} instances", included.len());
let mut output_file = File::create(options.output).expect("Failed to create output file");
write!(output_file, "[").expect("Failed to write to output file");
let output_file = Mutex::new(BufWriter::new(output_file));
let output_file_ref = &output_file;
stream::iter(included)
.filter_map(|inst| async {
let host = inst.host.clone();
let this_config = instance_config_overrides
.get(&host)
.unwrap_or(default_instance_config);
let client = cmap.get_or_insert_with(host.clone(), || {
http::misskey::MisskeyClient::new(
http::new_safe_client().expect("Failed to create client"),
Arc::new(MisskeyGovernor::new(this_config.rpm.unwrap_or(default_rpm))),
host.clone(),
)
});
match compute_instance_info(client, inst).await {
Ok(info) => Some((host, info)),
Err(e) => {
log::error!("Instance {} is excluded: {:?}", host, e);
None
}
}
})
.for_each(|(host, mut info)| async move {
info.url = host; // prevent fake URLs
let mut output_file = output_file_ref.lock().await;
serde_json::to_writer(output_file.get_mut(), &info)
.expect("Failed to write to output file");
})
.await;
let mut output_file = output_file.into_inner();
write!(output_file, "]").expect("Failed to write to output file");
output_file.flush().expect("Failed to flush output file");
log::info!("Done");
}

77
src/logic/analysis.rs Normal file
View file

@ -0,0 +1,77 @@
use std::future::Future;
use reqwest::StatusCode;
use crate::{
http::{
misskey::{self, FederationInstance, MisskeyClient, REQUIRED_ENDPOINTS},
JsonHTTPError,
},
model::{ExcludeReason, InstanceInfo},
};
fn has_nobot_tag(s: &str) -> bool {
s.split_whitespace().any(|w| {
w.eq_ignore_ascii_case("nobot")
|| w.eq_ignore_ascii_case("#nobot")
|| w.eq_ignore_ascii_case("no-bot")
|| w.eq_ignore_ascii_case("#no-bot")
})
}
fn potentially_misskey(software_name: &str) -> bool {
software_name.starts_with("miss")
|| software_name.ends_with("key")
|| software_name.ends_with("ski")
|| software_name.ends_with("fish")
|| software_name.contains("misskey")
}
pub fn compute_instance_info<'c, T: misskey::DirectThrottler + 'c>(
client: MisskeyClient<'c, T>,
master_info: FederationInstance,
) -> impl Future<Output = Result<InstanceInfo, ExcludeReason>> + Send + 'c {
async move {
if master_info.name.as_deref().map_or(false, has_nobot_tag)
|| master_info
.description
.as_deref()
.map_or(false, has_nobot_tag)
{
log::info!("Excluding instance {} due to no-bot tag", master_info.host);
return Err(ExcludeReason::NoBotIndicated);
}
if !master_info
.software_name
.as_deref()
.map_or(false, potentially_misskey)
{
log::info!(
"Excluding instance {} due to non-Misskey software name",
master_info.host
);
return Err(ExcludeReason::UnknownSoftwareName);
}
// probe API
let endpoints = client.endpoints().await.map_err(|e| match e {
JsonHTTPError::UnexpectedCode(StatusCode::GONE) => ExcludeReason::InstanceGone,
JsonHTTPError::UnexpectedCode(_) => ExcludeReason::UnknownSoftwareName,
e => {
log::error!("Failed to probe instance {}: {:?}", master_info.host, e);
ExcludeReason::NetworkError
}
})?;
if !REQUIRED_ENDPOINTS.iter().all(|&e| endpoints.contains(e)) {
log::info!(
"Excluding instance {} due to missing endpoints",
master_info.host
);
return Err(ExcludeReason::APINotSupported);
}
todo!()
}
}

38
src/logic/discovery.rs Normal file
View file

@ -0,0 +1,38 @@
use futures::{stream, Stream, StreamExt};
use crate::http::{
misskey::{self, FederationInstance, MisskeyClient},
JsonHTTPError,
};
const PAGE_SIZE: u64 = 50;
pub fn do_instance_discovery<'c, T: misskey::DirectThrottler + 'c>(
client: MisskeyClient<'c, T>,
blocked: bool,
) -> impl Stream<Item = Result<Vec<FederationInstance>, JsonHTTPError>> + 'c {
let base_url = client.base_url().to_string();
stream::iter((0..).step_by(PAGE_SIZE as usize))
.then(move |offset| {
let options = misskey::ListFederationInstancesOptions {
blocked: Some(blocked),
limit: Some(PAGE_SIZE),
offset: Some(offset),
..Default::default()
};
let client = client.clone();
async move {
let instances = client.federation_instances(&options).await?;
Ok(instances)
}
})
.take_while(move |instances| match instances {
Ok(instances) => futures::future::ready(!instances.is_empty()),
Err(e) => {
log::error!("Failed to fetch instances from {}: {:?}", base_url, e);
futures::future::ready(false)
}
})
}

2
src/logic/mod.rs Normal file
View file

@ -0,0 +1,2 @@
pub mod analysis;
pub mod discovery;

20
src/main.rs Normal file
View file

@ -0,0 +1,20 @@
use clap::Parser;
use yume_taishikan::RunOptions;
#[derive(Parser)]
struct Opts {
#[clap(short, long)]
output: String,
}
#[tokio::main]
async fn main() {
let opts = Opts::parse();
let runopts = RunOptions {
output: opts.output.into(),
..Default::default()
};
yume_taishikan::run(runopts).await;
}

342
src/model.rs Normal file
View file

@ -0,0 +1,342 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
#[derive(Deserialize, Serialize, Debug)]
pub struct ETagged<T> {
pub etag: Option<String>,
pub data: T,
}
#[derive(Deserialize, Serialize, Debug, Clone)]
pub enum ExcludeReason {
NoBotIndicated,
UnknownSoftwareName,
InstanceGone,
NetworkError,
APINotSupported,
SuspiciousStats,
// Vulnerable - not shown at all
// Blacklisted - not shown at all
MissingSourceUrl,
FailedLocatingSource,
SourceTooOld,
}
#[derive(Deserialize, Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct YumeExtraInstanceInfo {
git_last_commit: DateTime<Utc>,
last_reachable: DateTime<Utc>,
score_terms: ScoreTerms,
}
#[derive(Deserialize, Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct ScoreTerms {
pub original_users_count: u64,
pub original_users_count_trans: f64,
pub original_notes_count: u64,
pub original_notes_over_users: f64,
pub original_notes_over_users_trans: f64,
pub npd15: f64,
pub npd15_trans: f64,
pub open_registrations: bool,
pub open_registrations_trans: f64,
pub email_required_for_signup: bool,
pub email_required_for_signup_trans: f64,
pub has_rules: bool,
pub has_rules_trans: f64,
pub obsolete: bool,
pub obsolete_trans_mult: f64,
pub missing_icon: bool,
pub missing_icon_trans_mult: f64,
}
impl ScoreTerms {
pub fn new(
original_users_count: u64,
original_notes_count: u64,
npd15: f64,
open_registrations: bool,
email_required_for_signup: bool,
has_rules: bool,
obsolete: bool,
missing_icon: bool,
) -> Self {
Self {
original_users_count,
original_users_count_trans: (original_users_count as f64).log10(),
original_notes_count,
original_notes_over_users: original_notes_count as f64 / original_users_count as f64,
original_notes_over_users_trans: (original_notes_count as f64
/ original_users_count as f64)
.log2(),
npd15,
npd15_trans: npd15.log2(),
open_registrations,
open_registrations_trans: if open_registrations { 2.0 } else { 0.0 },
email_required_for_signup,
email_required_for_signup_trans: if email_required_for_signup { 0.4 } else { 0.0 },
has_rules,
has_rules_trans: if has_rules { 5.0 } else { 0.0 },
obsolete,
obsolete_trans_mult: if obsolete { 0.5 } else { 1.0 },
missing_icon,
missing_icon_trans_mult: if missing_icon { 0.75 } else { 1.0 },
}
}
pub fn score(&self) -> f64 {
(self.original_users_count_trans
+ self.original_notes_over_users_trans
+ self.npd15_trans
+ self.open_registrations_trans
+ self.email_required_for_signup_trans
+ self.has_rules_trans)
* self.obsolete_trans_mult
* self.missing_icon_trans_mult
}
}
#[derive(Deserialize, Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct InstanceInfo {
pub yume_extra: YumeExtraInstanceInfo,
pub url: String,
pub langs: Vec<String>,
pub value: f64,
pub meta: InstanceMeta,
pub nodeinfo: NodeInfo,
pub stats: Stats,
pub npd15: f64,
pub name: String,
pub description: String,
pub is_alive: bool,
pub repo: String,
pub background: bool,
pub icon: bool,
pub banner: bool,
}
#[derive(Deserialize, Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Stats {
pub notes_count: i64,
pub original_notes_count: i64,
pub users_count: i64,
pub original_users_count: i64,
pub reactions_count: i64,
pub instances: i64,
pub drive_usage_local: i64,
pub drive_usage_remote: i64,
}
#[derive(Deserialize, Serialize, Debug)]
pub struct NodeInfo {
pub version: String,
pub software: Software,
pub protocols: Vec<String>,
pub services: Services,
#[serde(rename = "openRegistrations")]
pub open_registrations: bool,
pub usage: Usage,
pub metadata: NodeInfoMetadata,
}
#[derive(Deserialize, Serialize, Debug)]
pub struct NodeInfoMetadata {
#[serde(rename = "nodeName")]
pub node_name: String,
#[serde(rename = "nodeDescription")]
pub node_description: String,
#[serde(rename = "nodeAdmins")]
pub node_admins: Vec<NodeInfoPerson>,
pub maintainer: NodeInfoPerson,
pub langs: Vec<String>,
#[serde(rename = "tosUrl")]
pub tos_url: String,
#[serde(rename = "privacyPolicyUrl")]
pub privacy_policy_url: Option<String>,
#[serde(rename = "impressumUrl")]
pub impressum_url: Option<String>,
#[serde(rename = "repositoryUrl")]
pub repository_url: String,
#[serde(rename = "feedbackUrl")]
pub feedback_url: String,
#[serde(rename = "disableRegistration")]
pub disable_registration: bool,
#[serde(rename = "disableLocalTimeline")]
pub disable_local_timeline: bool,
#[serde(rename = "disableGlobalTimeline")]
pub disable_global_timeline: bool,
#[serde(rename = "disableRegistrationReason")]
pub email_required_for_signup: bool,
#[serde(rename = "enableHcaptcha")]
pub enable_hcaptcha: bool,
#[serde(rename = "enableRecaptcha")]
pub enable_recaptcha: bool,
#[serde(rename = "enableMcaptcha")]
pub enable_mcaptcha: bool,
#[serde(rename = "enableTurnstile")]
pub enable_turnstile: bool,
#[serde(rename = "maxNoteTextLength")]
pub max_note_text_length: i64,
#[serde(rename = "enableEmail")]
pub enable_email: bool,
#[serde(rename = "enableServiceWorker")]
pub enable_service_worker: bool,
#[serde(rename = "proxyAccountName")]
pub proxy_account_name: String,
#[serde(rename = "themeColor")]
pub theme_color: String,
}
#[derive(Deserialize, Serialize, Debug)]
pub struct NodeInfoPerson {
pub name: String,
pub email: Option<String>,
}
#[derive(Deserialize, Serialize, Debug)]
pub struct Software {
pub name: String,
pub version: Option<String>,
pub homepage: Option<String>,
pub repository: Option<String>,
}
#[derive(Deserialize, Serialize, Debug)]
pub struct Services {
pub inbound: Vec<String>,
pub outbound: Vec<String>,
}
#[derive(Deserialize, Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Usage {
pub users: Users,
pub local_posts: i64,
pub local_comments: i64,
}
#[derive(Deserialize, Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Users {
pub total: i64,
pub active_halfyear: Option<i64>,
pub active_month: Option<i64>,
}
#[derive(Deserialize, Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct InstanceMeta {
pub maintainer_name: String,
pub version: String,
pub name: String,
pub short_name: Option<String>,
pub uri: String,
pub description: String,
pub langs: Vec<String>,
pub tos_url: String,
pub repository_url: String,
pub feedback_url: String,
pub impressum_url: Option<String>,
pub privacy_policy_url: Option<String>,
pub disable_registration: bool,
pub email_required_for_signup: bool,
pub mcaptcha_instance_url: Option<String>,
pub sw_publickey: String,
pub theme_color: String,
pub mascot_image_url: String,
pub banner_url: String,
pub info_image_url: Option<String>,
pub server_error_image_url: Option<String>,
pub not_found_image_url: Option<String>,
pub icon_url: Option<String>,
pub background_image_url: Option<String>,
pub logo_image_url: Option<String>,
pub max_note_text_length: i64,
pub default_light_theme: Option<String>,
pub default_dark_theme: Option<String>,
pub well_known_websites: Vec<String>,
pub notes_per_one_ad: i64,
pub enable_email: bool,
pub enable_service_worker: bool,
pub translator_available: bool,
pub server_rules: Vec<String>,
pub policies: Policies,
pub media_proxy: String,
pub enable_url_preview: bool,
pub enable_skeb_status: bool,
pub cache_remote_files: bool,
pub cache_remote_sensitive_files: bool,
pub require_setup: bool,
pub proxy_account_name: String,
pub features: Features,
}
#[derive(Deserialize, Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Policies {
pub gtl_available: bool,
pub ltl_available: bool,
pub can_public_note: bool,
pub can_initiate_conversation: bool,
pub can_create_content: bool,
pub can_update_content: bool,
pub can_delete_content: bool,
pub can_purge_account: bool,
pub can_update_avatar: bool,
pub can_update_banner: bool,
pub mention_limit: i64,
pub can_invite: bool,
pub invite_limit: i64,
pub invite_limit_cycle: i64,
pub invite_expiration_time: i64,
pub can_manage_custom_emojis: bool,
pub can_manage_avatar_decorations: bool,
pub can_search_notes: bool,
pub can_use_translator: bool,
pub can_use_drive_file_in_sound_settings: bool,
pub can_use_reaction: bool,
pub can_hide_ads: bool,
pub drive_capacity_mb: i64,
pub always_mark_nsfw: bool,
pub skip_nsfw_detection: bool,
pub pin_limit: i64,
pub antenna_limit: i64,
pub antenna_notes_limit: i64,
pub word_mute_limit: i64,
pub webhook_limit: i64,
pub clip_limit: i64,
pub note_each_clips_limit: i64,
pub user_list_limit: i64,
pub user_each_user_lists_limit: i64,
pub rate_limit_factor: i64,
pub avatar_decoration_limit: i64,
pub mutual_link_section_limit: i64,
pub mutual_link_limit: i64,
}
#[derive(Deserialize, Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Features {
pub local_timeline: bool,
pub global_timeline: bool,
pub registration: bool,
pub email_required_for_signup: bool,
pub h_captcha: bool,
pub hcaptcha: bool,
pub m_captcha: bool,
pub mcaptcha: bool,
pub re_captcha: bool,
pub recaptcha: bool,
pub turnstile: bool,
pub object_storage: bool,
pub service_worker: bool,
pub miauth: bool,
}