From 6deea63497263cc73a528d8a14de800798584957 Mon Sep 17 00:00:00 2001 From: Danilo Reyes Date: Thu, 21 Aug 2025 18:46:37 -0600 Subject: [PATCH] download_rust init --- src/download_rust/Cargo.toml | 31 ++ src/download_rust/src/args.rs | 97 ++++++ src/download_rust/src/config.rs | 44 +++ src/download_rust/src/download.rs | 474 +++++++++++++++++++++++++++++ src/download_rust/src/functions.rs | 127 ++++++++ src/download_rust/src/gallery.rs | 166 ++++++++++ src/download_rust/src/main.rs | 33 ++ src/download_rust/src/user.rs | 246 +++++++++++++++ 8 files changed, 1218 insertions(+) create mode 100644 src/download_rust/Cargo.toml create mode 100644 src/download_rust/src/args.rs create mode 100644 src/download_rust/src/config.rs create mode 100644 src/download_rust/src/download.rs create mode 100644 src/download_rust/src/functions.rs create mode 100644 src/download_rust/src/gallery.rs create mode 100644 src/download_rust/src/main.rs create mode 100644 src/download_rust/src/user.rs diff --git a/src/download_rust/Cargo.toml b/src/download_rust/Cargo.toml new file mode 100644 index 0000000..c6b4f7c --- /dev/null +++ b/src/download_rust/Cargo.toml @@ -0,0 +1,31 @@ +##! Cargo.toml +# +# This Cargo manifest defines the Rust version of the jawz download +# manager. It exposes a single binary named `rust_downloader` and +# pulls in a handful of third‐party crates to mirror the features of +# the original Python implementation. The chosen dependencies +# provide command line parsing (clap), configuration loading +# (serde/serde_yaml), regular expressions (regex), home directory +# discovery (dirs), shuffling (rand), logging (log/env_logger) and +# convenient error handling (anyhow). Versions are pegged to +# relatively conservative releases so the project will build cleanly +# against the NixOS 25.05 channel. + +[package] +name = "rust_downloader" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +clap = { version = "4.4", features = ["derive"] } +serde = { version = "1.0", features = ["derive"] } +serde_yaml = "0.9" +regex = "1.10" +dirs = "5.0" +rand = { version = "0.8", features = ["std", "small_rng"] } +log = "0.4" +env_logger = "0.10" +once_cell = "1.17" +anyhow = "1.0" \ No newline at end of file diff --git a/src/download_rust/src/args.rs b/src/download_rust/src/args.rs new file mode 100644 index 0000000..5b831f6 --- /dev/null +++ b/src/download_rust/src/args.rs @@ -0,0 +1,97 @@ +//! Command line argument definitions. +//! +//! This module defines the [`Cli`] structure which holds the +//! command line arguments accepted by the application. It uses +//! [`clap`] derive macros to declare flags, options and positional +//! parameters and enforces compile time validation for the +//! enumerated scrapper types. A `--dry-run` flag has been added to +//! allow simulating the run without executing any external +//! commands. + +use clap::{ArgAction, Parser, ValueEnum}; + +/// The supported scraper categories. These values mirror the +/// behaviour of the original Python script and are used to select +/// which downstream logic executes. See `scrapper_manager` in +/// [`crate::download`] for details. +#[derive(ValueEnum, Clone, Debug, PartialEq, Eq)] +pub enum Scrapper { + /// Download from the user's push list + Push, + /// Download from the user's main list + Main, + /// Download from the user's Instagram list + Instagram, + /// Download from the user's Kemono list + Kemono, + /// Download from the comic list + Comic, + /// Download from the manga list + Manga, + /// Download from the webcomic list + Webcomic, +} + +/// Command line interface for the downloader. Deriving [`Parser`] +/// automatically generates argument parsing and help output. The +/// fields directly correspond to the command line options of the +/// original Python implementation with a few refinements: +/// +/// * `scrapper` is optional and uses the [`Scrapper`] enum for +/// compile time validation. +/// * `user` defaults to "everyone" when omitted. +/// * `input` accepts one or more strings and may be specified +/// multiple times on the command line. +/// * Flags use Rust style booleans rather than inverted names; `no +/// archive` and `no skip` have been inverted into `flag_archive` +/// and `flag_skip` with sensible defaults. +/// * A `--dry-run` flag has been introduced to simulate execution +/// without running external commands. +/// * Instagram `post_type` values default to all supported types. +#[derive(Parser, Debug, Clone)] +#[command(name = "Downloader", about = "Download images, galleries and videos from a wide array of websites.")] +pub struct Cli { + /// Selects the scraper to use. When omitted the program + /// interprets input links via `--input` instead. + #[arg(value_enum, index = 1)] + pub scrapper: Option, + + /// Selects the personal user list to process. Defaults to + /// "everyone" which processes all configured users. + #[arg(short = 'u', long = "user", default_value = "everyone")] + pub user: String, + + /// Downloads the provided links immediately instead of using a + /// preconfigured list. May be specified multiple times. + #[arg(short = 'i', long = "input", num_args = 1.., action = ArgAction::Append)] + pub input: Vec, + + /// Prints a numbered list of links and prompts for a selection. + #[arg(short = 'l', long = "list", action = ArgAction::SetTrue, default_value_t = false)] + pub flag_list: bool, + + /// Enables archiving of downloads to prevent duplicates. Use + /// `--no-archive` to disable. + #[arg(long = "no-archive", action = ArgAction::SetFalse, default_value_t = true)] + pub flag_archive: bool, + + /// Skips already downloaded items when true. Use `--no-skip` + /// to download entire galleries. + #[arg(long = "no-skip", action = ArgAction::SetFalse, default_value_t = true)] + pub flag_skip: bool, + + /// Prints generated commands in addition to executing them. + #[arg(short = 'v', long = "verbose", action = ArgAction::SetTrue, default_value_t = false)] + pub flag_verbose: bool, + + /// Performs a dry run. Commands will be printed but never + /// executed. This flag takes precedence over `--verbose`. + #[arg(long = "dry-run", action = ArgAction::SetTrue, default_value_t = false)] + pub flag_dry_run: bool, + + /// Filters Instagram posts by type. When multiple values are + /// provided they will be joined by commas. The default + /// includes all supported types. + #[arg(short = 't', long = "type-post", num_args = 1.., action = ArgAction::Append, default_values_t = vec![String::from("posts"), String::from("reels"), String::from("stories"), String::from("highlights"), String::from("avatar")])] + pub post_type: Vec, +} \ No newline at end of file diff --git a/src/download_rust/src/config.rs b/src/download_rust/src/config.rs new file mode 100644 index 0000000..b113fba --- /dev/null +++ b/src/download_rust/src/config.rs @@ -0,0 +1,44 @@ +//! Configuration handling. +//! +//! This module is responsible for loading the YAML configuration +//! expected by the downloader. The configuration is read from +//! `~/.config/jawz/config.yaml` and deserialised into a +//! [`serde_yaml::Value`]. Consumers can then index into the value +//! to pull out fields as needed. A global constant is not used in +//! order to avoid surprises during testing and to make error +//! propagation explicit. + +use anyhow::{anyhow, Context, Result}; +use dirs::home_dir; +use serde_yaml::Value; +use std::fs; +use std::path::PathBuf; + +/// Loads the configuration file from the user's home directory. The +/// expected location is `$HOME/.config/jawz/config.yaml`. If the +/// file cannot be read or parsed a descriptive error is returned. +pub fn load_config_variables() -> Result { + let home = home_dir().ok_or_else(|| anyhow!("Could not determine home directory"))?; + let path: PathBuf = [home.to_str().unwrap_or(""), ".config/jawz/config.yaml"] + .iter() + .collect::(); + let content = fs::read_to_string(&path) + .with_context(|| format!("Failed to read configuration file from {}", path.display()))?; + let cfg: Value = serde_yaml::from_str(&content) + .with_context(|| format!("Failed to parse YAML in {}", path.display()))?; + Ok(cfg) +} + +/// Finds the index of a user by name. Returns `None` if no match +/// exists or if the configuration does not contain a `users` list. +pub fn get_user_index(name: &str, cfg: &Value) -> Option { + cfg.get("users")?.as_sequence()?.iter().enumerate().find_map(|(i, user)| { + let map = user.as_mapping()?; + let n = map.get(&Value::String("name".into()))?.as_str()?; + if n.eq_ignore_ascii_case(name) { + Some(i) + } else { + None + } + }) +} \ No newline at end of file diff --git a/src/download_rust/src/download.rs b/src/download_rust/src/download.rs new file mode 100644 index 0000000..95a1b23 --- /dev/null +++ b/src/download_rust/src/download.rs @@ -0,0 +1,474 @@ +//! High level download orchestration. +//! +//! This module coordinates the various helper modules to mirror the +//! behaviour of the original Python downloader. It exposes a +//! `run` function which is called from `main.rs` with the parsed +//! command line arguments and the loaded configuration. Where +//! possible iterators and guard clauses replace explicit loops to +//! improve clarity. + +use crate::args::{Cli, Scrapper}; +use crate::config::{get_user_index, load_config_variables}; +use crate::functions::{append_line, list_lines, parse_link, quote, run}; +use crate::gallery::Gallery; +use crate::user::User; +use anyhow::{anyhow, Context, Result}; +use log::{debug, info}; +use rand::seq::SliceRandom; +use regex::Regex; +use serde_yaml::Value; +use std::fs; +use std::io::{self, Write}; +use std::path::Path; + +/// A simple struct representing a video download. It collects +/// command line arguments required to build a `yt-dlp` or +/// `stream-dl` command. The `dest` and `database` fields should be +/// prequoted. +#[derive(Default, Debug, Clone)] +struct Video { + use_archive: bool, + link: String, + dest: String, + database: String, +} + +/// Constructs the `-o include=...` argument for Instagram links. +/// When the provided link does not contain "instagram" an empty +/// string is returned. When multiple post types are supplied they +/// are joined with commas. +fn parse_instagram(link: &str, cli: &Cli) -> String { + if !link.contains("instagram") { + return String::new(); + } + if cli.post_type.is_empty() { + return String::new(); + } + let joined = if cli.post_type.len() > 1 { + cli.post_type.join(",") + } else { + cli.post_type.first().cloned().unwrap_or_default() + }; + format!(" -o include={}", quote(&joined)) +} + +/// Builds a command string for video downloads. The logic mirrors +/// the original Python `video_command` function. See the source +/// comments for more details. Logging of the command and link is +/// performed at the call site. +fn video_command(video: &Video) -> String { + let rgx_yt = Regex::new(r"https://(?:www\.)?youtube|https://youtu.be").expect("invalid regex"); + let rgx_music = Regex::new(r"https://music\.youtube.*").expect("invalid regex"); + // Handle special case for chaturbate: use stream-dl on the last + // path component only. + if video.link.contains("chaturbate") { + let slug = video + .link + .trim_end_matches('/') + .rsplit('/') + .next() + .unwrap_or(""); + return format!("stream-dl {}", slug); + } + let mut command = String::from("yt-dlp"); + if rgx_yt.is_match(&video.link) { + command.push_str(" --embed-subs --embed-thumbnail"); + command.push_str(" --embed-metadata --embed-chapters"); + command.push_str(&format!(" -o {}", quote(&(video.dest.clone() + "/%(title)s.%(ext)s")))); + } else if rgx_music.is_match(&video.link) { + if video.use_archive { + command.push_str(&format!(" --download-archive {}", video.database)); + } + command.push_str(" --no-playlist --newline -x"); + command.push_str(" --audio-format best --add-metadata --audio-quality 0 -o"); + command.push_str(&format!(" {}", quote(&(video.dest.clone() + "/%(title)s.%(ext)s")))); + } else { + command.push_str(&format!(" -f mp4 -o {}", quote(&(video.dest.clone() + "/%(title)s.%(ext)s")))); + } + format!("{} {}", command, quote(&video.link)) +} + +/// Processes a gallery list (main, instagram or kemono) for a single +/// user. Builds and executes the appropriate `gallery-dl` command. +fn parse_gallery(list_name: &str, user: &User, cli: &Cli, cfg: &Value) -> Result<()> { + let mut gallery = Gallery::default(); + gallery.archive = cli.flag_archive; + // If skip is disabled (`flag_skip` false) then we enable skip + // through an option on gallery-dl. Otherwise we leave it empty. + gallery.skip_arg = if cli.flag_skip { String::new() } else { " -o skip=true".to_string() }; + gallery.dest = Some("download".to_string()); + gallery.list = Some(list_name.to_string()); + gallery.opt_args = parse_instagram(list_name, cli); + gallery.generate_command(Some(user), cfg, false)?; + gallery.run_command(cli.flag_dry_run, cli.flag_verbose)?; + Ok(()) +} + +/// Downloads manga or comics based on the provided category. The +/// `skip_arg` string contains the chapter range options and is +/// assembled by the caller. Only lines matching the category are +/// downloaded. +fn comic_manager(skip_arg: &str, category: &Scrapper, cfg: &Value, cli: &Cli) -> Result<()> { + let comic = cfg + .get("comic") + .ok_or_else(|| anyhow!("Missing 'comic' section in configuration"))?; + let list_path = comic + .get("comic-list") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Missing 'comic-list' in comic configuration"))?; + let content = fs::read_to_string(list_path) + .with_context(|| format!("Failed to read comic list from {}", list_path))?; + let pattern = match category { + Scrapper::Manga => "manga|webtoon", + Scrapper::Comic => "readcomiconline", + _ => return Err(anyhow!("Invalid category for comic manager")), + }; + let re_cat = Regex::new(pattern).expect("Failed to compile comic regex"); + content + .lines() + .filter(|line| re_cat.is_match(line)) + .map(str::to_string) + .try_for_each(|link| { + let mut gallery = Gallery::default(); + gallery.archive = cli.flag_archive; + gallery.skip_arg = skip_arg.to_string(); + gallery.link = Some(link.trim().to_string()); + // Generate a comic command; pass `None` for user since + // comic downloads resolve their own directories + gallery.generate_command(None, cfg, true)?; + gallery.run_command(cli.flag_dry_run, cli.flag_verbose)?; + // Save the comic link to the master list + save_comic(&link, cfg)?; + Ok::<(), anyhow::Error>(()) + })?; + Ok(()) +} + +/// Prints a numbered list of webcomics to stdout and returns the +/// selected index. The user is prompted via stdin. Errors during +/// parsing or invalid selections are propagated. +fn print_webcomics(webcomics: &Value) -> Result { + let list = webcomics + .get("webcomics") + .and_then(|v| v.as_sequence()) + .ok_or_else(|| anyhow!("webcomic-list missing 'webcomics' array"))?; + for (index, entry) in list.iter().enumerate() { + let name = entry + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or(""); + println!("{}", list_lines(index, name)); + } + print!("Select a webcomic: "); + io::stdout().flush()?; + let mut input = String::new(); + io::stdin().read_line(&mut input)?; + let choice: usize = input.trim().parse()?; + if choice >= list.len() { + return Err(anyhow!("Invalid selection {}", choice)); + } + Ok(choice) +} + +/// Handles the webcomic download flow. The configuration file +/// referenced by `comic.webcomic-list` is parsed and the user is +/// prompted to choose which webcomic to download. A `webcomix` +/// command is then assembled and executed. +fn webcomic_manager(cfg: &Value, cli: &Cli) -> Result<()> { + let comic = cfg + .get("comic") + .ok_or_else(|| anyhow!("Missing 'comic' section in configuration"))?; + let list_path = comic + .get("webcomic-list") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Missing 'webcomic-list' in comic configuration"))?; + let webcomics_yaml: Value = serde_yaml::from_str(&fs::read_to_string(list_path)?) + .with_context(|| format!("Failed to parse webcomic list at {}", list_path))?; + let idx = print_webcomics(&webcomics_yaml)?; + let list = webcomics_yaml + .get("webcomics") + .and_then(|v| v.as_sequence()) + .ok_or_else(|| anyhow!("webcomic-list missing 'webcomics' array"))?; + let entry = list.get(idx).ok_or_else(|| anyhow!("Invalid webcomic index"))?; + let rating = entry + .get("type") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Missing 'type' in webcomic entry"))?; + let global = webcomics_yaml + .get("global") + .and_then(|v| v.as_mapping()) + .ok_or_else(|| anyhow!("Webcomic list missing 'global' section"))?; + let dest_key = format!("{}_directory", rating); + let dest = global + .get(&Value::String(dest_key.clone())) + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Missing '{}' in webcomic global", dest_key))?; + let name = entry + .get("name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Missing 'name' in webcomic entry"))?; + let link = entry + .get("url") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Missing 'url' in webcomic entry"))?; + let nxt_code = entry + .get("next_code") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Missing 'next_code' in webcomic entry"))?; + let img_code = entry + .get("image_code") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Missing 'image_code' in webcomic entry"))?; + info!("The webcomic is {}", dest); + let mut command = format!("cd {} && webcomix custom {}", quote(dest), quote(name)); + command.push_str(" --start-url "); + command.push_str("e(link)); + command.push_str(&format!(" --next-page-xpath={}", quote(nxt_code))); + command.push_str(&format!(" --image-xpath={}", quote(img_code))); + command.push_str(" -y --cbz"); + run(&command, cli.flag_dry_run, cli.flag_verbose) +} + +/// Appends a comic or manga link to the global comic list if it is +/// not already present. Links are normalised via `parse_link` to +/// avoid duplicates. Logs a message when skipping duplicates. +fn save_comic(link: &str, cfg: &Value) -> Result<()> { + let comic = cfg + .get("comic") + .ok_or_else(|| anyhow!("Missing 'comic' section in configuration"))?; + let list_path = comic + .get("comic-list") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("Missing 'comic-list' in comic configuration"))?; + let content = fs::read_to_string(list_path).unwrap_or_default().to_lowercase(); + let fixed = parse_link(link).to_lowercase(); + if content.contains(&fixed) { + info!("Graphic novel repeated, not saving"); + return Ok(()); + } + info!("New graphic novel, saving"); + append_line(Path::new(list_path), link) +} + +/// Handles the push list for a user. Links are classified into +/// gallery, comic, video or other categories using regular +/// expressions. Each category is processed appropriately. After +/// processing the push list is truncated. Logging mirrors the +/// original script. +fn push_manager(user: &User, cli: &Cli, cfg: &Value) -> Result<()> { + // Regular expressions used to classify links. These patterns + // mirror the ones in the original Python implementation. + let rgx_gallery = Regex::new( + r"(?x) + (x\.com/\w+((?=.*media)|(?!.*status))) + |(men\.wikifeet) + |(furaffinity\.net/user/) + |((deviantart\.com/\w+(?!.*/art/))) + |(furaffinity\.net/gallery/) + |(furaffinity\.net/scraps/) + |(furaffinity\.net/favorites/) + |(instagram.com(?!/p/)/\w+) + |(e621\.net((?=/post/)|(?!/posts/))) + |(flickr\.com/photos/\w+/(?!\d+)) + |(tumblr\.com(?!/post/)) + |(kemono\.party/(fanbox|gumroad|patreon)(?!/user/\d+/post)) + |(blogspot\.com(?!/)) + |(rule34\.paheal\.net/post/(?!view)) + |(rule34\.xxx/index\.php\?page=post&s=(?!view)) + |(pixiv\.net/(en/)?((?=users)|(?!artwork))) + |(fanbox\.cc/@\w+(?!.*posts/\d+)) + |(reddit\.com/(user|u)) + |(baraag\.net/((@\w+)|(?!/\d+))) + |(pinterest\.com/(?!pin/\d+)) + |(redgifs\.com/(users|u|(?!watch))) + |(bsky\.app/profile/(?!.*?/post/)) + ", + ) + .expect("Failed to compile gallery regex"); + let rgx_video = Regex::new(r"youtu\.be|youtube|pornhub|xtube|xvideos|chaturbate").expect("Failed to compile video regex"); + let rgx_comic = Regex::new(r"readcomiconline|mangahere|mangadex|webtoons|manganato").expect("Failed to compile comic regex"); + + // Read the push list into memory + let push_path = user + .lists + .get("push") + .ok_or_else(|| anyhow!("Push list missing for user {}", user.name))?; + let lines = fs::read_to_string(push_path).unwrap_or_default(); + // Temporary storage for categories + let mut links_galleries: Vec = Vec::new(); + let mut links_videos: Vec = Vec::new(); + let mut links_comics: Vec = Vec::new(); + let mut links_other: Vec = Vec::new(); + // Classify each link exactly once + for line in lines.lines().map(str::trim).filter(|l| !l.is_empty()) { + if rgx_gallery.is_match(line) { + links_galleries.push(line.to_string()); + } else if rgx_video.is_match(line) { + links_videos.push(line.to_string()); + } else if rgx_comic.is_match(line) { + links_comics.push(line.to_string()); + } else { + links_other.push(line.to_string()); + } + } + // Process gallery links + for link in &links_galleries { + let mut gallery = Gallery::default(); + gallery.archive = cli.flag_archive; + gallery.skip_arg = if cli.flag_skip { String::new() } else { " -o skip=true".to_string() }; + gallery.link = Some(parse_link(link)); + gallery.dest = Some("download".to_string()); + gallery.opt_args = parse_instagram(link, cli); + gallery.generate_command(Some(user), cfg, false)?; + gallery.run_command(cli.flag_dry_run, cli.flag_verbose)?; + // Save link into master list to prevent duplicates + user.save_link(link)?; + } + // Process comic links + for link in &links_comics { + let skip_arg = if !cli.flag_skip { + "".to_string() + } else if link.contains("readcomiconline") { + " --chapter-range 1".to_string() + } else { + " --chapter-range 1-5".to_string() + }; + let mut gallery = Gallery::default(); + gallery.archive = cli.flag_archive; + gallery.skip_arg = skip_arg; + gallery.link = Some(link.to_string()); + gallery.generate_command(None, cfg, true)?; + gallery.run_command(cli.flag_dry_run, cli.flag_verbose)?; + save_comic(link, cfg)?; + } + // Process video links + for link in &links_videos { + let mut video = Video::default(); + video.use_archive = cli.flag_archive; + video.link = link.to_string(); + // Use the media directory for the user + if let Some(media_dir) = user.directories.get("media") { + video.dest = media_dir.to_string_lossy().to_string(); + } else { + video.dest = String::new(); + } + video.database = quote( + user + .dbs + .get("media") + .map(|p| p.to_string_lossy()) + .unwrap_or_default() + .as_ref(), + ); + let cmd = video_command(&video); + info!("{} {}", cmd, link); + run(&cmd, cli.flag_dry_run, cli.flag_verbose)?; + } + // Process other links + for link in &links_other { + info!("Other type of download {}", link); + let mut gallery = Gallery::default(); + gallery.archive = false; + gallery.skip_arg = " -o directory='[]'".to_string(); + gallery.link = Some(link.to_string()); + gallery.dest = Some("push".to_string()); + gallery.generate_command(Some(user), cfg, false)?; + gallery.run_command(cli.flag_dry_run, cli.flag_verbose)?; + } + // Truncate the push list + fs::write(push_path, "")?; + Ok(()) +} + +/// Manages the selected scraper for a single user. Delegates to +/// specialised functions based on the scraper type. The user's +/// `list_manager` is invoked up front to prepare per‑site lists. +fn scrapper_manager(user: &User, scrapper: &Scrapper, cli: &Cli, cfg: &Value) -> Result<()> { + user.list_manager()?; + match scrapper { + Scrapper::Main => parse_gallery("main", user, cli, cfg), + Scrapper::Instagram => parse_gallery("instagram", user, cli, cfg), + Scrapper::Kemono => parse_gallery("kemono", user, cli, cfg), + Scrapper::Push => push_manager(user, cli, cfg), + Scrapper::Comic => { + let skip_arg = if cli.flag_skip { + " --chapter-range 1" + } else { + "" + }; + comic_manager(skip_arg, scrapper, cfg, cli) + } + Scrapper::Manga => { + let skip_arg = if cli.flag_skip { + " --chapter-range 1-5" + } else { + "" + }; + comic_manager(skip_arg, scrapper, cfg, cli) + } + Scrapper::Webcomic => webcomic_manager(cfg, cli), + } +} + +/// Invokes the selected scraper for every configured user. Only +/// scrapers which operate on per‑user lists (main, instagram, +/// kemono and push) are executed; others are skipped. +fn scrap_everyone(scrapper: &Scrapper, cli: &Cli, cfg: &Value) -> Result<()> { + let users = cfg + .get("users") + .and_then(|v| v.as_sequence()) + .ok_or_else(|| anyhow!("No users configured"))?; + for user_entry in users.iter() { + let name = user_entry + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or(""); + let idx = get_user_index(name, cfg) + .ok_or_else(|| anyhow!("User '{}' missing from configuration", name))?; + let user = User::new(idx, cfg)?; + info!("Scrapping {:?} for {}", scrapper, name); + scrapper_manager(&user, scrapper, cli, cfg)?; + } + Ok(()) +} + +/// Entry point for the download module. Decides how to dispatch +/// based on the presence or absence of a scrapper argument and +/// whether direct input links were provided. This function is +/// designed to be called from `main`. +pub fn run(cli: Cli, cfg: Value) -> Result<()> { + if let Some(scrapper) = &cli.scrapper { + let is_shared = matches!(scrapper, Scrapper::Push | Scrapper::Main | Scrapper::Instagram | Scrapper::Kemono); + if cli.user.eq_ignore_ascii_case("everyone") && is_shared { + return scrap_everyone(scrapper, &cli, &cfg); + } + // Otherwise operate on a single user + let user_name = &cli.user; + let idx = get_user_index(user_name, &cfg) + .ok_or_else(|| anyhow!("Unknown user '{}'", user_name))?; + let user = User::new(idx, &cfg)?; + return scrapper_manager(&user, scrapper, &cli, &cfg); + } + // No scrapper provided, process input links if present + if !cli.input.is_empty() { + // Determine which user should handle the push list. When + // called as an admin (`everyone` or `jawz`) we use the + // configuration for the user named "jawz". Otherwise we + // operate on the specified user. + let target = if cli.user.eq_ignore_ascii_case("everyone") || cli.user.eq_ignore_ascii_case("jawz") { + "jawz" + } else { + cli.user.as_str() + }; + let idx = get_user_index(target, &cfg) + .ok_or_else(|| anyhow!("Unknown user '{}'", target))?; + let user = User::new(idx, &cfg)?; + // Append each provided link to the user's push list + for link in cli.input.iter() { + user.append_list("push", &parse_link(link))?; + } + // Process the push list immediately + return push_manager(&user, &cli, &cfg); + } + Err(anyhow!("No scrapper selected and no input links provided")) +} \ No newline at end of file diff --git a/src/download_rust/src/functions.rs b/src/download_rust/src/functions.rs new file mode 100644 index 0000000..ed1b6ec --- /dev/null +++ b/src/download_rust/src/functions.rs @@ -0,0 +1,127 @@ +//! Miscellaneous helper functions. +//! +//! This module contains a variety of small helpers used throughout +//! the downloader. Where appropriate iterators and guard clauses are +//! employed to keep the code concise and expressive. Error +//! conditions are reported via [`anyhow::Error`]. + +use anyhow::{anyhow, Result}; +use log::{debug, info}; +use regex::Regex; +use std::fs::{self, File}; +use std::io::{self, BufRead, Write}; +use std::path::{Path, PathBuf}; +use std::process::Command; + +/// Ensures that a Twitter/X link ends in `/media` if it does not +/// already. The check is simple: if the string ends with +/// `"/media"` then the original string is returned, otherwise +/// `"/media"` is appended. +pub fn validate_x_link(line: &str) -> String { + if line.trim_end().ends_with("/media") { + line.to_string() + } else { + format!("{}/media", line.trim_end_matches('/')) + } +} + +/// Normalises certain links. At present this function only ensures +/// that X/Twitter links end with `/media`. If the pattern does not +/// match the link is returned unchanged. +pub fn parse_link(link: &str) -> String { + let re = Regex::new(r"(?x) + (?:x\.com/\w+/?(?!.*status)) + ") + .expect("Failed to compile regex"); + if re.is_match(link) { + let fixed = validate_x_link(link); + debug!("Processed link {}", fixed); + fixed + } else { + debug!("No modifications needed for the link {}", link); + link.to_string() + } +} + +/// Surrounds a string with double quotes. This mirrors the Python +/// `quote` helper and is useful when constructing shell commands. +pub fn quote(s: &str) -> String { + format!("\"{}\"", s) +} + +/// Recursively deletes all files and directories inside `directory` and +/// finally removes the directory itself. Missing directories are +/// ignored. Any failure during deletion results in an error. +pub fn clean_cache(directory: &Path) -> Result<()> { + if !directory.is_dir() { + return Ok(()); + } + for entry in fs::read_dir(directory)? { + let entry = entry?; + let path = entry.path(); + if path.is_file() { + fs::remove_file(&path)?; + } else if path.is_dir() { + fs::remove_dir_all(&path)?; + } + } + fs::remove_dir(directory)?; + Ok(()) +} + +/// Runs a shell command. When `dry_run` is true the command is +/// printed and execution is skipped. When `verbose` is true the +/// command is printed prior to execution. The command is executed +/// via the system shell so that complex pipelines are permitted. +pub fn run(command: &str, dry_run: bool, verbose: bool) -> Result<()> { + if dry_run { + println!("{}", command); + return Ok(()); + } + if verbose { + println!("{}", command); + } + // Execute through the system shell. Use `sh -c` so that the + // command string is interpreted as a complete shell command. + let status = Command::new("sh").arg("-c").arg(command).status()?; + if !status.success() { + return Err(anyhow!("Command failed with status {}: {}", status, command)); + } + Ok(()) +} + +/// Formats a numbered list entry. Useful when printing selections to +/// the user. +pub fn list_lines(index: usize, line: &str) -> String { + format!("{}) {}", index, line) +} + +/// Reads all non-empty lines from a file, trimming trailing +/// whitespace. Returns an iterator over the lines. When the file +/// does not exist an empty vector is returned. Errors during file +/// access are propagated. +pub fn read_lines(file: &Path) -> Result> { + if !file.is_file() { + return Ok(vec![]); + } + let file = File::open(file)?; + let buf = io::BufReader::new(file); + let lines: Vec = buf + .lines() + .filter_map(|l| l.ok()) + .map(|l| l.trim_end().to_string()) + .filter(|l| !l.is_empty()) + .collect(); + Ok(lines) +} + +/// Writes a string to a file, creating the file if necessary and +/// appending a newline. Errors are propagated. +pub fn append_line(file: &Path, line: &str) -> Result<()> { + let mut f = fs::OpenOptions::new() + .create(true) + .append(true) + .open(file)?; + writeln!(f, "{}", line)?; + Ok(()) +} \ No newline at end of file diff --git a/src/download_rust/src/gallery.rs b/src/download_rust/src/gallery.rs new file mode 100644 index 0000000..5685421 --- /dev/null +++ b/src/download_rust/src/gallery.rs @@ -0,0 +1,166 @@ +//! Gallery command generator. +//! +//! The [`Gallery`] struct encapsulates the state required to build a +//! `gallery-dl` command. It exposes a method to generate the +//! command string based on user configuration and whether the +//! download is for a comic. Guard clauses are used extensively to +//! keep the logic easy to follow. + +use crate::config::load_config_variables; +use crate::functions::quote; +use crate::user::User; +use anyhow::{anyhow, Context, Result}; +use serde_yaml::Value; + +/// Represents a gallery download request. Fields are mutable so +/// callers can configure the desired behaviour before generating the +/// command string. +#[derive(Default, Debug, Clone)] +pub struct Gallery { + /// Whether to append a download archive to prevent duplicates + pub archive: bool, + /// Optional skip argument string. A non‑empty string starting + /// with a space will be appended verbatim to the command. + pub skip_arg: String, + /// The direct link to download. Mutually exclusive with `list`. + pub link: Option, + /// The name of the list to process. Mutually exclusive with + /// `link`. + pub list: Option, + /// The destination folder key (resolved via the user or comic + /// configuration). Ignored when `is_comic` is true and + /// `dest` is empty. + pub dest: Option, + /// Additional options passed verbatim to `gallery-dl` (e.g. + /// Instagram filters). + pub opt_args: String, + /// The generated command string. This field is populated by + /// `generate_command` and consumed by `run_command`. + pub command: String, +} + +impl Gallery { + /// Builds a gallery-dl command based on the current fields. When + /// `is_comic` is true the destination and archive database are + /// read from the `comic` section of the configuration and the + /// provided `user` is ignored. Otherwise the user is used to + /// determine where to download and which archive to use. The + /// generated command is stored in `self.command`. + pub fn generate_command( + &mut self, + user: Option<&User>, + cfg: &Value, + is_comic: bool, + ) -> Result<()> { + // Determine directory, database and queue based on context + let (directory, database, queue): (String, String, String) = if is_comic { + let comic = cfg + .get("comic") + .ok_or_else(|| anyhow!("Missing 'comic' section in configuration"))?; + let download_dir = comic + .get("download-dir") + .and_then(|v| v.as_str()) + .unwrap_or_default(); + let database = comic + .get("database") + .and_then(|v| v.as_str()) + .unwrap_or_default(); + // When a list name is provided for comics look up + // `${list}-list` in the comic config + let q = if let Some(list_name) = self.list.as_ref() { + let key = format!("{}-list", list_name); + if let Some(v) = comic.get(&Value::String(key.clone())) { + if let Some(s) = v.as_str() { + quote(s) + } else { + String::new() + } + } else { + String::new() + } + } else { + String::new() + }; + ( + quote(download_dir), + quote(database), + q, + ) + } else { + // Non‑comic downloads must have an associated user + let user = user.ok_or_else(|| anyhow!("User is required for non comic downloads"))?; + // Destination directory falls back to an empty string + let dest_key = self.dest.as_ref().unwrap_or(&String::new()); + let dir_path = user + .directories + .get(dest_key) + .ok_or_else(|| anyhow!("Unknown destination '{}' for user {}", dest_key, user.name))?; + let db_path = user + .dbs + .get("gallery") + .ok_or_else(|| anyhow!("Missing gallery database for user {}", user.name))?; + let q = if let Some(list_name) = self.list.as_ref() { + if let Some(p) = user.lists.get(list_name) { + quote(p.to_string_lossy().as_ref()) + } else { + String::new() + } + } else { + String::new() + }; + ( + quote(dir_path.to_string_lossy().as_ref()), + quote(db_path.to_string_lossy().as_ref()), + q, + ) + }; + + // Determine the sleep interval. When a user is provided use + // their configured value. Otherwise fall back to the first + // user's value from the configuration or zero when missing. + let user_sleep: u64 = if let Some(u) = user { + u.sleep + } else { + cfg.get("users") + .and_then(|u| u.as_sequence()) + .and_then(|seq| seq.get(0)) + .and_then(|v| v.get("sleep")) + .and_then(|v| v.as_i64()) + .unwrap_or(0) as u64 + }; + let mut cmd = format!("gallery-dl --sleep {}", user_sleep); + if !self.skip_arg.is_empty() { + cmd.push_str(&self.skip_arg); + } + if is_comic || self.dest.is_some() { + cmd.push_str(&format!(" --dest {}", directory)); + } + if self.archive { + cmd.push_str(&format!(" --download-archive {}", database)); + } + if !self.opt_args.is_empty() { + cmd.push_str(&self.opt_args); + } + // Append either a direct link or an input file list + match (&self.link, &self.list) { + (Some(link), None) if !link.is_empty() => { + cmd.push(' '); + cmd.push_str("e(link)); + } + (None, Some(_)) if !queue.is_empty() => { + cmd.push_str(&format!(" -i {}", queue)); + } + _ => {} + } + self.command = cmd; + Ok(()) + } + + /// Runs the previously generated command. If `dry_run` is true + /// the command is printed and not executed. When `verbose` is + /// true the command is printed prior to execution. The + /// underlying execution is delegated to [`crate::functions::run`]. + pub fn run_command(&self, dry_run: bool, verbose: bool) -> Result<()> { + crate::functions::run(&self.command, dry_run, verbose) + } +} \ No newline at end of file diff --git a/src/download_rust/src/main.rs b/src/download_rust/src/main.rs new file mode 100644 index 0000000..2cb051a --- /dev/null +++ b/src/download_rust/src/main.rs @@ -0,0 +1,33 @@ +//! Program entry point. +//! +//! This module wires together argument parsing, configuration +//! loading, logging initialisation and the high level download +//! orchestration. Errors are propagated via [`anyhow::Result`] and +//! reported to stderr. + +mod args; +mod config; +mod download; +mod functions; +mod gallery; +mod user; + +use anyhow::Result; +use args::Cli; +use clap::Parser; +use env_logger; + +fn main() -> Result<()> { + // Initialise logging. The logger reads the `RUST_LOG` + // environment variable; if not set the default level is "info". + env_logger::init(); + // Parse command line arguments + let cli = Cli::parse(); + // Load configuration from disk + let cfg = config::load_config_variables()?; + // Dispatch to the download logic + if let Err(err) = download::run(cli, cfg) { + eprintln!("error: {}", err); + } + Ok(()) +} \ No newline at end of file diff --git a/src/download_rust/src/user.rs b/src/download_rust/src/user.rs new file mode 100644 index 0000000..4643314 --- /dev/null +++ b/src/download_rust/src/user.rs @@ -0,0 +1,246 @@ +//! User management. +//! +//! The `User` struct encapsulates per‑user configuration and +//! filesystem state. It derives its settings from the YAML +//! configuration and provides methods for managing lists, caching +//! directories and avoiding duplicate downloads. Iterators and guard +//! clauses are used throughout to make intent clear. + +use crate::config::load_config_variables; +use crate::functions::{append_line, clean_cache, parse_link, read_lines, validate_x_link}; +use anyhow::{anyhow, Context, Result}; +use log::error; +use rand::seq::SliceRandom; +use regex::Regex; +use serde_yaml::{Mapping, Value}; +use std::collections::HashMap; +use std::fs::{self, File}; +use std::path::{Path, PathBuf}; + +/// Represents a user and all of the paths and lists associated with +/// that user. The `User` is constructed from the global +/// configuration and an index selecting one of the `users` entries. +pub struct User { + /// The merged configuration for this user. User specific keys + /// override global settings. + pub config: Mapping, + /// The human readable name of the user. + pub name: String, + /// Number of seconds to sleep between operations. + pub sleep: u64, + /// Directories keyed by their logical purpose (e.g. "cache", + /// "lists", "downloads", "media", etc.). + pub directories: HashMap, + /// Paths to the databases used for archiving downloads. + pub dbs: HashMap, + /// Paths to various list files. See `list_manager` for details. + pub lists: HashMap, +} + +impl User { + /// Constructs a new user from the given index into the + /// configuration. Returns an error if the configuration is + /// malformed. + pub fn new(index: usize, cfg: &Value) -> Result { + let users = cfg + .get("users") + .and_then(|u| u.as_sequence()) + .ok_or_else(|| anyhow!("Configuration is missing a 'users' array"))?; + let user_cfg = users + .get(index) + .and_then(|v| v.as_mapping()) + .ok_or_else(|| anyhow!("Invalid user index {}", index))? + .clone(); + let global_cfg = cfg + .get("global") + .and_then(|v| v.as_mapping()) + .ok_or_else(|| anyhow!("Configuration is missing a 'global' map"))? + .clone(); + + // Merge global into user specific settings. User values take + // precedence. We perform a simple extend on a mutable copy. + let mut merged: Mapping = global_cfg.clone(); + for (k, v) in user_cfg.iter() { + merged.insert(k.clone(), v.clone()); + } + + let name = merged + .get(&Value::String("name".into())) + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow!("User configuration missing 'name'"))? + .to_string(); + let sleep = merged + .get(&Value::String("sleep".into())) + .and_then(|v| v.as_i64()) + .unwrap_or(0) as u64; + + // Build directory map from keys ending with '-dir'. Remove + // the suffix when storing the key. Convert each value into + // an absolute PathBuf and append the user name for cache and + // lists directories to mirror the Python behaviour. + let mut directories: HashMap = merged + .iter() + .filter_map(|(k, v)| { + let key = k.as_str()?; + if key.ends_with("-dir") { + let dir_name = key.trim_end_matches("-dir"); + let path_str = v.as_str()?; + Some((dir_name.to_string(), PathBuf::from(path_str))) + } else { + None + } + }) + .collect(); + + // Append user name to cache and lists directories + if let Some(cache) = directories.get_mut("cache") { + *cache = cache.join(&name); + } + if let Some(lists_dir) = directories.get_mut("lists") { + *lists_dir = lists_dir.join(&name); + } + + // Derive database file locations + let mut dbs = HashMap::new(); + if let Some(db_dir) = directories.get("databases") { + dbs.insert( + "gallery".to_string(), + db_dir.join(format!("{}.sqlite3", name)), + ); + dbs.insert( + "media".to_string(), + db_dir.join(format!("{}_ytdl.txt", name)), + ); + } + + // Derive list file locations + let mut lists = HashMap::new(); + if let Some(lists_dir) = directories.get("lists") { + lists.insert("master".to_string(), lists_dir.join("watch.txt")); + lists.insert("push".to_string(), lists_dir.join("instant.txt")); + } + if let Some(cache_dir) = directories.get("cache") { + lists.insert("instagram".to_string(), cache_dir.join("instagram.txt")); + lists.insert("kemono".to_string(), cache_dir.join("kemono.txt")); + lists.insert("main".to_string(), cache_dir.join("main.txt")); + } + + Ok(Self { + config: merged, + name, + sleep, + directories, + dbs, + lists, + }) + } + + /// Creates the necessary directory structure for this user. Any + /// pre‑existing cache directory is cleared. Missing list and + /// database files are touched into existence. Errors during + /// directory manipulation are propagated. + pub fn create_directories(&self) -> Result<()> { + // Clear the cache directory if it exists + if let Some(cache_dir) = self.directories.get("cache") { + let _ = clean_cache(cache_dir); + // Recreate cache directory after cleaning + fs::create_dir_all(cache_dir)?; + } + + // Create all directories + for dir in self.directories.values() { + fs::create_dir_all(dir)?; + } + + // Ensure list directory exists; complain otherwise + if let Some(lists_dir) = self.directories.get("lists") { + if !lists_dir.is_dir() { + error!("Lists directory for user {} doesn't exist", self.name); + } + } + + // Touch database files if missing + for db_path in self.dbs.values() { + if !db_path.is_file() { + File::create(db_path)?; + } + } + + // Touch master and push lists + for key in [&"master", &"push"] { + if let Some(path) = self.lists.get(*key) { + if !path.is_file() { + File::create(path)?; + } + } + } + Ok(()) + } + + /// Appends a line to the specified list. The list name must be + /// one of the keys in the `lists` map. A newline is appended + /// automatically. + pub fn append_list(&self, name: &str, line: &str) -> Result<()> { + let path = self + .lists + .get(name) + .ok_or_else(|| anyhow!("Unknown list {} for user {}", name, self.name))?; + append_line(path, line) + } + + /// Writes a link into its appropriate cache list based on simple + /// pattern matching. See the original Python implementation for + /// category definitions. This method uses guard clauses to keep + /// the matching logic obvious. + fn append_cache_list(&self, line: &str) -> Result<()> { + let lower = line.to_lowercase(); + if lower.contains('x') { + return self.append_list("main", &validate_x_link(line)); + } + if lower.contains("kemono.party") { + return self.append_list("kemono", line); + } + if lower.contains("instagram") { + return self.append_list("instagram", line); + } + // default case + self.append_list("main", line) + } + + /// Reads the master list, shuffles it and creates per‑site cache + /// lists. Empty or duplicate lines are silently ignored. + pub fn list_manager(&self) -> Result<()> { + self.create_directories()?; + let master_path = self + .lists + .get("master") + .ok_or_else(|| anyhow!("Master list missing for user {}", self.name))?; + let mut master_content = read_lines(master_path)?; + // Shuffle the list to randomise downloads; use a small RNG for + // reproducibility + let mut rng = rand::rngs::SmallRng::from_entropy(); + master_content.shuffle(&mut rng); + for line in master_content.iter().filter(|l| !l.is_empty()) { + self.append_cache_list(line)?; + } + Ok(()) + } + + /// Adds a link to the master list if it is not already present. + /// Normalisation of the link is performed via `parse_link` before + /// the check. Duplicates are logged and ignored. + pub fn save_link(&self, link: &str) -> Result<()> { + let master_path = self + .lists + .get("master") + .ok_or_else(|| anyhow!("Master list missing for user {}", self.name))?; + let contents = fs::read_to_string(master_path).unwrap_or_default().to_lowercase(); + let fixed = parse_link(link); + if contents.contains(&fixed.to_lowercase()) { + info!("Gallery repeated, not saving"); + return Ok(()); + } + info!("New gallery, saving"); + self.append_list("master", &fixed) + } +} \ No newline at end of file