NixOS/workstation/scripts/download/download.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rewriting of the download manager script
with the intention to make it
more modular with the use of flags
in order to avoid unnecesary modifications
to the cofig files.
Also following in line more posix and python rules.
"""

import re
import time
import logging
import yaml
from functions import run
from functions import quote
from functions import list_lines
from functions import load_config_variables
from argparser import argparser
from gdl_classes import User

# GLOBAL VARIABLE SECTION
# Store the name of the main binaries early in the code
BIN_GALLERY = "gallery-dl"
BIN_YOUTUBE = "yt-dlp"
# SKIP = "3"
CONFIGS = load_config_variables()

LOGGER = logging.getLogger()
HANDLER = logging.StreamHandler()
FORMATTER = logging.Formatter(
    "[%(filename)s][%(levelname)s] %(funcName)s '%(message)s'"
)
HANDLER.setFormatter(FORMATTER)
LOGGER.addHandler(HANDLER)
LOGGER.setLevel(logging.INFO)

# Enable a default "everyone" flag for when running stuff like download gallery
USERS = ["everyone"]
for dictionary in CONFIGS["users"]:
    USERS.append(dictionary["name"])

ARGS = argparser(USERS)


def get_index(value: str) -> int:
    """Find the index in the config file"""
    for i, dic in enumerate(CONFIGS["users"]):
        if dic["name"] == value:
            LOGGER.debug("%s is %s", dic["name"], i)
            return i
    return -1


def parse_gallery(gdl_list: str, user: User):
    """Processes the gallery-dl command based on the selected gallery"""
    # skip_arg = f" -A {SKIP}" if ARGS.flag_skip else ""
    skip_arg = " -o skip=true" if not ARGS.flag_skip else ""
    LOGGER.debug(skip_arg)

    # Send the list to gallery-dl
    download_gallery(
        ARGS.flag_archive,
        skip_arg,
        "",
        str(user.sleep),
        quote(f"{user.dir_download}"),
        quote(f"{user.archive_gallery}"),
        quote(gdl_list),
        parse_instagram(gdl_list),
    )


def parse_instagram(link: str) -> str:
    """Fix instagram links"""
    if "instagram" not in link:
        return ""
    if isinstance(ARGS.post_type, list):
        string = f" -o include={quote(','.join(ARGS.post_type))}"
        LOGGER.debug(string)
        return string
    string = f" -o include={quote(ARGS.post_type)}"
    LOGGER.debug(string)
    return string


def parse_link(link: str) -> str:
    """Fixes links"""
    if not re.search(r"(twitter\.com\/\w+(\/)?(?!.*status))", link):
        LOGGER.debug("No modifications needed for the link %s", link)
        return link
    # if url contains /media at the end just write the line
    fixed_link = re.sub(r"\/$|\/media(\/?)$", "", link) + "/media"
    LOGGER.debug("Processed link %s", fixed_link)
    return fixed_link


def download_gallery(
    use_archive: bool,
    skip_arg: str = "",
    link: str = "",
    sleep: str = "0",
    destination: str = "",
    database: str = "",
    queue: str = "",
    opt_args: str = "",
):
    """Processes the command string to run the gallery archiver"""
    command = f"{BIN_GALLERY} --sleep {sleep}"
    if skip_arg != "":
        command += skip_arg
    if destination != "":
        command += f" --dest {destination}"
    if use_archive:
        command += f" --download-archive {database}"
    if opt_args != "":
        command += opt_args
    if link != "" and queue == "":
        LOGGER.info("link: %s", quote(link))
        command += f" {link}"
    if queue != "" and link == "":
        LOGGER.info("queue: %s", queue)
        command += f" -i {queue}"
    LOGGER.debug(command)
    run(command, ARGS.flag_verbose)


def download_youtube(
    use_archive: bool,
    link: str = "",
    destination: str = "",
    database: str = "",
):
    """Filters and processes the required command to download videos"""
    command = BIN_YOUTUBE

    if re.search(r"(https:\/\/youtube|https:\/\/www.youtube|https:\/\/youtu.be)", link):
        command += f' -o {quote(destination + "/%(title)s.%(ext)s")}'

    elif re.search(r"(https:\/\/music.youtube.*)", link):
        if use_archive:
            command += f" --download-archive {database}"
        command += f""" \
            --no-playlist --newline -x \
            --audio-format best --add-metadata --audio-quality 0 -o \
            {quote(destination + '/%(title)s.%(ext)s')} \
        """

    elif re.search(r"chaturbate", link):
        # Re-runs the program every 30 seconds in case the stream goes private or dc
        for i in range(1, 41):  # For a 20 minute total
            run(
                f"""
                {BIN_YOUTUBE} \
                --hls-use-mpegts --prefer-ffmpeg \
                -o {quote(destination + '/%(title)s.%(ext)s')} \
                {link}
                """,
                ARGS.flag_verbose,
            )
            time.sleep(30)
            LOGGER.info("waited for %s minutes", i * 30 / 60)

    else:  # Any other video link, just do it generic
        command += f" -f mp4 -o {quote(destination + '/%(title)s.%(ext)s')}"
    LOGGER.info("%s %s", command, link)
    run(f"{command} {link}", ARGS.flag_verbose)


def comic_manager(skip_arg: str, category: str):
    """Process the information to download manga"""
    re_cat = ""
    if category == "manga":
        re_cat = "manga|webtoon"
    elif category == "comic":
        re_cat = "readcomiconline"

    with open(CONFIGS["comic"]["list"], encoding="utf-8") as list_comic:
        for graphic_novel in [line.rstrip() for line in list_comic]:
            # Search for mangas but exclude comics
            if not re.search(re_cat, graphic_novel):
                LOGGER.debug("%s does not match regex espression", graphic_novel)
                continue
            download_gallery(
                ARGS.flag_archive,
                skip_arg,
                quote(graphic_novel),
                "0",
                CONFIGS["comic"]["download-directory"],
                CONFIGS["comic"]["archive"],
                "",
                "",
            )


def webcomic_manager():
    """Process the information to download webcomics"""
    webcomic_list = CONFIGS["comic"]["webcomic-list"]
    with open(webcomic_list, encoding="utf-8") as open_list:
        webcomic_file = yaml.safe_load(open_list)

    # Create a list of all the available webcomics for the user to chose from
    for index, entry in enumerate(webcomic_file["Webcomics"]):
        print(list_lines(index, entry["name"]))

    # Prompt for a choice
    usr_input = int(input("Select your comic: "))
    # Determines where the webcomic will be downloaded
    rating = webcomic_file["Webcomics"][usr_input]["type"]
    webcomic_category = webcomic_file["Global"][f"{rating}_directory"]
    LOGGER.debug("The webcomic is %s", webcomic_category)
    command = f"""cd {quote(webcomic_category)} && webcomix custom \
        {quote(webcomic_file["Webcomics"][usr_input]["name"])} \
        --start-url \
        {quote(webcomic_file["Webcomics"][usr_input]["url"])} \
        --next-page-xpath={quote(webcomic_file["Webcomics"][usr_input]["next_code"])} \
        --image-xpath={quote(webcomic_file["Webcomics"][usr_input]["image_code"])} \
        -y --cbz"""
    LOGGER.debug(command)
    run(command, ARGS.flag_verbose)


def push_manager(user: User):
    """Filters out the URL to use the appropiate downloader"""
    # Creates an array which will store any links that should use youtube-dl
    link_video_cache = []
    re_links = re.compile(
        r"(twitter\.com\/\w+((?=.*media)|(?!.*status)))"
        r"|(men\.wikifeet)"
        r"|(furaffinity\.net\/user\/)"
        r"|((deviantart\.com\/\w+(?!.*\/art\/)))"
        r"|(furaffinity\.net\/gallery\/)"
        r"|(furaffinity\.net\/scraps\/)"
        r"|(furaffinity\.net\/favorites\/)"
        r"|(instagram.com(?!\/p\/)\/\w+)"
        r"|(e621\.net((?=\/post\/)|(?!\/posts\/)))"
        r"|(flickr\.com\/photos\/\w+\/(?!\d+))"
        r"|(tumblr\.com(?!\/post\/))"
        r"|(kemono\.party\/(fanbox|gumroad|patreon)(?!\/user\/\d+\/post))"
        r"|(blogspot\.com(?!\/))"
        r"|(rule34\.paheal\.net\/post\/(?!view))"
        r"|(rule34\.xxx\/index\.php\?page\=post&s=(?!view))"
        r"|(pixiv\.net\/(en\/)?((?=users)|(?!artwork)))"
        r"|(reddit\.com\/(user|u))"
        r"|(baraag\.net\/((@\w+)|(?!\/\d+)))"
        r"|(pinterest\.com\/(?!pin\/\d+))"
        r"|(redgifs\.com\/(users|u|(?!watch)))",
    )
    with open(user.list_push, encoding="utf-8") as list_push:
        for link in [line.rstrip() for line in list_push]:
            LOGGER.debug("Processing %s", link)
            # Flush the push list, cleans all the contents
            with open(user.list_push, "w", encoding="utf-8") as list_push:
                list_push.close()
            # VIDEOS
            if re.search(r"youtu.be|youtube|pornhub|xtube|xvideos|chaturbate", link):
                LOGGER.debug("Matched type yt-dlp")
                link_video_cache.append(link)
            # Search for gallery links, these will be added to a list after downloading
            elif re.search(re_links, link):
                LOGGER.debug("Matched type gallery-dl")
                # skip_arg = f" -A {SKIP}" if ARGS.flag_skip else ""
                skip_arg = " -o skip=true" if not ARGS.flag_skip else ""
                LOGGER.debug("Skip: %s, link: %s", skip_arg, parse_instagram(link))
                download_gallery(
                    ARGS.flag_archive,
                    skip_arg,
                    quote(f"{parse_link(link)}"),
                    f"{user.sleep}",
                    quote(f"{user.dir_download}"),
                    quote(f"{user.archive_gallery}"),
                    "",
                    f"{parse_instagram(link)}",
                )
                # Record the gallery link, so it remains on the watch list
                with open(user.list_master, "a", encoding="utf-8") as w_file, open(
                    user.list_master, "r", encoding="utf-8"
                ) as r_file:
                    content = r_file.read().lower()
                    if parse_link(link).lower() in content:
                        LOGGER.info("Gallery repeated, not saving")
                        continue
                    LOGGER.info("New gallery, saving")
                    w_file.write(parse_link(str(link)) + "\n")

            # Searches for comic/manga links
            elif re.search(r"readcomiconline|mangahere|mangadex|webtoons", link):
                # Toggle for comic/manga skip flag
                if ARGS.flag_skip and re.search(r"readcomiconline", link):
                    skip_arg = " --chapter-range 1"
                elif ARGS.flag_skip and re.search(r"mangahere|webtoons", link):
                    skip_arg = " --chapter-range 1-5"
                else:
                    skip_arg = ""
                LOGGER.debug(skip_arg)

                download_gallery(
                    ARGS.flag_archive,
                    skip_arg,
                    quote(link),
                    "0",
                    CONFIGS["comic"]["download-directory"],
                    CONFIGS["comic"]["archive"],
                    "",
                    "",
                )
                # Add comic/manga link to the list
                list_gn = CONFIGS["comic"]["list"]
                with open(list_gn, "a", encoding="utf-8") as w_file, open(
                    list_gn, "r", encoding="utf-8"
                ) as r_file:
                    content = r_file.read().lower()
                    if parse_link(link).lower() in content:
                        LOGGER.info("Graphic novel repeated, not saving")
                        continue
                    LOGGER.info("New graphic novel, saving")
                    w_file.write(link + "\n")
            # Download generic links, the -o flag overwrites config file and
            # downloads the files into the root destination
            else:
                LOGGER.info("Other type of download %s", link)
                download_gallery(
                    False,
                    " -o directory='[]'",
                    quote(link),
                    "0",
                    quote(str(user.dir_push)),
                    "",
                    "",
                    "",
                )
    # Send the video links to youtube-dl
    for link in link_video_cache:
        download_youtube(
            ARGS.flag_archive,
            quote(link),
            f"{user.dir_media_download}",
            quote(f"{user.archive_media}"),
        )


def scrapper_manager(user: User):
    # pylint: disable=too-many-branches
    """Analyze the user arguments and call in functions"""
    if not ARGS.scrapper:  # Check if a scrapper was selected
        return

    if re.search(r"gallery|instagram|kemono", ARGS.scrapper):
        # skip_arg = f" -A {SKIP}" if ARGS.flag_skip else ""
        skip_arg = " -o skip=true" if not ARGS.flag_skip else ""
        LOGGER.debug(skip_arg)
        if ARGS.scrapper == "gallery":
            parse_gallery(f"{user.list_main}", user)
        elif ARGS.scrapper == "instagram":
            parse_gallery(f"{user.list_instagram}", user)
        elif ARGS.scrapper == "kemono":
            parse_gallery(f"{user.list_kemono}", user)
    elif ARGS.scrapper in "push":
        push_manager(user)
    elif ARGS.scrapper in "comic":
        skip_arg = " --chapter-range 1" if ARGS.flag_skip else ""
        LOGGER.debug(skip_arg)
        comic_manager(skip_arg, "comic")
    elif ARGS.scrapper in "manga":
        skip_arg = " --chapter-range 1-5" if ARGS.flag_skip else ""
        LOGGER.debug(skip_arg)
        comic_manager(skip_arg, "manga")
    elif ARGS.scrapper in "webcomic":
        webcomic_manager()


def main():
    """Main module to decide what to do based on the parsed arguments"""
    if ARGS.scrapper:
        if (ARGS.user in "everyone") and (
            re.search(r"push|gallery|instagram|kemono", ARGS.scrapper)
        ):
            for current_user in CONFIGS["users"]:
                user = User(get_index(current_user["name"]))
                user.list_manager()
                LOGGER.info("Scrapping %s for %s", ARGS.scrapper, current_user["name"])
                scrapper_manager(user)
        elif re.search(r"comic|manga|webcomic", ARGS.scrapper):
            user = User(get_index("jawz"))
            user.list_manager()
            LOGGER.info("Scrapping %s", ARGS.scrapper)
            scrapper_manager(user)
        else:
            # Create the lists to scrap
            user = User(get_index(ARGS.user))
            user.list_manager()
            scrapper_manager(user)
    elif ARGS.link:
        LOGGER.debug(ARGS.link)
        if re.search(r"everyone|jawz", ARGS.user):
            # Create the lists to scrap
            user = User(get_index("jawz"))
            user.list_manager()
        else:
            # Create the lists to scrap
            user = User(get_index(ARGS.user))
            user.list_manager()
        for arg_link in ARGS.link[0]:
            LOGGER.debug(arg_link)
            if ARGS.flag_verbose:
                LOGGER.debug(
                    "%s >> %s", quote(parse_link(arg_link)), quote(user.list_push)
                )
            else:
                with open(user.list_push, "a", encoding="utf-8") as open_file:
                    open_file.write(parse_link(arg_link) + "\n")
        push_manager(user)


if __name__ == "__main__":
    main()