Source code for scripts.classification.main_prepare_learning_sets

# %%
# This script takes the curated dataset (raw_learning_set) and prepares a refined learning set according to the taxonomy file.
# Each class will be aggregated to a single class according to the taxonomy file and where the cut (eg order, family, genus) is specified in the config file.
# The output structure is a directory tree with training and validation sets, each containing a folder for each class.

# The input directory tree is specified and curated by the user.

##### Need to wrap in main() ???

import argparse
import shutil
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import yaml

from mzbsuite.utils import cfg_to_arguments

# %%


[docs]def main(args, cfg):
    """
    Main function to prepare the learning sets for the classification task.

    Parameters
    ----------

    args: argparse.Namespace
        Arguments parsed from the command line. Specifically:
        
            - input_dir: path to the directory containing the raw clips
            - output_dir: path to the directory where the learning sets will be saved
            - config_file: path to the config file with train / inference parameters
            - taxonomy_file: path to the taxonomy file indicating classes per level
            - verbose: boolean indicating whether to print progress to the console

    cfg: dict
        Dictionary containing the configuration parameters.

    Returns
    -------
    None. Saves the files for the different splits in the specified folders.
    """

    np.random.seed(cfg.glob_random_seed)

    # rood of raw clip data
    root_data = Path(args.input_dir)
    outdir = Path(args.output_dir)
    outdir.mkdir(parents=True, exist_ok=True)

    # target folders definition
    target_trn = outdir / "trn_set/"
    target_val = outdir / "val_set/"

    # check if trn_set and val_set subfolders exist. If so, then interrupt the script.
    # This is to make sure that no overwriting happens; prompt the user that they need to specify a different output directory.
    if target_trn.exists() or target_val.exists():
        raise ValueError(
            # print in red and back to normal
            f"\033[91m Output directory {outdir} already exists. Please specify a different output directory.\033[0m"
        )

    # make dictionary to recode: key is current classification, value is target reclassif.
    # forward fill to get last valid entry and subset to desired column
    mzb_taxonomy = pd.read_csv(Path(args.taxonomy_file))
    if "Unnamed: 0" in mzb_taxonomy.columns:
        mzb_taxonomy = mzb_taxonomy.drop(columns=["Unnamed: 0"])
    mzb_taxonomy = mzb_taxonomy.ffill(axis=1)
    recode_order = dict(
        zip(mzb_taxonomy["query"], mzb_taxonomy[cfg.lset_class_cut].str.lower())
    )

    if args.verbose:
        print(f"Cutting phyl tree at {cfg.lset_class_cut}")

    # Move files to target folders for all files in the curated learning set
    for s_fo in recode_order:
        target_folder = target_trn / recode_order[s_fo]
        target_folder.mkdir(exist_ok=True, parents=True)

        for file in list((root_data / s_fo).glob("*")):
            shutil.copy(file, target_folder)

    # move out the validation set
    # make a small val set, 10% or 1 file, what is possible...
    size = cfg.lset_val_size
    trn_folds = [a.name for a in sorted(list(target_trn.glob("*")))]

    for s_fo in trn_folds:
        target_folder = target_val / s_fo
        target_folder.mkdir(exist_ok=True, parents=True)

        list_class = list((target_trn / s_fo).glob("*"))
        n_val_sam = np.max((1, np.ceil(0.1 * len(list_class))))

        val_files = np.random.choice(list_class, int(n_val_sam))

        if args.verbose:
            print(len(val_files), n_val_sam, len(list_class))

        for file in val_files:
            try:
                shutil.move(str(file), target_folder)
            except:
                print(f"{str(file)} into {target_folder}")

    # make a separate folder for the mixed set (actual test set)
    if (root_data / "mixed").is_dir():
        target_tst = outdir / "mixed_set/"
        target_tst.mkdir(exist_ok=True, parents=True)

        for file in list((root_data / "mixed").glob("*")):
            shutil.copy(file, target_tst)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_file", type=str, required=True)
    parser.add_argument("--input_dir", type=str, required=True)
    parser.add_argument("--taxonomy_file", type=str, required=False, default=None)
    parser.add_argument("--output_dir", type=str, required=True)
    parser.add_argument("--verbose", "-v", action="store_true")
    args = parser.parse_args()

    with open(str(args.config_file), "r") as f:
        cfg = yaml.load(f, Loader=yaml.FullLoader)

    cfg = cfg_to_arguments(cfg)

    sys.exit(main(args, cfg))