{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://catalog.lintel.tools/schemas/schemastore/everyvoice-tts-toolkit-aligner-configuration/versions/0.2.json",
  "title": "DFAlignerConfig",
  "x-lintel": {
    "source": "https://raw.githubusercontent.com/EveryVoiceTTS/everyvoice/main/everyvoice/.schema/everyvoice-aligner-0.2.json",
    "sourceSha256": "73ef78fac02aadd52b7e5de8e86d6af30b1d4960da3d8c6f7a0ae3538323d5b8",
    "fileMatch": [
      "everyvoice-aligner.yaml",
      "everyvoice-aligner.json"
    ],
    "parsers": [
      "json",
      "yaml"
    ]
  },
  "type": "object",
  "properties": {
    "contact": {
      "allOf": [
        {
          "$ref": "#/$defs/ContactInformation"
        }
      ],
      "description": "EveryVoice requires a contact name and email to help prevent misuse. Please read our Guide <https://docs.everyvoice.ca/latest/> to understand more about the importance of misuse prevention with TTS."
    },
    "VERSION": {
      "default": "1.0",
      "title": "Version",
      "type": "string"
    },
    "model": {
      "allOf": [
        {
          "$ref": "#/$defs/DFAlignerModelConfig"
        }
      ],
      "description": "The model configuration settings."
    },
    "path_to_model_config_file": {
      "anyOf": [
        {
          "format": "file-path",
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The path of a preprocessing configuration file.",
      "title": "Path To Model Config File"
    },
    "training": {
      "allOf": [
        {
          "$ref": "#/$defs/DFAlignerTrainingConfig"
        }
      ],
      "description": "The training configuration hyperparameters."
    },
    "path_to_training_config_file": {
      "anyOf": [
        {
          "format": "file-path",
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The path of a preprocessing configuration file.",
      "title": "Path To Training Config File"
    },
    "preprocessing": {
      "allOf": [
        {
          "$ref": "#/$defs/PreprocessingConfig"
        }
      ],
      "description": "The preprocessing configuration, including information about audio settings."
    },
    "path_to_preprocessing_config_file": {
      "anyOf": [
        {
          "format": "file-path",
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The path of a preprocessing configuration file.",
      "title": "Path To Preprocessing Config File"
    },
    "text": {
      "$ref": "#/$defs/TextConfig"
    },
    "path_to_text_config_file": {
      "anyOf": [
        {
          "format": "file-path",
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Path To Text Config File"
    }
  },
  "$defs": {
    "AdamOptimizer": {
      "$schema": "http://json-schema.org/draft-07/schema",
      "type": "object",
      "properties": {
        "learning_rate": {
          "default": 0.0001,
          "description": "The initial learning rate to use",
          "title": "Learning Rate",
          "type": "number"
        },
        "eps": {
          "default": 1e-8,
          "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.",
          "title": "Eps",
          "type": "number"
        },
        "weight_decay": {
          "default": 0.01,
          "title": "Weight Decay",
          "type": "number"
        },
        "betas": {
          "default": [
            0.9,
            0.98
          ],
          "description": "Advanced. The values of the Adam Optimizer beta coefficients.",
          "maxItems": 2,
          "minItems": 2,
          "prefixItems": [
            {
              "type": "number"
            },
            {
              "type": "number"
            }
          ],
          "title": "Betas",
          "type": "array"
        },
        "name": {
          "default": "adam",
          "description": "The name of the optimizer to use.",
          "title": "Name",
          "type": "string"
        }
      },
      "title": "AdamOptimizer",
      "additionalProperties": false
    },
    "AdamWOptimizer": {
      "$schema": "http://json-schema.org/draft-07/schema",
      "type": "object",
      "properties": {
        "learning_rate": {
          "default": 0.0001,
          "description": "The initial learning rate to use",
          "title": "Learning Rate",
          "type": "number"
        },
        "eps": {
          "default": 1e-8,
          "description": "Advanced. The value of optimizer constant Epsilon, used for numerical stability.",
          "title": "Eps",
          "type": "number"
        },
        "weight_decay": {
          "default": 0.01,
          "title": "Weight Decay",
          "type": "number"
        },
        "betas": {
          "default": [
            0.9,
            0.98
          ],
          "description": "Advanced. The values of the AdamW Optimizer beta coefficients.",
          "maxItems": 2,
          "minItems": 2,
          "prefixItems": [
            {
              "type": "number"
            },
            {
              "type": "number"
            }
          ],
          "title": "Betas",
          "type": "array"
        },
        "name": {
          "default": "adamw",
          "description": "The name of the optimizer to use.",
          "title": "Name",
          "type": "string"
        }
      },
      "title": "AdamWOptimizer",
      "additionalProperties": false
    },
    "AudioConfig": {
      "$schema": "http://json-schema.org/draft-07/schema",
      "type": "object",
      "properties": {
        "min_audio_length": {
          "default": 0.4,
          "description": "The minimum length of an audio sample in seconds. Audio shorter than this will be ignored during preprocessing.",
          "title": "Min Audio Length",
          "type": "number"
        },
        "max_audio_length": {
          "default": 11.0,
          "description": "The maximum length of an audio sample in seconds. Audio longer than this will be ignored during preprocessing. Increasing the max_audio_length will result in larger memory usage. If you are running out of memory, consider lowering the max_audio_length.",
          "title": "Max Audio Length",
          "type": "number"
        },
        "max_wav_value": {
          "default": 32767.0,
          "description": "Advanced. The maximum value allowed to be in your wav files. For 16-bit audio, this should be (2**16)/2 - 1.",
          "title": "Max Wav Value",
          "type": "number"
        },
        "input_sampling_rate": {
          "default": 22050,
          "description": "The sampling rate describes the number of samples per second of audio. The 'input_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the spectrograms predicted by your text-to-spec model will also be calculated from audio at this sampling rate. If you change this value, your audio will automatically be re-sampled during preprocessing.",
          "title": "Input Sampling Rate",
          "type": "integer"
        },
        "output_sampling_rate": {
          "default": 22050,
          "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'output_sampling_rate' is with respect to your vocoder, or spec-to-wav model. This means that the wav files generated by your vocoder or spec-to-wav model will be at this sampling rate. If you change this value, you will also need to change the upsample rates in your vocoder. Your audio will automatically be re-sampled during preprocessing.",
          "title": "Output Sampling Rate",
          "type": "integer"
        },
        "alignment_sampling_rate": {
          "default": 22050,
          "description": "Advanced. The sampling rate describes the number of samples per second of audio. The 'alignment_sampling_rate' describes the sampling rate used when training an alignment model. If you change this value, your audio will automatically be re-sampled during preprocessing.",
          "title": "Alignment Sampling Rate",
          "type": "integer"
        },
        "target_bit_depth": {
          "default": 16,
          "description": "Advanced. This is the bit depth of each sample in your audio files.",
          "title": "Target Bit Depth",
          "type": "integer"
        },
        "n_fft": {
          "default": 1024,
          "description": "Advanced. This is the number of bins used by the Fast Fourier Transform (FFT).",
          "title": "FFT Size",
          "type": "integer"
        },
        "fft_window_size": {
          "default": 1024,
          "description": "Advanced. This is the window size used by the Fast Fourier Transform (FFT).",
          "title": "FFT Window Size",
          "type": "integer"
        },
        "fft_hop_size": {
          "default": 256,
          "description": "Advanced. This is the hop size for calculating the Short-Time Fourier Transform (STFT) which calculates a sequence of spectrograms from a single audio file. Another way of putting it is that the hop size is equal to the amount of non-intersecting samples from the audio in each spectrogram.",
          "title": "FFT Hop Size",
          "type": "integer"
        },
        "f_min": {
          "default": 0,
          "description": "Advanced. This is the minimum frequency for the lowest frequency bin when calculating the spectrogram.",
          "title": "Minimum Frequency",
          "type": "integer"
        },
        "f_max": {
          "default": 8000,
          "description": "Advanced. This is the maximum frequency for the highest frequency bin when calculating the spectrogram.",
          "title": "Maximum Frequency",
          "type": "integer"
        },
        "n_mels": {
          "default": 80,
          "description": "Advanced. This is the number of filters in the Mel-scale spaced filterbank.",
          "title": "Number of Mel bins",
          "type": "integer"
        },
        "spec_type": {
          "anyOf": [
            {
              "$ref": "#/$defs/AudioSpecTypeEnum"
            },
            {
              "type": "string"
            }
          ],
          "default": "mel-librosa",
          "description": "Advanced. Defines how to calculate the spectrogram. 'mel' uses the TorchAudio implementation for a Mel spectrogram. 'mel-librosa' uses Librosa's implementation. 'linear' calculates a non-Mel linear spectrogram and 'raw' calculates a complex-valued spectrogram. 'linear' and 'raw' are not currently supported by EveryVoice. We recommend using 'mel-librosa'.",
          "title": "Spec Type"
        },
        "vocoder_segment_size": {
          "default": 8192,
          "description": "Advanced. The vocoder, or spec-to-wav model is trained by sampling random fixed-size sections of the audio. This value specifies the number of samples in those sections.",
          "title": "Vocoder Segment Size",
          "type": "integer"
        }
      },
      "title": "AudioConfig",
      "additionalProperties": false
    },
    "AudioSpecTypeEnum": {
      "enum": [
        "mel",
        "mel-librosa",
        "linear",
        "raw"
      ],
      "title": "AudioSpecTypeEnum",
      "type": "string"
    },
    "ContactInformation": {
      "$schema": "http://json-schema.org/draft-07/schema",
      "type": "object",
      "properties": {
        "contact_name": {
          "description": "The name of the contact person or organization responsible for answering questions related to this model.",
          "title": "Contact Name",
          "type": "string"
        },
        "contact_email": {
          "description": "The email address of the contact person or organization responsible for answering questions related to this model.",
          "format": "email",
          "title": "Contact Email",
          "type": "string"
        }
      },
      "required": [
        "contact_name",
        "contact_email"
      ],
      "title": "ContactInformation",
      "additionalProperties": false
    },
    "DFAlignerExtractionMethod": {
      "enum": [
        "beam",
        "dijkstra"
      ],
      "title": "DFAlignerExtractionMethod",
      "type": "string"
    },
    "DFAlignerModelConfig": {
      "$schema": "http://json-schema.org/draft-07/schema",
      "type": "object",
      "properties": {
        "target_text_representation_level": {
          "allOf": [
            {
              "$ref": "#/$defs/TargetTrainingTextRepresentationLevel"
            }
          ],
          "default": "characters"
        },
        "lstm_dim": {
          "default": 512,
          "description": "The number of dimensions in the LSTM layers.",
          "title": "Lstm Dim",
          "type": "integer"
        },
        "conv_dim": {
          "default": 512,
          "description": "The number of dimensions in the convolutional layers.",
          "title": "Conv Dim",
          "type": "integer"
        }
      },
      "title": "DFAlignerModelConfig",
      "additionalProperties": false
    },
    "DFAlignerTrainingConfig": {
      "$schema": "http://json-schema.org/draft-07/schema",
      "type": "object",
      "properties": {
        "batch_size": {
          "default": 16,
          "description": "The number of samples to include in each batch when training. If you are running out of memory, consider lowering your batch_size.",
          "title": "Batch Size",
          "type": "integer"
        },
        "save_top_k_ckpts": {
          "default": 5,
          "description": "The number of checkpoints to save.",
          "title": "Save Top K Ckpts",
          "type": "integer"
        },
        "ckpt_steps": {
          "anyOf": [
            {
              "minimum": 0,
              "type": "integer"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The interval (in steps) for saving a checkpoint. By default checkpoints are saved every epoch using the 'ckpt_epochs' hyperparameter",
          "title": "Ckpt Steps"
        },
        "ckpt_epochs": {
          "anyOf": [
            {
              "minimum": 0,
              "type": "integer"
            },
            {
              "type": "null"
            }
          ],
          "default": 1,
          "description": "The interval (in epochs) for saving a checkpoint. You can also save checkpoints after n steps by using 'ckpt_steps'",
          "title": "Ckpt Epochs"
        },
        "val_check_interval": {
          "anyOf": [
            {
              "type": "integer"
            },
            {
              "type": "number"
            },
            {
              "type": "null"
            }
          ],
          "default": 500,
          "description": "How often to check the validation set. Pass a float in the range [0.0, 1.0] to check after a fraction of the training epoch. Pass an int to check after a fixed number of training batches.",
          "title": "Val Check Interval"
        },
        "check_val_every_n_epoch": {
          "anyOf": [
            {
              "type": "integer"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Run validation after every n epochs. Defaults to 1, but if you have a small dataset you should change this to be larger to speed up training",
          "title": "Check Val Every N Epoch"
        },
        "max_epochs": {
          "default": 1000,
          "description": "Stop training after this many epochs",
          "title": "Max Epochs",
          "type": "integer"
        },
        "max_steps": {
          "default": 100000,
          "description": "Stop training after this many steps",
          "title": "Max Steps",
          "type": "integer"
        },
        "finetune_checkpoint": {
          "anyOf": [
            {
              "format": "path",
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Automatically resume training from a checkpoint loaded from this path.",
          "title": "Finetune Checkpoint"
        },
        "training_filelist": {
          "default": "path/to/your/preprocessed/training_filelist.psv",
          "description": "The path to a filelist containing samples belonging to your training set.",
          "format": "path",
          "title": "Training Filelist",
          "type": "string"
        },
        "validation_filelist": {
          "default": "path/to/your/preprocessed/validation_filelist.psv",
          "description": "The path to a filelist containing samples belonging to your validation set.",
          "format": "path",
          "title": "Validation Filelist",
          "type": "string"
        },
        "filelist_loader": {
          "description": "Advanced. The function to use to load the filelist.",
          "title": "Filelist Loader",
          "type": "string"
        },
        "logger": {
          "allOf": [
            {
              "$ref": "#/$defs/LoggerConfig"
            }
          ],
          "description": "The configuration for the logger."
        },
        "val_data_workers": {
          "default": 0,
          "description": "The number of CPU workers to use when loading data during validation.",
          "title": "Val Data Workers",
          "type": "integer"
        },
        "train_data_workers": {
          "default": 4,
          "description": "The number of CPU workers to use when loading data during training.",
          "title": "Train Data Workers",
          "type": "integer"
        },
        "optimizer": {
          "anyOf": [
            {
              "$ref": "#/$defs/AdamOptimizer"
            },
            {
              "$ref": "#/$defs/AdamWOptimizer"
            }
          ],
          "description": "Optimizer configuration settings.",
          "title": "Optimizer"
        },
        "binned_sampler": {
          "default": true,
          "description": "Use a binned length sampler",
          "title": "Binned Sampler",
          "type": "boolean"
        },
        "plot_steps": {
          "default": 1000,
          "description": "The maximum number of steps to plot",
          "title": "Plot Steps",
          "type": "integer"
        },
        "extraction_method": {
          "allOf": [
            {
              "$ref": "#/$defs/DFAlignerExtractionMethod"
            }
          ],
          "default": "dijkstra",
          "description": "The alignment extraction algorithm to use. 'beam' will be quicker but possibly less accurate than 'dijkstra'"
        }
      },
      "title": "DFAlignerTrainingConfig",
      "additionalProperties": false
    },
    "Dataset": {
      "$schema": "http://json-schema.org/draft-07/schema",
      "type": "object",
      "properties": {
        "label": {
          "default": "YourDataSet",
          "description": "A label for the source of data",
          "title": "Label",
          "type": "string"
        },
        "permissions_obtained": {
          "default": false,
          "description": "An attestation that permission has been obtained to use this data. You may not use EveryVoice to build a TTS system with data that you do not have permission to use and there are serious possible consequences for doing so. Finding data online does not constitute permission. The speaker should be aware and consent to their data being used in this way.",
          "title": "Permissions Obtained",
          "type": "boolean"
        },
        "data_dir": {
          "default": "/please/create/a/path/to/your/dataset/data",
          "description": "The path to the directory with your audio files.",
          "format": "path",
          "title": "Data Dir",
          "type": "string"
        },
        "filelist": {
          "default": "/please/create/a/path/to/your/dataset/filelist",
          "description": "The path to your dataset's filelist.",
          "format": "path",
          "title": "Filelist",
          "type": "string"
        },
        "filelist_loader": {
          "description": "Advanced. The file-loader function to use to load your dataset's filelist.",
          "title": "Filelist Loader",
          "type": "string"
        },
        "sox_effects": {
          "default": [
            [
              "channels",
              "1"
            ]
          ],
          "description": "Advanced. A list of SoX effects to apply to your audio prior to preprocessing. Run python -c 'import torchaudio; print(torchaudio.sox_effects.effect_names())' to see a list of supported effects.",
          "items": {},
          "title": "Sox Effects",
          "type": "array"
        }
      },
      "title": "Dataset",
      "additionalProperties": false
    },
    "LoggerConfig": {
      "$schema": "http://json-schema.org/draft-07/schema",
      "type": "object",
      "description": "The logger configures all the information needed for where to store your experiment's logs and checkpoints.\nThe structure of your logs will then be:\n<name> / <version> / <sub_dir>\n<sub_dir> will be generated by calling <sub_dir_callable> each time the LoggerConfig is constructed.",
      "properties": {
        "name": {
          "default": "BaseExperiment",
          "description": "The name of the experiment. The structure of your logs will be <name> / <version> / <sub_dir>.",
          "title": "Experiment Name",
          "type": "string"
        },
        "save_dir": {
          "default": "logs_and_checkpoints",
          "description": "The directory to save your checkpoints and logs to.",
          "format": "path",
          "title": "Save Dir",
          "type": "string"
        },
        "sub_dir_callable": {
          "description": "The function that generates a string to call your runs - by default this is a timestamp. The structure of your logs will be <name> / <version> / <sub_dir> where <sub_dir> is a timestamp.",
          "title": "Sub Dir Callable",
          "type": "string"
        },
        "version": {
          "default": "base",
          "description": "The version of your experiment. The structure of your logs will be <name> / <version> / <sub_dir>.",
          "title": "Version",
          "type": "string"
        }
      },
      "title": "LoggerConfig",
      "additionalProperties": false
    },
    "PreprocessingConfig": {
      "$schema": "http://json-schema.org/draft-07/schema",
      "type": "object",
      "properties": {
        "dataset": {
          "default": "YourDataSet",
          "description": "The name of the dataset.",
          "title": "Dataset",
          "type": "string"
        },
        "train_split": {
          "default": 0.9,
          "description": "The amount of the dataset to use for training. The rest will be used as validation. Hold some of the validation set out for a test set if you are performing experiments.",
          "maximum": 1.0,
          "minimum": 0.0,
          "title": "Train Split",
          "type": "number"
        },
        "dataset_split_seed": {
          "default": 1234,
          "description": "The seed to use when splitting the dataset into train and validation sets.",
          "title": "Dataset Split Seed",
          "type": "integer"
        },
        "save_dir": {
          "default": "preprocessed/YourDataSet",
          "description": "The directory to save preprocessed files to.",
          "format": "path",
          "title": "Save Dir",
          "type": "string"
        },
        "audio": {
          "allOf": [
            {
              "$ref": "#/$defs/AudioConfig"
            }
          ],
          "description": "Configuration settings for audio."
        },
        "path_to_audio_config_file": {
          "anyOf": [
            {
              "format": "file-path",
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The path to an audio configuration file.",
          "title": "Path To Audio Config File"
        },
        "source_data": {
          "description": "A list of datasets.",
          "items": {
            "$ref": "#/$defs/Dataset"
          },
          "title": "Source Data",
          "type": "array"
        }
      },
      "title": "PreprocessingConfig",
      "additionalProperties": false
    },
    "Punctuation": {
      "properties": {
        "exclamations": {
          "default": [
            "!",
            "¡"
          ],
          "description": "Exclamation punctuation symbols used in your datasets. Replaces these symbols with <EXCL> internally.",
          "items": {
            "type": "string"
          },
          "title": "Exclamations",
          "type": "array"
        },
        "question_symbols": {
          "default": [
            "?",
            "¿"
          ],
          "description": "Question/interrogative punctuation symbols used in your datasets. Replaces these symbols with <QINT> internally.",
          "items": {
            "type": "string"
          },
          "title": "Question Symbols",
          "type": "array"
        },
        "quotemarks": {
          "default": [
            "\"",
            "'",
            "“",
            "”",
            "«",
            "»"
          ],
          "description": "Quotemark punctuation symbols used in your datasets. Replaces these symbols with <QUOTE> internally.",
          "items": {
            "type": "string"
          },
          "title": "Quotemarks",
          "type": "array"
        },
        "big_breaks": {
          "default": [
            ".",
            ":",
            ";"
          ],
          "description": "Punctuation symbols indicating a 'big break' used in your datasets. Replaces these symbols with <BB> internally.",
          "items": {
            "type": "string"
          },
          "title": "Big Breaks",
          "type": "array"
        },
        "small_breaks": {
          "default": [
            ",",
            "-",
            "—"
          ],
          "description": "Punctuation symbols indicating a 'small break' used in your datasets. Replaces these symbols with <SB> internally.",
          "items": {
            "type": "string"
          },
          "title": "Small Breaks",
          "type": "array"
        },
        "ellipsis": {
          "default": [
            "…"
          ],
          "description": "Punctuation symbols indicating an ellipsis used in your datasets. Replaces these symbols with <EPS> internally.",
          "items": {
            "type": "string"
          },
          "title": "Ellipsis",
          "type": "array"
        }
      },
      "title": "Punctuation",
      "type": "object"
    },
    "Symbols": {
      "type": "object",
      "properties": {
        "silence": {
          "default": [
            "<SIL>"
          ],
          "description": "The symbol(s) used to indicate silence.",
          "items": {
            "type": "string"
          },
          "title": "Silence",
          "type": "array"
        },
        "punctuation": {
          "allOf": [
            {
              "$ref": "#/$defs/Punctuation"
            }
          ],
          "description": "EveryVoice will combine punctuation and normalize it into a set of five permissible types of punctuation to help tractable training."
        }
      },
      "title": "Symbols",
      "additionalProperties": true
    },
    "TargetTrainingTextRepresentationLevel": {
      "enum": [
        "characters",
        "phones",
        "phonological_features"
      ],
      "title": "TargetTrainingTextRepresentationLevel",
      "type": "string"
    },
    "TextConfig": {
      "$schema": "http://json-schema.org/draft-07/schema",
      "type": "object",
      "properties": {
        "symbols": {
          "$ref": "#/$defs/Symbols"
        },
        "to_replace": {
          "type": "object",
          "default": {},
          "title": "To Replace",
          "additionalProperties": {
            "type": "string"
          }
        },
        "cleaners": {
          "items": {
            "type": "string"
          },
          "title": "Cleaners",
          "type": "array"
        },
        "g2p_engines": {
          "type": "object",
          "default": {},
          "description": "User defined or external G2P engines.\nSee <https://github.com/EveryVoiceTTS/everyvoice_g2p_template_plugin> to implement your own G2P.",
          "examples": [
            "{\"fr\": \"everyvoice_plugin_g2p4example.g2p\"}"
          ],
          "title": "External G2P",
          "additionalProperties": {
            "examples": [
              "everyvoice_plugin_g2p4example.g2p"
            ],
            "type": "string"
          }
        }
      },
      "title": "TextConfig",
      "additionalProperties": false
    }
  },
  "required": [
    "contact"
  ],
  "additionalProperties": false
}
