Update whisper xxl help text

2024-11-21 18:52:36 +01:00 · 2024-11-08 13:45:11 +01:00 · 2024-11-08 13:45:11 +01:00 · c46f838723
commit c46f838723
parent e7f38e60e0
1 changed files with 298 additions and 186 deletions
--- a/src/ui/Forms/AudioToText/WhisperAdvanced.resx
+++ b/src/ui/Forms/AudioToText/WhisperAdvanced.resx
@ -544,10 +544,10 @@ optional arguments:
  <data name="nikseTextBox1.Text" xml:space="preserve">
    <value>[--device DEVICE]
 [--verbose VERBOSE]
-[--task {transcribe,translate}]
+[--language_detection_threshold LANGUAGE_DETECTION_THRESHOLD]
-[--best_of BEST_OF]
+[--language_detection_segments LANGUAGE_DETECTION_SEGMENTS]
-[--beam_size BEAM_SIZE]
+[--temperature TEMPERATURE] [--best_of BEST_OF]
-[--patience PATIENCE]
+[--beam_size BEAM_SIZE] [--patience PATIENCE]
 [--length_penalty LENGTH_PENALTY]
 [--repetition_penalty REPETITION_PENALTY]
 [--no_repeat_ngram_size NO_REPEAT_NGRAM_SIZE]
@ -572,8 +572,7 @@ optional arguments:
 [--highlight_words HIGHLIGHT_WORDS]
 [--prepend_punctuations PREPEND_PUNCTUATIONS]
 [--append_punctuations APPEND_PUNCTUATIONS]
-[--threads THREADS]
+[--threads THREADS] [--version]
 [--version]
 [--vad_filter VAD_FILTER]
 [--vad_threshold VAD_THRESHOLD]
 [--vad_min_speech_duration_ms VAD_MIN_SPEECH_DURATION_MS]
@ -581,24 +580,17 @@ optional arguments:
 [--vad_min_silence_duration_ms VAD_MIN_SILENCE_DURATION_MS]
 [--vad_speech_pad_ms VAD_SPEECH_PAD_MS]
 [--vad_window_size_samples VAD_WINDOW_SIZE_SAMPLES]
-[--vad_dump]
+[--vad_method {silero_v3,silero_v4,pyannote_v3,pyannote_onnx_v3,auditok,webrtc}]
 [--vad_dump] [--nullify_non_speech]
 [--max_new_tokens MAX_NEW_TOKENS]
 [--chunk_length CHUNK_LENGTH]
 [--compute_type {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}]
-[--batch_recursive]
+[--batch_recursive] [--beep_off] [--skip]
-[--beep_off]
+[--checkcuda] [--print_progress] [--postfix]
-[--skip]
+[--check_files] [--PR163_off]
-[--checkcuda]
+[--hallucinations_list_off] [--alt_writer_off]
-[--print_progress]
+[--one_word {0,1,2}] [--sentence] [--standard]
-[--postfix]
+[--standard_asia] [--max_comma MAX_COMMA]
 [--check_files]
 [--PR163_off]
 [--hallucinations_list_off]
 [--one_word {0,1,2}]
 [--sentence]
 [--standard]
 [--standard_asia]
 [--max_comma MAX_COMMA]
 [--max_comma_cent {50,60,70,80,90,100}]
 [--max_gap MAX_GAP]
 [--max_line_width MAX_LINE_WIDTH]
@ -606,243 +598,363 @@ optional arguments:
 [--min_dist_to_end {0,4,5,6,7,8,9,10,11,12}]
 [--prompt_max {16,32,64,128,223}]
 [--reprompt {0,1,2}]
-[--prompt_reset_on_no_end {0,1,2}]
+[--prompt_reset_on_no_end {0,1,2}] [--ff_dump]
-[--ff_dump]
+[--ff_track {1,2,3,4,5,6}] [--ff_fc] [--ff_mp3]
-[--ff_track {1,2,3,4,5,6}]
+[--ff_sync] [--ff_rnndn_sh] [--ff_rnndn_xiph]
-[--ff_fc]
+[--ff_fftdn [0 - 97]] [--ff_tempo [0.5 - 2.0]]
-[--ff_mp3]
+[--ff_gate] [--ff_speechnorm] [--ff_loudnorm]
 [--ff_sync]
 [--ff_rnndn_sh]
 [--ff_rnndn_xiph]
 [--ff_fftdn
 [0 - 97]]
 [--ff_tempo
 [0.5 - 2.0]]
 [--ff_gate]
 [--ff_speechnorm]
 [--ff_loudnorm]
 [--ff_silence_suppress noise duration]
-[--ff_lowhighpass]
+[--ff_lowhighpass] [--ff_mdx_kim2]
-                          audio
+[--mdx_chunk MDX_CHUNK]
-[audio ...]
+[--mdx_device MDX_DEVICE] [--diarize]
 [--diarize_device DIARIZE_DEVICE]
 [--diarize_threads DIARIZE_THREADS]
 [--speaker SPEAKER]
 audio [audio ...]
 positional arguments:
-  audio                 audio file(s). You can enter a file wildcard, filelist (txt. m3u, m3u8, lst) or directory to
+  audio                 audio file(s). You can enter a file wildcard, filelist
-                        do batch processing. Note: non-media files in list or directory are filtered out by extension.
+                        (txt. m3u, m3u8, lst) or directory to do batch
                        processing. Note: non-media files in list or directory
                        are filtered out by extension.
 optional arguments:
  -h, --help            show this help message and exit
  --model MODEL, -m MODEL
                        name of the Whisper model to use (default: medium)
  --model_dir MODEL_DIR
-                        the path to save model files; uses C:\git\subtitleedit\src\ui\bin\Debug\Whisper\Purfview-
+                        the path to save model files; uses C:\git\subtitleedit
-                        Whisper-Faster\_models by default (default: None)
+                        \src\ui\bin\Debug\net48\Whisper\Purfview-Whisper-
                        Faster\_models by default (default: None)
  --device DEVICE, -d DEVICE
-                        Device to use. Default is 'cuda' if CUDA device is detected, else is 'cpu'. If CUDA GPU is a
+                        Device to use. Default is 'cuda' if CUDA device is
-                        second device then set 'cuda:1'. (default: cpu)
+                        detected, else is 'cpu'. If CUDA GPU is a second
                        device then set 'cuda:1'. (default: cpu)
  --output_dir OUTPUT_DIR, -o OUTPUT_DIR
-                        directory to save the outputs. By default the same folder where the executable file is or
+                        directory to save the outputs. By default the same
-                        where media file is if --batch_recursive=True. '.'- sets to the current folder. 'source' -
+                        folder where the executable file is or where media
-                        sets to where media file is. (default: default)
+                        file is if --batch_recursive=True. '.'- sets to the
                        current folder. 'source' - sets to where media file
                        is. (default: default)
  --output_format {lrc,txt,text,vtt,srt,tsv,json,all}, -f {lrc,txt,text,vtt,srt,tsv,json,all}
-                        format of the output file; if not specified srt will be produced (default: srt)
+                        format of the output file; if not specified srt will
                        be produced (default: srt)
  --verbose VERBOSE, -v VERBOSE
                        whether to print out debug messages (default: False)
  --task {transcribe,translate}
-                        whether to perform X-&gt;X speech recognition ('transcribe') or X-&gt;English translation
+                        whether to perform X-&gt;X speech recognition
-                        ('translate') (default: transcribe)
+                        ('transcribe') or X-&gt;English translation ('translate')
                        (default: transcribe)
  --language {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,yue,zh,Afrikaans,Albanian,Amharic,Arabic,Armenian,Assamese,Azerbaijani,Bashkir,Basque,Belarusian,Bengali,Bosnian,Breton,Bulgarian,Burmese,Cantonese,Castilian,Catalan,Chinese,Croatian,Czech,Danish,Dutch,English,Estonian,Faroese,Finnish,Flemish,French,Galician,Georgian,German,Greek,Gujarati,Haitian,Haitian Creole,Hausa,Hawaiian,Hebrew,Hindi,Hungarian,Icelandic,Indonesian,Italian,Japanese,Javanese,Kannada,Kazakh,Khmer,Korean,Lao,Latin,Latvian,Letzeburgesch,Lingala,Lithuanian,Luxembourgish,Macedonian,Malagasy,Malay,Malayalam,Maltese,Mandarin,Maori,Marathi,Moldavian,Moldovan,Mongolian,Myanmar,Nepali,Norwegian,Nynorsk,Occitan,Panjabi,Pashto,Persian,Polish,Portuguese,Punjabi,Pushto,Romanian,Russian,Sanskrit,Serbian,Shona,Sindhi,Sinhala,Sinhalese,Slovak,Slovenian,Somali,Spanish,Sundanese,Swahili,Swedish,Tagalog,Tajik,Tamil,Tatar,Telugu,Thai,Tibetan,Turkish,Turkmen,Ukrainian,Urdu,Uzbek,Valencian,Vietnamese,Welsh,Yiddish,Yoruba}, -l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,yue,zh,Afrikaans,Albanian,Amharic,Arabic,Armenian,Assamese,Azerbaijani,Bashkir,Basque,Belarusian,Bengali,Bosnian,Breton,Bulgarian,Burmese,Cantonese,Castilian,Catalan,Chinese,Croatian,Czech,Danish,Dutch,English,Estonian,Faroese,Finnish,Flemish,French,Galician,Georgian,German,Greek,Gujarati,Haitian,Haitian Creole,Hausa,Hawaiian,Hebrew,Hindi,Hungarian,Icelandic,Indonesian,Italian,Japanese,Javanese,Kannada,Kazakh,Khmer,Korean,Lao,Latin,Latvian,Letzeburgesch,Lingala,Lithuanian,Luxembourgish,Macedonian,Malagasy,Malay,Malayalam,Maltese,Mandarin,Maori,Marathi,Moldavian,Moldovan,Mongolian,Myanmar,Nepali,Norwegian,Nynorsk,Occitan,Panjabi,Pashto,Persian,Polish,Portuguese,Punjabi,Pushto,Romanian,Russian,Sanskrit,Serbian,Shona,Sindhi,Sinhala,Sinhalese,Slovak,Slovenian,Somali,Spanish,Sundanese,Swahili,Swedish,Tagalog,Tajik,Tamil,Tatar,Telugu,Thai,Tibetan,Turkish,Turkmen,Ukrainian,Urdu,Uzbek,Valencian,Vietnamese,Welsh,Yiddish,Yoruba}
-                        language spoken in the audio, specify None to perform language detection (default: None)
+                        language spoken in the audio, specify None to perform
                        language detection (default: None)
  --language_detection_threshold LANGUAGE_DETECTION_THRESHOLD
-                        If the maximum probability of the language tokens is higher than this value, the language is
+                        If the maximum probability of the language tokens is
-                        detected. (default: None)
+                        higher than this value, the language is detected.
                        (default: None)
  --language_detection_segments LANGUAGE_DETECTION_SEGMENTS
-                        Number of segments/chunks to consider for the language detection. (default: 1)
+                        Number of segments/chunks to consider for the language
                        detection. (default: 1)
  --temperature TEMPERATURE
                        temperature to use for sampling (default: 0)
  --best_of BEST_OF, -bo BEST_OF
-                        number of candidates when sampling with non-zero temperature (default: 5)
+                        number of candidates when sampling with non-zero
                        temperature (default: 5)
  --beam_size BEAM_SIZE, -bs BEAM_SIZE
-                        number of beams in beam search, only applicable when temperature is zero (default: 5)
+                        number of beams in beam search, only applicable when
                        temperature is zero (default: 5)
  --patience PATIENCE, -p PATIENCE
-                        optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the
+                        optional patience value to use in beam decoding, as in
-                        default (1.0) is equivalent to conventional beam search (default: 2.0)
+                        https://arxiv.org/abs/2204.05424, the default (1.0) is
                        equivalent to conventional beam search (default: 2.0)
  --length_penalty LENGTH_PENALTY
-                        optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses
+                        optional token length penalty coefficient (alpha) as
-                        simple length normalization by default (default: 1.0)
+                        in https://arxiv.org/abs/1609.08144, uses simple
                        length normalization by default (default: 1.0)
  --repetition_penalty REPETITION_PENALTY
-                        Penalty applied to the score of previously generated tokens (set &gt; 1.0 to penalize). (default:
+                        Penalty applied to the score of previously generated
-                        1.0)
+                        tokens (set &gt; 1.0 to penalize). (default: 1.0)
  --no_repeat_ngram_size NO_REPEAT_NGRAM_SIZE
-                        Prevent repetitions of ngrams with this size (set 0 to disable). (default: 0)
+                        Prevent repetitions of ngrams with this size (set 0 to
                        disable). (default: 0)
  --suppress_blank SUPPRESS_BLANK
-                        Suppress blank outputs at the beginning of the sampling. (default: True)
+                        Suppress blank outputs at the beginning of the
                        sampling. (default: True)
  --suppress_tokens SUPPRESS_TOKENS
-                        comma-separated list of token ids to suppress during sampling; '-1' will suppress most special
+                        comma-separated list of token ids to suppress during
-                        characters except common punctuations (default: -1)
+                        sampling; '-1' will suppress most special characters
                        except common punctuations. `None` - will pass empty
                        list. (default: -1)
  --initial_prompt INITIAL_PROMPT, -prompt INITIAL_PROMPT
-                        optional text to provide context as a prompt for the first window. Use 'None' to disable it.
+                        optional text to provide context as a prompt for the
-                        Note: 'auto' and 'default' are experimental ~universal prompt presets, they work if --language
+                        first window. Use 'None' to disable it. Note: 'auto'
-                        is set. (default: auto)
+                        and 'default' are experimental ~universal prompt
-  --prefix PREFIX       Optional text to provide as a prefix for the first window (default: None)
+                        presets, they work if --language is set. (default:
                        auto)
  --prefix PREFIX       Optional text to provide as a prefix for the first
                        window (default: None)
  --condition_on_previous_text CONDITION_ON_PREVIOUS_TEXT, -condition CONDITION_ON_PREVIOUS_TEXT
-                        if True, provide the previous output of the model as a prompt for the next window; disabling
+                        if True, provide the previous output of the model as a
-                        may make the text inconsistent across windows, but the model becomes less prone to getting
+                        prompt for the next window; disabling may make the
-                        stuck in a failure loop. If disabled then you may want to disable --reprompt too. (default:
+                        text inconsistent across windows, but the model
-                        True)
+                        becomes less prone to getting stuck in a failure loop.
                        If disabled then you may want to disable --reprompt
                        too. (default: True)
  --prompt_reset_on_temperature PROMPT_RESET_ON_TEMPERATURE
-                        Resets prompt if temperature is above this value. Arg has effect only if
+                        Resets prompt if temperature is above this value. Arg
-                        condition_on_previous_text is True. (default: 0.5)
+                        has effect only if condition_on_previous_text is True.
                        (default: 0.5)
  --without_timestamps WITHOUT_TIMESTAMPS
                        Only sample text tokens. (default: False)
  --max_initial_timestamp MAX_INITIAL_TIMESTAMP
-                        The initial timestamp cannot be later than this. (default: 1.0)
+                        The initial timestamp cannot be later than this.
                        (default: 1.0)
  --temperature_increment_on_fallback TEMPERATURE_INCREMENT_ON_FALLBACK, -fallback TEMPERATURE_INCREMENT_ON_FALLBACK
-                        temperature to increase when falling back when the decoding fails to meet either of the
+                        temperature to increase when falling back when the
-                        thresholds below. To disable fallback set it to 'None'. (default: 0.2)
+                        decoding fails to meet either of the thresholds below.
                        To disable fallback set it to 'None'. (default: 0.2)
  --compression_ratio_threshold COMPRESSION_RATIO_THRESHOLD
-                        if the gzip compression ratio is higher than this value, treat the decoding as failed
+                        if the gzip compression ratio is higher than this
-                        (default: 2.4)
+                        value, treat the decoding as failed (default: 2.4)
  --logprob_threshold LOGPROB_THRESHOLD
-                        if the average log probability is lower than this value, treat the decoding as failed
+                        if the average log probability is lower than this
-                        (default: -1.0)
+                        value, treat the decoding as failed (default: -1.0)
  --no_speech_threshold NO_SPEECH_THRESHOLD
-                        if the probability of the &lt;|nospeech|&gt; token is higher than this value AND the decoding has
+                        if the probability of the &lt;|nospeech|&gt; token is higher
-                        failed due to 'logprob_threshold', consider the segment as silence (default: 0.6)
+                        than this value AND the decoding has failed due to
-  --v3_offsets_off      Disables custom offsets to the defaults of pseudo-vad thresholds when 'large-v3' models are in
+                        'logprob_threshold', consider the segment as silence
-                        use. Those offset were made to make 'large-v3' to hallucinate less by default. (default:
+                        (default: 0.6)
-                        False)
+  --v3_offsets_off      Disables custom offsets to the defaults of pseudo-vad
                        thresholds when 'large-v3' models are in use. Note:
                        Offsets made to combat 'large-v3' hallucinations.
                        (default: False)
  --hallucination_silence_threshold HALLUCINATION_SILENCE_THRESHOLD, -hst HALLUCINATION_SILENCE_THRESHOLD
-                        (Experimental) When word_timestamps is True, skip silent periods longer than this threshold
+                        (Experimental) When word_timestamps is True, skip
-                        (in seconds) when a possible hallucination is detected. Optimal value is somewhere between 2 -
+                        silent periods longer than this threshold (in seconds)
-                        8 seconds. Inactive if None. (default: None)
+                        when a possible hallucination is detected. Optimal
                        value is somewhere between 2 - 8 seconds. Inactive if
                        None. (default: None)
  --hallucination_silence_th_temp {0.0,0.2,0.5,0.8,1.0}, -hst_temp {0.0,0.2,0.5,0.8,1.0}
-                        (Experimental) Additional heuristic for '--hallucination_silence_threshold'. If temperature is
+                        (Experimental) Additional heuristic for '--
-                        higher that this threshold then consider segment as possible hallucination ignoring the hst
+                        hallucination_silence_threshold'. If temperature is
-                        score. Inactive if 1.0. (default: 1.0)
+                        higher that this threshold then consider segment as
-  --clip_timestamps CLIP_TIMESTAMPS
+                        possible hallucination ignoring the hst score.
-                        Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process. The
+                        Inactive if 1.0. (default: 1.0)
-                        last end timestamp defaults to the end of the file. VAD is auto-disabled. (default: 0)
+  --clip_timestamps CLIP_TIMESTAMPS, -clip CLIP_TIMESTAMPS
                        Comma-separated list start,end,start,end,...
                        timestamps (in seconds) of clips to process. The last
                        end timestamp defaults to the end of the file. VAD is
                        auto-disabled. (default: 0)
  --no_speech_strict_lvl {0,1,2}
-                        (experimental) Level of stricter actions when no_speech_prob &gt; 0.93. Use beam_size=5 if this
+                        (experimental) Level of stricter actions when
-                        is enabled. Options: 0 - Disabled (do nothing), 1 - Reset propmt (see
+                        no_speech_prob &gt; 0.93. Use beam_size=5 if this is
-                        condition_on_previous_text), 2 - Invalidate the cached encoder output (if no_speech_threshold
+                        enabled. Options: 0 - Disabled (do nothing), 1 - Reset
-                        is not None). Arg meant to combat cases where the model is getting stuck in a failure loop or
+                        propmt (see condition_on_previous_text), 2 -
-                        outputs nonsense (default: 0)
+                        Invalidate the cached encoder output (if
                        no_speech_threshold is not None). Arg meant to combat
                        cases where the model is getting stuck in a failure
                        loop or outputs nonsense (default: 0)
  --word_timestamps WORD_TIMESTAMPS, -wt WORD_TIMESTAMPS
-                        Extract word-level timestamps and refine the results based on them (default: True)
+                        Extract word-level timestamps and refine the results
                        based on them (default: True)
  --highlight_words HIGHLIGHT_WORDS, -hw HIGHLIGHT_WORDS
-                        underline each word as it is spoken AKA karaoke in srt and vtt output formats (default: False)
+                        underline each word as it is spoken AKA karaoke in srt
                        and vtt output formats (default: False)
  --prepend_punctuations PREPEND_PUNCTUATIONS
-                        if word_timestamps is True, merge these punctuation symbols with the next word (default:
+                        if word_timestamps is True, merge these punctuation
-                        "'“¿([{-)
+                        symbols with the next word (default: "'“¿([{-)
  --append_punctuations APPEND_PUNCTUATIONS
-                        if word_timestamps is True, merge these punctuation symbols with the previous word (default:
+                        if word_timestamps is True, merge these punctuation
                        symbols with the previous word (default:
                        "'.。,，!！?？:：”)]}、)
-  --threads THREADS     number of threads used for CPU inference; By default number of the real cores but no more that
+  --threads THREADS     number of threads used for CPU inference; By default
-                        4 (default: 0)
+                        number of the real cores but no more that 4 (default:
                        0)
  --version             Show Faster-Whisper's version number
  --vad_filter VAD_FILTER, -vad VAD_FILTER
-                        Enable the voice activity detection (VAD) to filter out parts of the audio without speech.
+                        Enable the voice activity detection (VAD) to filter
-                        (default: True)
+                        out parts of the audio without speech. (default: True)
  --vad_threshold VAD_THRESHOLD
-                        Probabilities above this value are considered as speech. (default: 0.45)
+                        Probabilities above this value are considered as
                        speech. (default: 0.45)
  --vad_min_speech_duration_ms VAD_MIN_SPEECH_DURATION_MS
-                        Final speech chunks shorter min_speech_duration_ms are thrown out. (default: 350)
+                        Final speech chunks shorter min_speech_duration_ms are
                        thrown out. (default: 350)
  --vad_max_speech_duration_s VAD_MAX_SPEECH_DURATION_S
-                        Maximum duration of speech chunks in seconds. Longer will be split at the timestamp of the
+                        Maximum duration of speech chunks in seconds. Longer
-                        last silence. (default: None)
+                        will be split at the timestamp of the last silence.
                        (default: None)
  --vad_min_silence_duration_ms VAD_MIN_SILENCE_DURATION_MS
-                        In the end of each speech chunk time to wait before separating it. (default: 3000)
+                        In the end of each speech chunk time to wait before
                        separating it. (default: 3000)
  --vad_speech_pad_ms VAD_SPEECH_PAD_MS
-                        Final speech chunks are padded by speech_pad_ms each side. (default: 900)
+                        Final speech chunks are padded by speech_pad_ms each
                        side. (default: 900)
  --vad_window_size_samples VAD_WINDOW_SIZE_SAMPLES
-                        Size of audio chunks fed to the silero VAD model. Values other than 512, 1024, 1536 may affect
+                        Size of audio chunks fed to the silero VAD model.
-                        model perfomance!!! (default: 1536)
+                        Values other than 512, 1024, 1536 may affect model
-  --vad_dump            Dumps VAD timings to a subtitle file for inspection. (default: False)
+                        perfomance!!! (default: 1536)
  --vad_method {silero_v3,silero_v4,pyannote_v3,pyannote_onnx_v3,auditok,webrtc}, --vad_alt_method {silero_v3,silero_v4,pyannote_v3,pyannote_onnx_v3,auditok,webrtc}
                        Read there - 'https://github.com/Purfview/whisper-
                        standalone-win/discussions/231'. (default: None)
  --vad_dump            Dumps VAD timings to a subtitle file for inspection.
                        Additionaly dumps various intermediate audio files for
                        debug. (default: False)
  --nullify_non_speech, -nullify
                        For dev experiments. (default: False)
  --max_new_tokens MAX_NEW_TOKENS
-                        Maximum number of new tokens to generate per-chunk. (default: None)
+                        Maximum number of new tokens to generate per-chunk.
                        (default: None)
  --chunk_length CHUNK_LENGTH
-                        The length of audio segments. If it is not None, it will overwrite the default chunk_length of
+                        The length of audio segments. If it is not None, it
-                        the FeatureExtractor. (default: None)
+                        will overwrite the default chunk_length of the
                        FeatureExtractor. (default: None)
  --compute_type {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}, -ct {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}
-                        Type of quantization to use (see https://opennmt.net/CTranslate2/quantization.html). (default:
+                        Type of quantization to use (see
-                        auto)
+                        https://opennmt.net/CTranslate2/quantization.html).
                        (default: auto)
  --batch_recursive, -br
-                        Enables recursive batch processing. Note: If set then it changes defaults of --output_dir.
+                        Enables recursive batch processing. Note: If set then
                        it changes defaults of --output_dir. (default: False)
  --beep_off            Disables the beep sound when operation is finished.
                        (default: False)
-  --beep_off            Disables the beep sound when operation is finished. (default: False)
+  --skip                Skips media file if subtitle exists. Works if input is
  --skip                Skips media file if subtitle exists. Works if input is wildcard or directory. (default: False)
  --checkcuda, -cc      Returns CUDA device count. (for Subtitle Edit's internal use)
  --print_progress, -pp
                        Prints progress bar instead of transcription. (default: False)
  --postfix             Adds language as a postfix to subtitle's filename. (default: False)
  --check_files         Checks input files for errors before passing all them for transcription. Works if input is
                        wildcard or directory. (default: False)
-  --PR163_off           (For dev experiments) Disables PR163. . (default: False)
+  --checkcuda, -cc      Returns CUDA device count. (for Subtitle Edit's
                        internal use)
  --print_progress, -pp
                        Prints progress bar instead of transcription.
                        (default: False)
  --postfix             Adds language as a postfix to subtitle's filename.
                        (default: False)
  --check_files         Checks input files for errors before passing all them
                        for transcription. Works if input is wildcard or
                        directory. (default: False)
  --PR163_off           (For dev experiments) Disables PR163. . (default:
                        False)
  --hallucinations_list_off
-                        (For dev experiments) Disables hallucinations_list, allows hallucinations added to prompt.
+                        (For dev experiments) Disables hallucinations_list,
-                        (default: False)
+                        allows hallucinations added to prompt. (default:
-  --one_word {0,1,2}    0) Disabled. 1) Outputs srt and vtt subtitles with one word per line. 2) As '1', plus removes
+                        False)
-                        whitespace and ensures &gt;= 50ms for sub lines. Note: VAD may slightly reduce the accuracy of
+  --alt_writer_off      (For dev experiments) Forcefully disables alternative
-                        timestamps on some lines. (default: 0)
+                        subs writer func. (default: False)
-  --sentence            Enables splitting lines to sentences for srt and vtt subs. Every sentence starts in the new
+  --one_word {0,1,2}    0) Disabled. 1) Outputs srt and vtt subtitles with one
-                        segment. By default meant to output whole sentence per line for better translations, but not
+                        word per line. 2) As '1', plus removes whitespace and
-                        limited to, read about '--max_...' parameters. Note: has no effect on 'highlight_words'.
+                        ensures &gt;= 50ms for sub lines. Note: VAD may slightly
-                        (default: False)
+                        reduce the accuracy of timestamps on some lines.
-  --standard            Quick hardcoded preset to split lines in standard way. 42 chars per 2 lines with
+                        (default: 0)
-                        max_comma_cent=70 and --sentence are activated automatically. (default: False)
+  --sentence            Enables splitting lines to sentences for srt and vtt
-  --standard_asia       Quick hardcoded preset to split lines in standard way for some Asian languages. 16 chars per 2
+                        subs. Every sentence starts in the new segment. By
-                        lines with max_comma_cent=80 and --sentence are activated automatically. (default: False)
+                        default meant to output whole sentence per line for
                        better translations, but not limited to, read about '
                        --max_...' parameters. Note: has no effect on
                        'highlight_words'. (default: False)
  --standard            Quick hardcoded preset to split lines in standard way.
                        42 chars per 2 lines with max_comma_cent=70 and
                        --sentence are activated automatically. (default:
                        False)
  --standard_asia       Quick hardcoded preset to split lines in standard way
                        for some Asian languages. 16 chars per 2 lines with
                        max_comma_cent=80 and --sentence are activated
                        automatically. (default: False)
  --max_comma MAX_COMMA
-                        (requires --sentence) After this line length a comma is treated as the end of sentence. Note:
+                        (requires --sentence) After this line length a comma
-                        disabled if it's over or equal to --max_line_width. (default: 250)
+                        is treated as the end of sentence. Note: disabled if
                        it's over or equal to --max_line_width. (default: 250)
  --max_comma_cent {50,60,70,80,90,100}
-                        (requires --sentence) Percentage of --max_line_width when it starts breaking the line after
+                        (requires --sentence) Percentage of --max_line_width
-                        comma. Note: 100 = disabled. (default: 100)
+                        when it starts breaking the line after comma. Note:
-  --max_gap MAX_GAP     (requires --sentence) Threshold for a gap length in seconds, longer gaps are treated as dots.
+                        100 = disabled. (default: 100)
-                        (default: 3.0)
+  --max_gap MAX_GAP     (requires --sentence) Threshold for a gap length in
                        seconds, longer gaps are treated as dots. (default:
                        3.0)
  --max_line_width MAX_LINE_WIDTH
-                        The maximum number of characters in a line before breaking the line. (default: 1000)
+                        The maximum number of characters in a line before
                        breaking the line. Note: It works with `--sentence`
                        too. (default: 1000)
  --max_line_count MAX_LINE_COUNT
-                        The maximum number of lines in one sub segment. (default: 1)
+                        The maximum number of lines in one sub segment. Note:
                        It works with `--sentence` too. (default: 1)
  --min_dist_to_end {0,4,5,6,7,8,9,10,11,12}
-                        (requires --sentence) If from words like 'the', 'Mr.' and ect. to the end of line distance is
+                        (requires --sentence) If from words like 'the', 'Mr.'
-                        less than set then it starts in a new line. Note: 0 = disabled. (default: 0)
+                        and ect. to the end of line distance is less than set
                        then it starts in a new line. Note: 0 = disabled.
                        (default: 0)
  --prompt_max {16,32,64,128,223}
-                        (experimental) The maximum size of prompt. (default: 223)
+                        (experimental) The maximum size of prompt. (default:
-  --reprompt {0,1,2}    (experimental) 0) Disabled. 1) Inserts initial_prompt after the prompt resets. 2) Ensures that
+                        223)
-                        initial_prompt is present in prompt for all windows/chunks. Note: auto-disabled if
+  --reprompt {0,1,2}    (experimental) 0) Disabled. 1) Inserts initial_prompt
-                        initial_prompt=None. It's similar to 'hotwords' feature. (default: 2)
+                        after the prompt resets. 2) Ensures that
                        initial_prompt is present in prompt for all
                        windows/chunks. Note: auto-disabled if
                        initial_prompt=None. It's similar to 'hotwords'
                        feature. (default: 2)
  --prompt_reset_on_no_end {0,1,2}
-                        (experimental) Resets prompt if there is no end of sentence in window/chunk. 0 - disabled, 1 -
+                        (experimental) Resets prompt if there is no end of
-                        looks for period, 2 - looks for period or comma. Note: it's auto-disabled if reprompt=0.
+                        sentence in window/chunk. 0 - disabled, 1 - looks for
-                        (default: 2)
+                        period, 2 - looks for period or comma. Note: it's
-  --ff_dump             Dumps pre-processed audio by the filters to the 16000Hz file and prevents deletion of some
+                        auto-disabled if reprompt=0. (default: 2)
  --ff_dump             Dumps pre-processed audio by the filters to the
                        16000Hz file and prevents deletion of some
                        intermediate audio files. (default: False)
  --ff_track {1,2,3,4,5,6}
-                        Audio track selector. 1 - selects the first audio track. (default: 1)
+                        Audio track selector. 1 - selects the first audio
-  --ff_fc               Selects only front-center channel (FC) to process. (default: False)
+                        track. (default: 1)
-  --ff_mp3              Audio filter: Conversion to MP3 and back. (default: False)
+  --ff_fc               Selects only front-center channel (FC) to process.
  --ff_sync             Audio filter: Stretch/squeeze samples to the given timestamps, with a maximum of 3600 samples
                        per second compensation. Input file must be container that support storing PTS like mp4,
                        mkv... (default: False)
  --ff_rnndn_sh         Audio filter: Suppress non-speech with GregorR's SH model using Recurrent Neural Networks.
                        Notes: It's more aggressive than Xiph, discards singing. (default: False)
  --ff_rnndn_xiph       Audio filter: Suppress non-speech with Xiph's original model using Recurrent Neural Networks.
                        (default: False)
-  --ff_fftdn [0 - 97]   Audio filter: General denoise with Fast Fourier Transform. Notes: 12 - normal strength, 0 -
+  --ff_mp3              Audio filter: Conversion to MP3 and back. (default:
-                        disabled. (default: 0)
+                        False)
  --ff_sync             Audio filter: Stretch/squeeze samples to the given
                        timestamps, with a maximum of 3600 samples per second
                        compensation. Input file must be container that
                        support storing PTS like mp4, mkv... (default: False)
  --ff_rnndn_sh         Audio filter: Suppress non-speech with GregorR's SH
                        model using Recurrent Neural Networks. Notes: It's
                        more aggressive than Xiph, discards singing. (default:
                        False)
  --ff_rnndn_xiph       Audio filter: Suppress non-speech with Xiph's original
                        model using Recurrent Neural Networks. (default:
                        False)
  --ff_fftdn [0 - 97]   Audio filter: General denoise with Fast Fourier
                        Transform. Notes: 12 - normal strength, 0 - disabled.
                        (default: 0)
  --ff_tempo [0.5 - 2.0]
-                        Audio filter: Adjust audio tempo. Values below 1.0 slows down audio, above - speeds up. 1.0 =
+                        Audio filter: Adjust audio tempo. Values below 1.0
-                        disabled. (default: 1.0)
+                        slows down audio, above - speeds up. 1.0 = disabled.
-  --ff_gate             Audio filter: Reduce lower parts of a signal. (default: False)
+                        (default: 1.0)
-  --ff_speechnorm       Audio filter: Extreme and fast speech amplification. (default: False)
+  --ff_gate             Audio filter: Reduce lower parts of a signal.
-  --ff_loudnorm         Audio filter: EBU R128 loudness normalization. (default: False)
+                        (default: False)
  --ff_speechnorm       Audio filter: Extreme and fast speech amplification.
                        (default: False)
  --ff_loudnorm         Audio filter: EBU R128 loudness normalization.
                        (default: False)
  --ff_silence_suppress noise duration
-                        Audio filter: Suppress quiet parts of audio. Takes two values. First value - noise tolerance
+                        Audio filter: Suppress quiet parts of audio. Takes two
-                        in decibels [-70 - 0] (0=disabled), second value - minimum silence duration in seconds [0.1 -
+                        values. First value - noise tolerance in decibels [-70
-                        10]. (default: [0, 3.0])
+                        - 0] (0=disabled), second value - minimum silence
-  --ff_lowhighpass      Audio filter: Pass 50Hz - 7800 band. sinc + afir. (default: False)</value>
+                        duration in seconds [0.1 - 10]. (default: [0, 3.0])
  --ff_lowhighpass      Audio filter: Pass 50Hz - 7800 band. sinc + afir.
                        (default: False)
  --ff_mdx_kim2         Audio filter: High quality vocals extraction. MDX-Net
                        'Kim_Vocal_2' model made by KimberleyJensen. Notes:
                        It's better than 'HT Demucs v4 FT', first in the
                        filters chain, recommended to use on movies and
                        series. (default: False)
  --mdx_chunk MDX_CHUNK
                        Chunk size in seconds for MDX-Net filter. Notes:
                        Smaller is using less memory, but can be slower and
                        produce a bit lower quality. (default: 15)
  --mdx_device MDX_DEVICE
                        Device to use for MDX-Net filter. Default is 'cuda' if
                        CUDA device is detected, else is 'cpu'. If CUDA GPU is
                        a second device then set 'cuda:1'. (default: cpu)
  --diarize             Enable diarization. (default: False)
  --diarize_device DIARIZE_DEVICE
                        Device to use for '--diarize'. Default is 'cuda' if
                        CUDA device is detected, else is 'cpu'. If CUDA GPU is
                        a second device then set 'cuda:1'. (default: cpu)
  --diarize_threads DIARIZE_THREADS
  --speaker SPEAKER     To replace 'SPEAKER' string with your own word.
                        (default: SPEAKER)</value>
  </data>
 </root>