mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-10-27 14:32:35 +01:00
Update Whisper cmd line arugements help
This commit is contained in:
parent
687e19c4f1
commit
c425fa9178
@ -125,6 +125,7 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText
|
||||
// textBoxCpp
|
||||
//
|
||||
this.textBoxCpp.Dock = System.Windows.Forms.DockStyle.Fill;
|
||||
this.textBoxCpp.FocusedColor = System.Drawing.Color.FromArgb(((int)(((byte)(0)))), ((int)(((byte)(120)))), ((int)(((byte)(215)))));
|
||||
this.textBoxCpp.Font = new System.Drawing.Font("Consolas", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
|
||||
this.textBoxCpp.Location = new System.Drawing.Point(3, 3);
|
||||
this.textBoxCpp.Multiline = true;
|
||||
@ -150,6 +151,7 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText
|
||||
// textBoxConstMe
|
||||
//
|
||||
this.textBoxConstMe.Dock = System.Windows.Forms.DockStyle.Fill;
|
||||
this.textBoxConstMe.FocusedColor = System.Drawing.Color.FromArgb(((int)(((byte)(0)))), ((int)(((byte)(120)))), ((int)(((byte)(215)))));
|
||||
this.textBoxConstMe.Font = new System.Drawing.Font("Consolas", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
|
||||
this.textBoxConstMe.Location = new System.Drawing.Point(3, 3);
|
||||
this.textBoxConstMe.Multiline = true;
|
||||
@ -175,6 +177,7 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText
|
||||
// textBoxOpenAI
|
||||
//
|
||||
this.textBoxOpenAI.Dock = System.Windows.Forms.DockStyle.Fill;
|
||||
this.textBoxOpenAI.FocusedColor = System.Drawing.Color.FromArgb(((int)(((byte)(0)))), ((int)(((byte)(120)))), ((int)(((byte)(215)))));
|
||||
this.textBoxOpenAI.Font = new System.Drawing.Font("Consolas", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
|
||||
this.textBoxOpenAI.Location = new System.Drawing.Point(3, 3);
|
||||
this.textBoxOpenAI.Multiline = true;
|
||||
@ -200,6 +203,7 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText
|
||||
// textBoxPurfviewFasterWhisper
|
||||
//
|
||||
this.textBoxPurfviewFasterWhisper.Dock = System.Windows.Forms.DockStyle.Fill;
|
||||
this.textBoxPurfviewFasterWhisper.FocusedColor = System.Drawing.Color.FromArgb(((int)(((byte)(0)))), ((int)(((byte)(120)))), ((int)(((byte)(215)))));
|
||||
this.textBoxPurfviewFasterWhisper.Font = new System.Drawing.Font("Consolas", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
|
||||
this.textBoxPurfviewFasterWhisper.Location = new System.Drawing.Point(3, 3);
|
||||
this.textBoxPurfviewFasterWhisper.Multiline = true;
|
||||
|
@ -118,8 +118,7 @@
|
||||
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
|
||||
</resheader>
|
||||
<data name="textBoxCpp.Text" xml:space="preserve">
|
||||
<value>CPP:
|
||||
-t N, --threads N [4 ] number of threads to use during computation
|
||||
<value> -t N, --threads N [4 ] number of threads to use during computation
|
||||
-p N, --processors N [1 ] number of processors to use during computation
|
||||
-ot N, --offset-t N [0 ] time offset in milliseconds
|
||||
-on N, --offset-n N [0 ] segment index offset
|
||||
@ -127,14 +126,15 @@
|
||||
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
|
||||
-ml N, --max-len N [0 ] maximum segment length in characters
|
||||
-sow, --split-on-word [false ] split on word rather than on token
|
||||
-bo N, --best-of N [2 ] number of best candidates to keep
|
||||
-bs N, --beam-size N [-1 ] beam size for beam search
|
||||
-bo N, --best-of N [5 ] number of best candidates to keep
|
||||
-bs N, --beam-size N [5 ] beam size for beam search
|
||||
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
|
||||
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
|
||||
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
|
||||
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
|
||||
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
|
||||
-tr, --translate [false ] translate from source language to english
|
||||
-di, --diarize [false ] stereo audio diarization
|
||||
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
|
||||
-nf, --no-fallback [false ] do not use temperature fallback while decoding
|
||||
-otxt, --output-txt [false ] output result in a text file
|
||||
-ovtt, --output-vtt [false ] output result in a vtt file
|
||||
@ -144,11 +144,19 @@
|
||||
-fp, --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
|
||||
-ocsv, --output-csv [false ] output result in a CSV file
|
||||
-oj, --output-json [false ] output result in a JSON file
|
||||
-ojf, --output-json-full [false ] include more information in the JSON file
|
||||
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
|
||||
-ps, --print-special [false ] print special tokens
|
||||
-pc, --print-colors [false ] print colors
|
||||
-pp, --print-progress [false ] print progress
|
||||
-nt, --no-timestamps [true ] do not print timestamps</value>
|
||||
-nt, --no-timestamps [false ] do not print timestamps
|
||||
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
|
||||
-dl, --detect-language [false ] exit after automatically detecting language
|
||||
--prompt PROMPT [ ] initial prompt
|
||||
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
|
||||
-ls, --log-score [false ] log best decoder scores of tokens
|
||||
-ng, --no-gpu [false ] disable GPU
|
||||
</value>
|
||||
</data>
|
||||
<data name="textBoxConstMe.Text" xml:space="preserve">
|
||||
<value>Const-me:
|
||||
@ -230,93 +238,82 @@
|
||||
</value>
|
||||
</data>
|
||||
<data name="textBoxPurfviewFasterWhisper.Text" xml:space="preserve">
|
||||
<value> --temperature TEMPERATURE
|
||||
<value> --device DEVICE, -d DEVICE
|
||||
device to use (default: cpu)
|
||||
--verbose VERBOSE, -v VERBOSE
|
||||
whether to print out debug messages (default: False)
|
||||
--task {transcribe,translate}
|
||||
whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate') (default: transcribe)
|
||||
--temperature TEMPERATURE
|
||||
temperature to use for sampling (default: 0)
|
||||
--best_of BEST_OF, -bo BEST_OF
|
||||
number of candidates when sampling with non-zero temperature (default: 5)
|
||||
--beam_size BEAM_SIZE, -bs BEAM_SIZE
|
||||
number of beams in beam search, only applicable when temperature is zero (default: 1)
|
||||
number of beams in beam search, only applicable when temperature is zero (default: 5)
|
||||
--patience PATIENCE, -p PATIENCE
|
||||
optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the
|
||||
default (1.0) is equivalent to conventional beam search (default: 1.0)
|
||||
optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search (default: 1.0)
|
||||
--length_penalty LENGTH_PENALTY
|
||||
optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses
|
||||
simple length normalization by default (default: 1.0)
|
||||
optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default (default: 1.0)
|
||||
--repetition_penalty REPETITION_PENALTY
|
||||
Penalty applied to the score of previously generated tokens (set > 1.0 to penalize). (default:
|
||||
1.0)
|
||||
Penalty applied to the score of previously generated tokens (set > 1.0 to penalize). (default: 1.0)
|
||||
--no_repeat_ngram_size NO_REPEAT_NGRAM_SIZE
|
||||
Prevent repetitions of ngrams with this size (set 0 to disable). (default: 0)
|
||||
--suppress_tokens SUPPRESS_TOKENS
|
||||
comma-separated list of token ids to suppress during sampling; '-1' will suppress most special
|
||||
characters except common punctuations (default: -1)
|
||||
comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations (default: -1)
|
||||
--initial_prompt INITIAL_PROMPT, -prompt INITIAL_PROMPT
|
||||
optional text to provide as a prompt for the first window. (default: None)
|
||||
optional text to provide as a prompt for the first window. Use 'None' to disable it (default: ,.?!)
|
||||
--condition_on_previous_text CONDITION_ON_PREVIOUS_TEXT
|
||||
if True, provide the previous output of the model as a prompt for the next window; disabling
|
||||
may make the text inconsistent across windows, but the model becomes less prone to getting
|
||||
stuck in a failure loop (default: True)
|
||||
if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop
|
||||
(default: True)
|
||||
--prompt_reset_on_temperature PROMPT_RESET_ON_TEMPERATURE
|
||||
Resets prompt if temperature is above this value. Arg has effect only if
|
||||
condition_on_previous_text is True. (default: 0.5)
|
||||
Resets prompt if temperature is above this value. Arg has effect only if condition_on_previous_text is True. (default: 0.5)
|
||||
--temperature_increment_on_fallback TEMPERATURE_INCREMENT_ON_FALLBACK
|
||||
temperature to increase when falling back when the decoding fails to meet either of the
|
||||
thresholds below. To disable fallback set it to 'None'. (default: 0.2)
|
||||
temperature to increase when falling back when the decoding fails to meet either of the thresholds below. To disable fallback set it to 'None'. (default: 0.2)
|
||||
--compression_ratio_threshold COMPRESSION_RATIO_THRESHOLD
|
||||
if the gzip compression ratio is higher than this value, treat the decoding as failed
|
||||
(default: 2.4)
|
||||
if the gzip compression ratio is higher than this value, treat the decoding as failed (default: 2.4)
|
||||
--logprob_threshold LOGPROB_THRESHOLD
|
||||
if the average log probability is lower than this value, treat the decoding as failed
|
||||
(default: -1.0)
|
||||
if the average log probability is lower than this value, treat the decoding as failed (default: -1.0)
|
||||
--no_speech_threshold NO_SPEECH_THRESHOLD
|
||||
if the probability of the <|nospeech|> token is higher than this value AND the decoding has
|
||||
failed due to `logprob_threshold`, consider the segment as silence (default: 0.6)
|
||||
if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence (default: 0.6)
|
||||
--no_speech_strict_lvl {0,1,2}
|
||||
Level of stricter actions when no_speech_prob > 0.93. Use beam_size=5 if this is enabled.
|
||||
Options: 0 - Disabled (do nothing), 1 - Reset propmt (see condition_on_previous_text), 2 -
|
||||
Invalidate the cached encoder output (if no_speech_threshold is not None). Arg meant to combat
|
||||
cases where the model is getting stuck in a failure loop or outputs nonsense (default: 0)
|
||||
--word_timestamps WORD_TIMESTAMPS
|
||||
(experimental) extract word-level timestamps and refine the results based on them (default:
|
||||
True)
|
||||
--highlight_words HIGHLIGHT_WORDS
|
||||
(experimental) Level of stricter actions when no_speech_prob > 0.93. Use beam_size=5 if this is enabled. Options: 0 - Disabled (do nothing), 1 - Reset propmt (see condition_on_previous_text), 2 - Invalidate the
|
||||
cached encoder output (if no_speech_threshold is not None). Arg meant to combat cases where the model is getting stuck in a failure loop or outputs nonsense (default: 0)
|
||||
--word_timestamps WORD_TIMESTAMPS, -wt WORD_TIMESTAMPS
|
||||
(experimental) extract word-level timestamps and refine the results based on them (default: True)
|
||||
--highlight_words HIGHLIGHT_WORDS, -hw HIGHLIGHT_WORDS
|
||||
underline each word as it is spoken AKA karaoke in srt and vtt output formats (default: False)
|
||||
--prepend_punctuations PREPEND_PUNCTUATIONS
|
||||
if word_timestamps is True, merge these punctuation symbols with the next word (default:
|
||||
"'“¿([{-)
|
||||
if word_timestamps is True, merge these punctuation symbols with the next word (default: "'“¿([{-)
|
||||
--append_punctuations APPEND_PUNCTUATIONS
|
||||
if word_timestamps is True, merge these punctuation symbols with the previous word (default:
|
||||
"'.。,,!!??::”)]}、)
|
||||
--threads THREADS number of threads used for CPU inference; By default number of the real cores but no more that
|
||||
4 (default: 0)
|
||||
if word_timestamps is True, merge these punctuation symbols with the previous word (default: "'.。,,!!??::”)]}、)
|
||||
--threads THREADS number of threads used for CPU inference; By default number of the real cores but no more that 4 (default: 0)
|
||||
--version Show Faster-Whisper's version number
|
||||
--vad_filter VAD_FILTER, -vad VAD_FILTER
|
||||
Enable the voice activity detection (VAD) to filter out parts of the audio without speech.
|
||||
(default: True)
|
||||
Enable the voice activity detection (VAD) to filter out parts of the audio without speech. (default: True)
|
||||
--vad_threshold VAD_THRESHOLD
|
||||
Probabilities above this value are considered as speech. (default: 0.45)
|
||||
--vad_min_speech_duration_ms VAD_MIN_SPEECH_DURATION_MS
|
||||
Final speech chunks shorter min_speech_duration_ms are thrown out. (default: 350)
|
||||
--vad_max_speech_duration_s VAD_MAX_SPEECH_DURATION_S
|
||||
Maximum duration of speech chunks in seconds. Longer will be split at the timestamp of the
|
||||
last silence. (default: None)
|
||||
Maximum duration of speech chunks in seconds. Longer will be split at the timestamp of the last silence. (default: None)
|
||||
--vad_min_silence_duration_ms VAD_MIN_SILENCE_DURATION_MS
|
||||
In the end of each speech chunk time to wait before separating it. (default: 3000)
|
||||
--vad_speech_pad_ms VAD_SPEECH_PAD_MS
|
||||
Final speech chunks are padded by speech_pad_ms each side. (default: 900)
|
||||
--vad_window_size_samples VAD_WINDOW_SIZE_SAMPLES
|
||||
Size of audio chunks fed to the silero VAD model. Values other than 512, 1024, 1536 may affect
|
||||
model perfomance!!! (default: 1536)
|
||||
Size of audio chunks fed to the silero VAD model. Values other than 512, 1024, 1536 may affect model perfomance!!! (default: 1536)
|
||||
--compute_type {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}, -ct {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}
|
||||
Type of quantization to use (see https://opennmt.net/CTranslate2/quantization.html). (default:
|
||||
default)
|
||||
Type of quantization to use (see https://opennmt.net/CTranslate2/quantization.html). (default: auto)
|
||||
--batch_recursive, -br
|
||||
Enables recursive batch processing. Note: If set then it changes defaults of --output_dir.
|
||||
(default: False)
|
||||
--beep_off Disables beep sound when operation is finished. (default: False)
|
||||
--skip Skips files if 'srt' subtitle exists. Works if input is wildcard or directory. (default:
|
||||
False)
|
||||
--checkcuda, -cc Check for CUDA devices. (for Subtitle Edit's internal use)
|
||||
Enables recursive batch processing. Note: If set then it changes defaults of --output_dir. (default: False)
|
||||
--beep_off Disables the beep sound when operation is finished. (default: False)
|
||||
--skip Skips files if 'srt' subtitle exists. Works if input is wildcard or directory. (default: False)
|
||||
--checkcuda, -cc Returns CUDA device count. (for Subtitle Edit's internal use)
|
||||
--print_progress, -pp
|
||||
Prints progress bar instead of transcription. (default: False)
|
||||
</value>
|
||||
--postfix Adds language as a postfix to subtitle's filename. (default: False)
|
||||
--one_word Outputs srt and vtt subtitles with one word per line. Note: VAD may slightly reduce the accuracy of timestamps on some lines. (default: False)
|
||||
--PR163_off Disables PR163. For dev experiments. (default: False)
|
||||
</value>
|
||||
</data>
|
||||
</root>
|
Loading…
Reference in New Issue
Block a user