diff --git a/src/libse/Common/Settings.cs b/src/libse/Common/Settings.cs index 01255521e..8c896cfea 100644 --- a/src/libse/Common/Settings.cs +++ b/src/libse/Common/Settings.cs @@ -476,6 +476,7 @@ namespace Nikse.SubtitleEdit.Core.Common public string WhisperLocation { get; set; } public string WhisperCtranslate2Location { get; set; } public string WhisperPurfviewFasterWhisperLocation { get; set; } + public string WhisperPurfviewFasterWhisperDefaultCmd { get; set; } public string WhisperXLocation { get; set; } public string WhisperStableTsLocation { get; set; } public string WhisperCppModelLocation { get; set; } @@ -710,6 +711,7 @@ namespace Nikse.SubtitleEdit.Core.Common VoskPostProcessing = true; WhisperChoice = Configuration.IsRunningOnWindows ? AudioToText.WhisperChoice.PurfviewFasterWhisper : AudioToText.WhisperChoice.OpenAi; WhisperDeleteTempFiles = true; + WhisperPurfviewFasterWhisperDefaultCmd = "--standard"; WhisperExtraSettings = ""; WhisperLanguageCode = "en"; WhisperAutoAdjustTimings = true; @@ -7128,6 +7130,12 @@ $HorzAlign = Center settings.Tools.WhisperPurfviewFasterWhisperLocation = subNode.InnerText; } + subNode = node.SelectSingleNode("WhisperPurfviewFasterWhisperDefaultCmd"); + if (subNode != null) + { + settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd = subNode.InnerText; + } + subNode = node.SelectSingleNode("WhisperExtraSettings"); if (subNode != null) { @@ -12084,6 +12092,7 @@ $HorzAlign = Center textWriter.WriteElementString("WhisperLocation", settings.Tools.WhisperLocation); textWriter.WriteElementString("WhisperCtranslate2Location", settings.Tools.WhisperCtranslate2Location); textWriter.WriteElementString("WhisperPurfviewFasterWhisperLocation", settings.Tools.WhisperPurfviewFasterWhisperLocation); + textWriter.WriteElementString("WhisperPurfviewFasterWhisperDefaultCmd", settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd); textWriter.WriteElementString("WhisperXLocation", settings.Tools.WhisperXLocation); textWriter.WriteElementString("WhisperStableTsLocation", settings.Tools.WhisperStableTsLocation); textWriter.WriteElementString("WhisperCppModelLocation", settings.Tools.WhisperCppModelLocation); diff --git a/src/ui/Forms/AudioToText/WhisperAdvanced.cs b/src/ui/Forms/AudioToText/WhisperAdvanced.cs index 8fdb239e2..368fbaa8e 100644 --- a/src/ui/Forms/AudioToText/WhisperAdvanced.cs +++ b/src/ui/Forms/AudioToText/WhisperAdvanced.cs @@ -73,6 +73,15 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText } Configuration.Settings.Tools.WhisperExtraSettings = comboBoxWhisperExtra.Text; + + if (Configuration.Settings.Tools.WhisperChoice == WhisperChoice.PurfviewFasterWhisper) + { + if (Configuration.Settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd != comboBoxWhisperExtra.Text) + { + Configuration.Settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd = string.Empty; + } + } + DialogResult = DialogResult.OK; } diff --git a/src/ui/Forms/AudioToText/WhisperAdvanced.resx b/src/ui/Forms/AudioToText/WhisperAdvanced.resx index 254903b2c..91ad71b6f 100644 --- a/src/ui/Forms/AudioToText/WhisperAdvanced.resx +++ b/src/ui/Forms/AudioToText/WhisperAdvanced.resx @@ -238,85 +238,83 @@ - --device DEVICE, -d DEVICE - Device to use. Default is 'cuda' if CUDA device is detected, else is 'cpu'. If CUDA GPU is a second device then set 'cuda:1'. - --verbose VERBOSE, -v VERBOSE - whether to print out debug messages (default: False) - --task {transcribe,translate} - whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate') (default: transcribe) - --temperature TEMPERATURE - temperature to use for sampling (default: 0) - --best_of BEST_OF, -bo BEST_OF - number of candidates when sampling with non-zero temperature (default: 5) - --beam_size BEAM_SIZE, -bs BEAM_SIZE - number of beams in beam search, only applicable when temperature is zero (default: 5) - --patience PATIENCE, -p PATIENCE - optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search (default: 1.0) - --length_penalty LENGTH_PENALTY - optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default (default: 1.0) - --repetition_penalty REPETITION_PENALTY - Penalty applied to the score of previously generated tokens (set > 1.0 to penalize). (default: 1.0) - --no_repeat_ngram_size NO_REPEAT_NGRAM_SIZE - Prevent repetitions of ngrams with this size (set 0 to disable). (default: 0) - --suppress_tokens SUPPRESS_TOKENS - comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations (default: -1) - --initial_prompt INITIAL_PROMPT, -prompt INITIAL_PROMPT - optional text to provide as a prompt for the first window. Use 'None' to disable it (default: ,.?!) - --condition_on_previous_text CONDITION_ON_PREVIOUS_TEXT - if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop - (default: True) - --prompt_reset_on_temperature PROMPT_RESET_ON_TEMPERATURE - Resets prompt if temperature is above this value. Arg has effect only if condition_on_previous_text is True. (default: 0.5) - --temperature_increment_on_fallback TEMPERATURE_INCREMENT_ON_FALLBACK - temperature to increase when falling back when the decoding fails to meet either of the thresholds below. To disable fallback set it to 'None'. (default: 0.2) - --compression_ratio_threshold COMPRESSION_RATIO_THRESHOLD - if the gzip compression ratio is higher than this value, treat the decoding as failed (default: 2.4) - --logprob_threshold LOGPROB_THRESHOLD - if the average log probability is lower than this value, treat the decoding as failed (default: -1.0) - --no_speech_threshold NO_SPEECH_THRESHOLD - if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence (default: 0.6) - --no_speech_strict_lvl {0,1,2} - (experimental) Level of stricter actions when no_speech_prob > 0.93. Use beam_size=5 if this is enabled. Options: 0 - Disabled (do nothing), 1 - Reset propmt (see condition_on_previous_text), 2 - Invalidate the - cached encoder output (if no_speech_threshold is not None). Arg meant to combat cases where the model is getting stuck in a failure loop or outputs nonsense (default: 0) - --word_timestamps WORD_TIMESTAMPS, -wt WORD_TIMESTAMPS - (experimental) extract word-level timestamps and refine the results based on them (default: True) - --highlight_words HIGHLIGHT_WORDS, -hw HIGHLIGHT_WORDS - underline each word as it is spoken AKA karaoke in srt and vtt output formats (default: False) - --prepend_punctuations PREPEND_PUNCTUATIONS - if word_timestamps is True, merge these punctuation symbols with the next word (default: "'“¿([{-) - --append_punctuations APPEND_PUNCTUATIONS - if word_timestamps is True, merge these punctuation symbols with the previous word (default: "'.。,,!!??::”)]}、) - --threads THREADS number of threads used for CPU inference; By default number of the real cores but no more that 4 (default: 0) - --version Show Faster-Whisper's version number - --vad_filter VAD_FILTER, -vad VAD_FILTER - Enable the voice activity detection (VAD) to filter out parts of the audio without speech. (default: True) - --vad_threshold VAD_THRESHOLD - Probabilities above this value are considered as speech. (default: 0.45) - --vad_min_speech_duration_ms VAD_MIN_SPEECH_DURATION_MS - Final speech chunks shorter min_speech_duration_ms are thrown out. (default: 350) - --vad_max_speech_duration_s VAD_MAX_SPEECH_DURATION_S - Maximum duration of speech chunks in seconds. Longer will be split at the timestamp of the last silence. (default: None) - --vad_min_silence_duration_ms VAD_MIN_SILENCE_DURATION_MS - In the end of each speech chunk time to wait before separating it. (default: 3000) - --vad_speech_pad_ms VAD_SPEECH_PAD_MS - Final speech chunks are padded by speech_pad_ms each side. (default: 900) - --vad_window_size_samples VAD_WINDOW_SIZE_SAMPLES - Size of audio chunks fed to the silero VAD model. Values other than 512, 1024, 1536 may affect model perfomance!!! (default: 1536) - --compute_type {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}, -ct {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16} - Type of quantization to use (see https://opennmt.net/CTranslate2/quantization.html). (default: auto) - --batch_recursive, -br - Enables recursive batch processing. Note: If set then it changes defaults of --output_dir. (default: False) - --beep_off Disables the beep sound when operation is finished. (default: False) - --skip Skips files if 'srt' subtitle exists. Works if input is wildcard or directory. (default: False) - --checkcuda, -cc Returns CUDA device count. (for Subtitle Edit's internal use) - --print_progress, -pp - Prints progress bar instead of transcription. (default: False) - --postfix Adds language as a postfix to subtitle's filename. (default: False) - --one_word {0,1,2} 0) Disabled. 1) Outputs srt and vtt subtitles with one word per line. 2) As `1`, plus removes - whitespace and ensures >= 50ms for sub lines. Note: VAD may slightly reduce the accuracy of - timestamps on some lines. (default: 0) - --check_files Checks input files for errors before passing all them for transcription. Works if input is - wildcard or directory. (default: False) - --PR163_off Disables PR163. For dev experiments. (default: False) + [--device DEVICE] +[--output_dir OUTPUT_DIR] +[--output_format {lrc,txt,text,vtt,srt,tsv,json,all}] +[--verbose VERBOSE] +[--task {transcribe,translate}] +[--temperature TEMPERATURE] +[--best_of BEST_OF] +[--beam_size BEAM_SIZE] +[--patience PATIENCE] +[--length_penalty LENGTH_PENALTY] +[--repetition_penalty REPETITION_PENALTY] +[--no_repeat_ngram_size NO_REPEAT_NGRAM_SIZE] +[--suppress_blank SUPPRESS_BLANK] +[--suppress_tokens SUPPRESS_TOKENS] +[--initial_prompt INITIAL_PROMPT] +[--prefix PREFIX] +[--condition_on_previous_text CONDITION_ON_PREVIOUS_TEXT] +[--prompt_reset_on_temperature PROMPT_RESET_ON_TEMPERATURE] +[--without_timestamps WITHOUT_TIMESTAMPS] +[--max_initial_timestamp MAX_INITIAL_TIMESTAMP] +[--temperature_increment_on_fallback TEMPERATURE_INCREMENT_ON_FALLBACK] +[--compression_ratio_threshold COMPRESSION_RATIO_THRESHOLD] +[--logprob_threshold LOGPROB_THRESHOLD] +[--no_speech_threshold NO_SPEECH_THRESHOLD] +[--hallucination_silence_threshold HALLUCINATION_SILENCE_THRESHOLD] +[--clip_timestamps CLIP_TIMESTAMPS] +[--no_speech_strict_lvl {0,1,2}] +[--word_timestamps WORD_TIMESTAMPS] +[--highlight_words HIGHLIGHT_WORDS] +[--prepend_punctuations PREPEND_PUNCTUATIONS] +[--append_punctuations APPEND_PUNCTUATIONS] +[--threads THREADS] +[--version] +[--vad_filter VAD_FILTER] +[--vad_threshold VAD_THRESHOLD] +[--vad_min_speech_duration_ms VAD_MIN_SPEECH_DURATION_MS] +[--vad_max_speech_duration_s VAD_MAX_SPEECH_DURATION_S] +[--vad_min_silence_duration_ms VAD_MIN_SILENCE_DURATION_MS] +[--vad_speech_pad_ms VAD_SPEECH_PAD_MS] +[--vad_window_size_samples VAD_WINDOW_SIZE_SAMPLES] +[--max_new_tokens MAX_NEW_TOKENS] +[--chunk_length CHUNK_LENGTH] +[--compute_type {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}] +[--batch_recursive] +[--beep_off] +[--skip] +[--checkcuda] +[--print_progress] +[--postfix] +[--check_files] +[--PR163_off] +[--hallucinations_list_off] +[--one_word {0,1,2}] +[--sentence] +[--standard] +[--standard_asia] +[--max_comma MAX_COMMA] +[--max_comma_cent {50,60,70,80,90,100}] +[--max_gap MAX_GAP] +[--max_line_width MAX_LINE_WIDTH] +[--max_line_count MAX_LINE_COUNT] +[--min_dist_to_end {0,4,5,6,7,8,9,10,11,12}] +[--prompt_max {16,32,64,128,223}] +[--reprompt {0,1,2}] +[--prompt_reset_on_no_end {0,1,2}] +[--ff_dump] +[--ff_mp3] +[--ff_sync] +[--ff_rnndn_sh] +[--ff_rnndn_xiph] +[--ff_fftdn [0 - 97]] +[--ff_tempo [0.5 - 2.0]] +[--ff_gate] +[--ff_speechnorm] +[--ff_loudnorm] +[--ff_silence_suppress noise duration] +[--ff_lowhighpass] + - + \ No newline at end of file diff --git a/src/ui/Forms/AudioToText/WhisperAudioToText.cs b/src/ui/Forms/AudioToText/WhisperAudioToText.cs index c183b14fc..2d316f286 100644 --- a/src/ui/Forms/AudioToText/WhisperAudioToText.cs +++ b/src/ui/Forms/AudioToText/WhisperAudioToText.cs @@ -130,6 +130,11 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText labelEngine.Text = LanguageSettings.Current.AudioToText.Engine; labelEngine.Left = comboBoxWhisperEngine.Left - labelEngine.Width - 5; + if (Configuration.Settings.Tools.WhisperChoice == WhisperChoice.PurfviewFasterWhisper && !string.IsNullOrEmpty(Configuration.Settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd)) + { + Configuration.Settings.Tools.WhisperExtraSettings = Configuration.Settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd; + } + Init(); InitializeWhisperEngines(comboBoxWhisperEngine); } diff --git a/src/ui/Forms/AudioToText/WhisperDownload.cs b/src/ui/Forms/AudioToText/WhisperDownload.cs index 520714fe6..1658f7ed3 100644 --- a/src/ui/Forms/AudioToText/WhisperDownload.cs +++ b/src/ui/Forms/AudioToText/WhisperDownload.cs @@ -88,15 +88,16 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText }; - private const string DownloadUrlPurfviewFasterWhisper = "https://github.com/Purfview/whisper-standalone-win/releases/download/faster-whisper/Whisper-Faster_r167.2_windows.zip"; + private const string DownloadUrlPurfviewFasterWhisper = "https://github.com/Purfview/whisper-standalone-win/releases/download/faster-whisper/Whisper-Faster_r186.1_windows.zip"; private static readonly string[] Sha512HashesPurfviewFasterWhisper = { - "a16e2b5460d7f4b0d45de3f0e07b231d58ad4c79d077ad6b9c84a4e2ced4bd1cd3a7d9f01689f1d847ec8ff59c8f81cb742fcf2b153291ed6f15ec8b27adb998", // r167.2 + "e78616511a92b21cb8ac82e23cdbd06f5b9310751e5f3fa940b5c48743b69bad130aaf6d629ae07c5388326f117be8f181b125ed04aacd23f1a80d8891be889b", // r186.1 }; private static readonly string[] OldSha512HashesPurfviewFasterWhisper = { + "a16e2b5460d7f4b0d45de3f0e07b231d58ad4c79d077ad6b9c84a4e2ced4bd1cd3a7d9f01689f1d847ec8ff59c8f81cb742fcf2b153291ed6f15ec8b27adb998", // r167.2 "1995feca9dd971eccfb41f8dc330d418a531e615cee56eac7cc053fd343fe5200f9e64e2b4feafdde49b018ac518d1ee1b244aedd32dcb84e3fb69c1035b8a4f", // r160.7 "10ac03f098f991fe9474430a7f44c6fe0574dfb88d37ea4a31b764c540337918c529c4eceaf0524e88975b11b771c61dd67501d2a59fe05008a10195d2768edf", // r160.6 "9d65922c41a8848e70f04af8deed7279f827264e1fa305c165849e391917713f0336eee07320b2c2cbb6191167953f4d6d1e23a378bfa5a4273c6065a0eba5b3", // r160.5 @@ -388,7 +389,8 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText var hashVersion160_5 = "6983c90c96e47f53fb1451c1f0a32151ef144fe2e549affc7319d0c7666ea44dcbb0d7dc87ccdaaf0b3d8b2abe92060440e151495109f2681b99940f0eec5ad0"; var hashVersion160_6 = "f616a4fecfb40e74b3e096207f08fbe84a0d08ad872380cf2791eba8458ed854399de2d547be98bc35c65ce0b6959a149b981e745aa75876ffa8eb2fc6a8719e"; var hashVersion160_7 = "0f6b5b0a8d3d169ca7947866552dec30ac43406cda6b7e748c273ed78574087e330571925d8a36d48e5a3ea197d450be0289277677fdbad069038ac0788ea82e"; - return hash == hashVersion153 || hash == hashVersion160_3 || hash == hashVersion160_4 || hash == hashVersion160_5 || hash == hashVersion160_6 || hash == hashVersion160_7; + var hashVersion167_2 = "628dee27ab3030798c42983d0f544668f54e7c8d1c7a433b322b9c07286eedd10666d9b1f89764a75301b334cea9c7ad8bfbfeee00a98113b4730ee5cafe8812"; + return hash == hashVersion153 || hash == hashVersion160_3 || hash == hashVersion160_4 || hash == hashVersion160_5 || hash == hashVersion160_6 || hash == hashVersion160_7 || hash == hashVersion167_2; } if (whisperChoice == WhisperChoice.Cpp) diff --git a/src/ui/Forms/Options/Settings.cs b/src/ui/Forms/Options/Settings.cs index af42e0cf6..7c16c2051 100644 --- a/src/ui/Forms/Options/Settings.cs +++ b/src/ui/Forms/Options/Settings.cs @@ -44,7 +44,6 @@ namespace Nikse.SubtitleEdit.Forms.Options private List _rulesProfiles; private List _pluginShortcuts; private readonly bool _loading; - private readonly BackgroundWorker _shortcutsBackgroundWorker; private string _defaultLanguages; private static IEnumerable GetSubtitleFormats() => SubtitleFormat.AllSubtitleFormats.Where(format => !format.IsVobSubIndexFile).Select(format => format.FriendlyName); @@ -130,7 +129,6 @@ namespace Nikse.SubtitleEdit.Forms.Options "Network", "File type associations"}); - _shortcutsBackgroundWorker = new BackgroundWorker(); Init(); _loading = false; @@ -139,12 +137,7 @@ namespace Nikse.SubtitleEdit.Forms.Options public void Init() { - _shortcutsBackgroundWorker.DoWork += (sender, args) => - { - MakeShortcutsTreeView(LanguageSettings.Current.Settings); - }; - _shortcutsBackgroundWorker.RunWorkerAsync(); - + MakeShortcutsTreeView(LanguageSettings.Current.Settings); labelStatus.Text = string.Empty; _rulesProfiles = new List(Configuration.Settings.General.Profiles); var gs = Configuration.Settings.General; @@ -2508,12 +2501,6 @@ namespace Nikse.SubtitleEdit.Forms.Options case ShortcutsSection: section = panelShortcuts; Cursor = Cursors.WaitCursor; - while (_shortcutsBackgroundWorker.IsBusy) - { - System.Threading.Thread.Sleep(10); - Application.DoEvents(); - } - ShowShortcutsTreeView(); Cursor = Cursors.Default; break;