mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-11-25 04:33:04 +01:00
Update Purfview Faster Whisper to r186.1
This commit is contained in:
parent
bdfb3f1122
commit
4e165cc719
@ -476,6 +476,7 @@ namespace Nikse.SubtitleEdit.Core.Common
|
||||
public string WhisperLocation { get; set; }
|
||||
public string WhisperCtranslate2Location { get; set; }
|
||||
public string WhisperPurfviewFasterWhisperLocation { get; set; }
|
||||
public string WhisperPurfviewFasterWhisperDefaultCmd { get; set; }
|
||||
public string WhisperXLocation { get; set; }
|
||||
public string WhisperStableTsLocation { get; set; }
|
||||
public string WhisperCppModelLocation { get; set; }
|
||||
@ -710,6 +711,7 @@ namespace Nikse.SubtitleEdit.Core.Common
|
||||
VoskPostProcessing = true;
|
||||
WhisperChoice = Configuration.IsRunningOnWindows ? AudioToText.WhisperChoice.PurfviewFasterWhisper : AudioToText.WhisperChoice.OpenAi;
|
||||
WhisperDeleteTempFiles = true;
|
||||
WhisperPurfviewFasterWhisperDefaultCmd = "--standard";
|
||||
WhisperExtraSettings = "";
|
||||
WhisperLanguageCode = "en";
|
||||
WhisperAutoAdjustTimings = true;
|
||||
@ -7128,6 +7130,12 @@ $HorzAlign = Center
|
||||
settings.Tools.WhisperPurfviewFasterWhisperLocation = subNode.InnerText;
|
||||
}
|
||||
|
||||
subNode = node.SelectSingleNode("WhisperPurfviewFasterWhisperDefaultCmd");
|
||||
if (subNode != null)
|
||||
{
|
||||
settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd = subNode.InnerText;
|
||||
}
|
||||
|
||||
subNode = node.SelectSingleNode("WhisperExtraSettings");
|
||||
if (subNode != null)
|
||||
{
|
||||
@ -12084,6 +12092,7 @@ $HorzAlign = Center
|
||||
textWriter.WriteElementString("WhisperLocation", settings.Tools.WhisperLocation);
|
||||
textWriter.WriteElementString("WhisperCtranslate2Location", settings.Tools.WhisperCtranslate2Location);
|
||||
textWriter.WriteElementString("WhisperPurfviewFasterWhisperLocation", settings.Tools.WhisperPurfviewFasterWhisperLocation);
|
||||
textWriter.WriteElementString("WhisperPurfviewFasterWhisperDefaultCmd", settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd);
|
||||
textWriter.WriteElementString("WhisperXLocation", settings.Tools.WhisperXLocation);
|
||||
textWriter.WriteElementString("WhisperStableTsLocation", settings.Tools.WhisperStableTsLocation);
|
||||
textWriter.WriteElementString("WhisperCppModelLocation", settings.Tools.WhisperCppModelLocation);
|
||||
|
@ -73,6 +73,15 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText
|
||||
}
|
||||
|
||||
Configuration.Settings.Tools.WhisperExtraSettings = comboBoxWhisperExtra.Text;
|
||||
|
||||
if (Configuration.Settings.Tools.WhisperChoice == WhisperChoice.PurfviewFasterWhisper)
|
||||
{
|
||||
if (Configuration.Settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd != comboBoxWhisperExtra.Text)
|
||||
{
|
||||
Configuration.Settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd = string.Empty;
|
||||
}
|
||||
}
|
||||
|
||||
DialogResult = DialogResult.OK;
|
||||
}
|
||||
|
||||
|
@ -238,85 +238,83 @@
|
||||
</value>
|
||||
</data>
|
||||
<data name="textBoxPurfviewFasterWhisper.Text" xml:space="preserve">
|
||||
<value> --device DEVICE, -d DEVICE
|
||||
Device to use. Default is 'cuda' if CUDA device is detected, else is 'cpu'. If CUDA GPU is a second device then set 'cuda:1'.
|
||||
--verbose VERBOSE, -v VERBOSE
|
||||
whether to print out debug messages (default: False)
|
||||
--task {transcribe,translate}
|
||||
whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate') (default: transcribe)
|
||||
--temperature TEMPERATURE
|
||||
temperature to use for sampling (default: 0)
|
||||
--best_of BEST_OF, -bo BEST_OF
|
||||
number of candidates when sampling with non-zero temperature (default: 5)
|
||||
--beam_size BEAM_SIZE, -bs BEAM_SIZE
|
||||
number of beams in beam search, only applicable when temperature is zero (default: 5)
|
||||
--patience PATIENCE, -p PATIENCE
|
||||
optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search (default: 1.0)
|
||||
--length_penalty LENGTH_PENALTY
|
||||
optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default (default: 1.0)
|
||||
--repetition_penalty REPETITION_PENALTY
|
||||
Penalty applied to the score of previously generated tokens (set > 1.0 to penalize). (default: 1.0)
|
||||
--no_repeat_ngram_size NO_REPEAT_NGRAM_SIZE
|
||||
Prevent repetitions of ngrams with this size (set 0 to disable). (default: 0)
|
||||
--suppress_tokens SUPPRESS_TOKENS
|
||||
comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations (default: -1)
|
||||
--initial_prompt INITIAL_PROMPT, -prompt INITIAL_PROMPT
|
||||
optional text to provide as a prompt for the first window. Use 'None' to disable it (default: ,.?!)
|
||||
--condition_on_previous_text CONDITION_ON_PREVIOUS_TEXT
|
||||
if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop
|
||||
(default: True)
|
||||
--prompt_reset_on_temperature PROMPT_RESET_ON_TEMPERATURE
|
||||
Resets prompt if temperature is above this value. Arg has effect only if condition_on_previous_text is True. (default: 0.5)
|
||||
--temperature_increment_on_fallback TEMPERATURE_INCREMENT_ON_FALLBACK
|
||||
temperature to increase when falling back when the decoding fails to meet either of the thresholds below. To disable fallback set it to 'None'. (default: 0.2)
|
||||
--compression_ratio_threshold COMPRESSION_RATIO_THRESHOLD
|
||||
if the gzip compression ratio is higher than this value, treat the decoding as failed (default: 2.4)
|
||||
--logprob_threshold LOGPROB_THRESHOLD
|
||||
if the average log probability is lower than this value, treat the decoding as failed (default: -1.0)
|
||||
--no_speech_threshold NO_SPEECH_THRESHOLD
|
||||
if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence (default: 0.6)
|
||||
--no_speech_strict_lvl {0,1,2}
|
||||
(experimental) Level of stricter actions when no_speech_prob > 0.93. Use beam_size=5 if this is enabled. Options: 0 - Disabled (do nothing), 1 - Reset propmt (see condition_on_previous_text), 2 - Invalidate the
|
||||
cached encoder output (if no_speech_threshold is not None). Arg meant to combat cases where the model is getting stuck in a failure loop or outputs nonsense (default: 0)
|
||||
--word_timestamps WORD_TIMESTAMPS, -wt WORD_TIMESTAMPS
|
||||
(experimental) extract word-level timestamps and refine the results based on them (default: True)
|
||||
--highlight_words HIGHLIGHT_WORDS, -hw HIGHLIGHT_WORDS
|
||||
underline each word as it is spoken AKA karaoke in srt and vtt output formats (default: False)
|
||||
--prepend_punctuations PREPEND_PUNCTUATIONS
|
||||
if word_timestamps is True, merge these punctuation symbols with the next word (default: "'“¿([{-)
|
||||
--append_punctuations APPEND_PUNCTUATIONS
|
||||
if word_timestamps is True, merge these punctuation symbols with the previous word (default: "'.。,,!!??::”)]}、)
|
||||
--threads THREADS number of threads used for CPU inference; By default number of the real cores but no more that 4 (default: 0)
|
||||
--version Show Faster-Whisper's version number
|
||||
--vad_filter VAD_FILTER, -vad VAD_FILTER
|
||||
Enable the voice activity detection (VAD) to filter out parts of the audio without speech. (default: True)
|
||||
--vad_threshold VAD_THRESHOLD
|
||||
Probabilities above this value are considered as speech. (default: 0.45)
|
||||
--vad_min_speech_duration_ms VAD_MIN_SPEECH_DURATION_MS
|
||||
Final speech chunks shorter min_speech_duration_ms are thrown out. (default: 350)
|
||||
--vad_max_speech_duration_s VAD_MAX_SPEECH_DURATION_S
|
||||
Maximum duration of speech chunks in seconds. Longer will be split at the timestamp of the last silence. (default: None)
|
||||
--vad_min_silence_duration_ms VAD_MIN_SILENCE_DURATION_MS
|
||||
In the end of each speech chunk time to wait before separating it. (default: 3000)
|
||||
--vad_speech_pad_ms VAD_SPEECH_PAD_MS
|
||||
Final speech chunks are padded by speech_pad_ms each side. (default: 900)
|
||||
--vad_window_size_samples VAD_WINDOW_SIZE_SAMPLES
|
||||
Size of audio chunks fed to the silero VAD model. Values other than 512, 1024, 1536 may affect model perfomance!!! (default: 1536)
|
||||
--compute_type {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}, -ct {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}
|
||||
Type of quantization to use (see https://opennmt.net/CTranslate2/quantization.html). (default: auto)
|
||||
--batch_recursive, -br
|
||||
Enables recursive batch processing. Note: If set then it changes defaults of --output_dir. (default: False)
|
||||
--beep_off Disables the beep sound when operation is finished. (default: False)
|
||||
--skip Skips files if 'srt' subtitle exists. Works if input is wildcard or directory. (default: False)
|
||||
--checkcuda, -cc Returns CUDA device count. (for Subtitle Edit's internal use)
|
||||
--print_progress, -pp
|
||||
Prints progress bar instead of transcription. (default: False)
|
||||
--postfix Adds language as a postfix to subtitle's filename. (default: False)
|
||||
--one_word {0,1,2} 0) Disabled. 1) Outputs srt and vtt subtitles with one word per line. 2) As `1`, plus removes
|
||||
whitespace and ensures >= 50ms for sub lines. Note: VAD may slightly reduce the accuracy of
|
||||
timestamps on some lines. (default: 0)
|
||||
--check_files Checks input files for errors before passing all them for transcription. Works if input is
|
||||
wildcard or directory. (default: False)
|
||||
--PR163_off Disables PR163. For dev experiments. (default: False)</value>
|
||||
<value>[--device DEVICE]
|
||||
[--output_dir OUTPUT_DIR]
|
||||
[--output_format {lrc,txt,text,vtt,srt,tsv,json,all}]
|
||||
[--verbose VERBOSE]
|
||||
[--task {transcribe,translate}]
|
||||
[--temperature TEMPERATURE]
|
||||
[--best_of BEST_OF]
|
||||
[--beam_size BEAM_SIZE]
|
||||
[--patience PATIENCE]
|
||||
[--length_penalty LENGTH_PENALTY]
|
||||
[--repetition_penalty REPETITION_PENALTY]
|
||||
[--no_repeat_ngram_size NO_REPEAT_NGRAM_SIZE]
|
||||
[--suppress_blank SUPPRESS_BLANK]
|
||||
[--suppress_tokens SUPPRESS_TOKENS]
|
||||
[--initial_prompt INITIAL_PROMPT]
|
||||
[--prefix PREFIX]
|
||||
[--condition_on_previous_text CONDITION_ON_PREVIOUS_TEXT]
|
||||
[--prompt_reset_on_temperature PROMPT_RESET_ON_TEMPERATURE]
|
||||
[--without_timestamps WITHOUT_TIMESTAMPS]
|
||||
[--max_initial_timestamp MAX_INITIAL_TIMESTAMP]
|
||||
[--temperature_increment_on_fallback TEMPERATURE_INCREMENT_ON_FALLBACK]
|
||||
[--compression_ratio_threshold COMPRESSION_RATIO_THRESHOLD]
|
||||
[--logprob_threshold LOGPROB_THRESHOLD]
|
||||
[--no_speech_threshold NO_SPEECH_THRESHOLD]
|
||||
[--hallucination_silence_threshold HALLUCINATION_SILENCE_THRESHOLD]
|
||||
[--clip_timestamps CLIP_TIMESTAMPS]
|
||||
[--no_speech_strict_lvl {0,1,2}]
|
||||
[--word_timestamps WORD_TIMESTAMPS]
|
||||
[--highlight_words HIGHLIGHT_WORDS]
|
||||
[--prepend_punctuations PREPEND_PUNCTUATIONS]
|
||||
[--append_punctuations APPEND_PUNCTUATIONS]
|
||||
[--threads THREADS]
|
||||
[--version]
|
||||
[--vad_filter VAD_FILTER]
|
||||
[--vad_threshold VAD_THRESHOLD]
|
||||
[--vad_min_speech_duration_ms VAD_MIN_SPEECH_DURATION_MS]
|
||||
[--vad_max_speech_duration_s VAD_MAX_SPEECH_DURATION_S]
|
||||
[--vad_min_silence_duration_ms VAD_MIN_SILENCE_DURATION_MS]
|
||||
[--vad_speech_pad_ms VAD_SPEECH_PAD_MS]
|
||||
[--vad_window_size_samples VAD_WINDOW_SIZE_SAMPLES]
|
||||
[--max_new_tokens MAX_NEW_TOKENS]
|
||||
[--chunk_length CHUNK_LENGTH]
|
||||
[--compute_type {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}]
|
||||
[--batch_recursive]
|
||||
[--beep_off]
|
||||
[--skip]
|
||||
[--checkcuda]
|
||||
[--print_progress]
|
||||
[--postfix]
|
||||
[--check_files]
|
||||
[--PR163_off]
|
||||
[--hallucinations_list_off]
|
||||
[--one_word {0,1,2}]
|
||||
[--sentence]
|
||||
[--standard]
|
||||
[--standard_asia]
|
||||
[--max_comma MAX_COMMA]
|
||||
[--max_comma_cent {50,60,70,80,90,100}]
|
||||
[--max_gap MAX_GAP]
|
||||
[--max_line_width MAX_LINE_WIDTH]
|
||||
[--max_line_count MAX_LINE_COUNT]
|
||||
[--min_dist_to_end {0,4,5,6,7,8,9,10,11,12}]
|
||||
[--prompt_max {16,32,64,128,223}]
|
||||
[--reprompt {0,1,2}]
|
||||
[--prompt_reset_on_no_end {0,1,2}]
|
||||
[--ff_dump]
|
||||
[--ff_mp3]
|
||||
[--ff_sync]
|
||||
[--ff_rnndn_sh]
|
||||
[--ff_rnndn_xiph]
|
||||
[--ff_fftdn [0 - 97]]
|
||||
[--ff_tempo [0.5 - 2.0]]
|
||||
[--ff_gate]
|
||||
[--ff_speechnorm]
|
||||
[--ff_loudnorm]
|
||||
[--ff_silence_suppress noise duration]
|
||||
[--ff_lowhighpass]
|
||||
</value>
|
||||
</data>
|
||||
</root>
|
@ -130,6 +130,11 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText
|
||||
labelEngine.Text = LanguageSettings.Current.AudioToText.Engine;
|
||||
labelEngine.Left = comboBoxWhisperEngine.Left - labelEngine.Width - 5;
|
||||
|
||||
if (Configuration.Settings.Tools.WhisperChoice == WhisperChoice.PurfviewFasterWhisper && !string.IsNullOrEmpty(Configuration.Settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd))
|
||||
{
|
||||
Configuration.Settings.Tools.WhisperExtraSettings = Configuration.Settings.Tools.WhisperPurfviewFasterWhisperDefaultCmd;
|
||||
}
|
||||
|
||||
Init();
|
||||
InitializeWhisperEngines(comboBoxWhisperEngine);
|
||||
}
|
||||
|
@ -88,15 +88,16 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText
|
||||
};
|
||||
|
||||
|
||||
private const string DownloadUrlPurfviewFasterWhisper = "https://github.com/Purfview/whisper-standalone-win/releases/download/faster-whisper/Whisper-Faster_r167.2_windows.zip";
|
||||
private const string DownloadUrlPurfviewFasterWhisper = "https://github.com/Purfview/whisper-standalone-win/releases/download/faster-whisper/Whisper-Faster_r186.1_windows.zip";
|
||||
|
||||
private static readonly string[] Sha512HashesPurfviewFasterWhisper =
|
||||
{
|
||||
"a16e2b5460d7f4b0d45de3f0e07b231d58ad4c79d077ad6b9c84a4e2ced4bd1cd3a7d9f01689f1d847ec8ff59c8f81cb742fcf2b153291ed6f15ec8b27adb998", // r167.2
|
||||
"e78616511a92b21cb8ac82e23cdbd06f5b9310751e5f3fa940b5c48743b69bad130aaf6d629ae07c5388326f117be8f181b125ed04aacd23f1a80d8891be889b", // r186.1
|
||||
};
|
||||
|
||||
private static readonly string[] OldSha512HashesPurfviewFasterWhisper =
|
||||
{
|
||||
"a16e2b5460d7f4b0d45de3f0e07b231d58ad4c79d077ad6b9c84a4e2ced4bd1cd3a7d9f01689f1d847ec8ff59c8f81cb742fcf2b153291ed6f15ec8b27adb998", // r167.2
|
||||
"1995feca9dd971eccfb41f8dc330d418a531e615cee56eac7cc053fd343fe5200f9e64e2b4feafdde49b018ac518d1ee1b244aedd32dcb84e3fb69c1035b8a4f", // r160.7
|
||||
"10ac03f098f991fe9474430a7f44c6fe0574dfb88d37ea4a31b764c540337918c529c4eceaf0524e88975b11b771c61dd67501d2a59fe05008a10195d2768edf", // r160.6
|
||||
"9d65922c41a8848e70f04af8deed7279f827264e1fa305c165849e391917713f0336eee07320b2c2cbb6191167953f4d6d1e23a378bfa5a4273c6065a0eba5b3", // r160.5
|
||||
@ -388,7 +389,8 @@ namespace Nikse.SubtitleEdit.Forms.AudioToText
|
||||
var hashVersion160_5 = "6983c90c96e47f53fb1451c1f0a32151ef144fe2e549affc7319d0c7666ea44dcbb0d7dc87ccdaaf0b3d8b2abe92060440e151495109f2681b99940f0eec5ad0";
|
||||
var hashVersion160_6 = "f616a4fecfb40e74b3e096207f08fbe84a0d08ad872380cf2791eba8458ed854399de2d547be98bc35c65ce0b6959a149b981e745aa75876ffa8eb2fc6a8719e";
|
||||
var hashVersion160_7 = "0f6b5b0a8d3d169ca7947866552dec30ac43406cda6b7e748c273ed78574087e330571925d8a36d48e5a3ea197d450be0289277677fdbad069038ac0788ea82e";
|
||||
return hash == hashVersion153 || hash == hashVersion160_3 || hash == hashVersion160_4 || hash == hashVersion160_5 || hash == hashVersion160_6 || hash == hashVersion160_7;
|
||||
var hashVersion167_2 = "628dee27ab3030798c42983d0f544668f54e7c8d1c7a433b322b9c07286eedd10666d9b1f89764a75301b334cea9c7ad8bfbfeee00a98113b4730ee5cafe8812";
|
||||
return hash == hashVersion153 || hash == hashVersion160_3 || hash == hashVersion160_4 || hash == hashVersion160_5 || hash == hashVersion160_6 || hash == hashVersion160_7 || hash == hashVersion167_2;
|
||||
}
|
||||
|
||||
if (whisperChoice == WhisperChoice.Cpp)
|
||||
|
@ -44,7 +44,6 @@ namespace Nikse.SubtitleEdit.Forms.Options
|
||||
private List<RulesProfile> _rulesProfiles;
|
||||
private List<PluginShortcut> _pluginShortcuts;
|
||||
private readonly bool _loading;
|
||||
private readonly BackgroundWorker _shortcutsBackgroundWorker;
|
||||
private string _defaultLanguages;
|
||||
|
||||
private static IEnumerable<string> GetSubtitleFormats() => SubtitleFormat.AllSubtitleFormats.Where(format => !format.IsVobSubIndexFile).Select(format => format.FriendlyName);
|
||||
@ -130,7 +129,6 @@ namespace Nikse.SubtitleEdit.Forms.Options
|
||||
"Network",
|
||||
"File type associations"});
|
||||
|
||||
_shortcutsBackgroundWorker = new BackgroundWorker();
|
||||
Init();
|
||||
_loading = false;
|
||||
|
||||
@ -138,13 +136,8 @@ namespace Nikse.SubtitleEdit.Forms.Options
|
||||
}
|
||||
|
||||
public void Init()
|
||||
{
|
||||
_shortcutsBackgroundWorker.DoWork += (sender, args) =>
|
||||
{
|
||||
MakeShortcutsTreeView(LanguageSettings.Current.Settings);
|
||||
};
|
||||
_shortcutsBackgroundWorker.RunWorkerAsync();
|
||||
|
||||
labelStatus.Text = string.Empty;
|
||||
_rulesProfiles = new List<RulesProfile>(Configuration.Settings.General.Profiles);
|
||||
var gs = Configuration.Settings.General;
|
||||
@ -2508,12 +2501,6 @@ namespace Nikse.SubtitleEdit.Forms.Options
|
||||
case ShortcutsSection:
|
||||
section = panelShortcuts;
|
||||
Cursor = Cursors.WaitCursor;
|
||||
while (_shortcutsBackgroundWorker.IsBusy)
|
||||
{
|
||||
System.Threading.Thread.Sleep(10);
|
||||
Application.DoEvents();
|
||||
}
|
||||
|
||||
ShowShortcutsTreeView();
|
||||
Cursor = Cursors.Default;
|
||||
break;
|
||||
|
Loading…
Reference in New Issue
Block a user