Updated OcrFixReplaceList.cs

Added other common Latin ligatures present in Unicode.

Also added the acute accent, which I've often seen used instead of the
apostrophe, either as an OCR error or because people mistake it for the
curly apostrophe.

Closes #1961
This commit is contained in:
aaaxx 2016-09-19 01:46:00 +02:00 committed by Waldi Ravens
parent 8aba3f8f17
commit 74c5c0a29e

View File

@ -281,10 +281,18 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
{ {
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules) if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
{ {
// common Latin ligatures from legacy encodings;
// Unicode includes them only for compatibility and discourages their use
word = word.Replace("ff", "ff");
word = word.Replace("fi", "fi"); word = word.Replace("fi", "fi");
word = word.Replace("fl", "fl");
word = word.Replace("ffi", "ffi");
word = word.Replace("ffl", "ffl");
word = word.Replace('ν', 'v'); // first 'v' is U+03BD GREEK SMALL LETTER NU word = word.Replace('ν', 'v'); // first 'v' is U+03BD GREEK SMALL LETTER NU
word = word.Replace('', '\''); word = word.Replace('', '\'');
word = word.Replace('`', '\''); word = word.Replace('`', '\'');
word = word.Replace('´', '\'');
word = word.Replace('', '\''); word = word.Replace('', '\'');
word = word.Replace('—', '-'); word = word.Replace('—', '-');
while(word.Contains("--")) while(word.Contains("--"))