dictionaries: automated XML upkeep

This commit is contained in:
Waldi Ravens 2019-05-26 03:23:51 +02:00
parent dcc28ab676
commit 0469c7f59f
27 changed files with 140 additions and 128 deletions

View File

@ -1,6 +1,9 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- This list contains names/words with specific casing - and specific to Danish only -->
<!-- This list contains names with specific casing - and specific to Danish only -->
<names>
<blacklist>
<name>Rabat</name>
</blacklist>
<name>Aquarium</name>
<name>Armfeldt</name>
<name>Asgård</name>
@ -85,7 +88,4 @@
<name>Wharton</name>
<name>Wylie</name>
<name>Yardley</name>
<blacklist>
<name>Rabat</name>
</blacklist>
</names>

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords>
<Word from="Haner" to="Han er" />
<Word from="JaveL" to="Javel" />
@ -633,10 +634,10 @@
<WordPart from="Ã" to="Å" />
<WordPart from="í" to="i" />
</PartialWords>
<PartialLines />
<WholeLines />
<PartialLinesAlways />
<PartialLines />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions />
</OCRFixReplaceList>

View File

@ -1,6 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- This list contains names/words with specific casing - and specific to German only -->
<!-- This list contains names with specific casing - and specific to German only -->
<names>
<blacklist />
<name>Abelard</name>
<name>Ada</name>
<name>Adal</name>

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords>
<Word from="/a" to="Ja" />
<Word from="/ch" to="Ich" />
@ -7047,12 +7048,12 @@
<WordPart from="fi" to="fi" />
<WordPart from="fl" to="fl" />
</PartialWords>
<WholeLines />
<PartialLinesAlways />
<PartialLines>
<!-- Partial lines - but whole words only -->
</PartialLines>
<PartialLinesAlways />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions />
</OCRFixReplaceList>

View File

@ -1,6 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- This list contains names/words with specific casing - and specific to English only -->
<!-- This list contains names with specific casing - and specific to English only -->
<names>
<blacklist>
<name>Bill</name>
<name>Black</name>
<name>Male</name>
</blacklist>
<name>Aarav</name>
<name>Abbey</name>
<name>Abbie</name>
@ -1570,9 +1575,4 @@
<name>Zion</name>
<name>Zoie</name>
<name>Zuri</name>
<blacklist>
<name>Male</name>
<name>Bill</name>
<name>Black</name>
</blacklist>
</names>

View File

@ -2694,6 +2694,26 @@
<WordPart from=")'" to="y" />
<WordPart from="a" to="d" />
</PartialWords>
<WholeLines>
<!-- Whole lines - including -" etc -->
<Line from="H ey." to="Hey." />
<Line from="He)-" to="Hey." />
<Line from="N0." to="No." />
<Line from="-N0." to="-No." />
<Line from="Noll" to="No!!" />
<Line from="(G ROANS)" to="(GROANS)" />
<Line from="[G ROANS]" to="[GROANS]" />
<Line from="(M EOWS)" to="(MEOWS)" />
<Line from="[M EOWS]" to="[MEOWS]" />
<Line from="Uaughs]" to="[laughs]" />
<Line from="[chitte rs]" to="[chitters]" />
<Line from="Hil it!" to="Hit it!" />
<Line from="&lt;i&gt;Hil it!&lt;/i&gt;" to="&lt;i&gt;Hit it!&lt;/i&gt;" />
<Line from="ISIGHS]" to="[SIGHS]" />
</WholeLines>
<PartialLinesAlways>
<LinePart from="forbest act" to="for best act" />
</PartialLinesAlways>
<PartialLines>
<LinePart from=" /be " to=" I be " />
<LinePart from=" aren '1'" to=" aren't" />
@ -3014,9 +3034,6 @@
<LinePart from="you' re" to="you're" />
<LinePart from="You' ve " to="You've " />
</PartialLines>
<PartialLinesAlways>
<LinePart from="forbest act" to="for best act" />
</PartialLinesAlways>
<BeginLines>
<Beginning from="lgot it" to="I got it" />
<Beginning from="Don,t " to="Don't " />
@ -3164,23 +3181,6 @@
<Ending from=" i..." to=" I..." />
<Ending from=" L." to=" I." />
</EndLines>
<WholeLines>
<!-- Whole lines - including -" etc -->
<Line from="H ey." to="Hey." />
<Line from="He)-" to="Hey." />
<Line from="N0." to="No." />
<Line from="-N0." to="-No." />
<Line from="Noll" to="No!!" />
<Line from="(G ROANS)" to="(GROANS)" />
<Line from="[G ROANS]" to="[GROANS]" />
<Line from="(M EOWS)" to="(MEOWS)" />
<Line from="[M EOWS]" to="[MEOWS]" />
<Line from="Uaughs]" to="[laughs]" />
<Line from="[chitte rs]" to="[chitters]" />
<Line from="Hil it!" to="Hit it!" />
<Line from="&lt;i&gt;Hil it!&lt;/i&gt;" to="&lt;i&gt;Hit it!&lt;/i&gt;" />
<Line from="ISIGHS]" to="[SIGHS]" />
</WholeLines>
<RegularExpressions>
<RegEx find="([a-z]) Won't " replaceWith="$1 won't " />
<RegEx find=" L([,\r\n :;!?]+)" replaceWith=" I$1" />

View File

@ -1,6 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- This list contains names/words with specific casing - and specific to Spanish only -->
<!-- This list contains names with specific casing - and specific to Spanish only -->
<names>
<blacklist />
<name>Aang</name>
<name>Aarón</name>
<name>Abdulabri</name>
@ -1009,7 +1010,4 @@
<name>Zoroastro</name>
<name>Zuko</name>
<name>Zulú</name>
<blacklist>
<name></name>
</blacklist>
</names>

View File

@ -1,6 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- This list contains names/words with specific casing - and specific to Finnish only -->
<!-- This list contains names with specific casing - and specific to Finnish only -->
<names>
<blacklist />
<name>AA</name>
<name>Abbie</name>
<name>Abbylla</name>
@ -3072,7 +3073,4 @@
<name>Zoye</name>
<name>Zürichissa</name>
<name>Åkessonin</name>
<blacklist>
<name></name>
</blacklist>
</names>

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords>
<Word from="kellojo" to="kello jo" />
<Word from="onjo" to="on jo" />
@ -987,10 +988,6 @@
</WholeWords>
<PartialWordsAlways />
<PartialWords />
<PartialLines />
<PartialLinesAlways />
<BeginLines />
<EndLines />
<WholeLines>
<Line from="Katsokaa pa." to="Katsokaapa." />
<Line from="Mik!&#xD;&#xA;&quot;&quot;e“9iräı" to="Mik!&#xD;&#xA;-Hengitä!" />
@ -1028,5 +1025,9 @@
<Line from="Haluan kertoa jotai n" to="Haluan kertoa jotain" />
<Line from="I-Ialuatte" to="Haluatte" />
</WholeLines>
<PartialLinesAlways />
<PartialLines />
<BeginLines />
<EndLines />
<RegularExpressions />
</OCRFixReplaceList>

View File

@ -1,6 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- This list contains names/words with specific casing - and specific to French only -->
<!-- This list contains names with specific casing - and specific to French only -->
<names>
<blacklist />
<name>Abdon</name>
<name>Abdonie</name>
<name>Abdonise</name>
@ -808,7 +809,4 @@
<name>Zéphir</name>
<name>Zéphirin</name>
<name>Zoé</name>
<blacklist>
<name></name>
</blacklist>
</names>

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords>
<Word from="@immatriculation" to="d'immatriculation" />
<Word from="acquer" to="acquér" />
@ -246,13 +247,6 @@
</WholeWords>
<PartialWordsAlways />
<PartialWords />
<PartialLines>
<LinePart from=" I'" to=" l'" />
<LinePart from=" |'" to=" l'" />
</PartialLines>
<PartialLinesAlways />
<BeginLines />
<EndLines />
<WholeLines>
<Line from="&quot;D'ac:c:ord.&quot;" to="&quot;D'accord.&quot;" />
<Line from="“i QUÎ gagne, qui perd," to="ni qui gagne, qui perd," />
@ -266,5 +260,12 @@
<Line from="Peter H u nt." to="Peter Hunt." />
<Line from="&quot;C'est bien mieux dans Peau. &#xD;&#xA; &#xD;&#xA; On peut sfléclabousser, faire du bruit.&quot;" to="&quot;C'est bien mieux dans l'eau. &#xD;&#xA; &#xD;&#xA; On peut s'éclabousser, faire du bruit.&quot;" />
</WholeLines>
<PartialLinesAlways />
<PartialLines>
<LinePart from=" I'" to=" l'" />
<LinePart from=" |'" to=" l'" />
</PartialLines>
<BeginLines />
<EndLines />
<RegularExpressions />
</OCRFixReplaceList>

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords>
<Word from="andele" to="anđele" />
<Word from="andeli" to="anđeli" />
@ -1218,6 +1219,8 @@
</WholeWords>
<PartialWordsAlways />
<PartialWords />
<WholeLines />
<PartialLinesAlways />
<PartialLines>
<LinePart from="da nadjem" to="naći" />
<LinePart from="da nadjes" to="naći" />
@ -1255,10 +1258,8 @@
<LinePart from="znas sto" to="znaš što" />
<LinePart from="znaš sto" to="znaš što" />
</PartialLines>
<PartialLinesAlways />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions>
<RegEx find="adas(?!v)" replaceWith="adaš" />
<RegEx find="(?&lt;![Pp]r|[Nn])adje(?!(v|n(e|u[olt]))\b)" replaceWith="ađe" />

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords>
<Word from="()d" to="Od" />
<Word from="advokati" to="odvjetnici" />
@ -2228,6 +2229,8 @@
</WholeWords>
<PartialWordsAlways />
<PartialWords />
<WholeLines />
<PartialLinesAlways />
<PartialLines>
<LinePart from="Ako ej" to="Ako je" />
<LinePart from="ako ej" to="ako je" />
@ -2397,10 +2400,8 @@
<LinePart from="Želi da zna" to="Želi znati" />
<LinePart from="želi da zna" to="želi znati" />
</PartialLines>
<PartialLinesAlways />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions>
<!-- deklinacije imenica i konjugacije glagola -->
<RegEx find="([0-9])-ogodišnj" replaceWith="$1-godišnj" />

View File

@ -1,12 +1,13 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords />
<PartialWordsAlways />
<PartialWords />
<PartialLines />
<WholeLines />
<PartialLinesAlways />
<PartialLines />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions>
<!-- nagy I-l javítások -->
<RegEx find="([\x41-\x5a\x61-\x7a\xc1-\xfc])II" replaceWith="$1ll" />

View File

@ -5,6 +5,7 @@ This file is case sensitive.
This file is generated/updated by Multi Translator
-->
<names>
<blacklist />
<name>1A</name>
<name>2 Chainz</name>
<name>2 Pac</name>
@ -2653,8 +2654,8 @@ This file is generated/updated by Multi Translator
<name>Iceland</name>
<name>Icelander</name>
<name>Icelandic</name>
<name>I'd</name>
<name>ID</name>
<name>I'd</name>
<name>Idaho</name>
<name>Idris</name>
<name>Idris Elba</name>

View File

@ -9,8 +9,8 @@
<word>fotograafje</word>
<word>gemaar</word>
<word>gps</word>
<word>hielenlikkerij</word>
<word>hielenlikkerij</word>
<word>hielenlikkerij</word>
<word>hostessen</word>
<word>inbak</word>
<word>insignificante</word>

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords>
<Word from="aandachtmag" to="aandacht mag" />
<Word from="agrariers" to="agrariërs" />
@ -110,12 +111,16 @@
<Word from="zonderjou" to="zonder jou" />
</WholeWords>
<PartialWordsAlways />
<PartialWords />
<PartialLines />
<PartialWords>
<!-- Will be used to check words not in dictionary -->
<!-- If new word(s) exists in spelling dictionary, it is (they are) accepted -->
<WordPart from="ij" to="ij" />
</PartialWords>
<WholeLines />
<PartialLinesAlways />
<PartialLines />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions>
<RegEx find="\blk(?=\p{Ll}{2})" replaceWith="Ik" />
<RegEx find="\bln(?=\p{Ll}{2})" replaceWith="In" />

View File

@ -1,6 +1,22 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- This list contains names/words with specific casing - and specific to Norwegian only -->
<!-- This list contains names with specific casing - and specific to Norwegian only -->
<names>
<blacklist>
<name>Ane</name>
<name>Ben</name>
<name>Bo</name>
<name>Dag</name>
<name>Fet</name>
<name>Fred</name>
<name>Gro</name>
<name>Hem</name>
<name>Jo</name>
<name>Per</name>
<name>Rune</name>
<name>Saga</name>
<name>Tom</name>
<name>Ål</name>
</blacklist>
<name>Aage</name>
<name>Aagot</name>
<name>Aase</name>
@ -2856,20 +2872,4 @@
<name>Åsne</name>
<name>Åsnes</name>
<name>Åsta</name>
<blacklist>
<name>Ane</name>
<name>Ben</name>
<name>Bo</name>
<name>Dag</name>
<name>Fet</name>
<name>Fred</name>
<name>Gro</name>
<name>Hem</name>
<name>Jo</name>
<name>Per</name>
<name>Rune</name>
<name>Saga</name>
<name>Tom</name>
<name>Ål</name>
</blacklist>
</names>

View File

@ -48,10 +48,10 @@
<WordPart from="Ã" to="Å" />
<WordPart from="í" to="i" />
</PartialWords>
<PartialLines />
<WholeLines />
<PartialLinesAlways />
<PartialLines />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions />
</OCRFixReplaceList>

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords />
<PartialWordsAlways />
<PartialWords>
@ -34,10 +35,10 @@
<WordPart from="Ã" to="Å" />
<WordPart from="í" to="i" />
</PartialWords>
<PartialLines />
<WholeLines />
<PartialLinesAlways />
<PartialLines />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions />
</OCRFixReplaceList>

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords>
<Word from="abitual" to="habitual" />
<Word from="àcerca" to="acerca" />
@ -445,6 +446,8 @@
</WholeWords>
<PartialWordsAlways />
<PartialWords />
<WholeLines />
<PartialLinesAlways />
<PartialLines>
<LinePart from="IN 6-E" to="N 6 E" />
<LinePart from="in tegrar-se" to="integrar-se" />
@ -474,10 +477,8 @@
<LinePart from="R egião" to="Região" />
<LinePart from="unsuficien temente" to="insuficientemente" />
</PartialLines>
<PartialLinesAlways />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions>
<!-- <RegEx find="\bi\b" replaceWith="I" /> just an example - do not use this regex -->
<RegEx find="([0-9]) +º" replaceWith="$1º" />

View File

@ -1,6 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- This list contains names/words with specific casing - and specific to Portuguese only -->
<!-- This list contains names with specific casing - and specific to Portuguese only -->
<names>
<blacklist />
<name>Aarão</name>
<name>Abdénago</name>
<name>Abedenego</name>

View File

@ -1,6 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- This list contains names/words with specific casing - and specific to Russian only -->
<!-- This list contains names with specific casing - and specific to Russian only -->
<names>
<blacklist />
<name>Абакум</name>
<name>Абакумович</name>
<name>Абакумовна</name>
@ -2891,7 +2892,4 @@
<name>Ярославна</name>
<name>Ярославович</name>
<name>Ярославовна</name>
<blacklist>
<name></name>
</blacklist>
</names>

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords>
<Word from="НЄЙ" to="НЕЙ" />
<Word from="ОРГЗНИЗМОБ" to="ОРГАНИЗМА" />
@ -248,10 +249,10 @@
<WordPart from="ШЗ" to="ША" />
<WordPart from="І\/І" to="М" />
</PartialWords>
<PartialLines />
<WholeLines />
<PartialLinesAlways />
<PartialLines />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions />
</OCRFixReplaceList>

View File

@ -1,4 +1,5 @@
<OCRFixReplaceList>
<?xml version="1.0" encoding="utf-8"?>
<OCRFixReplaceList>
<WholeWords>
<!-- Abreviaturas simples -->
<Word from="KBs" to="kB" />
@ -368,6 +369,11 @@
</WholeWords>
<PartialWordsAlways />
<PartialWords />
<WholeLines>
<!-- Todas las líneas -->
<Line from="No" to="No." />
</WholeLines>
<PartialLinesAlways />
<PartialLines>
<!-- Varios -->
<LinePart from="de gratis" to="gratis" />
@ -710,15 +716,10 @@
<LinePart from="misterl" to="misteri" />
<LinePart from="vivencl" to="vivenci" />
</PartialLines>
<PartialLinesAlways />
<BeginLines />
<EndLines>
<Ending from=".»." to="»." />
</EndLines>
<WholeLines>
<!-- Todas las líneas -->
<Line from="No" to="No." />
</WholeLines>
<RegularExpressions>
<!-- Abreviaturas compuestas -->
<RegEx find="\b[Ss](r|ra|rta)\b\.?" replaceWith="S$1." />

View File

@ -1,4 +1,5 @@
<!-- Credit goes to: MilanRS [http://www.prijevodi-online.org] -->
<?xml version="1.0" encoding="utf-8"?>
<!-- Credit goes to: MilanRS [http://www.prijevodi-online.org] -->
<OCRFixReplaceList>
<WholeWords>
<Word from="ču" to="ću" />
@ -59,6 +60,8 @@
<WordPart from="IVl" to="M" />
<WordPart from="lVl" to="M" />
</PartialWords>
<WholeLines />
<PartialLinesAlways />
<PartialLines>
<LinePart from="bi smo" to="bismo" />
<LinePart from="dali je" to="da li je" />
@ -97,10 +100,8 @@
<LinePart from="Svo vrijeme" to="Sve vrijeme" />
<LinePart from="Cijelo vrijeme" to="Sve vrijeme" />
</PartialLines>
<PartialLinesAlways />
<BeginLines />
<EndLines />
<WholeLines />
<RegularExpressions>
<RegEx find="ÄŤ" replaceWith="č" />
<RegEx find="Ä" replaceWith="č" />

View File

@ -440,13 +440,13 @@
<WordPart from="ejag" to="e jag" />
<WordPart from="ärp" to="är p" />
</PartialWords>
<PartialLines />
<WholeLines />
<PartialLinesAlways />
<PartialLines />
<BeginLines>
<Beginning from="Ln " to="In " />
<Beginning from="U ppfattat" to="Uppfattat" />
</BeginLines>
<EndLines />
<WholeLines />
<RegularExpressions />
</OCRFixReplaceList>