# Jan Labanowski, jkl@ccl.net, Jan. 10, 1992 # File koi8_tex.dat # This is a transliteration data file for converting from KOI-8 as used # by RELCOM (GOST 19768-74) to LaTeX # The TeX tranliteration sequences follow AMS cyrillic convention for # WNCYR fonts with cyracc.def file # To be used with translit.c program by Jan Labanowski. For a format of # this file consult translit documentation 1 file version number " " # string delimiters [ ] # list delimites { } # regular expression delimiters # starting sequence for LaTeX (kindly contributed by Oleg Zabluda # Wed May 26 22:20:38 1993 "\documentstyle{article} \input cyracc.def \font\tencyr=wncyr10 \def\cyr{\tencyr\cyracc} \settowidth{\textwidth}{\cyr\hspace{90em}} \pagestyle{empty} \begin{document} " #ending sequence " \end{document} " 0 # number of input SHIFT sequences, only one set of input characters 2 # number of output SHIFT sequences, two sets of input characters # SHIFT-OUT SHIFT-IN "" "" #shift sequences for set 1 (Latin) "{\cyr " "}" #cyrillic enclosed in {\cyr ... } # conversion table # inp_set inp_seq out_set out_seq # characters which are not in ASCII (and DEL) and not in KOI8 to * 0 [\0x7F-\0xA2\0xA4-\0xB2\0xB4-\0xBF] 0 "$\star$" # dehyphenate words, e.g. con- (NL)cert is changed to concert(NL) # Below is a complicated (?) regular expression. It joins a hyphenated # word. It looks for one of more letters (saves them as substring 1) # followed by a hyphen (which may be followed by zero or more spaces # or tabs). The hyphen must be followed by a NewLine (characters 0A-0D hex # are various new line sequences) and saves NewLine sequence. Then it looks # for zero or more tabs and spaces (at the beginning of the line). Then it # looks for the rest of the hyphenated word and saves it as substring 3. # The word may have punctuation attached. Then it looks again for some spaces # or tabs. The substitute string junks all sequences which were not withn (), # i.e., hyphen and spaces/tabs and inserts only substrings but in a different # order. The 1 (word beginning) is followed by 3 (word end) and followed by # the NewLine. The {\2\1\3} would be equally good. The string is then returned # back for processing (output code is -1). Note that since input regular # expression is very long, I chopped it into several lines by using \NL. # If \ is followed by a white space, the \ and all white space which follow it # is removed by the program. Be carefull not to use "\white_space" in strings, # lists or regular expressions. If you must, enter \ as a code (i.e., \0x5C). # uncomment lines below if you want to dehyphenate # 0 {([A-Za-z\0xA3\0xB3\0xC0-\0xFF]+)-[ \0x09]*([\0x0A-\0x0D]+)[ \0x09]*(\ # [A-Za-z\0xA3\0xB3\0xC0-\0xFF,.?;:")'`!]+)[ \0x09]} # -1 {\1\3\2} # All latin letters are converted to the same letters but with the output # set 1 0 [A-Za-z] 1 [A-Za-z] #Latin letters A-Z and a-z # Add \\ before all NewLine sequences 0 {([\0x0B-\0x0D]*)\0x0A([\0x0B-\0x0D]*)} 0 {\\\\\1\0x0A\2} # Convert all double spaces to protected LaTeX spaces. Note that the # backslash is followed by a space here, and had to be entered as its code 0 " " 0 "{\0x5C \0x5C }" # Quote some special TeX characters # these do not require going out of {\cyr ....} 0 "[" 0 "$[$" 0 "]" 0 "$]$" 0 "^" 0 "$\wedge$" 0 "{" 0 "$\lbrace$" 0 "}" 0 "$\rbrace$" 0 "~" 0 "$\sim$" 0 "\" 0 "$\backslash$" 0 "|" 0 "$\mid$" 0 "*" 0 "$\star$" 0 "<" 0 "$<$" 0 ">" 0 "$>$" 0 "$" 0 "\$" 0 "%" 0 "\%" # these can be represented correctly only in Latin charset 0 "_" 1 "\_" 0 "&" 1 "\&" 0 "#" 1 "\#" 0 "@" 1 "@" # Cyrillic letters 0 "\0xF4\0xFD" 2 "T{\cydot}Shch" # to prevent C 0 "\0xF4\0xDD" 2 "T{\cydot}shch" # to prevent C 0 "\0xD4\0xFD" 2 "t{\cydot}Shch" # to prevent C 0 "\0xD4\0xDD" 2 "t{\cydot}shch" # to prevent C 0 "\0xF4\0xFB" 2 "T{\cydot}Sh" # to prevent C 0 "\0xF4\0xDB" 2 "T{\cydot}sh" # to prevent C 0 "\0xD4\0xFB" 2 "t{\cydot}Sh" # to prevent C 0 "\0xD4\0xDB" 2 "t{\cydot}sh" # to prevent C 0 "\0xF4\0xF3" 2 "T{\cydot}S" # to prevent C 0 "\0xF4\0xD3" 2 "T{\cydot}s" # to prevent C 0 "\0xD4\0xF3" 2 "t{\cydot}S" # to prevent c 0 "\0xD4\0xD3" 2 "t{\cydot}s" # to prevent c 0 "\0xA3" 2 "\\0o42e" # small \"e (yo) 0 "\0xB3" 2 "\\0o42E" # capital \"E (Yo) 0 "\0xE1" 2 "A" 0 "\0xE2" 2 "B" 0 "\0xF7" 2 "V" 0 "\0xE7" 2 "G" 0 "\0xE4" 2 "D" 0 "\0xE5" 2 "E" 0 "\0xF6" 2 "Zh" 0 "\0xFA" 2 "Z" 0 "\0xE9" 2 "I" 0 "\0xEA" 2 "{\u I}" # I kratkoje 0 "\0xEB" 2 "K" 0 "\0xEC" 2 "L" 0 "\0xED" 2 "M" 0 "\0xEE" 2 "N" 0 "\0xEF" 2 "O" 0 "\0xF0" 2 "P" 0 "\0xF2" 2 "R" 0 "\0xF3" 2 "S" 0 "\0xF4" 2 "T" 0 "\0xF5" 2 "U" 0 "\0xE6" 2 "F" 0 "\0xE8" 2 "Kh" 0 "\0xE3" 2 "C" 0 "\0xFE" 2 "Ch" 0 "\0xFB" 2 "Sh" 0 "\0xFD" 2 "Shch" 0 "\0xFF" 2 "{\Cdprime}" # Tverdyj znak 0 "\0xF9" 2 "Y" 0 "\0xF8" 2 "{\Cprime}" # Myagkij znak 0 "\0xFC" 2 "\`E" 0 "\0xE0" 2 "Yu" 0 "\0xF1" 2 "Ya" 0 "\0xC1" 2 "a" 0 "\0xC2" 2 "b" 0 "\0xD7" 2 "v" 0 "\0xC7" 2 "g" 0 "\0xC4" 2 "d" 0 "\0xC5" 2 "e" 0 "\0xD6" 2 "zh" 0 "\0xDA" 2 "z" 0 "\0xC9" 2 "i" 0 "\0xCA" 2 "{\u i}" 0 "\0xCB" 2 "k" 0 "\0xCC" 2 "l" 0 "\0xCD" 2 "m" 0 "\0xCE" 2 "n" 0 "\0xCF" 2 "o" 0 "\0xD0" 2 "p" 0 "\0xD2" 2 "r" 0 "\0xD3" 2 "s" 0 "\0xD4" 2 "t" 0 "\0xD5" 2 "u" 0 "\0xC6" 2 "f" 0 "\0xC8" 2 "kh" 0 "\0xC3" 2 "c" 0 "\0xDE" 2 "ch" 0 "\0xDB" 2 "sh" 0 "\0xDD" 2 "shch" 0 "\0xDF" 2 "{\cdprime}" 0 "\0xD9" 2 "y" 0 "\0xD8" 2 "{\cprime}" 0 "\0xDC" 2 "\`e" 0 "\0xC0" 2 "yu" 0 "\0xD1" 2 "ya"