Here is my code:
def digraph(chars):
als = "шжяеёющчШЖЯЕЁЮЩЧ"
new = {'sh':als[0],'zh':als[1],'ja':als[2],'je':als[3],'jo':als[4],
'ju':als[5],'sx':als[6],'ch':als[7],'Sh':als[8],'Zh':als[9],
'Ja':als[10],'Je':als[11],'Jo':als[12],'Ju':als[13],'Sx':als[14],
'Ch':als[15],'SH':als[8],'ZH':als[9],'JA':als[10],'JE':als[11],
'JO':als[12],'JU':als[13],'SX':als[14],'CH':als[15]}
try:
return new[chars]
except:
return "[Error]"
def trans_cyr(inp):
cyrillic = "абцдэфгхийклмнопрстувъыьзАБЦДЭФГХИЙКЛМНОПРСТУВЫЗ "
latin = "abcdefghijklmnoprstuv$y'zABCDEFGHIJKLMNOPRSTUVYZ "
digs = ['sh','zh','ja','je','jo','ju','sx','ch','Sh','Zh',
'Ja','Je','Jo','Ju','Sx','Ch','SH','ZH','JA','JE','JO','JU','SX',
'CH']
prevc = ""
for e, char in enumerate(inp):
if(prevc != ""):
comb = prevc + char
newdig = digraph(comb)
if(comb in digs):
print(newdig, end="")
prevc = ""
else:
pos = latin.index(char)
posp = latin.index(inp[e - 1])
if(inp[e-1] in "szjcSZJC"):
print(cyrillic[posp] + cyrillic[pos], end="")
prevc = ""
else:
prevc=""
continue
elif(char not in "szjcSZJC"):
try:
pos = latin.index(char)
print(cyrillic[pos], end="")
except:
print(char, end="")
else:
prevc = char
while True:
cyrinp = input("\n> ")
trans_cyr(cyrinp)
The code is supposed to transliterate the Latin alphabet to Cyrillic, first by getting each character from the input (if it is not 'szjc' or their uppercase equivalents), getting the position of it using the index() function and then acquiring a Cyrillic equivalent in the same position as the latin one. However, Cyrillic has letters such as Я, Е, Ё, Ю, Ж, Ш, Щ, Ч, which are digraphs (ya, ye, yo, yu, zh, sh, shch (sx), ch) and therefore cannot be transliterated by just one character. Thus what I do, is check whether the current letter is equal to any of 'szjcSZJC', and if it is then I do not print it but instead assign it the name prevc if the next character combined with prevc is in the array 'digs'. Everything works perfectly, if I type in 'jojajo' it will print "ёяё" just as it should, but - if there is an unfinished digraph (c without h, s without h, x, z without h, j without a, e, u, and o) then the next digraph does not become transliterated. Example: sjo : If I enter sjo, my expected output would be сё, but instead I am getting сйо. Is there any way I can fix this?
Edit:
I wrote this code:
while True:
cyrillic = "абцдэфгхийклмнопрстувъыьзАБЦДЭФГХИЙКЛМНОПРСТУВЫЗ "
latin = "abcdefghijklmnoprstuv$y'zABCDEFGHIJKLMNOPRSTUVYZ "
als = "шжяеёющчШЖЯЕЁЮЩЧ"
new = {'sh':als[0],'zh':als[1],'ja':als[2],'je':als[3],'jo':als[4],
'ju':als[5],'sx':als[6],'ch':als[7],'Sh':als[8],'Zh':als[9],
'Ja':als[10],'Je':als[11],'Jo':als[12],'Ju':als[13],'Sx':als[14],
'Ch':als[15],'SH':als[8],'ZH':als[9],'JA':als[10],'JE':als[11],
'JO':als[12],'JU':als[13],'SX':als[14],'CH':als[15]}
inp = input("\n> ") + " "
digraph = ""
prevc = ""
for e, char in enumerate(inp):
part_j = "jJ"
part_v = "aeouAEOU"
part_z = "zZ"
part_h = "hH"
part_s = "sS"
part_x = "hxHX"
part_c = "cC"
if((char in part_j and inp[e+1] in part_v) or (char in part_z and inp[e+1] in part_h) or (char in part_s and inp[e+1] in part_x) or (char in part_c and inp[e+1] in part_h)):
digraph = "yes"
else:
digraph = "no"
if((char in part_v and inp[e-1] in part_j) or (char in part_h and inp[e-1] in part_z) or (char in part_x and inp[e-1] in part_s) or (char in part_h and inp[e-1] in part_c)):
comb = inp[e-1] + char
dig = new[comb]
print(dig, end="")
elif(digraph == "yes"):
prevc = char
else:
try:
print(cyrillic[latin.index(char)],end="")
except:
print(char, end="")
which appears to have the same sort of logic as the answer I selected as correct, and it works :)
Here is a solution that uses the same code logic as your approach, but is more clearly written.
CYRILLIC = u"абцдэфгхийклмнопрстувъыьзАБЦДЭФГХИЙКЛМНОПРСТУВЫЗ "
LATIN = u"abcdefghijklmnoprstuv$y'zABCDEFGHIJKLMNOPRSTUVYZ "
DIGRAPHS = u"шжяеёющчШЖЯЕЁЮЩЧШЖЯЕЁЮЩЧ"
LATIN_DIGRAPHS = [u'sh', u'zh', u'ja', u'je', u'jo', u'ju', u'sx', u'ch',
u'Sh', u'Zh', u'Ja', u'Je', u'Jo', u'Ju', u'Sx', u'Ch',
u'SH', u'ZH', u'JA', u'JE', u'JO', u'JU', u'SX', u'CH']
MAPPING = dict(zip(list(LATIN) + LATIN_DIGRAPHS, CYRILLIC + DIGRAPHS))
DIGRAPH_FIRST_LETTER = u'szjcSZJC'
def latin_to_cyrillic(word):
translation = []
possible_digraph = False
for letter in word:
if possible_digraph:
combination = previous_letter + letter
if combination in LATIN_DIGRAPHS:
translation.append(MAPPING[combination])
possible_digraph = False
else:
translation.append(MAPPING[previous_letter])
if letter in DIGRAPH_FIRST_LETTER:
previous_letter = letter
else:
translation.append(letter)
possible_digraph = False
else:
if letter in DIGRAPH_FIRST_LETTER:
possible_digraph = True
previous_letter = letter
else:
translation.append(MAPPING[letter])
if possible_digraph:
translation.append(MAPPING[previous_letter])
return ''.join(translation)
print latin_to_cyrillic('sjo')
print latin_to_cyrillic('jojajo')
The logic is as follows.
Instead of finding the index of a letter in the latin alphabet and using that index for the cyrillic alphabet, you can simply use a dictionary that maps each letter from one language to the other. Simply create a list of all the latin symbols (single and digraphs) and another one for the cyrillic. You only have to ensure that the respective order of symbols for both is the same. dict(zip(alphabet1, alphabet2))
will then create a mapping for each letter of alphabet1
to that of the same index in alphabet2
.