Search code examples
pythonunicodecharord

How to combine two similar functions that convert between hiragana and katakana?


I have two functions that convert between katakana and hiragana and they look the same:

katakana_minus_hiragana = 0x30a1 - 0x3041 # KATAKANA LETTER A - HIRAGANA A

def is_hirgana(char):
    return 0x3040 < ord(char[0]) and ord(char[0]) < 0x3097

def is_katakana(char):
    return 0x30a0 < ord(char[0]) and ord(char[0]) < 0x30f7

def hiragana_to_katakana(hiragana_text):
    katakana_text = ""
    max_len = 0
    for i, char in enumerate(hiragana_text):
        if is_hirgana(char):
            katakana_text += chr(ord(char) + katakana_minus_hiragana)
            max_len += 1
        else:
            break
    return katakana_text, max_len

def katakana_to_hiragana(katakana_text):
    hiragana_text = ""
    max_len = 0
    for i, char in enumerate(katakana_text):
        if is_katakana(char):
            hiragana_text += chr(ord(char) - katakana_minus_hiragana)
            max_len += 1
        else:
            break
    return hiragana_text, max_len

Is there a way to simplify hiragana_to_katakana() and katakana_to_hiragana() into a duck-type function or a super/meta function?

E.g. something like

def convert_hk_kh(text, charset_range, offset):
    charset_start, charset_end = charset_range
    output_text = ""
    max_len = 0
    for i, char in enumerate(text):
        if charset_start < ord(char[0]) and ord(char[0]) < charset_end:
            output_text += chr(ord(char) + offset)
            max_len +=1 
        else:
            break
    return output_text, max_len


def katakana_to_hiragana(katakana_text):
    return convert_hk_kh(katakana_text, (0x30a0, 0x30f7), -katakana_minus_hiragana)


def hiragana_to_katakana(hiragana_text):
    return convert_hk_kh(hiragana_text, (0x3040, 0x3097), katakana_minus_hiragana)

Are there other pythonic ways to simplify the two functions that are very similar?

EDITED

There's also https://github.com/olsgaard/Japanese_nlp_scripts which seems to do the same thing with str.translate. Is that more efficient? More pythonic?


Solution

  • I'd do something like this:

    KATAKANA_HIRGANA_SHIFT = 0x30a1 - 0x3041  # KATAKANA LETTER A - HIRAGANA A
    
    def shift_chars_prefix(text, amount, condition):
        output = ''
    
        for last_index, char in enumerate(text):
            if not condition(char):
                break
    
            output += chr(ord(char) + amount)
    
        return output, last_index
    
    def katakana_to_hiragana(text):
        return shift_chars_prefix(text, -KATAKANA_HIRGANA_SHIFT, lambda c: '\u30a0' < c < '\u30f7')
    
    def hiragana_to_katakana(text):
        return shift_chars_prefix(text, KATAKANA_HIRGANA_SHIFT, lambda c: '\u3040' < c < '\u3097')
    

    You can also use regex if you don't return the length of the replaced prefix:

    import re
    
    KATAKANA_HIRGANA_SHIFT = 0x30a1 - 0x3041  # KATAKANA LETTER A - HIRAGANA A
    
    def shift_by(n):
        def replacer(match):
            return ''.join(chr(ord(c) + n) for c in match.group(0))
    
        return replacer
    
    def katakana_to_hiragana(text):
        return re.sub(r'^[\u30a1-\u30f6]+', shift_by(KATAKANA_HIRGANA_SHIFT), text)
    
    def hiragana_to_katakana(text):
        return re.sub(r'^[\u3041-\u3096]+', shift_by(-KATAKANA_HIRGANA_SHIFT), text)