Search code examples
pythonregexpython-re

php preg_replace in python


I have a php function that clean any special character and now I want to create a function like php in Python.

my php function:

function cleanString($text)
{
    $utf8 = array(
        '/[áàâãªä]/u'   =>   'a',
        '/[ÁÀÂÃÄ]/u'    =>   'A',
        '/[ÍÌÎÏ]/u'     =>   'I',
        '/[íìîï]/u'     =>   'i',
        '/[éèêë]/u'     =>   'e',
        '/[ÉÈÊË]/u'     =>   'E',
        '/[óòôõºö]/u'   =>   'o',
        '/[ÓÒÔÕÖ]/u'    =>   'O',
        '/[úùûü]/u'     =>   'u',
        '/[ÚÙÛÜ]/u'     =>   'U',
        '/ç/'           =>   'c',
        '/Ç/'           =>   'C',
        '/ñ/'           =>   'n',
        '/Ñ/'           =>   'N',
        '/–/'           =>   '-', // UTF-8 hyphen to "normal" hyphen
        '/[’‘‹›‚]/u'    =>   ' ', // Literally a single quote
        '/[“”«»„]/u'    =>   ' ', // Double quote
        '/ /'           =>   ' ', // nonbreaking space (equiv. to 0x160)
    );
    return preg_replace(array_keys($utf8), array_values($utf8), trim($text));
}

I've tried in Python like below:

def clean(text):
    utf8 = {
        '/[áàâãªä]/u'   :   'a',
        '/[ÁÀÂÃÄ]/u'    :   'A',
        '/[ÍÌÎÏ]/u'     :   'I',
        '/[íìîï]/u'     :   'i',
        '/[éèêë]/u'     :   'e',
        '/[ÉÈÊË]/u'     :   'E',
        '/[óòôõºö]/u'   :   'o',
        '/[ÓÒÔÕÖ]/u'    :   'O',
        '/[úùûü]/u'     :   'u',
        '/[ÚÙÛÜ]/u'     :   'U',
        '/ç/'           :   'c',
        '/Ç/'           :   'C',
        '/ñ/'           :   'n',
        '/Ñ/'           :   'N',
        '/–/'           :   '-', # UTF-8 hyphen to "normal" hyphen
        '/[’‘‹›‚]/u'    :   ' ', # Literally a single quote
        '/[“”«»„]/u'    :   ' ', # Double quote
        '/ /'           :   ' ', # nonbreaking space (equiv. to 0x160)
    }
    return re.sub(utf8.keys(), utf8.values(), text.strip())

but show error with message below:

unhashable type: 'dict_keys'

Solution

  • Python's re.sub doesn't support array-style inputs the way PHP's preg_replace does. You would need to iterate over the replacements instead e.g.

    def clean(text):
        utf8 = {
            '[áàâãªä]'   :   'a',
            '[ÁÀÂÃÄ]'    :   'A',
            '[ÍÌÎÏ]'     :   'I',
            '[íìîï]'     :   'i',
            '[éèêë]'     :   'e',
            '[ÉÈÊË]'     :   'E',
            '[óòôõºö]'   :   'o',
            '[ÓÒÔÕÖ]'    :   'O',
            '[úùûü]'     :   'u',
            '[ÚÙÛÜ]'     :   'U',
            'ç'          :   'c',
            'Ç'          :   'C',
            'ñ'          :   'n',
            'Ñ'          :   'N',
            '–'          :   '-', # UTF-8 hyphen to "normal" hyphen
            '[’‘‹›‚]'    :   ' ', # Literally a single quote
            '[“”«»„]'    :   ' ', # Double quote
            ' '          :   ' ', # nonbreaking space (equiv. to 0x160)
        }
        text = text.strip()
        for pat, repl in utf8.items():
            text = re.sub(pat, repl, text, 0, re.U)
        return text
    

    Note also that python does not use delimiters around regexes, and you pass the u flag to re.sub directly. I've adjusted your code to deal with those issues.

    Sample usage:

    print(clean('ÂôÑ‹Î'))
    

    Output:

    AoN I