Search code examples
pythonencodingintegerfilenamesmemory-efficient

Python: Compactly and reversibly encode large integer as base64 or base16 having variable or fixed length


I want to compactly encode a large unsigned or signed integer having an arbitrary number of bits into a base64, base32, or base16 (hexadecimal) representation. The output will ultimately be used as a string which will be used as a filename, but this should be beside the point. I am using the latest Python 3.

This works but is far from compact:

>>> import base64, sys
>>> i: int = 2**62 - 3  # Can be signed or unsigned.
>>> b64: bytes =  base64.b64encode(str(i).encode()) # Not a compact encoding.
>>> len(b64), sys.getsizeof(b64)
(28, 61)

There is a prior question, now closed, the answers for which strictly concern with inefficient representations. Note again that we don't want to use any strings or needlessly long sequences of bytes in this exercise. As such, this question is not a duplicate of that question.


Solution

  • This answer is motivated in part by disparate comments by Erik A., such as for this answer. The integer is first compactly converted to bytes, following which the bytes are encoded to a variable base.

    from typing import Callable, Optional
    import base64
    
    class IntBaseEncoder:
        """Reversibly encode an unsigned or signed integer into a customizable encoding of a variable or fixed length."""
        # Ref: https://stackoverflow.com/a/54152763/
        def __init__(self, encoding: str, *, bits: Optional[int] = None, signed: bool = False):
            """
            :param encoder: Name of encoding from base64 module, e.g. b64, urlsafe_b64, b32, b16, etc.
            :param bits: Max bit length of int which is to be encoded. If specified, the encoding is of a fixed length,
            otherwise of a variable length.
            :param signed: If True, integers are considered signed, otherwise unsigned.
            """
            self._decoder: Callable[[bytes], bytes] = getattr(base64, f'{encoding}decode')
            self._encoder: Callable[[bytes], bytes] = getattr(base64, f'{encoding}encode')
            self.signed: bool = signed
            self.bytes_length: Optional[int] = bits and self._bytes_length(2 ** bits - 1)
    
        def _bytes_length(self, i: int) -> int:
            return (i.bit_length() + 7 + self.signed) // 8
    
        def encode(self, i: int) -> bytes:
            length = self.bytes_length or self._bytes_length(i)
            i_bytes = i.to_bytes(length, byteorder='big', signed=self.signed)
            return self._encoder(i_bytes)
    
        def decode(self, b64: bytes) -> int:
            i_bytes = self._decoder(b64)
            return int.from_bytes(i_bytes, byteorder='big', signed=self.signed)
    
    # Tests:
    import unittest
    
    class TestIntBaseEncoder(unittest.TestCase):
    
        ENCODINGS = ('b85', 'b64', 'urlsafe_b64', 'b32', 'b16')
    
        def test_unsigned_with_variable_length(self):
            for encoding in self.ENCODINGS:
                encoder = IntBaseEncoder(encoding)
                previous_length = 0
                for i in range(1234):
                    encoded = encoder.encode(i)
                    self.assertGreaterEqual(len(encoded), previous_length)
                    self.assertEqual(i, encoder.decode(encoded))
    
        def test_signed_with_variable_length(self):
            for encoding in self.ENCODINGS:
                encoder = IntBaseEncoder(encoding, signed=True)
                previous_length = 0
                for i in range(-1234, 1234):
                    encoded = encoder.encode(i)
                    self.assertGreaterEqual(len(encoded), previous_length)
                    self.assertEqual(i, encoder.decode(encoded))
    
        def test_unsigned_with_fixed_length(self):
            for encoding in self.ENCODINGS:
                for maxint in range(257):
                    encoder = IntBaseEncoder(encoding, bits=maxint.bit_length())
                    maxlen = len(encoder.encode(maxint))
                    for i in range(maxint + 1):
                        encoded = encoder.encode(i)
                        self.assertEqual(len(encoded), maxlen)
                        self.assertEqual(i, encoder.decode(encoded))
    
        def test_signed_with_fixed_length(self):
            for encoding in self.ENCODINGS:
                for maxint in range(257):
                    encoder = IntBaseEncoder(encoding, bits=maxint.bit_length(), signed=True)
                    maxlen = len(encoder.encode(maxint))
                    for i in range(-maxint, maxint + 1):
                        encoded = encoder.encode(i)
                        self.assertEqual(len(encoded), maxlen)
                        self.assertEqual(i, encoder.decode(encoded))
    
    if __name__ == '__main__':
        unittest.main()
    

    If using the output as a filename, initializing the encoder with the encoding 'urlsafe_b64' or even 'b16' are safer choices.

    Usage examples:

    # Variable length encoding
    >>> encoder = IntBaseEncoder('urlsafe_b64')
    >>> encoder.encode(12345)
    b'MDk='
    >>> encoder.decode(_)
    12345
    
    # Fixed length encoding
    >>> encoder = IntBaseEncoder('b16', bits=32)
    >>> encoder.encode(12345)
    b'00003039'
    >>> encoder.encode(123456789)
    b'075BCD15'
    >>> encoder.decode(_)
    123456789
    
    # Signed
    encoder = IntBaseEncoder('b32', signed=True)
    encoder.encode(-12345)
    b'Z7DQ===='
    encoder.decode(_)
    -12345