Search code examples
pythonpython-dataclasses

Adding dataclass fields dynamically with dacite.from_dict


I am using dacite to transform a Python dictionary into a dataclass. Is there a way to dynamically add fields to a dataclass? Like in the example below, where the dataclass "Parameters" has defined only one timeseries "timeseriesA", but there might be additional ones (provided through the dictionary) that cannot be declared.

from dataclasses import asdict, dataclass
from typing import Dict, List, Optional

from dacite import from_dict

@dataclass(frozen = True)
class TimeSeries:
  name: str
  unit: str
  data: Optional[List[float]]
  
@dataclass(frozen = True)
class Parameters:
  timeseriesA: TimeSeries
  
@dataclass(frozen = True)
class Data:
  parameters: Parameters
  
  @classmethod
  def fromDict(cls, data: Dict) -> 'Data':
    return from_dict(cls, data)

  @classmethod
  def toDict(cls) -> Dict:
    return asdict(cls)

  
def main() -> None:

  d: Dict = {
    'parameters': {
      'timeseriesA': {
        'name': 'nameA',
        'unit': 'USD',
        'data': [10, 20, 30, 40]
      },
      'timeseriesB': {
        'name': 'nameB',
        'unit': 'EUR',
        'data': [60, 30, 40, 50]
      }
    }
  }

  data: Data = Data.fromDict(d)

if __name__ == '__main__':
  main()

In this example, "timeseriesB" will be ignored by dacite, but should be added as field for the "Parameters" dataclass.


Solution

  • In general, dynamically adding fields to a dataclass, after the class is defined, is not good practice. However, this does present a good use case for using a dict within a dataclass, due to the dynamic nature of fields in the source dict object.

    Here is a straightforward example of using a dict field to handle a dynamic mapping of keys in the source object, using the dataclass-wizard which is also a similar JSON serialization library. The approach outlined below handles extraneous data in the dict object like timeseriesB for instance.

    from __future__ import annotations
    
    from dataclasses import dataclass
    from dataclass_wizard import JSONWizard
    
    
    @dataclass(frozen=True)
    class Data(JSONWizard):
        parameters: dict[str, TimeSeries]
    
    
    @dataclass(frozen=True)
    class TimeSeries:
        name: str
        unit: str
        data: list[float] | None
    
    
    data: dict = {
        'parameters': {
            'timeseriesA': {
                'name': 'nameA',
                'unit': 'USD',
                'data': [10, 20, 30, 40]
            },
            'timeseriesB': {
                'name': 'nameB',
                'unit': 'EUR',
                'data': [60, 30, 40, 50]
            }
        }
    }
    
    
    def main():
        # deserialize from dict
        d = Data.from_dict(data)
        print(d.parameters['timeseriesB'].unit)  # EUR
    
        print(repr(d)) 
        # Data(parameters={'timeseriesA': TimeSeries(name='nameA', unit='USD', data=[10.0, 20.0, 30.0, 40.0]),
        #                  'timeseriesB': TimeSeries(name='nameB', unit='EUR', data=[60.0, 30.0, 40.0, 50.0])})
    
    
    if __name__ == '__main__':
        main()
    

    The dataclass-wizard admittedly doesn't perform strict type checking like dacite, but instead performs implicit type coercion, like str to annotated int, where possible. Perhaps as a result, it's overall much faster; the other nice thing is serialization is even slightly faster than builtin dataclasses.asdict too :-)

    Here are some quick tests:

    from dataclasses import asdict, dataclass
    from typing import Dict, List, Optional
    
    from dacite import from_dict
    from dataclass_wizard import JSONWizard
    from timeit import timeit
    
    
    @dataclass(frozen=True)
    class TimeSeries:
        name: str
        unit: str
        data: Optional[List[float]]
    
    
    @dataclass(frozen=True)
    class Parameters:
        timeseriesA: TimeSeries
    
    
    @dataclass(frozen=True)
    class Data:
        parameters: Parameters
    
        @classmethod
        def fromDict(cls, data: Dict) -> 'Data':
            return from_dict(cls, data)
    
        def toDict(self) -> Dict:
            return asdict(self)
    
    
    @dataclass(frozen=True)
    class ParametersWizard:
        # renamed because default key transform is `camelCase` -> `snake_case`
        timeseries_a: TimeSeries
    
    
    @dataclass(frozen=True)
    class DataWizard(JSONWizard):
        # enable debug mode in case of incorrect types etc.
        class _(JSONWizard.Meta):
            debug_enabled = True
    
        parameters: ParametersWizard
    
    
    data: Dict = {
        'parameters': {
            'timeseriesA': {
                'name': 'nameA',
                'unit': 'USD',
                'data': [10, 20, 30, 40]
            },
            'timeseriesB': {
                'name': 'nameB',
                'unit': 'EUR',
                'data': [60, 30, 40, 50]
            }
        }
    }
    
    
    def main():
        n = 10_000
    
        print(f"From Dict:        {timeit('Data.fromDict(data)', globals=globals(), number=n):.3f}")
        print(f"From Dict (Wiz):  {timeit('DataWizard.from_dict(data)', globals=globals(), number=n):.3f}")
    
        data_1: Data = Data.fromDict(data)
        data_wiz: Data = DataWizard.from_dict(data)
    
        g = globals().copy()
        g.update(locals())
    
        print(f"To Dict:        {timeit('data_1.toDict()', globals=g, number=n):.3f}")
        print(f"To Dict (Wiz):  {timeit('data_wiz.to_dict()', globals=g, number=n):.3f}")
    
    
    if __name__ == '__main__':
        main()
    

    Results, on my PC (Windows):

    From Dict:        1.663
    From Dict (Wiz):  0.059
    To Dict:        0.105
    To Dict (Wiz):  0.057