python class serialization deserialization python-dataclasses

Python: Allow object to be recreated from a serialised form without re-computing attributes

Say I have a class like this:

from uuid import uuid4 as uuid
import requests
from bs4 import BeautifulSoup

class Link:
    def __init__(self, url):
        self.url = url
        self.id = str(uuid())
        self.content = self.scrape()

    def scrape(self):
        # get webpage
        response = requests.get(self.url).text
        # extract the text (excluding HTML tags etc)
        soup = BeautifulSoup(response, features="html.parser")
        return soup.get_text()

Now, I can create a new instance of that class like so:

>>> my_link = Link("https://example.com")
>>> my_link.id
'ba664ae1-eb5d-4370-9276-93f8d9d03a0d'
>>> my_link.content
'Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information...'

But what if I already have the id and content attributes already stored in a database, and I want to recreate my object from its serialized form? I might have JSON like this:

{
    "url": "https://example.com",
    "id": "ba664ae1-eb5d-4370-9276-93f8d9d03a0d"
    "content": "Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information..."
}

Obviously, I could pass through these options in __init__:

def __init__(self, url, id=None, content=None):
    self.url = url
    if id and content:
        self.id = id
        self.content = content
    else:
        self.id = str(uuid())
        self.content = self.scrape()

But this becomes unwieldy with more and more attributes, for instance if I wanted to add a title field, or I wanted to change to a dataclass.

What's the recommended way to do this?

Solution

I would simplify your current __init__ method to be the "simple" version (that takes pre-computed values as arguments) and define new class methods that, for example, compute a new UUID to pass to __init__, or extracts data from a dict, or parses a JSON value to create the dict to extract data from.

class Link:
    def __init__(self, url, str_uuid, content):
        self.url = url
        self.id = str_uuid
        self.content = content

    # Like most static methods, this may work just as well
    # as a regular function defined outside the class.
    @staticmethod
    def scrape(url):
        response = requests.get(url).text
        soup = BeautifulSoup(response, feature="html.parser")
        return soup.get_text()

    @classmethod
    def create(cls, url):
        str_uuid = str(uuid())
        content = cls.scrape(url)

        return cls(url, str_uuid, content)

    @classmethod
    def from_dict(cls, data):
        return cls(data['url'], data['id'], data['content'])

    @classmethod
    def from_json(cls, obj):
        return cls.from_dict(json.loads(obj))

l1 = Link.create("https://example.com")

json_obj = '''{
    "url": "https://example.com",
    "id": "ba664ae1-eb5d-4370-9276-93f8d9d03a0d"
    "content": "Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information..."
}'''
l2 = Link.from_dict(json.loads(json_obj))

d = json.loads(json_obj)
l3 = Link.from_json(d)

l4 = Link(d['url'], d['id'], d['content'])

If you only need the URL the first time you create the object, you don't need to save it as an attribute. If you want an existing object to be able to "rescrape" a link, add an additional instance method:

def rescrape(self):
    self.content = self.scrape(self.url)