Search code examples
pythonprotocol-buffers

How to get outer key in a protobuf


I am reading data from 2 proto files:

file.proto: this is a wrapper

file2.proto: this has all the columns

file.proto:

syntax = "proto3";

package com.oracle;

import "file2.proto";

option go_package = "github.com/cle/sdk/go_sdk";

// This is the inbound message intended to inform the Oracle of new answers to be persisted
message AnswerUpdateRequest {
  Entity entity = 1;
  repeated Answer answers = 2;
}

// This is the outbound message informing Oracle subscribers of new answers
message AnswersUpdated {
  Entity entity = 1;
  repeated Answer answers = 2;
}

file2.proto:

syntax = "proto3";

package com.oracle;

import "google/protobuf/timestamp.proto";

option go_package = "github.com/embroker/oracle/sdk/go_sdk";

message Entity {
  Type type = 1;
  string id = 2;

  enum Type {
    ORGANIZATION = 0;
    USER = 1;
    APPLICATION = 2;
  }
}

message AnswerSource {
  Type type = 1;
  string id = 2;

  enum Type {
    UNKNOWN = 0;
    USER = 1;
    DOCUMENT = 2;
    EXTERNAL = 3;
  }
}

message Answer {
  string key = 1;
  AnswerSource source = 2;
  google.protobuf.Timestamp provided_at = 3;
  google.protobuf.Timestamp received_at = 4;
  AnswerFieldType type = 5;
  Value value = 6;

  message Value {
    oneof value {
      string text = 1;
      float decimal = 2;
      // ...
    }
  }
}

enum AnswerFieldType {
  ANSWER_FIELD_TYPE_UNSTRUCTURED = 0; // Can be useful for LLM purposes
  ANSWER_FIELD_TYPE_TEXT = 1;
  ANSWER_FIELD_TYPE_INTEGER = 2;
  ANSWER_FIELD_TYPE_BOOLEAN = 3;
  ANSWER_FIELD_TYPE_DECIMAL = 4;
  ANSWER_FIELD_TYPE_DATE = 5;
  ANSWER_FIELD_TYPE_ADDRESS = 6;
}

My python function to map to proto

import file.proto
import file2.proto
def create_answer_update_request(json_data):
    data = json_data
    answer_update_request = events_pb2.AnswerUpdateRequest()
    
    entity = answer_update_request.entity
    entity.type = model_pb2.Entity.Type.Value(data["answerUpdateRequest"]["entity"]["type"])
    entity.id = data["answerUpdateRequest"]["entity"]["id"]

    for answer_data in data["answerUpdateRequest"]["answers"]:
        answer = Answer()
        answer.key = answer_data['key']

        source = AnswerSource()
        source.type = AnswerSource.Type.Value(answer_data['source']['type'])
        source.id = answer_data['source']['id']
        answer.source.CopyFrom(source)

        provided_at_datetime = datetime.fromisoformat(answer_data['provided_at'])
        answer.provided_at.FromDatetime(provided_at_datetime)
        received_at_datetime = datetime.fromisoformat(answer_data['received_at'])
        answer.received_at.FromDatetime(received_at_datetime)
        answer.type = AnswerFieldType.Value(f"ANSWER_FIELD_TYPE_{answer_data['type']}")
        value = Answer.Value()
        value.text = answer_data['value']['text']
        answer.value.CopyFrom(value)

        answer_update_request.answers.append(answer)
    return answer_update_request.SerializeToString()

While deserializing data I am not getting wrapper:

Expected output:

{
  "answerUpdateRequest": {
    "entity": {
      "type": "ORGANIZATION",
      "id": "UU12334ID"
    },
    "answers": [
      {
        "key": "legal_company_name",
        "source": {
          "type": "DOCUMENT",
          "id": "3ea20f68e73ec | DocumentType.application"
        },
        "provided_at": "2024-05-02T15:54:15.941988",
        "received_at": "2024-05-02T15:54:15.945350",
        "type": "TEXT",
        "value": {
          "text": "Cicne Law, LLC"
        }
      },
      {
        "key": "company_website_ind",
        "source": {
          "type": "DOCUMENT",
          "id": "3ea20440-83fb-43c0-b409-1dd8f68e73ec | DocumentType.application"
        },
        "provided_at": "2024-05-02T15:54:15.941988",
        "received_at": "2024-05-02T15:54:15.945365",
        "type": "BOOLEAN",
        "value": {
          "text": "Yes"
        }
      
    ]
  }
  
}

Error: I am not getting "answerUpdateRequest" " in the final output, rest everthing is working for me as expected how to get this?


Solution

  • The Protobuf sources (schemas) you include reference package com.oracle.

    If these are indeed Oracle Protobuf sources, it would be better for you to generate using Oracle's public repo and reference them as 3rd-party sources.

    I think your code could be simplified:

    1. JSON is wrapped in "answerUpdateRequest" but the Message that includes this is missing
    2. Enum should be represented by numbers in JSON
    3. Per the other answer, the use of invalid times ("2024-05-02T15:54:15.941988")

    I create a wrapper Message:

    foo.proto:

    syntax = "proto3";
    
    package com.oracle;
    
    import "file.proto";
    
    message Foo {
        AnswerUpdateRequest answer_update_request = 1;
    }
    

    And:

    protoc \
    --python_out=${PWD} \
    --pyi_out=${PWD} \
    file.proto \
    file2.proto \
    foo.proto
    

    If you were to use the following tweaks to the JSON:

    data = '''{
      "answerUpdateRequest": {
        "entity": {
          "type": 0,
          "id": "UU12334ID"
        },
        "answers": [
          {
            "key": "legal_company_name",
            "source": {
              "type": 2,
              "id": "3ea20f68e73ec | DocumentType.application"
            },
            "provided_at": "2024-05-02T15:54:15.941988Z",
            "received_at": "2024-05-02T15:54:15.945350Z",
            "type": 1,
            "value": {
              "text": "Cicne Law, LLC"
            }
          },
          {
            "key": "company_website_ind",
            "source": {
              "type": 2,
              "id": "3ea20440-83fb-43c0-b409-1dd8f68e73ec | DocumentType.application"
            },
            "provided_at": "2024-05-02T15:54:15.941988Z",
            "received_at": "2024-05-02T15:54:15.945365Z",
            "type": 3,
            "value": {
              "text": "Yes"
            }
          }
        ]
      }
    }
    '''
    

    Then:

    import json
    
    import foo_pb2
    import file_pb2
    import file2_pb2
    
    from google.protobuf import json_format
    
    j=json.loads(data)
    m1 = foo_pb2.Foo()
    json_format.Parse(data,m1)
    print(m1)
    

    Yields a protobuf message (!) (entity.type is omitted because it is the default value 0|ORGANIZATION):

      entity {
        id: "UU12334ID"
      }
      answers {
        key: "legal_company_name"
        source {
          type: DOCUMENT
          id: "3ea20f68e73ec | DocumentType.application"
        }
        provided_at {
          seconds: 1714665255
          nanos: 941988000
        }
        received_at {
          seconds: 1714665255
          nanos: 945350000
        }
        type: ANSWER_FIELD_TYPE_TEXT
        value {
          text: "Cicne Law, LLC"
        }
      }
      answers {
        key: "company_website_ind"
        source {
          type: DOCUMENT
          id: "3ea20440-83fb-43c0-b409-1dd8f68e73ec | DocumentType.application"
        }
        provided_at {
          seconds: 1714665255
          nanos: 941988000
        }
        received_at {
          seconds: 1714665255
          nanos: 945365000
        }
        type: ANSWER_FIELD_TYPE_BOOLEAN
        value {
          text: "Yes"
        }
      }
    }
    

    And:

    import json
    
    import foo_pb2
    import file_pb2
    import file2_pb2
    
    from google.protobuf import json_format
    
    m2 = foo_pb2.Foo(
        answer_update_request=file_pb2.AnswerUpdateRequest(
          entity=file2_pb2.Entity(
              type=file2_pb2.Entity.ORGANIZATION,
              id="UU12334ID",
          ),
        ),
    )
    a1 = file2_pb2.Answer(
        key="legal_company_name",
        source=file2_pb2.AnswerSource(
            type=file2_pb2.AnswerSource.DOCUMENT,
            id="3ea20f68e73ec | DocumentType.application",
        ),
        type=file2_pb2.ANSWER_FIELD_TYPE_TEXT,
        value=file2_pb2.Answer.Value(
            text="Cicne Law, LLC",
        ),
    )
    a1.provided_at.FromJsonString("2024-05-02T15:54:15.941988Z"),
    a1.received_at.FromJsonString("2024-05-02T15:54:15.945350Z"),
    
    a2 = file2_pb2.Answer(
        key="company_websited_ind",
        source=file2_pb2.AnswerSource(
            type=file2_pb2.AnswerSource.DOCUMENT,
            id="3ea20440-83fb-43c0-b409-1dd8f68e73ec | DocumentType.application",
        ),
        type=file2_pb2.ANSWER_FIELD_TYPE_BOOLEAN,
        value=file2_pb2.Answer.Value(
            text="Yes",
        ),
    )
    a2.provided_at.FromJsonString("2024-05-02T15:54:15.941988Z"),
    a2.received_at.FromJsonString("2024-05-02T15:54:15.945350Z"),
    
    m2.answer_update_request.answers.extend([
        a1,
        a2,
    ])
    print(json_format.MessageToJson(m2,always_print_fields_with_no_presence=True))
    

    Yields the JSON:

    {
      "answerUpdateRequest": {
        "entity": {
          "id": "UU12334ID",
          "type": "ORGANIZATION"
        },
        "answers": [
          {
            "key": "legal_company_name",
            "source": {
              "type": "DOCUMENT",
              "id": "3ea20f68e73ec | DocumentType.application"
            },
            "providedAt": "2024-05-02T15:54:15.941988Z",
            "receivedAt": "2024-05-02T15:54:15.945350Z",
            "type": "ANSWER_FIELD_TYPE_TEXT",
            "value": {
              "text": "Cicne Law, LLC"
            }
          },
          {
            "key": "company_websited_ind",
            "source": {
              "type": "DOCUMENT",
              "id": "3ea20440-83fb-43c0-b409-1dd8f68e73ec | DocumentType.application"
            },
            "providedAt": "2024-05-02T15:54:15.941988Z",
            "receivedAt": "2024-05-02T15:54:15.945350Z",
            "type": "ANSWER_FIELD_TYPE_BOOLEAN",
            "value": {
              "text": "Yes"
            }
          }
        ]
      }
    }
    

    NOTE You'd need to revise AnswerFieldType to get values of TEXT instead of ANSWER_FIELD_TYPE_TEXT.