Search code examples
marklogicmarklogic-dhf

Marklogic Data Hub Frame 5.2.2 Mapping


I'm trying to learn the datahubframework 5.2.2 and as a part of that implementing a small project.Could someone help me to understand the below points.

  • Whats the main use of creating steps?(Ingestion,mapping).Because as a part of flow,we define the step very clear with inputs and outputs.Whats the need to create step explicitly and what purpose it does?
  • I'm trying to map the data using mapping file but the mapping is not done,same ingested file is loaded into final database with out mapping it.Please help me where i have done wrong.

ingestionmapping.flow.json

{
  "name": "ingestionmapping",
  "description": "This is the default flow containing all of the default steps",
  "batchSize": 100,
  "threadCount": 4,
  "options": {
    "sourceQuery": null
  },
  "steps": {
    "1": {
      "name": "csv-ingest-step-json",
      "description": "ingests json docs in JSON format to data-hub-STAGING",
      "stepDefinitionName": "productIngestion",
      "stepDefinitionType": "INGESTION",
      "customHook" : {
        "module" : "",
        "parameters" : { },
        "user" : "",
        "runBefore" : false
      },
      "batchSize" : 100,
      "threadCount" : 4,
      "fileLocations": {
        "inputFilePath": "input",
        "outputURIReplacement": ".*input*.,'/mapping-flow/json'",
        "inputFileType": "csv"
      },
      "options": {
        "targetDatabase": "data-hub-STAGING",
        "sourceQuery": "cts.collectionQuery([])",
        "permissions": "data-hub-operator,read,data-hub-operator,update",
        "outputFormat": "json",
        "collections": [
          "mapping-flow-ingestion-json"
        ],
        "headers": {
          "sources": [{"name":  "ingestion_only-flow"}],
          "createdOn" : "currentDateTime",
          "createdBy" : "currentUser"
        }
      }
    },
    "2": {
      "name": "mapping-step",
      "description": "This is the default mapping step",
      "stepDefinitionName": "productMapping",
      "stepDefinitionType": "MAPPING",
      "customHook" : {
        "module" : "",
        "parameters" : { },
        "user" : "",
        "runBefore" : false
      },
      "batchSize" : 100,
      "threadCount" : 4,
      "options": {
        "sourceDatabase": "data-hub-STAGING",
        "targetDatabase": "data-hub-FINAL",
        "sourceQuery": "cts.collectionQuery('mapping-flow-ingestion-json')",
        "permissions": "data-hub-operator,read,data-hub-operator,update",
        "outputFormat": "json",
        "collections": [
          "mapping-flow-mapping-json",
          "mdm-content"
        ],
        "targetEntity": "modifiedproduct",
        "mapping": {
          "name": "ingestionmapping-productMapping",
          "version": 1
        },
        "validateEntity": false
      }
    }
  }
}

mapping file:ingestionmapping-productMapping-1.mapping.json

{
  "lang" : "zxx",
  "name" : "ingestionmapping-productMapping",
  "description" : "",
  "version" : 1,
  "targetEntityType" : "http://marklogic.com/modifiedproduct-0.0.1/modifiedproduct",
  "sourceContext" : "/",
  "sourceURI" : "/mapping-flow/json/....json",
  "properties" : {
    "mgame_id" : {
      "sourcedFrom" : "game_id"
    },
    "mSKU" : {
      "sourcedFrom" : "SKU"
    },
    "mtitle" : {
      "sourcedFrom" : "title"
    },
    "mprice" : {
      "sourcedFrom" : "price"
    },
    "mdescription" : {
      "sourcedFrom" : "description"
    },
    "myears_active" : {
      "sourcedFrom" : "years_active"
    },
    "mpublication_date" : {
      "sourcedFrom" : "publication_date"
    },
    "mplayers" : {
      "sourcedFrom" : "players"
    },
    "mage_range" : {
      "sourcedFrom" : "age_range"
    },
    "msetup_time" : {
      "sourcedFrom" : "setup_time"
    },
    "mplaying_time" : {
      "sourcedFrom" : "playing_time"
    },
    "mchance" : {
      "sourcedFrom" : "chance"
    },
    "mcategory" : {
      "sourcedFrom" : "category"
    },
    "mhas_extensions" : {
      "sourcedFrom" : "has_extensions"
    },
    "mhas_accessories" : {
      "sourcedFrom" : "has_accessories"
    },
    "mhas_apparel" : {
      "sourcedFrom" : "has_apparel"
    },
    "mpopularity_tier" : {
      "sourcedFrom" : "popularity_tier"
    },
    "mprobability_apparel" : {
      "sourcedFrom" : "probability_apparel"
    },
    "mprobability_accessories" : {
      "sourcedFrom" : "probability_accessories"
    },
    "mprobability_extensions" : {
      "sourcedFrom" : "probability_extensions"
    }
  }
}

Entity name : modifiedproduct version : 0.0.1

I have tried many times to debug the issue but couldnt able to find where it goes wrong. As a result it stores the same json to final database with out using the mapping attributes.

folder structure: Folder structure screenshot

json file

{
"envelope": {
"headers": {
"sources": [
{
"name": "ingestion_only-flow"
}
], 
"createdOn": "2020-07-02T09:49:57.5876177+02:00", 
"createdBy": "admin", 
"createdUsingFile": "C:\\Users\\Jhansi\\IdeaProjects\\MarklogicDataHubFramework5.2\\input\\board_games.csv"
}, 
"triples": [
], 
"instance": {
"game_id": "1000130", 
"SKU": "177897644317", 
"title": "careful crack", 
"price": "24.95", 
"description": "", 
"years_active": "0", 
"publication_date": "0", 
"players": "2-4", 
"age_range": "", 
"setup_time": "< 5 minutes", 
"playing_time": "1 hour", 
"chance": "High", 
"category": "Board Game", 
"has_extensions": "False", 
"has_accessories": "True", 
"has_apparel": "False", 
"popularity_tier": "3", 
"probability_apparel": "0.3", 
"probability_accessories": "0.3", 
"probability_extensions": "0.3"
}, 
"attachments": null
}
}

Solution

  • Data Hub would render desired mapping when MarkLogic Entity Services is properly deployed: (Notice the Entity declaration in the mapped document, the key takeaway from all that equation)

    enter image description here

    https://docs.marklogic.com/datahub//flows/flow-definition.html#flow-definition__custom-step-settings

    stepDefinitionName: .....Tip: If you are customizing a default step type (ingestion, mapping, or mastering), leave the value as default-ingestion, default-mapping, or default-mastering....

    Once above is reviewed, please follow Data Hub best practice and correct erroneous manual Steps definitions. Below shouldn’t happen if you use Quick Start to create Flow and Steps, given your familiarity with MarkLogic Data Hub.

    "steps": {
        "1": {
    ……………
          "stepDefinitionName": "productIngestion",
          "stepDefinitionType": "INGESTION",
    ……………
    
        
    
    "2": {
          "name": "mapping-step",
    
          "stepDefinitionName": "productMapping",
          "stepDefinitionType": "MAPPING",
    …………
    
            "mapping": {
              "name": "ingestionmapping-productMapping",
    
    1. Please cleanup your project structure and remove the contents in step-definitions folder. Project structure example ( the Pink part ):

    enter image description here

    1. A working example of the Steps definitions is below. When in doubt, please validate the step in QuickStart.
    {
      "name" : "ingestionmapping",
      "description" : "",
      "batchSize" : 100,
      "threadCount" : 4,
      "stopOnError" : false,
      "options" : { },
      "version" : 0,
      "steps" : {
        "1" : {
          "name" : "csv-ingest-step-json",
          "description" : "",
          "options" : {
            "additionalCollections" : [ ],
            "headers" : {
              "sources" : [ {
                "name" : "ingestionmapping"
              } ],
              "createdOn" : "currentDateTime",
              "createdBy" : "currentUser"
            },
            "sourceQuery" : "cts.collectionQuery([])",
            "collections" : [ "mapping-flow-ingestion-json" ],
            "permissions" : "data-hub-operator,read,data-hub-operator,update",
            "outputFormat" : "json",
            "targetDatabase" : "store-hub-STAGING"
          },
          "customHook" : {
            "module" : "",
            "parameters" : { },
            "user" : "",
            "runBefore" : false
          },
          "retryLimit" : 0,
          "batchSize" : 100,
          "threadCount" : 4,
          "stepDefinitionName" : "default-ingestion",
          "stepDefinitionType" : "INGESTION",
          "fileLocations" : {
            "inputFilePath" : "/mldhf/STORE/data/products/games",
            "inputFileType" : "csv",
            "outputURIReplacement" : ".*games*.,'/mapping-flow/json'",
            "separator" : ","
          }
        },
        "2" : {
          "name" : "mapping-step",
          "description" : "",
          "options" : {
            "additionalCollections" : [ ],
            "sourceQuery" : "cts.collectionQuery([\"mapping-flow-ingestion-json\"])",
            "mapping" : {
              "name" : "ingestionmapping-mapping-step",
              "version" : 1
            },
            "targetEntity" : "modifiedproduct",
            "sourceDatabase" : "store-hub-STAGING",
            "collections" : [ "mapping-flow-mapping-json", "mdm-content" ],
            "permissions" : "data-hub-operator,read,data-hub-operator,update",
            "validateEntity" : false,
            "sourceCollection" : "csv-ingest-step-json",
            "outputFormat" : "json",
            "targetDatabase" : "store-hub-FINAL"
          },
          "customHook" : {
            "module" : "",
            "parameters" : { },
            "user" : "",
            "runBefore" : false
          },
          "retryLimit" : null,
          "batchSize" : 100,
          "threadCount" : 4,
          "stepDefinitionName" : "entity-services-mapping",
          "stepDefinitionType" : "MAPPING"
        }
      }
    }