Search code examples
pentahopentaho-spoonpentaho-data-integration

transforming rows to headers in pentaho


I have a file which has entried like below

FIELD1,FIELD2
name,ABC
age,29
location,ZZ
name,XYZ
age,33
location,YY

I require the output to be as below. I tried the row-denormalization, but its not giving proper output

name,age,location
ABC,29,ZZ
XYZ,33,YY

Solution

  • Row de-normalizer can produce an output like this, but it requires some identifier for every entity in the input. And the input has to be sorted by this identifier.

    So you need first to transform output of your file to following structure:

    ID,FIELD1,FIELD2
    0,name,ABC
    0,age,29
    0,location,ZZ
    1,name,XYZ
    1,age,33
    1,location,YY
    

    One of the ways to achieve this is by combining Add sequence (start from 0) and User Defined Java Expression (Set the expression to ID / 3, if you always have exactly three rows corresponding to the same entity).

    Then you can use Row denormalizer as on the picture below.

    Your transformation will look like this:

    enter image description here

    The transformation steps xml (just copy it and paste on the transformation canvas):

    <?xml version="1.0" encoding="UTF-8"?>
    <transformation-steps>
    <steps>
      <step>
        <name>Data Grid</name>
        <type>DataGrid</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
             <partitioning>
               <method>none</method>
               <schema_name/>
               </partitioning>
        <fields>
          <field>
            <name>FIELD1</name>
            <type>String</type>
            <format/>
            <currency/>
            <decimal/>
            <group/>
            <length>-1</length>
            <precision>-1</precision>
            <set_empty_string>N</set_empty_string>
          </field>
          <field>
            <name>FIELD2</name>
            <type>String</type>
            <format/>
            <currency/>
            <decimal/>
            <group/>
            <length>-1</length>
            <precision>-1</precision>
            <set_empty_string>N</set_empty_string>
          </field>
        </fields>
        <data>
          <line> <item>name</item><item>ABC</item> </line>
          <line> <item>age</item><item>29</item> </line>
          <line> <item>location</item><item>ZZ</item> </line>
          <line> <item>name</item><item>XYZ</item> </line>
          <line> <item>age</item><item>33</item> </line>
          <line> <item>location</item><item>YY</item> </line>
        </data>
         <cluster_schema/>
     <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
          <xloc>128</xloc>
          <yloc>64</yloc>
          <draw>Y</draw>
          </GUI>
        </step>
    
      <step>
        <name>Row denormaliser</name>
        <type>Denormaliser</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
             <partitioning>
               <method>none</method>
               <schema_name/>
               </partitioning>
          <key_field>FIELD1</key_field>
          <group>
            <field>
              <name>ID</name>
              </field>
            </group>
          <fields>
            <field>
              <field_name>FIELD2</field_name>
              <key_value>name</key_value>
              <target_name>name</target_name>
              <target_type>String</target_type>
              <target_format/>
              <target_length>-1</target_length>
              <target_precision>-1</target_precision>
              <target_decimal_symbol/>
              <target_grouping_symbol/>
              <target_currency_symbol/>
              <target_null_string/>
              <target_aggregation_type>-</target_aggregation_type>
              </field>
            <field>
              <field_name>FIELD2</field_name>
              <key_value>age</key_value>
              <target_name>age</target_name>
              <target_type>Integer</target_type>
              <target_format/>
              <target_length>-1</target_length>
              <target_precision>-1</target_precision>
              <target_decimal_symbol/>
              <target_grouping_symbol/>
              <target_currency_symbol/>
              <target_null_string/>
              <target_aggregation_type>-</target_aggregation_type>
              </field>
            <field>
              <field_name>FIELD2</field_name>
              <key_value>location</key_value>
              <target_name>location</target_name>
              <target_type>String</target_type>
              <target_format/>
              <target_length>-1</target_length>
              <target_precision>-1</target_precision>
              <target_decimal_symbol/>
              <target_grouping_symbol/>
              <target_currency_symbol/>
              <target_null_string/>
              <target_aggregation_type>-</target_aggregation_type>
              </field>
            </fields>
         <cluster_schema/>
     <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
          <xloc>672</xloc>
          <yloc>64</yloc>
          <draw>Y</draw>
          </GUI>
        </step>
    
      <step>
        <name>Add sequence</name>
        <type>Sequence</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
             <partitioning>
               <method>none</method>
               <schema_name/>
               </partitioning>
          <valuename>ID</valuename>
          <use_database>N</use_database>
          <connection/>
          <schema/>
          <seqname>SEQ_</seqname>
          <use_counter>Y</use_counter>
          <counter_name/>
          <start_at>0</start_at>
          <increment_by>1</increment_by>
          <max_value>999999999</max_value>
         <cluster_schema/>
     <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
          <xloc>272</xloc>
          <yloc>64</yloc>
          <draw>Y</draw>
          </GUI>
        </step>
    
      <step>
        <name>User Defined Java Expression</name>
        <type>Janino</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
             <partitioning>
               <method>none</method>
               <schema_name/>
               </partitioning>
           <formula><field_name>ID</field_name>
    <formula_string>ID &#x2f; 3</formula_string>
    <value_type>Integer</value_type>
    <value_length>-1</value_length>
    <value_precision>-1</value_precision>
    <replace_field>ID</replace_field>
    </formula>
         <cluster_schema/>
     <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
          <xloc>432</xloc>
          <yloc>64</yloc>
          <draw>Y</draw>
          </GUI>
        </step>
    
    </steps>
    <order>
      <hop> <from>Data Grid</from><to>Add sequence</to><enabled>Y</enabled> </hop>
      <hop> <from>Add sequence</from><to>User Defined Java Expression</to><enabled>Y</enabled> </hop>
      <hop> <from>User Defined Java Expression</from><to>Row denormaliser</to><enabled>Y</enabled> </hop>
    </order>
    <notepads>
    </notepads>
    <step_error_handling>
    </step_error_handling>
    </transformation-steps>
    

    And finally, if you need it, you can get rid of the ID column by using Select values step.