Search code examples
regexpentahokettlepentaho-data-integration

Pentaho Regex Evaluation


I tried my best to make work a Regex on a File content in PDI but it is giving me the null values in the output. Regex works perfectly in the test regex section of Regex evaluation step, but it is not showing me the same output in the preview.

Here's the file content:

I am expecting 1:19:18.637s in the output, but it is null.

Here's the sample code. It won't work on your local machine but, it will definitely give you an idea of what I am trying to achieve. Here's the code I am trying:

<?xml version="1.0" encoding="UTF-8"?>
<transformation-steps>
<steps>
  <step>
    <name>Generate Rows</name>
    <type>RowGenerator</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
         <partitioning>
           <method>none</method>
           <schema_name/>
           </partitioning>
    <fields>
    </fields>
    <limit>1</limit>
    <never_ending>N</never_ending>
    <interval_in_ms>5000</interval_in_ms>
    <row_time_field>now</row_time_field>
    <last_time_field>FiveSecondsAgo</last_time_field>
     <cluster_schema/>
 <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
      <xloc>318</xloc>
      <yloc>286</yloc>
      <draw>Y</draw>
      </GUI>
    </step>

  <step>
    <name>Get File Names</name>
    <type>GetFileNames</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
         <partitioning>
           <method>none</method>
           <schema_name/>
           </partitioning>
    <filter>
      <filterfiletype>all_files</filterfiletype>
    </filter>
    <doNotFailIfNoFile>N</doNotFailIfNoFile>
    <rownum>N</rownum>
    <isaddresult>Y</isaddresult>
    <filefield>N</filefield>
    <rownum_field/>
    <filename_Field/>
    <wildcard_Field/>
    <exclude_wildcard_Field/>
    <dynamic_include_subfolders>N</dynamic_include_subfolders>
    <limit>10</limit>
    <file>
      <name>&#x24;&#x7b;DEVCI_DATA_HOME&#x7d;&#x2f;console_output&#x2f;</name>
      <filemask>.&#x2a;txt</filemask>
      <exclude_filemask/>
      <file_required>N</file_required>
      <include_subfolders>N</include_subfolders>
    </file>
     <cluster_schema/>
 <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
      <xloc>438</xloc>
      <yloc>286</yloc>
      <draw>Y</draw>
      </GUI>
    </step>

  <step>
    <name>Get Variables</name>
    <type>GetVariable</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
         <partitioning>
           <method>none</method>
           <schema_name/>
           </partitioning>
    <fields>
      <field>
        <name>issue_key_regex</name>
        <variable>&#x24;&#x7b;issue_key_regex&#x7d;</variable>
        <type>String</type>
        <format/>
        <currency/>
        <decimal/>
        <group/>
        <length>-1</length>
        <precision>-1</precision>
        <trim_type>none</trim_type>
      </field>
    </fields>
     <cluster_schema/>
 <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
      <xloc>438</xloc>
      <yloc>126</yloc>
      <draw>Y</draw>
      </GUI>
    </step>

  <step>
    <name>Load file content in memory</name>
    <type>LoadFileInput</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
         <partitioning>
           <method>none</method>
           <schema_name/>
           </partitioning>
    <include>N</include>
    <include_field>full_file_path</include_field>
    <rownum>N</rownum>
    <addresultfile>N</addresultfile>
    <IsIgnoreEmptyFile>Y</IsIgnoreEmptyFile>
    <rownum_field/>
    <encoding/>
    <file>
      <name>C&#x3a;&#x5c;Users&#x5c;nikhil.karkare&#x5c;console_output&#x5c;star-lin64-build-feature_VMESH120_29.txt</name>
      <filemask/>
      <exclude_filemask/>
      <file_required>N</file_required>
      <include_subfolders>N</include_subfolders>
      </file>
    <fields>
      <field>
        <name>File content</name>
        <element_type>content</element_type>
        <type>String</type>
        <format/>
        <currency/>
        <decimal/>
        <group/>
        <length>-1</length>
        <precision>-1</precision>
        <trim_type>none</trim_type>
        <repeat>N</repeat>
        </field>
      </fields>
    <limit>0</limit>
    <IsInFields>Y</IsInFields>
    <DynamicFilenameField>filename</DynamicFilenameField>
    <shortFileFieldName>file_name</shortFileFieldName>
    <pathFieldName/>
    <hiddenFieldName/>
    <lastModificationTimeFieldName/>
    <uriNameFieldName/>
    <rootUriNameFieldName/>
    <extensionFieldName/>
     <cluster_schema/>
 <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
      <xloc>938</xloc>
      <yloc>286</yloc>
      <draw>Y</draw>
      </GUI>
    </step>

  <step>
    <name>Regex Evaluation 3</name>
    <type>RegexEval</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
         <partitioning>
           <method>none</method>
           <schema_name/>
           </partitioning>
    <script><![CDATA[.*console_output\/([A-Za-z0-9_\.\-]+).txt]]></script>    <matcher>uri</matcher>
    <resultfieldname/>
    <usevar>N</usevar>
    <allowcapturegroups>Y</allowcapturegroups>
    <replacefields>Y</replacefields>
    <canoneq>N</canoneq>
    <caseinsensitive>N</caseinsensitive>
    <comment>N</comment>
    <dotall>N</dotall>
    <multiline>N</multiline>
    <unicode>N</unicode>
    <unix>N</unix>
    <fields>
      <field>
        <name>build_id_from_regex</name>
        <type>String</type>
        <format/>
        <group/>
        <decimal/>
        <length>-1</length>
        <precision>-1</precision>
        <nullif/>
        <ifnull/>
        <trimtype>none</trimtype>
      </field>
    </fields>
     <cluster_schema/>
 <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
      <xloc>1098</xloc>
      <yloc>286</yloc>
      <draw>Y</draw>
      </GUI>
    </step>

  <step>
    <name>Regex Evaluation 4</name>
    <type>RegexEval</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
         <partitioning>
           <method>none</method>
           <schema_name/>
           </partitioning>
    <script><![CDATA[^.*\[INFO\].*star\-maven.*SUCCESS.*\[(.*)\].*]]></script>    <matcher>File content</matcher>
    <resultfieldname>result</resultfieldname>
    <usevar>N</usevar>
    <allowcapturegroups>Y</allowcapturegroups>
    <replacefields>Y</replacefields>
    <canoneq>N</canoneq>
    <caseinsensitive>N</caseinsensitive>
    <comment>N</comment>
    <dotall>N</dotall>
    <multiline>N</multiline>
    <unicode>N</unicode>
    <unix>N</unix>
    <fields>
      <field>
        <name>star_maven_time</name>
        <type>String</type>
        <format/>
        <group/>
        <decimal/>
        <length>-1</length>
        <precision>-1</precision>
        <nullif/>
        <ifnull/>
        <trimtype>none</trimtype>
      </field>
    </fields>
     <cluster_schema/>
 <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
      <xloc>725</xloc>
      <yloc>124</yloc>
      <draw>Y</draw>
      </GUI>
    </step>

  <step>
    <name>Regex Evaluation 6</name>
    <type>RegexEval</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
         <partitioning>
           <method>none</method>
           <schema_name/>
           </partitioning>
    <script><![CDATA[([A-Za-z0-9_\.\-]+).txt]]></script>    <matcher>short_filename</matcher>
    <resultfieldname/>
    <usevar>N</usevar>
    <allowcapturegroups>Y</allowcapturegroups>
    <replacefields>Y</replacefields>
    <canoneq>N</canoneq>
    <caseinsensitive>N</caseinsensitive>
    <comment>N</comment>
    <dotall>N</dotall>
    <multiline>N</multiline>
    <unicode>N</unicode>
    <unix>N</unix>
    <fields>
      <field>
        <name>build_id_from_short_filename</name>
        <type>String</type>
        <format/>
        <group/>
        <decimal/>
        <length>-1</length>
        <precision>-1</precision>
        <nullif/>
        <ifnull/>
        <trimtype>none</trimtype>
      </field>
    </fields>
     <cluster_schema/>
 <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
      <xloc>558</xloc>
      <yloc>286</yloc>
      <draw>Y</draw>
      </GUI>
    </step>

  <step>
    <name>Stream lookup 4</name>
    <type>StreamLookup</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
         <partitioning>
           <method>none</method>
           <schema_name/>
           </partitioning>
    <from>Regex Evaluation 3</from>
    <input_sorted>N</input_sorted>
    <preserve_memory>Y</preserve_memory>
    <sorted_list>N</sorted_list>
    <integer_pair>N</integer_pair>
    <lookup>
      <key>
        <name>build_id</name>
        <field>build_id_from_regex</field>
      </key>
      <value>
        <name>build_id_from_regex</name>
        <rename>build_id_from_regex</rename>
        <default/>
        <type>String</type>
      </value>
      <value>
        <name>File content</name>
        <rename>File content</rename>
        <default/>
        <type>String</type>
      </value>
    </lookup>
     <cluster_schema/>
 <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
      <xloc>578</xloc>
      <yloc>126</yloc>
      <draw>Y</draw>
      </GUI>
    </step>

  <step>
    <name>Dummy &#x28;do nothing&#x29;</name>
    <type>Dummy</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
         <partitioning>
           <method>none</method>
           <schema_name/>
           </partitioning>
     <cluster_schema/>
 <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
      <xloc>1036</xloc>
      <yloc>120</yloc>
      <draw>Y</draw>
      </GUI>
    </step>

  <step>
    <name>Data Grid</name>
    <type>DataGrid</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
         <partitioning>
           <method>none</method>
           <schema_name/>
           </partitioning>
    <fields>
      <field>
        <name>build_id</name>
        <type>String</type>
        <format/>
        <currency/>
        <decimal/>
        <group/>
        <length>-1</length>
        <precision>-1</precision>
        <set_empty_string>N</set_empty_string>
      </field>
    </fields>
    <data>
      <line> <item/> </line>
    </data>
     <cluster_schema/>
 <remotesteps>   <input>   </input>   <output>   </output> </remotesteps>    <GUI>
      <xloc>308</xloc>
      <yloc>126</yloc>
      <draw>Y</draw>
      </GUI>
    </step>

</steps>
<order>
  <hop> <from>Generate Rows</from><to>Get File Names</to><enabled>Y</enabled> </hop>
  <hop> <from>Get File Names</from><to>Regex Evaluation 6</to><enabled>Y</enabled> </hop>
  <hop> <from>Get Variables</from><to>Stream lookup 4</to><enabled>Y</enabled> </hop>
  <hop> <from>Load file content in memory</from><to>Regex Evaluation 3</to><enabled>Y</enabled> </hop>
  <hop> <from>Regex Evaluation 3</from><to>Stream lookup 4</to><enabled>Y</enabled> </hop>
  <hop> <from>Regex Evaluation 4</from><to>Dummy &#x28;do nothing&#x29;</to><enabled>Y</enabled> </hop>
  <hop> <from>Regex Evaluation 6</from><to>Load file content in memory</to><enabled>Y</enabled> </hop>
  <hop> <from>Stream lookup 4</from><to>Regex Evaluation 4</to><enabled>Y</enabled> </hop>
  <hop> <from>Data Grid</from><to>Get Variables</to><enabled>Y</enabled> </hop>
</order>
<notepads>
</notepads>
<step_error_handling>
</step_error_handling>
</transformation-steps>

Any suggestions will be appreciated. Thanks


Solution

  • Never mind. The regex I was using needed to be refined more according to the string I was trying to extract. Issue resolved.

    Conclusion: If the File content is in Megabytes and you want extract a string from it, your RegEx should be very very specific to that string (it should always be specific BTW). It might give you the correct results when you test it on a RegEx compiler or on the Test utility of the RegEx evaluation step, but when you run the transformation, you will just see the null value. Rework on the RegEx and continue to refine it until you see the extracted string you want to see in the output.