Search code examples
regexkettlepentaho-data-integrationpdi

Regex - How to find file in dynamic folder name


Is it possible to find file where the folder that I looking to is dynamic using regex?

Path:

Folder/Folder/file

And the folder is name by year and month number.

E.g:

2018/10/File
2018/09/File
2018/01/File
2017/10/File

What I want is to find the file with condition is:

[CurrentYear]/[CurrentMonth]/File
[CurrentYear]/[LastMonth]/File

This thing is run in pentaho using Get File Name/Text File Input Step.

Or any other way to do it?

Assume current date is October 2018.


Solution

  • You might be able to find some RegExp for that, but you can also build this dynamic path in a previous KTR and pass it as a variable to execute.

    With the formula step you have date methods which you can extract information, and use calculator step to alter them. For example, you could use a formula step to get TODAY(), and use a calculator step to generate older/future dates, and extract the YEAR(), MONTH(), DAY() from those.

    <?xml version="1.0" encoding="UTF-8"?>
    <transformation>
      <info>
        <name>Range_of_dates</name>
        <description/>
        <extended_description/>
        <trans_version/>
        <trans_type>Normal</trans_type>
        <directory>&#x2f;</directory>
        <parameters>
        </parameters>
        <log>
          <trans-log-table>
            <connection/>
            <schema/>
            <table/>
            <size_limit_lines/>
            <interval/>
            <timeout_days/>
            <field>
              <id>ID_BATCH</id>
              <enabled>Y</enabled>
              <name>ID_BATCH</name>
            </field>
            <field>
              <id>CHANNEL_ID</id>
              <enabled>Y</enabled>
              <name>CHANNEL_ID</name>
            </field>
            <field>
              <id>TRANSNAME</id>
              <enabled>Y</enabled>
              <name>TRANSNAME</name>
            </field>
            <field>
              <id>STATUS</id>
              <enabled>Y</enabled>
              <name>STATUS</name>
            </field>
            <field>
              <id>LINES_READ</id>
              <enabled>Y</enabled>
              <name>LINES_READ</name>
              <subject/>
            </field>
            <field>
              <id>LINES_WRITTEN</id>
              <enabled>Y</enabled>
              <name>LINES_WRITTEN</name>
              <subject/>
            </field>
            <field>
              <id>LINES_UPDATED</id>
              <enabled>Y</enabled>
              <name>LINES_UPDATED</name>
              <subject/>
            </field>
            <field>
              <id>LINES_INPUT</id>
              <enabled>Y</enabled>
              <name>LINES_INPUT</name>
              <subject/>
            </field>
            <field>
              <id>LINES_OUTPUT</id>
              <enabled>Y</enabled>
              <name>LINES_OUTPUT</name>
              <subject/>
            </field>
            <field>
              <id>LINES_REJECTED</id>
              <enabled>Y</enabled>
              <name>LINES_REJECTED</name>
              <subject/>
            </field>
            <field>
              <id>ERRORS</id>
              <enabled>Y</enabled>
              <name>ERRORS</name>
            </field>
            <field>
              <id>STARTDATE</id>
              <enabled>Y</enabled>
              <name>STARTDATE</name>
            </field>
            <field>
              <id>ENDDATE</id>
              <enabled>Y</enabled>
              <name>ENDDATE</name>
            </field>
            <field>
              <id>LOGDATE</id>
              <enabled>Y</enabled>
              <name>LOGDATE</name>
            </field>
            <field>
              <id>DEPDATE</id>
              <enabled>Y</enabled>
              <name>DEPDATE</name>
            </field>
            <field>
              <id>REPLAYDATE</id>
              <enabled>Y</enabled>
              <name>REPLAYDATE</name>
            </field>
            <field>
              <id>LOG_FIELD</id>
              <enabled>Y</enabled>
              <name>LOG_FIELD</name>
            </field>
            <field>
              <id>EXECUTING_SERVER</id>
              <enabled>N</enabled>
              <name>EXECUTING_SERVER</name>
            </field>
            <field>
              <id>EXECUTING_USER</id>
              <enabled>N</enabled>
              <name>EXECUTING_USER</name>
            </field>
            <field>
              <id>CLIENT</id>
              <enabled>N</enabled>
              <name>CLIENT</name>
            </field>
          </trans-log-table>
          <perf-log-table>
            <connection/>
            <schema/>
            <table/>
            <interval/>
            <timeout_days/>
            <field>
              <id>ID_BATCH</id>
              <enabled>Y</enabled>
              <name>ID_BATCH</name>
            </field>
            <field>
              <id>SEQ_NR</id>
              <enabled>Y</enabled>
              <name>SEQ_NR</name>
            </field>
            <field>
              <id>LOGDATE</id>
              <enabled>Y</enabled>
              <name>LOGDATE</name>
            </field>
            <field>
              <id>TRANSNAME</id>
              <enabled>Y</enabled>
              <name>TRANSNAME</name>
            </field>
            <field>
              <id>STEPNAME</id>
              <enabled>Y</enabled>
              <name>STEPNAME</name>
            </field>
            <field>
              <id>STEP_COPY</id>
              <enabled>Y</enabled>
              <name>STEP_COPY</name>
            </field>
            <field>
              <id>LINES_READ</id>
              <enabled>Y</enabled>
              <name>LINES_READ</name>
            </field>
            <field>
              <id>LINES_WRITTEN</id>
              <enabled>Y</enabled>
              <name>LINES_WRITTEN</name>
            </field>
            <field>
              <id>LINES_UPDATED</id>
              <enabled>Y</enabled>
              <name>LINES_UPDATED</name>
            </field>
            <field>
              <id>LINES_INPUT</id>
              <enabled>Y</enabled>
              <name>LINES_INPUT</name>
            </field>
            <field>
              <id>LINES_OUTPUT</id>
              <enabled>Y</enabled>
              <name>LINES_OUTPUT</name>
            </field>
            <field>
              <id>LINES_REJECTED</id>
              <enabled>Y</enabled>
              <name>LINES_REJECTED</name>
            </field>
            <field>
              <id>ERRORS</id>
              <enabled>Y</enabled>
              <name>ERRORS</name>
            </field>
            <field>
              <id>INPUT_BUFFER_ROWS</id>
              <enabled>Y</enabled>
              <name>INPUT_BUFFER_ROWS</name>
            </field>
            <field>
              <id>OUTPUT_BUFFER_ROWS</id>
              <enabled>Y</enabled>
              <name>OUTPUT_BUFFER_ROWS</name>
            </field>
          </perf-log-table>
          <channel-log-table>
            <connection/>
            <schema/>
            <table/>
            <timeout_days/>
            <field>
              <id>ID_BATCH</id>
              <enabled>Y</enabled>
              <name>ID_BATCH</name>
            </field>
            <field>
              <id>CHANNEL_ID</id>
              <enabled>Y</enabled>
              <name>CHANNEL_ID</name>
            </field>
            <field>
              <id>LOG_DATE</id>
              <enabled>Y</enabled>
              <name>LOG_DATE</name>
            </field>
            <field>
              <id>LOGGING_OBJECT_TYPE</id>
              <enabled>Y</enabled>
              <name>LOGGING_OBJECT_TYPE</name>
            </field>
            <field>
              <id>OBJECT_NAME</id>
              <enabled>Y</enabled>
              <name>OBJECT_NAME</name>
            </field>
            <field>
              <id>OBJECT_COPY</id>
              <enabled>Y</enabled>
              <name>OBJECT_COPY</name>
            </field>
            <field>
              <id>REPOSITORY_DIRECTORY</id>
              <enabled>Y</enabled>
              <name>REPOSITORY_DIRECTORY</name>
            </field>
            <field>
              <id>FILENAME</id>
              <enabled>Y</enabled>
              <name>FILENAME</name>
            </field>
            <field>
              <id>OBJECT_ID</id>
              <enabled>Y</enabled>
              <name>OBJECT_ID</name>
            </field>
            <field>
              <id>OBJECT_REVISION</id>
              <enabled>Y</enabled>
              <name>OBJECT_REVISION</name>
            </field>
            <field>
              <id>PARENT_CHANNEL_ID</id>
              <enabled>Y</enabled>
              <name>PARENT_CHANNEL_ID</name>
            </field>
            <field>
              <id>ROOT_CHANNEL_ID</id>
              <enabled>Y</enabled>
              <name>ROOT_CHANNEL_ID</name>
            </field>
          </channel-log-table>
          <step-log-table>
            <connection/>
            <schema/>
            <table/>
            <timeout_days/>
            <field>
              <id>ID_BATCH</id>
              <enabled>Y</enabled>
              <name>ID_BATCH</name>
            </field>
            <field>
              <id>CHANNEL_ID</id>
              <enabled>Y</enabled>
              <name>CHANNEL_ID</name>
            </field>
            <field>
              <id>LOG_DATE</id>
              <enabled>Y</enabled>
              <name>LOG_DATE</name>
            </field>
            <field>
              <id>TRANSNAME</id>
              <enabled>Y</enabled>
              <name>TRANSNAME</name>
            </field>
            <field>
              <id>STEPNAME</id>
              <enabled>Y</enabled>
              <name>STEPNAME</name>
            </field>
            <field>
              <id>STEP_COPY</id>
              <enabled>Y</enabled>
              <name>STEP_COPY</name>
            </field>
            <field>
              <id>LINES_READ</id>
              <enabled>Y</enabled>
              <name>LINES_READ</name>
            </field>
            <field>
              <id>LINES_WRITTEN</id>
              <enabled>Y</enabled>
              <name>LINES_WRITTEN</name>
            </field>
            <field>
              <id>LINES_UPDATED</id>
              <enabled>Y</enabled>
              <name>LINES_UPDATED</name>
            </field>
            <field>
              <id>LINES_INPUT</id>
              <enabled>Y</enabled>
              <name>LINES_INPUT</name>
            </field>
            <field>
              <id>LINES_OUTPUT</id>
              <enabled>Y</enabled>
              <name>LINES_OUTPUT</name>
            </field>
            <field>
              <id>LINES_REJECTED</id>
              <enabled>Y</enabled>
              <name>LINES_REJECTED</name>
            </field>
            <field>
              <id>ERRORS</id>
              <enabled>Y</enabled>
              <name>ERRORS</name>
            </field>
            <field>
              <id>LOG_FIELD</id>
              <enabled>N</enabled>
              <name>LOG_FIELD</name>
            </field>
          </step-log-table>
          <metrics-log-table>
            <connection/>
            <schema/>
            <table/>
            <timeout_days/>
            <field>
              <id>ID_BATCH</id>
              <enabled>Y</enabled>
              <name>ID_BATCH</name>
            </field>
            <field>
              <id>CHANNEL_ID</id>
              <enabled>Y</enabled>
              <name>CHANNEL_ID</name>
            </field>
            <field>
              <id>LOG_DATE</id>
              <enabled>Y</enabled>
              <name>LOG_DATE</name>
            </field>
            <field>
              <id>METRICS_DATE</id>
              <enabled>Y</enabled>
              <name>METRICS_DATE</name>
            </field>
            <field>
              <id>METRICS_CODE</id>
              <enabled>Y</enabled>
              <name>METRICS_CODE</name>
            </field>
            <field>
              <id>METRICS_DESCRIPTION</id>
              <enabled>Y</enabled>
              <name>METRICS_DESCRIPTION</name>
            </field>
            <field>
              <id>METRICS_SUBJECT</id>
              <enabled>Y</enabled>
              <name>METRICS_SUBJECT</name>
            </field>
            <field>
              <id>METRICS_TYPE</id>
              <enabled>Y</enabled>
              <name>METRICS_TYPE</name>
            </field>
            <field>
              <id>METRICS_VALUE</id>
              <enabled>Y</enabled>
              <name>METRICS_VALUE</name>
            </field>
          </metrics-log-table>
        </log>
        <maxdate>
          <connection/>
          <table/>
          <field/>
          <offset>0.0</offset>
          <maxdiff>0.0</maxdiff>
        </maxdate>
        <size_rowset>10000</size_rowset>
        <sleep_time_empty>50</sleep_time_empty>
        <sleep_time_full>50</sleep_time_full>
        <unique_connections>N</unique_connections>
        <feedback_shown>Y</feedback_shown>
        <feedback_size>50000</feedback_size>
        <using_thread_priorities>Y</using_thread_priorities>
        <shared_objects_file/>
        <capture_step_performance>N</capture_step_performance>
        <step_performance_capturing_delay>1000</step_performance_capturing_delay>
        <step_performance_capturing_size_limit>100</step_performance_capturing_size_limit>
        <dependencies>
        </dependencies>
        <partitionschemas>
        </partitionschemas>
        <slaveservers>
        </slaveservers>
        <clusterschemas>
        </clusterschemas>
        <created_user>-</created_user>
        <created_date>2018&#x2f;10&#x2f;26 11&#x3a;34&#x3a;38.865</created_date>
        <modified_user>-</modified_user>
        <modified_date>2018&#x2f;10&#x2f;26 11&#x3a;34&#x3a;38.865</modified_date>
        <key_for_session_key/>
        <is_key_private>N</is_key_private>
      </info>
      <notepads>
      </notepads>
      <order>
        <hop>
          <from>Generate Rows</from>
          <to>Add sequence</to>
          <enabled>Y</enabled>
        </hop>
        <hop>
          <from>Add sequence</from>
          <to>Today</to>
          <enabled>Y</enabled>
        </hop>
        <hop>
          <from>Today</from>
          <to>Today - Number</to>
          <enabled>Y</enabled>
        </hop>
        <hop>
          <from>Today - Number</from>
          <to>Parts of the Date</to>
          <enabled>Y</enabled>
        </hop>
        <hop>
          <from>Parts of the Date</from>
          <to>2 Digit Month and Day</to>
          <enabled>Y</enabled>
        </hop>
        <hop>
          <from>2 Digit Month and Day</from>
          <to>Copy rows to result</to>
          <enabled>Y</enabled>
        </hop>
      </order>
      <step>
        <name>Generate Rows</name>
        <type>RowGenerator</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
        <partitioning>
          <method>none</method>
          <schema_name/>
        </partitioning>
        <fields>
        </fields>
        <limit>60</limit>
        <never_ending>N</never_ending>
        <interval_in_ms>5000</interval_in_ms>
        <row_time_field>now</row_time_field>
        <last_time_field>FiveSecondsAgo</last_time_field>
        <cluster_schema/>
        <remotesteps>
          <input>
          </input>
          <output>
          </output>
        </remotesteps>
        <GUI>
          <xloc>48</xloc>
          <yloc>32</yloc>
          <draw>Y</draw>
        </GUI>
        </step>
    
      <step>
        <name>Add sequence</name>
        <type>Sequence</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
        <partitioning>
          <method>none</method>
          <schema_name/>
        </partitioning>
          <valuename>valuename</valuename>
          <use_database>N</use_database>
          <connection/>
          <schema/>
          <seqname>SEQ_</seqname>
          <use_counter>Y</use_counter>
          <counter_name/>
          <start_at>-1</start_at>
          <increment_by>-1</increment_by>
          <max_value>999999999</max_value>
        <cluster_schema/>
        <remotesteps>
          <input>
          </input>
          <output>
          </output>
        </remotesteps>
        <GUI>
          <xloc>160</xloc>
          <yloc>32</yloc>
          <draw>Y</draw>
        </GUI>
        </step>
    
      <step>
        <name>Today</name>
        <type>Formula</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
        <partitioning>
          <method>none</method>
          <schema_name/>
        </partitioning>
           <formula><field_name>today</field_name>
    <formula_string>TODAY&#x28;&#x29;</formula_string>
    <value_type>Date</value_type>
    <value_length>-1</value_length>
    <value_precision>-1</value_precision>
    <replace_field/>
    </formula>
        <cluster_schema/>
        <remotesteps>
          <input>
          </input>
          <output>
          </output>
        </remotesteps>
        <GUI>
          <xloc>256</xloc>
          <yloc>32</yloc>
          <draw>Y</draw>
        </GUI>
        </step>
    
      <step>
        <name>Today - Number</name>
        <type>Calculator</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
        <partitioning>
          <method>none</method>
          <schema_name/>
        </partitioning>
        <calculation>
          <field_name>date_minus_days</field_name>
          <calc_type>ADD_DAYS</calc_type>
          <field_a>today</field_a>
          <field_b>valuename</field_b>
          <field_c/>
          <value_type>Date</value_type>
          <value_length>-1</value_length>
          <value_precision>-1</value_precision>
          <remove>N</remove>
          <conversion_mask>yyyy&#x2f;MM&#x2f;dd</conversion_mask>
          <decimal_symbol/>
          <grouping_symbol/>
          <currency_symbol/>
        </calculation>
        <cluster_schema/>
        <remotesteps>
          <input>
          </input>
          <output>
          </output>
        </remotesteps>
        <GUI>
          <xloc>352</xloc>
          <yloc>32</yloc>
          <draw>Y</draw>
        </GUI>
        </step>
    
      <step>
        <name>Parts of the Date</name>
        <type>Formula</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
        <partitioning>
          <method>none</method>
          <schema_name/>
        </partitioning>
           <formula><field_name>Year</field_name>
    <formula_string>YEAR&#x28;&#x5b;date_minus_days&#x5d;&#x29;</formula_string>
    <value_type>Integer</value_type>
    <value_length>-1</value_length>
    <value_precision>-1</value_precision>
    <replace_field/>
    </formula>
           <formula><field_name>Month</field_name>
    <formula_string>MONTH&#x28;&#x5b;date_minus_days&#x5d;&#x29;</formula_string>
    <value_type>Integer</value_type>
    <value_length>-1</value_length>
    <value_precision>-1</value_precision>
    <replace_field/>
    </formula>
           <formula><field_name>Day</field_name>
    <formula_string>DAY&#x28;&#x5b;date_minus_days&#x5d;&#x29;</formula_string>
    <value_type>Integer</value_type>
    <value_length>-1</value_length>
    <value_precision>-1</value_precision>
    <replace_field/>
    </formula>
        <cluster_schema/>
        <remotesteps>
          <input>
          </input>
          <output>
          </output>
        </remotesteps>
        <GUI>
          <xloc>464</xloc>
          <yloc>32</yloc>
          <draw>Y</draw>
        </GUI>
        </step>
    
      <step>
        <name>2 Digit Month and Day</name>
        <type>SelectValues</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
        <partitioning>
          <method>none</method>
          <schema_name/>
        </partitioning>
        <fields>        <select_unspecified>N</select_unspecified>
          <meta>        <name>Month</name>
            <rename>Month</rename>
            <type>String</type>
            <length>-2</length>
            <precision>-2</precision>
            <conversion_mask>00</conversion_mask>
            <date_format_lenient>false</date_format_lenient>
            <date_format_locale/>
            <date_format_timezone/>
            <lenient_string_to_number>false</lenient_string_to_number>
            <encoding/>
            <decimal_symbol/>
            <grouping_symbol/>
            <currency_symbol/>
            <storage_type/>
          </meta>      <meta>        <name>Day</name>
            <rename>Day</rename>
            <type>String</type>
            <length>-2</length>
            <precision>-2</precision>
            <conversion_mask>00</conversion_mask>
            <date_format_lenient>false</date_format_lenient>
            <date_format_locale/>
            <date_format_timezone/>
            <lenient_string_to_number>false</lenient_string_to_number>
            <encoding/>
            <decimal_symbol/>
            <grouping_symbol/>
            <currency_symbol/>
            <storage_type/>
          </meta>    </fields>    <cluster_schema/>
        <remotesteps>
          <input>
          </input>
          <output>
          </output>
        </remotesteps>
        <GUI>
          <xloc>592</xloc>
          <yloc>32</yloc>
          <draw>Y</draw>
        </GUI>
        </step>
    
      <step>
        <name>Copy rows to result</name>
        <type>RowsToResult</type>
        <description/>
        <distribute>Y</distribute>
        <custom_distribution/>
        <copies>1</copies>
        <partitioning>
          <method>none</method>
          <schema_name/>
        </partitioning>
        <cluster_schema/>
        <remotesteps>
          <input>
          </input>
          <output>
          </output>
        </remotesteps>
        <GUI>
          <xloc>720</xloc>
          <yloc>32</yloc>
          <draw>Y</draw>
        </GUI>
        </step>
    
      <step_error_handling>
      </step_error_handling>
      <slave-step-copy-partition-distribution>
      </slave-step-copy-partition-distribution>
      <slave_transformation>N</slave_transformation>
    </transformation>

    You can save this snippet to a .ktr file and open in PDI Spoon.

    After this, you need a Job that executes this first KTR, and passes the result to another, in this second KTR you can access the variables in the Get File Names step like you would any other variable:

    C:\${year}\${month}\File