Search code examples
hadoophiveoozieoozie-coordinator

oozie Coordinator for Historic dates


I want to run oozie coordinator for historic dates and pass date as parameter to a script in workflow. How do I do that?

Can I put start date as old date ? Will it catch up? and what frequency should I add to it.


Solution

  • Yes, when you submit a coordinator with start date in the past it catches up. It starts the execution immediately so setting concurrency=1 saves your cluster from heavy load. You can also set execution=LIFO if you want to process new files first. For more info look into http://oozie.apache.org/docs/3.3.2/CoordinatorFunctionalSpec.html

    I'm posting a modified sample from the answer to How to schedule a sqoop action using oozie

    Create coordinator.xml file:

    <coordinator-app name="sample-coord" xmlns="uri:oozie:coordinator:0.2"
    
                     frequency="${coord:days(7)}"
                     start="${start}"
                     end=  "${end}"
    
                     timezone="America/New_York">
    
        <controls>
            <timeout>${timeout}</timeout>
            <concurrency>1</concurrency>
        </controls>
    
        <datasets>
            <dataset name="data"
                     frequency="${coord:days(7)}"
                     initial-instance="${start}" timezone="America/New_York">
                <uri-template>${data_path}/${YEAR}/${MONTH}/${DAY}</uri-template>
                <done-flag/>
            </dataset>
        </datasets>
    
        <input-events>
            <data-in name="data_in" dataset="data">
                <instance>${coord:current(0)}</instance>
            </data-in>
        </input-events>
    
        <action>
            <workflow>
                <app-path>${wf_application_path}</app-path>
    
                <configuration>
                    <property>
                        <name>input</name>
                        <value>${coord:dataIn('data_in')}</value>
                    </property>
                </configuration>
            </workflow>
        </action>
    </coordinator-app>
    

    Specify all properties used in the above file in coordinator.properties:

    host=namenode01
    nameNode=hdfs://${host}:8020
    
    wf_application_path=${nameNode}/oozie/deployments/example
    oozie.coord.application.path=${wf_application_path}
    
    data_path=${nameNode}/data
    
    start=2013-08-01T01:00Z
    end=2013-08-19T23:59Z
    timeout=10
    

    Upload your coordinator.xml file to hdfs and then submit your coordinator job with something like

    oozie job -config coordinator.properties -run