Search code examples
pmml

Is it possible to add transformation code to pmml?


I run a linear regression code and generated a pmml. In my pmml code there are data types and regression table info. I have a pmml code like that

 <?xml version="1.0" encoding="utf-8" standalone="yes"?>
  <PMML version="4.2"
   xmlns="http://www.dmg.org/PMML-4_2"
   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <Header>
    <Application name="SAS(r)" version="9.4"/>
    <Timestamp>2016-05-27 09:58:01</Timestamp>
    </Header>
    <DataDictionary numberOfFields="10">
      <DataField name="Invoice" optype="continuous" dataType="double"/>
      <DataField name="ZINT_Cylinders" optype="continuous" dataType="double"/>
      <DataField name="ZINT_EngineSize" optype="continuous" dataType="double"/>
    </DataDictionary>
    <TransformationDictionary/>
    <RegressionModel functionName="regression" targetFieldName="Invoice" normalizationMethod="none">
      <MiningSchema>
        <MiningField name="ZINT_Cylinders" usageType="active" optype="continuous"/>
        <MiningField name="ZINT_EngineSize" usageType="active" optype="continuous"/>
      </MiningSchema>
      <Output>
        <OutputField name="P_Invoice" displayName="Predicted: Invoice" optype="continuous" dataType="double" targetField="Invoice" feature="predictedValue"/>
      </Output>
      <Targets>
        <Target field="Invoice" optype="continuous">
          <TargetValue defaultValue="30014.700935"/>
        </Target>
      </Targets>
      <LocalTransformations/>
      <RegressionTable intercept="-4919.70174">
        <NumericPredictor name="ZINT_Cylinders" coefficient="-0.007378626"/>
        <NumericPredictor name="ZINT_EngineSize" coefficient="-0.147331595"/>
      </RegressionTable>
    </RegressionModel>
  </PMML>

Before the regression step, I want to transform the my source data. For example I want to add these case when structure to my pmml. Is it possible to do this?

SELECT Invoice,
    CASE
        WHEN EngineSize < 2.9 THEN 20304.5142857143
        WHEN EngineSize < 4.1 THEN 30378.8789808917
        WHEN EngineSize >= 4.1 THEN 47119.9791666667
        ELSE 30014.7009345794 
    END AS ZINT_EngineSize,
    CASE
        WHEN Cylinders < 4.5 THEN 18349.4452554745
        WHEN Cylinders < 7 THEN 29472.3819095477
        WHEN Cylinders >= 7 THEN 48558.847826087
        ELSE 30014.7009345794 
    END AS ZINT_Cylinders
FROM MYSOURCE.MYTABLE;

Solution

  • You can use the Discretize transformation for the job:

    <DerivedField name="ZINT_EngineSize" dataType="double" optype="continuous">
      <Discretize field="EngineSize">
        <DiscretizeBin binValue="20304.5142857143">
          <!-- EngineSize < 2.9 -->
          <Interval closure="openOpen" rightMargin="2.9"/>
        </DiscretizeBin>
        <DiscretizeBin binValue="30378.8789808917">
          <!-- EngineSize >= 2.9 && EngineSize < 4.1 -->
          <Interval closure="closedOpen" leftMargin="2.9" rightMargin="4.1"/>
        </DiscretizeBin>
        <DiscretizeBin binValue="47119.9791666667">
          <!-- EngineSize >= 4.1 -->
          <Interval closure="closedOpen" leftMargin="4.1"/>
        </DiscretizeBin>
      </Discretize>
    </DerivedField>
    

    You could build a parser for SAS script, and automate the generation and embedding of Discretize elements using the JPMML-Model library.