Search code examples
xmlxpathpentaho

Get XPath for all td in a table using Pentaho


Is there anyway using Pentaho to parse a tables td's from an html page? Lets say I have this html content

<html>
  <body>
    <table>
      <tr>
        <td>info1</td>
        <td>info2</td>
      </tr>
      <tr>
        <td>info3</td>
        <td>info4</td>
      </tr>
    </table>
  </body>
</html>
I am using in Pentaho the "Get data from XML" with the following data:
Content::
Loop XPath: /html/body/table/tr
Fields::
Name: tableData
XPath: td
The data information I would like to have is
info1 info2 info3 info4
in any kind of way.
Any help would be truly appreciated!


Solution

  • I solved it by making reading every row of my file as rows. Then I added a Pentaho step "User Defined Java Class" and made it parse my table content using XSLT to a new XML file. Using that XML I was able to get the data needed to complete the task.
    Here is what I wrote in "User Defined Java Class":

    
    import java.util.*;
    import java.io.FileOutputStream;
    
    import javax.xml.transform.Transformer;
    import javax.xml.transform.TransformerFactory;
    
    private int infilenameIndex;
    private int xsltfilenameIndex;
    private int outfilenameIndex;
    
    
    public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
      Object[] r=getRow();
      if (r==null) {
        setOutputDone();
        return false;
      }
    
    
      if (first == false) {
         infilenameIndex = getInputRowMeta().indexOfValue(getParameter("infilename"));
         if (infilenameIndex < 0) {
             throw new KettleException("Field not found in the input row, check parameter 'infilename'!");
         }
         xsltfilenameIndex = getInputRowMeta().indexOfValue(getParameter("xsltfilename"));
         if (xsltfilenameIndex < 0) {
             throw new KettleException("Field not found in the input row, check parameter 'xsltfilename'!");
         }
         outfilenameIndex = getInputRowMeta().indexOfValue(getParameter("outfilename"));
         if (outfilenameIndex < 0) {
             throw new KettleException("Field not found in the input row, check parameter 'outfilename'!");
         }
    
         first=false;
      }
    
      String infilename = get(Fields.In, "infilename").getString(r);
      String xsltfilename = get(Fields.In, "xsltfilename").getString(r);
      String outfilename = get(Fields.In, "outfilename").getString(r);
    
      Object[] outputRowData = RowDataUtil.resizeArray(r, data.outputRowMeta.size());
      int outputIndex = getInputRowMeta().size();
    
      transform(infilename, xsltfilename, outfilename);
    
    
      putRow(data.outputRowMeta, outputRowData);
    
      return true;
    }
    public void transform(String infilename, String xsltfilename, String outfilename) throws KettleException {
    
        javax.xml.transform.stream.StreamSource inss = null;
        javax.xml.transform.stream.StreamSource xsltss = null;
        javax.xml.transform.stream.StreamResult outss = null;
    
        logBasic("");
        logBasic("Transformerar " +  infilename + " med " + xsltfilename + " till " + outfilename );
        logBasic("");
    
        try {
           inss = new javax.xml.transform.stream.StreamSource(infilename);
        }     
        catch (Exception e) {
           logError("Infil saknas " +  infilename);
           throw new KettleException(e);
        }
    
        try {
           xsltss = new javax.xml.transform.stream.StreamSource(xsltfilename);
        }     
        catch (Exception e) {
           logError("Xsltfil saknas " +  xsltfilename);
           throw new KettleException(e);
        }
    
        try {
           outss = new javax.xml.transform.stream.StreamResult(outfilename);
        }     
        catch (Exception e) {
           logError("Outfil saknas " +  outfilename);
           throw new KettleException(e);
        }
    
        try {       
            TransformerFactory tFactory = TransformerFactory.newInstance();
    
            // Set the TransformerFactory to the SAXON implementation.
            //tFactory = new net.sf.saxon.TransformerFactoryImpl();
    
            Transformer transformer = tFactory.newTransformer(xsltss);
    
            // Do the transfromtation
            transformer.transform(inss, outss);
        }
        catch (Exception e) {
           throw new KettleException(e);
        }
        return;
    }