Search code examples
google-bigqueryudf

BigQuery UDF reducing all rows


I have the following UDF defined (note my table had an 'Id' and a 'Reading' object with subfield 'RawHex'):

// UDF definition
function hexdecode(row, emit) {
  emit({
    Id: row.Id,
    converted: decodeHelper(row.Reading.Raw)
  });
}

// Helper function with error handling
function decodeHelper(s) {
  try {
    return parseInt(s, 16);
  } catch (ex) {
    return s;
  }
}

// UDF registration
bigquery.defineFunction(
  'hexdecode',  // Name used to call the function from SQL

  ['Id', 'Reading.Raw'],  // Input column names

  // JSON representation of the output schema
  [{name: 'Id', type: 'STRING'},
   {name: 'converted', type: 'INTEGER'}],

  hexdecode  // The function reference
);

and my query is:

select Id, converted from 
hexdecode(
select r.Id, r.Reading.Raw from Example.TagRaw2 r
)

Generated from uploading the following JSON to the table Example.TagRaw2

{"Id":"ABC","Reading":{"Raw":"0004"}}
{"Id":"CDE","Reading":{"Raw":"000b"}}

I expected this to convert my column from Hex to Integer, but instead if returns a single entry that make almost no sense.

Any idea what I am doing wrong?

EDIT: I added an example JSON to upload to try and reproduce the problem.. but it now seems to work. I will update again if I can figure out what was originally wrong, but the UDF above seems to do exactly what I want.


Solution

  • Question - is your Reading object a repeated column, by any chance?

    If so, you'll need to do something like this:

    function hexdecode(r, emit) {
      for (var i = 0; i < r.reading.length; ++i) {
        emit({ tag: r.Id, num: parseInt(r.reading[i].Raw, 16) });
      }
    }
    
    bigquery.defineFunction(
      'hexdecode',
      ['Id', 'reading.Raw'],
      [{name: 'tag', type: 'string'},
       {name: 'num', type: 'integer'}],
      hexdecode
    );
    

    If you try to access r.reading.Raw, this will be trying to access the Raw property of a JavaScript array. This is perfectly valid JavaScript, but it's certainly not what you want since the value is undefined (null).

    One handy trick is to print out the JSON string of the input record to see what the UDF is seeing as input:

    bigquery.defineFunction(
      'jsonifyObj',
      ['Id', 'reading.Raw'],
      [{name: 'obj', type: 'string'}],
      function(r, emit) { emit({obj: JSON.stringify(r)}); }
    );
    

    This can help to debug problems. I suspect the issue in this case is that your data looks more like

    [{"Id":"ABC","Reading":[{"Raw":"0004"}, {"Raw": "00ff"}]},
     {"Id":"CDE","Reading":[{"Raw":"000b"}, {"Raw": "0012"}]}]
    

    -------- Update 2015-11-17 --------

    Your code has a few problems, commented NOTE below:

    function hexdecode(row, emit) {
     for (var i = 0; i < row.reading.length; ++i) {
       // NOTE: tag and row.Id are wrong, this must be TagId and row.TagId based on your input and output specifications
       emit({ tag: row.Id,
              times: row.reading[i].Timestamp,
              // NOTE: You're making a recursive call here!  You should be calling decodeHelper() not hexdecode().
              convert: hexdecode(row.reading[i].RawCO) });
      }
    }
    
    
    // Helper function with error handling
    function decodeHelper(s) {
      try {
        return parseInt(s, 16);
      } catch (ex) {
        return s;
      }
    }
    
    // UDF registration
    bigquery.defineFunction(
      'hexdecode',  // Name used to call the function from SQL
    
      ['TagId', 'reading.Timestamp', 'reading.RawCO'],  // Input column names
    
      // JSON representation of the output schema
      [{name: 'TagId', type: 'STRING'},
       {name: 'times', type: 'INTEGER'},
       {name: 'convert', type: 'INTEGER'}],
    
      hexdecode  // The function reference
    );
    

    Your nested select returns 0 rows, so I updated to the following SQL:

    select 
      TagID, times, convert 
    from  hexdecode(
    select r.TagId, r.Reading.Timestamp, r.Reading.RawCO from [table.Readings] r where r.Reading.RawCO is not NULL and r.Reading.PPM is
     not NULL
     and r.TagId = 'Tag_00000000' 
     )
    

    And here is the corrected code:

    function hexdecode(row, emit) {
      for (var i = 0; i < row.reading.length; ++i) {
        emit({TagId: row.TagId, times: row.reading[i].Timestamp, convert: decodeHelper(row.reading[i].RawCO)});
      }
    }
    
    // Helper function with error handling
    function decodeHelper(s) {
      try {
        return parseInt(s, 16);
      } catch (ex) {
        return s;
      }
    }
    
    // UDF registration
    bigquery.defineFunction(
      'hexdecode',  // Name used to call the function from SQL
    
      ['TagId', 'reading.Timestamp', 'reading.RawCO'],  // Input column names
    
      // JSON representation of the output schema
      [{name: 'TagId', type: 'STRING'},
       {name: 'times', type: 'INTEGER'},
       {name: 'convert', type: 'INTEGER'}],
    
      hexdecode  // The function reference
    );