Search code examples
node.jsreactjsamazon-web-servicesmachine-learningamazon-textract

Problem with text extraction from pdf using nodejs server and reactjs webpage


The following below is my code for textractUtils.js -

const _ = require("lodash");
const aws = require("aws-sdk");
const config = require("./config");

aws.config.update({
  accessKeyId: config.awsAccesskeyID,
  secretAccessKey: config.awsSecretAccessKey,
  region: config.awsRegion
});

const textract = new aws.Textract();

const getText = (result, blocksMap) => {
  let text = "";

  if (_.has(result, "Relationships")) {
    result.Relationships.forEach(relationship => {
      if (relationship.Type === "CHILD") {
        relationship.Ids.forEach(childId => {
          const word = blocksMap[childId];
          if (word.BlockType === "WORD") {
            text += `${word.Text} `;
          }
          if (word.BlockType === "SELECTION_ELEMENT") {
            if (word.SelectionStatus === "SELECTED") {
              text += `X `;
            }
          }
        });
      }
    });
  }

  return text.trim();
};

const findValueBlock = (keyBlock, valueMap) => {
  let valueBlock;
  keyBlock.Relationships.forEach(relationship => {
    if (relationship.Type === "VALUE") {
      // eslint-disable-next-line array-callback-return
      relationship.Ids.every(valueId => {
        if (_.has(valueMap, valueId)) {
          valueBlock = valueMap[valueId];
          return false;
        }
      });
    }
  });

  return valueBlock;
};

const getKeyValueRelationship = (keyMap, valueMap, blockMap) => {
  const keyValues = {};

  const keyMapValues = _.values(keyMap);

  keyMapValues.forEach(keyMapValue => {
    const valueBlock = findValueBlock(keyMapValue, valueMap);
    const key = getText(keyMapValue, blockMap);
    const value = getText(valueBlock, blockMap);
    keyValues[key] = value;
  });

  return keyValues;
};

const getKeyValueMap = blocks => {
  const keyMap = {};
  const valueMap = {};
  const blockMap = {};

  let blockId;
  blocks.forEach(block => {
    blockId = block.Id;
    blockMap[blockId] = block;

    if (block.BlockType === "KEY_VALUE_SET") {
      if (_.includes(block.EntityTypes, "KEY")) {
        keyMap[blockId] = block;
      } else {
        valueMap[blockId] = block;
      }
    }
  });

  return { keyMap, valueMap, blockMap };
};

module.exports = async buffer => {
  const params = {
    Document: {
      /* required */
      Bytes: buffer
    },
    FeatureTypes: ["FORMS"]
  };

  const request = textract.analyzeDocument(params);
  const data = await request.promise();

  if (data && data.Blocks) {
    const { keyMap, valueMap, blockMap } = getKeyValueMap(data.Blocks);
    const keyValues = getKeyValueRelationship(keyMap, valueMap, blockMap);

    return keyValues;
  }

  // in case no blocks are found return undefined
  return undefined;
};

It is working fine with images but not with pdf(neither single nor multi-page). The following is the error when I run it by importing pdf-

(node:2001) UnhandledPromiseRejectionWarning: UnsupportedDocumentException: Request has unsupported document format
    at Request.extractError (/home/<user>/textract-lab/node_modules/aws-sdk/lib/protocol/json.js:51:27)
    at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:106:20)
    at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:78:10)
    at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:683:14)
    at Request.transition (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:22:10)
    at AcceptorStateMachine.runTo (/home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:14:12)
    at /home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:26:10
    at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:38:9)
    at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:685:12)
    at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:116:18)
(node:2001) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)
(node:2001) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.

The things I have tried are a non text containing image, text containing image, table containing image, a single page pdf and a multi page pdf. I also have a conceptual doubt that if I imported the aws-sdk already , why should I code for pdf as the aws-sdk for textract looks after the image of the form pdf , png, jpeg and jpg? What changes do I have to make to textractUtils.js for it ot process th epdf files also?


Solution

  • The AnalyzeDocument API operation only supports images in PNG or JPEG format. From the Textract documentation:

    Amazon Textract synchronous operations (DetectDocumentText and AnalyzeDocument) support the PNG and JPEG image formats. Asynchronous operations (StartDocumentTextDetection, StartDocumentAnalysis) also support the PDF file format.

    You should use the asynchronous operations to process your PDF documents. Else, a work-around would be to convert the PDF document into images in your code and then use the synchronous API operations with these images to process the documents.