Search code examples
google-apps-scriptgoogle-sheetsautomationgoogle-drive-apigoogle-docs-api

Google App Script find text from Google Document next to key text


I have a PDF file saved in Google Drive, I want to find a text from that file i.e USD then pick the value next to found text i.e: 167.1764, and insert it in my google spreadsheet.

Below is the preview of my PDF File. Link to my PDF File. enter image description here

Here is the code below which I tried but failed to find the text and reached to that value which is next to it.

below is my code.

function extractTextFromPDF() {

  var drive = DriveApp;
  var folders = drive.getFolderById('folderid');
  var newfile = folders.getFilesByName('08-Sep-2021.pdf');
  if(newfile.hasNext()){
    var file1 = newfile.next().getBlob();
  }
  
  var blob = file1;
  var resource = {
    title: blob.getName(),
    mimeType: blob.getContentType()
  };

  // Enable the Advanced Drive API Service
  var file = Drive.Files.insert(resource, blob, {ocr: true, ocrLanguage: "en"});

  // Extract Text from PDF file
  var doc = DocumentApp.openById(file.id);
  var text = doc.getBody().getText();
  Logger.log(text);
  //DriveApp.getFileById(file.id).setTrashed(true);
  var body = doc.getBody();
  var foundElement = body.findText("(USD)");

while (foundElement != null) {
    // Get the text object from the element
    var foundText = foundElement.getElement().asText();

    // Where in the element is the found text?
    var start = foundElement.getStartOffset();
    var end = foundElement.getEndOffsetInclusive();
}
    // i want the value of USD i.e 167.1144 in log
    Logger.log(foundText);
  
  
}


Solution

  • With the help of RegEx you can extract this. I'm not the best with those patterns. But maybe somebody else can optimize so the split is not necessary. (here is a link).

    The code:

    function extractTextFromPDF() {
      const folders = DriveApp.getFolderById('1QVo_pxxx387WPH9Yx');
      const newfile = folders.getFilesByName('08-Sep-2021.pdf');
      if(newfile.hasNext()){
        var file1 = newfile.next().getBlob();
      }
      
      const blob = file1;
      const resource = {
        title: blob.getName(),
        mimeType: blob.getContentType()
      };
    
      // Enable the Advanced Drive API Service
      const file = Drive.Files.insert(resource, blob, {convert: true});
    
      // Extract Text from PDF file
      const doc = DocumentApp.openById(file.id);
      const text = doc.getBody().getText();
      Logger.log(text);
      const buying = /USD\n(.*?)$/gm.exec(text)[1].trim();
      const selling = /USD\n\s*\S*\n(.*?)$/gm.exec(text)[1].trim();
      
      console.log(buying) 
      console.log(selling)
    
      //Remove the converted file.
      DriveApp.getFileById(file.id).setTrashed(true);
    
    }