node.js file base64 powerpoint office-js

How do I extract text from Uploaded .pptx File in Web Application?

I am currently working on a web application where users can upload .pptx files, and I need to extract all of the text from the uploaded file in the backend. I have the frontend part implemented where users can input their .pptx file, but I am struggling with how to process and extract text from this file in the backend.

Simplified frontend markup:

<div id="uploadBox">
    <form id="uploadForm">
        <div id="dragText">Drag & Drop your file here</div>
        <div id="orText">or</div>
        <div class="input-group" id="selectButton">
            <input
                accept=".pdf, .pptx"
                type="file"
                id="input"
                name="file"
                on:change={() => {
                    const fileInput = document.getElementById("input");
                    const selectedFile = fileInput.files[0];
                }}
            />
            <label for="input">Select file from your device</label>
        </div>
        <div id="fileNameDisplay"></div>
        <button id="submitButton" type="submit" style="display: none;">Begin</button>
    </form>
</div>

Frontend code, sending pptx file to backend after turning it into base64File

        const fileArrayBuffer = await file.arrayBuffer();
        const base64File = arrayBufferToBase64(fileArrayBuffer);

        const response = await fetch(`${apiUrl}/api/getTextChunks`, {
          method: 'POST',
          headers: {
            Authorization: `Bearer ${localStorage.getItem('token')}`,
            'Content-Type': 'application/json'
          },
          body: JSON.stringify({ 
            file: base64File, 
            fileType: fileType === "application/pdf" ? 'pdf' : 'pptx'
          }),
        });

However, I have no idea how I would get the text from the base64file I send to the backend (I'm using a Node.js backend).

I tried this (code doesn't work but logic makes sense):

import pdfjs from "pdfjs-dist-legacy";
import count from 'openai-gpt-token-counter';
import path from 'path';

pdfjs.GlobalWorkerOptions.workerSrc = path.join(process.cwd(), '/server/node_modules/pdfjs-dist-legacy/pdf.worker.js');

async function getTextFromPPTX(pptxBuffer) {
    let powerpointText = '';
    const textChunks = [];
  
    console.log('PPTX object:', PPTX);

    let ppt;
    try {
        ppt = new PPTX.Composer();
    } catch (e) {
        console.error('Error while creating new PPTX:', e);
        return;
    }
  
    await ppt.load(pptxBuffer);
  
    const slides = ppt.getSlides();
  
    slides.forEach((slide) => {
      const shapes = slide.getShapes();
  
      shapes.forEach((shape) => {
        if (shape.textBody && shape.textBody.paragraphs) {
          shape.textBody.paragraphs.forEach((paragraph) => {
            paragraph.runs.forEach((run) => {
              const text = run.text;
              powerpointText = text;
            });
          });
        }
      });
    });
  
    return powerpointText;
  }

export default getTextFromPPTX;

How should I approach this?

Solution

Import jszip and xmldom.
Get the PPTX Buffer from your PPTX file.
Load the PPTX file as a ZIP archive.
Define the XML namespace for accessing text nodes in PowerPoint slides.
Create a variable, text, to accumulate extracted text.
Iterate through each slide, parse its XML, extract text, and concatenate it to text.
Return the concatenated text.

import JSZip from 'jszip';
import { DOMParser } from 'xmldom';

function getTextFromNodes(node, tagName, namespaceURI) {
  let text = '';
  const textNodes = node.getElementsByTagNameNS(namespaceURI, tagName);
  for (let i = 0; i < textNodes.length; i++) {
    text += textNodes[i].textContent + ' ';
  }
  return text.trim();
}

async function getTextFromPPTX(arrayBuffer) {
  try {
    const zip = new JSZip();
    await zip.loadAsync(arrayBuffer);

    const aNamespace = "http://schemas.openxmlformats.org/drawingml/2006/main";
    let text = '';
    
    let slideIndex = 1;
    while (true) {
      const slideFile = zip.file(`ppt/slides/slide${slideIndex}.xml`);
      
      if (!slideFile) break;
      
      const slideXmlStr = await slideFile.async('text');
      
      const parser = new DOMParser();
      const xmlDoc = parser.parseFromString(slideXmlStr, 'application/xml');
      
      text += getTextFromNodes(xmlDoc, "t", aNamespace) + ' ';
      
      slideIndex++;
    }

    return text.trim();
  } catch (err) {
    console.error('Error extracting text from PPTX:', err);
    return '';
  }
}