Search code examples
node.jsfilebase64powerpointoffice-js

How do I extract text from Uploaded .pptx File in Web Application?


I am currently working on a web application where users can upload .pptx files, and I need to extract all of the text from the uploaded file in the backend. I have the frontend part implemented where users can input their .pptx file, but I am struggling with how to process and extract text from this file in the backend.

Simplified frontend markup:

<div id="uploadBox">
    <form id="uploadForm">
        <div id="dragText">Drag & Drop your file here</div>
        <div id="orText">or</div>
        <div class="input-group" id="selectButton">
            <input
                accept=".pdf, .pptx"
                type="file"
                id="input"
                name="file"
                on:change={() => {
                    const fileInput = document.getElementById("input");
                    const selectedFile = fileInput.files[0];
                }}
            />
            <label for="input">Select file from your device</label>
        </div>
        <div id="fileNameDisplay"></div>
        <button id="submitButton" type="submit" style="display: none;">Begin</button>
    </form>
</div>

Frontend code, sending pptx file to backend after turning it into base64File

        const fileArrayBuffer = await file.arrayBuffer();
        const base64File = arrayBufferToBase64(fileArrayBuffer);

        const response = await fetch(`${apiUrl}/api/getTextChunks`, {
          method: 'POST',
          headers: {
            Authorization: `Bearer ${localStorage.getItem('token')}`,
            'Content-Type': 'application/json'
          },
          body: JSON.stringify({ 
            file: base64File, 
            fileType: fileType === "application/pdf" ? 'pdf' : 'pptx'
          }),
        });

However, I have no idea how I would get the text from the base64file I send to the backend (I'm using a Node.js backend).

I tried this (code doesn't work but logic makes sense):

import pdfjs from "pdfjs-dist-legacy";
import count from 'openai-gpt-token-counter';
import path from 'path';

pdfjs.GlobalWorkerOptions.workerSrc = path.join(process.cwd(), '/server/node_modules/pdfjs-dist-legacy/pdf.worker.js');

async function getTextFromPPTX(pptxBuffer) {
    let powerpointText = '';
    const textChunks = [];
  
    console.log('PPTX object:', PPTX);

    let ppt;
    try {
        ppt = new PPTX.Composer();
    } catch (e) {
        console.error('Error while creating new PPTX:', e);
        return;
    }
  
    await ppt.load(pptxBuffer);
  
    const slides = ppt.getSlides();
  
    slides.forEach((slide) => {
      const shapes = slide.getShapes();
  
      shapes.forEach((shape) => {
        if (shape.textBody && shape.textBody.paragraphs) {
          shape.textBody.paragraphs.forEach((paragraph) => {
            paragraph.runs.forEach((run) => {
              const text = run.text;
              powerpointText = text;
            });
          });
        }
      });
    });
  
    return powerpointText;
  }

export default getTextFromPPTX;

How should I approach this?


Solution

    1. Import jszip and xmldom.
    2. Get the PPTX Buffer from your PPTX file.
    3. Load the PPTX file as a ZIP archive.
    4. Define the XML namespace for accessing text nodes in PowerPoint slides.
    5. Create a variable, text, to accumulate extracted text.
    6. Iterate through each slide, parse its XML, extract text, and concatenate it to text.
    7. Return the concatenated text.
    import JSZip from 'jszip';
    import { DOMParser } from 'xmldom';
    
    function getTextFromNodes(node, tagName, namespaceURI) {
      let text = '';
      const textNodes = node.getElementsByTagNameNS(namespaceURI, tagName);
      for (let i = 0; i < textNodes.length; i++) {
        text += textNodes[i].textContent + ' ';
      }
      return text.trim();
    }
    
    async function getTextFromPPTX(arrayBuffer) {
      try {
        const zip = new JSZip();
        await zip.loadAsync(arrayBuffer);
    
        const aNamespace = "http://schemas.openxmlformats.org/drawingml/2006/main";
        let text = '';
        
        let slideIndex = 1;
        while (true) {
          const slideFile = zip.file(`ppt/slides/slide${slideIndex}.xml`);
          
          if (!slideFile) break;
          
          const slideXmlStr = await slideFile.async('text');
          
          const parser = new DOMParser();
          const xmlDoc = parser.parseFromString(slideXmlStr, 'application/xml');
          
          text += getTextFromNodes(xmlDoc, "t", aNamespace) + ' ';
          
          slideIndex++;
        }
    
        return text.trim();
      } catch (err) {
        console.error('Error extracting text from PPTX:', err);
        return '';
      }
    }