I am currently working on a web application where users can upload .pptx files, and I need to extract all of the text from the uploaded file in the backend. I have the frontend part implemented where users can input their .pptx file, but I am struggling with how to process and extract text from this file in the backend.
Simplified frontend markup:
<div id="uploadBox">
<form id="uploadForm">
<div id="dragText">Drag & Drop your file here</div>
<div id="orText">or</div>
<div class="input-group" id="selectButton">
<input
accept=".pdf, .pptx"
type="file"
id="input"
name="file"
on:change={() => {
const fileInput = document.getElementById("input");
const selectedFile = fileInput.files[0];
}}
/>
<label for="input">Select file from your device</label>
</div>
<div id="fileNameDisplay"></div>
<button id="submitButton" type="submit" style="display: none;">Begin</button>
</form>
</div>
Frontend code, sending pptx file to backend after turning it into base64File
const fileArrayBuffer = await file.arrayBuffer();
const base64File = arrayBufferToBase64(fileArrayBuffer);
const response = await fetch(`${apiUrl}/api/getTextChunks`, {
method: 'POST',
headers: {
Authorization: `Bearer ${localStorage.getItem('token')}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
file: base64File,
fileType: fileType === "application/pdf" ? 'pdf' : 'pptx'
}),
});
However, I have no idea how I would get the text from the base64file I send to the backend (I'm using a Node.js backend).
I tried this (code doesn't work but logic makes sense):
import pdfjs from "pdfjs-dist-legacy";
import count from 'openai-gpt-token-counter';
import path from 'path';
pdfjs.GlobalWorkerOptions.workerSrc = path.join(process.cwd(), '/server/node_modules/pdfjs-dist-legacy/pdf.worker.js');
async function getTextFromPPTX(pptxBuffer) {
let powerpointText = '';
const textChunks = [];
console.log('PPTX object:', PPTX);
let ppt;
try {
ppt = new PPTX.Composer();
} catch (e) {
console.error('Error while creating new PPTX:', e);
return;
}
await ppt.load(pptxBuffer);
const slides = ppt.getSlides();
slides.forEach((slide) => {
const shapes = slide.getShapes();
shapes.forEach((shape) => {
if (shape.textBody && shape.textBody.paragraphs) {
shape.textBody.paragraphs.forEach((paragraph) => {
paragraph.runs.forEach((run) => {
const text = run.text;
powerpointText = text;
});
});
}
});
});
return powerpointText;
}
export default getTextFromPPTX;
How should I approach this?
import JSZip from 'jszip';
import { DOMParser } from 'xmldom';
function getTextFromNodes(node, tagName, namespaceURI) {
let text = '';
const textNodes = node.getElementsByTagNameNS(namespaceURI, tagName);
for (let i = 0; i < textNodes.length; i++) {
text += textNodes[i].textContent + ' ';
}
return text.trim();
}
async function getTextFromPPTX(arrayBuffer) {
try {
const zip = new JSZip();
await zip.loadAsync(arrayBuffer);
const aNamespace = "http://schemas.openxmlformats.org/drawingml/2006/main";
let text = '';
let slideIndex = 1;
while (true) {
const slideFile = zip.file(`ppt/slides/slide${slideIndex}.xml`);
if (!slideFile) break;
const slideXmlStr = await slideFile.async('text');
const parser = new DOMParser();
const xmlDoc = parser.parseFromString(slideXmlStr, 'application/xml');
text += getTextFromNodes(xmlDoc, "t", aNamespace) + ' ';
slideIndex++;
}
return text.trim();
} catch (err) {
console.error('Error extracting text from PPTX:', err);
return '';
}
}