I have the ALT text generated just need to add it somehow to the images under the figure tag. A little background - I want to my my pdf accessible to the WCAG 2.1 AA standards and i am using adobe autotag feature to tag the pdf. It tags the images as /figure. I can totally extract the figures and generate alt text but I cant find a way to embed or add that alt text to the image and make it WCAG 2.1 AA compliant. I ultimately also want to add this to a lambda function in AWS. Is there any way I could do so? Thank you!
I tried using multiple open source libraries pikepdf,pymupdf, and some more and also tried converting the pdf to html or xml but the issue with that is the pdf cant be converted back exactly to what it was. I also tried adding it directly in code but the file goes corrupt.
So I used pdf-lib in node.js to find the answer for this solution—also Special thanks to KJ for giving me a path. I used adobe api for autotagging first and then used pdflib for post-processing of adding alt tags which I would be generating through LLM. It was easier for me to reference it back through objid, otherwise normal ID would also work great with this. I am still working on it fully and hopefully in a few weeks would add a github link if anyone would like to give any feedback and help contribute to this project!
const pdfLib = require('pdf-lib');
const fs = require('fs').promises;
async function modifyPDF() {
async function generateAltText(objId) {
try {
const json_file = await fs.readFile('output/ExtractTextTableInfoWithFiguresTablesRenditionsFromPDF/extract2024-07-30T14-39-57/structuredData.json', 'utf8');
const data = JSON.parse(json_file);
// Check if the object ID exists in the JSON
if (altTexts[objId]) {
return altTexts[objId];
} else {
return `Generated alt text for object ${objId}`;
}
} catch (err) {
console.error("Error reading JSON file:", err);
return `Generated alt text for object ${objId}`;
}
}
try {
const pdfPath = '../output/AutotagPDF/what_is_fair_use_2018_Original.pdf.pdf';
const pdfData = await fs.readFile(pdfPath);
const pdfDoc = await pdfLib.PDFDocument.load(pdfData);
pdfDoc.context.enumerateIndirectObjects().forEach(([pdfRef, pdfObject]) => {
if (pdfObject instanceof pdfLib.PDFDict) {
const type = pdfObject.lookup(pdfLib.PDFName.of('Type'))?.encodedName;
const structType = pdfObject.lookup(pdfLib.PDFName.of('S'))?.encodedName;
if (structType === "/Figure") {
const objId = `${pdfRef.objectNumber}-${pdfRef.generationNumber}`;
const altText = pdfObject.lookup(pdfLib.PDFName.of('Alt'))?.value;
if (!altText) {
const newAltText = generateAltText(objId);
pdfObject.set(pdfLib.PDFName.of('Alt'), pdfLib.PDFString.of(newAltText));
pdfObject.set(pdfLib.PDFName.of('Contents'),
pdfLib.PDFString.of(newAltText));
console.log(`Updated Alt text for figure with Object ID: ${objId}`);
} else {
console.log(`Figure found with Object ID: ${objId}, Alt text already exists: ${altText}`);
}
}
}
});
const pdfBytes = await pdfDoc.save();
await fs.writeFile("your-pdf-document-accessible.pdf", pdfBytes);
console.log("PDF modification complete. Output saved to your-pdf-document-accessible.pdf");
} catch (err) {
console.error("Error processing PDF:", err);
}
}
modifyPDF().catch(err => console.error(err));