Search code examples
google-apps-scriptgoogle-sheetspdfdocx

Error while merging docx files into single pdf using google apps script


I used a Google Apps script code proposed by Tanaike to merge multiple PDF files into one. My requirements are:

  1. Get file id
  2. Check if it is pdf or docx
  3. If it is docx, convert it into pdf and then merge it

Here is the modified snippet:

async function newMain() {
  // Retrieve PDF data.
 
  var destinationFolder = DriveApp.getFolderById("your-folder-id")
  var urls = sheet.getRange(2,2).getValue().toString().split(",");  //split the urls joined by commas 
  const ids = urls.map((url) => {
  const matches = url.match(/\/file\/d\/([^\/]+)\/edit/); //get file id from urls
  return matches ? [matches[1]] : [];
});

  const data = ids.map(([id]) => {
  const file = DriveApp.getFileById(id);
  const mimeType = file.getMimeType();

  // Check if the file is a DOCX
if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
  
  
   var blob = file.getBlob();
   var tempfile = Drive.Files.insert({}, blob, {convert:true});
   var id = tempfile["id"];
  
   var doc = DocumentApp.openById(id);
   var body = doc.getBody();
   var text = body.getText();
   var pdfFile = destinationFolder.createFile(`temp.pdf`, text, "application/pdf"); 

    // Get the PDF content
   const pdfData = pdfFile.getBlob().getBytes();
   Drive.Files.remove(id);
    // Remove the temporary PDF file
   DriveApp.getFileById(pdfFile.getId()).setTrashed(true);
   return new Uint8Array(pdfData);

  } else if (mimeType === 'application/pdf') {
    // File is already a PDF, fetch its content
    return new Uint8Array(file.getBlob().getBytes());
  } 
});



  // Load pdf-lib
  const cdnjs = "https://cdn.jsdelivr.net/npm/pdf-lib/dist/pdf-lib.min.js";
  eval(UrlFetchApp.fetch(cdnjs).getContentText().replace(/setTimeout\(.*?,.*?(\d*?)\)/g, "Utilities.sleep($1);return t();"));

  // Merge PDFs.
  const pdfDoc = await PDFLib.PDFDocument.create();
  for (let i = 0; i < data.length; i++) {
    const pdfData = await PDFLib.PDFDocument.load(data[i]);
    for (let j = 0; j < pdfData.getPageCount(); j++) {
      const [page] = await pdfDoc.copyPages(pdfData, [j]);
      pdfDoc.addPage(page);
    }
  }
  const bytes = await pdfDoc.save();

  // Create a PDF file.
  var file = DriveApp.createFile(Utilities.newBlob([...new Int8Array(bytes)], MimeType.PDF, "sample2.pdf"));
  file.moveTo(destinationFolder);
 
 }

when I run it, I receive the following error:

Error: Failed to parse PDF document (line:368 col:0 offset=18311): No PDF header found e
e
e.parseHeader
eval
eval
eval
eval
i
e.parseDocument

Due to my beginner skills, I am unable to resolve or improve it, any guidance in this regard would be much appreciated.


Solution

  • Modification points:

    • If your values of ids are valid file IDs of DOCX files, in your script, I think that pdfData is not the PDF data. In your script, the text data is retrieved and the retrieved text data is saved as a text file without the PDF data. I thought that this might be the reason for your current issue.

    In this case, how about the following modification?

    Modified script:

    async function newMain() {
      // Retrieve PDF data.
      var destinationFolder = DriveApp.getFolderById("your-folder-id")
      var urls = sheet.getRange(2, 2).getValue().toString().split(",");  //split the urls joined by commas 
      const ids = urls.map((url) => {
        const matches = url.match(/\/file\/d\/([^\/]+)\/edit/); //get file id from urls
        return matches ? [matches[1]] : [];
      });
    
    
      // --- I modifieid the below script.
      const data = ids.flatMap(([id]) => {
        const file = DriveApp.getFileById(id);
        const mimeType = file.getMimeType();
        if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
          var blob = file.getBlob();
          var tempfile = Drive.Files.insert({}, blob, { convert: true });
          const pdfData = DriveApp.getFileById(tempfile.id).getBlob().getBytes();
          Drive.Files.remove(tempfile.id);
          return new Uint8Array(pdfData);
        } else if (mimeType === 'application/pdf') {
          return new Uint8Array(file.getBlob().getBytes());
        }
        return [];
      });
      // ---
    
    
      // Load pdf-lib
      const cdnjs = "https://cdn.jsdelivr.net/npm/pdf-lib/dist/pdf-lib.min.js";
      eval(UrlFetchApp.fetch(cdnjs).getContentText().replace(/setTimeout\(.*?,.*?(\d*?)\)/g, "Utilities.sleep($1);return t();"));
    
      // Merge PDFs.
      const pdfDoc = await PDFLib.PDFDocument.create();
      for (let i = 0; i < data.length; i++) {
        const pdfData = await PDFLib.PDFDocument.load(data[i]);
        for (let j = 0; j < pdfData.getPageCount(); j++) {
          const [page] = await pdfDoc.copyPages(pdfData, [j]);
          pdfDoc.addPage(page);
        }
      }
      const bytes = await pdfDoc.save();
    
      // Create a PDF file.
      var file = DriveApp.createFile(Utilities.newBlob([...new Int8Array(bytes)], MimeType.PDF, "sample2.pdf"));
      file.moveTo(destinationFolder);
    }
    
    • In this modified script, if the values of ids are valid file IDs of DOCX files, the DOCX files are converted to Google Document, and the Google Document is converted to PDF data. And then, each PDF data is merged.

    • I added a process for the files without application/vnd.openxmlformats-officedocument.wordprocessingml.document and application/pdf, because I cannot know your actual file IDs.

    Note:

    • This modified script supposes that your values of ids are valid file IDs of DOCX files. Please be careful about this.