Search code examples
c#.netasp.net-core.net-corebase64

Determine if file is zip or docx/xlsx based on only base64 string


Is there any way to determine if uploaded file is zip file or docx/xlsx file based on only base64 string i specifying this since i do not have file extension.

I so far found solution which doing it using also file extension like this one

     private static readonly byte[] ZIP_DOCX = { 80, 75, 3, 4 };

     public static string GetMimeType(byte[] file, string fileName)
     {
        string extension = Path.GetExtension(fileName) == null
                           ? string.Empty
                           : Path.GetExtension(fileName).ToUpper();

         if (file.Take(4).SequenceEqual(ZIP_DOCX))
         {
            mime = extension == ".DOCX" ? "application/vnd.openxmlformats-officedocument.wordprocessingml.document" : "application/x-zip-compressed";
         }
      }

But as i said this solution is not working for me since i do not have extension of the file. Any ideas ?


Solution

  • Sounded interesting so I played around with it. This isn't based on a spec so not super reliable. I just extracted one excel file and one word file and guessed at what the identifying characteristics might be. Seems to work.

    public enum FileKind
    {
        NotZip,
        OtherZip,
        Xlsx,
        Docx
    }
    
    public static class FileKindDecoder
    {
        public static FileKind DetermineFileKind(string base64)
        {
            XElement? contentTypesXml;
    
            var bytes = Convert.FromBase64String(base64);
            using (var stream = new MemoryStream(bytes))
            using (var zip = OpenZip(stream))
            {
                if (zip == null)
                    return FileKind.NotZip;
    
                var contentTypesEntry = zip.GetEntry(@"[Content_Types].xml");
                if (contentTypesEntry == null)
                    return FileKind.OtherZip;
                contentTypesXml = ReadXmlFromZip(contentTypesEntry);
            }
    
            if (contentTypesXml == null)
                return FileKind.OtherZip;
    
            XNamespace ns = @"http://schemas.openxmlformats.org/package/2006/content-types";
            if (contentTypesXml.Name != ns + "Types")
                return FileKind.OtherZip;
    
            foreach (var overrideElement in contentTypesXml.Elements(ns + "Override"))
            {
                var contentType = overrideElement.Attribute("ContentType")?.Value;
                if (contentType == null)
                    continue;
                if (contentType.Contains("spreadsheetml"))
                    return FileKind.Xlsx;
                if (contentType.Contains("wordprocessingml"))
                    return FileKind.Docx;
            }
    
            return FileKind.OtherZip;
    
    
    
            static ZipArchive? OpenZip(Stream stream)
            {
                try
                {
                    return new ZipArchive(stream);
                }
                catch (InvalidDataException)
                {
                    return null;
                }
            }
    
            static XElement? ReadXmlFromZip(ZipArchiveEntry entry)
            {
                using var stream = entry.Open();
                try
                {
                    return XElement.Load(stream);
                }
                catch (XmlException)
                {
                    return null;
                }
            }
        }
    }
    

    Simplified approach which doesn't unzip any file or parse any XML.

    public enum FileKind
    {
        NotZip,
        OtherZip,
        Xlsx,
        Docx
    }
    
    public static class FileKindDecoder
    {
        public static FileKind DetermineFileKind(string base64)
        {
            HashSet<string> containedFiles;
    
            try
            {
                var bytes = Convert.FromBase64String(base64);
                using var stream = new MemoryStream(bytes);
                using var zip = new ZipArchive(stream);
                containedFiles = zip.Entries.Select(e => e.FullName).ToHashSet();
            }
            catch (InvalidDataException)
            {
                return FileKind.NotZip;
            }
    
            if (containedFiles.Contains("[Content_Types].xml"))
            {
                if (containedFiles.Contains("word/document.xml"))
                    return FileKind.Docx;
                if (containedFiles.Contains("xl/workbook.xml"))
                    return FileKind.Xlsx;
            }
    
            return FileKind.OtherZip;
        }
    }