I need to check all files(especially "*.docx") from a directory with size of about 10 GB and filter the names of the document with tables in it. For each file in the directory I need to iterate through Document elements of file to find out if the opened document has a table. I need to get this done in C#. I am from testing domain but they gave me development kind of task. Please help
You can use DocumentFormat.OpenXml
nuget package to access docx files and find the table inside each file.
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace ConsoleApp2
{
class Program
{
static void Main(string[] args)
{
var files = FindFilesWithTable("<path_to_directory>");
foreach (var file in files)
{
Console.WriteLine(file);
}
}
static List<string> FindFilesWithTable(string directory)
{
// filter all docx files
var files = Directory.GetFiles(directory, "*.docx");
var filesWithTable = new List<string>();
foreach (var file in files)
{
try
{
// open file in read only mode
using (WordprocessingDocument doc = WordprocessingDocument.Open(file, false))
{
// find the first table in the document.
var hasTable = doc.MainDocumentPart.Document.Body.Elements<Table>().Any();
if (hasTable)
{
filesWithTable.Add(file);
}
}
}
catch(Exception ex)
{
Console.WriteLine("Cannot process {0}: {1}", file, ex.Message);
}
}
return filesWithTable;
}
}