Search code examples
c#genericsabstractderived-class

Combination of abstract classes and generics problem when declaring and instantiating


I am struggling with an app in C# that is a document review app that searches documents for a list of search terms. It needs to either take a PDF or WORD document and possibly others in the future.

I have an enum DocumentType:

public enum DocumentType
{
    NotSpecified,
    PDF,
    WORD

}

and a base class DocumentInterface<DocumentType>:

public abstract class DocumentInterface<DocumentType>
{

    private string _documentPath;
    private readonly DocumentType _document;
    private int _pageCount;
    private int _currentPage;
    
    // Path to the document being reviewed
    public string DocumentPath
    {
        get { return _documentPath; }
        set { _documentPath = value; }
    }
    
    // Document Object will need to be typecast in child classes
    public abstract DocumentType Document { get; set; }
    //{
    //  get { return _document; }
    //  set { _document = value; }
    //}
    
    // Count of pages in the review document
    public int PageCount
    {
        get { return _pageCount; }
        set { _pageCount = value; }
    }
    
    // to keep track of where we are up to in the document
    public int CurrentPage
    {
        get { return _currentPage; }
        set { _currentPage = value; }
    }
    
    // reports if there are any more pages after the current position.
    public bool HasMorePages { get { return _currentPage < _pageCount; } }
    
    
    public string GetNextPageContents()
    {
        // Makes sure that there is a next page and if so then uses the abstract method
        // to return the page contents.
        if (HasMorePages)
        {
            _currentPage++;
            return GetPageContents(_currentPage);
        }
                
        return string.Empty;
    }
    
    
    #region Constructor & Destructor
    
    public DocumentInterface(string documentpath)
    {
        _document = OpenDocument(documentpath);
    
        // Make sure we opened the document successfully
        if (_document != null)
        {
            _pageCount = GetPageCount();
            _currentPage = 0;
        }
    }
    
    // Class Destructor, must close the document when going out of scope.
    ~DocumentInterface()
    {
        // Abstract method, must be implemented by child classes.
        CloseDocument();
    }
    
    #endregion
    
    
    #region Abstract Methods
    
    // Abstract Class must be implemented by Document Type specific child classes
    public abstract DocumentType OpenDocument(string path);
    public abstract int GetPageCount();
    public abstract string GetPageContents(int pageNumber);
    public abstract void CloseDocument();
    
    #endregion

}

and a derived class DocumentInterfacePDF:

public class DocumentInterfacePDF : DocumentInterface<PdfDocument>
{
    // Implementation of the abstract base generic as a PdfDocument
    private PdfDocument _document;

    public override PdfDocument Document 
    { 
        get => (PdfDocument)_document; 
        set => _document = value as PdfDocument; 
    }

    /// <summary>
    /// 
    /// </summary>
    /// <param name="documentpath"></param>
    /// <exception cref="Exception"></exception>
    public override PdfDocument OpenDocument(string documentpath)
    {
        if (// All need to be true!                
            documentpath != null
            && !string.IsNullOrEmpty(documentpath)
            && ".pdf" == documentpath.Substring(documentpath.Length - 4).ToLower()
            )
        {
            // Open the PDF
            PdfReader reader = new PdfReader(documentpath);

            // return to base class to assign to _document
            return new PdfDocument(reader);
        }
        return null;
    }

    #region Base Class Overrides to implement as a iText7 PDF Interface

    /// <summary>
    /// Gets the number of pages in the PDF document
    /// </summary>
    /// <returns></returns>
    public override int GetPageCount()
    {
        // Return the Page Count
        return Document.GetNumberOfPages();
    }

    /// <summary>
    /// Gets the page contents for a specific page number
    /// </summary>
    /// <param name="pageNumber"></param>
    /// <returns></returns>
    public override string GetPageContents(int pageNumber)
    {
        // Set the default scanning extents for the PDF Reader
        // numbers are points which are 72 to the inch
        int A4_width  = 595;    // 210mm
        int A4_height = 842;    // 297mm
        int header    =  57;    //  20mm
        int footer    =  57;    //  20mm
        var rect = new Rectangle(0, header, A4_width, A4_height - footer);

        PdfPage page = Document.GetPage(pageNumber);

        // Read the page contents
        FilteredTextEventListener listener =
                new FilteredTextEventListener(
                new LocationTextExtractionStrategy(),
                new TextRegionEventFilter(rect)
                );

        // Return the page contents
        return PdfTextExtractor.GetTextFromPage(page, listener);
    }

    /// <summary>
    /// Closes the PDF Document
    /// </summary>
    public override void CloseDocument()
    {
        // Close the document
        Document.Close();
    }

    #endregion

    #region Constructor

    /// <summary>
    /// Constructor
    /// Call the base class constructor to setup everything in the predefined way.
    /// </summary>
    /// <param name="documentpath"></param>
    public DocumentInterfacePDF(string documentpath) : base(documentpath)
    {
        // all of the implementation is taken care of in the Base Class
    }

    #endregion
}

In my DocumentParser class I want to instantiate the document using the base class so that the derived type can be decided at runtime depending on the document type that the user selects.

internal class ReviewDocumentParser
{
    #region Properties

    private List<SearchTerm> _searchterms;
    private SearchResults _results;
    private string _documentPath;
    private string _searchTermsPath;
    private DocumentInterface<DocumentType> _documentInterface;

    public List<SearchTerm> SearchTerms
    {
        get { return _searchterms; }
        set { _searchterms = value; }
    }

    public SearchResults Results
    {
        get { return _results; }
        set { _results = value; }
    }

    public string DocumentPath
    {
        get { return _documentPath; }
        set { _documentPath = value; }
    }

    public string SearchTermsPath
    {
        get { return _searchTermsPath; }
        set { _searchTermsPath = value; }
    }

    private DocumentType _documentType;

    public DocumentType DocumentType
    {
        get { return _documentType; }
        set { _documentType = value; }
    }

    public DocumentInterface<DocumentType> DocumentInterface
    {
        get { return _documentInterface; }
        set { _documentInterface = value; }
    }

    #endregion

//... unnecessary code ommitted

    #region Constructor

    public ReviewDocumentParser(DocumentType documenttype, string documentpath, string searchtermspath)
    {
        _documentType = documenttype;
        _documentPath = documentpath;
        _searchTermsPath = searchtermspath;

        // Hook the Search Terms element up
        _results = new SearchResults(_searchTermsPath);

        switch (documenttype)
        {
            case DocumentType.PDF:
                _documentInterface = new DocumentInterfacePDF(_documentPath);
                break;
            case DocumentType.WORD:
                _documentInterface = new DocumentInterfaceWORD(_documentPath);
                break;
            case DocumentType.NotSpecified:
                throw new NoDocumentParserFoundException("No suitable document parser found.");
        }
    }

    #endregion

I'm trying to resolve the following error with no luck, I'm going round and round in circles replacing one problem with another. I'm starting to think I am trying to do something that isn't doable. I've been at a standstill for 24 hours now.

Error CS0029 Cannot implicitly convert type 'PDF_Reader_Test.Model.DocumentInterfacePDF' to 'PDF_Reader_Test.Model.DocumentInterface'

I've tried adding a type descriptor to the instantiation statement(s). PdfDocument is a type defined in the iText7 package that I am using to extract the text only from PDF documents.

case DocumentType.PDF:
    _documentInterface = new DocumentInterfacePDF<PdfDocument>(_documentPath);
    break;

But this just generates a different warning. I've also tried modifying the base and derived class definitions but I am just hitting bigger problems, I think they are (almost) right as they are.

I was expecting the base class to be able to have a derived class type assigned to it. I am wanting to define the broad behaviour in the base class so it is consistent and just override the bits that handle the different file types within the derived classes for PDF and Word. I haven't tackled the Word version yet.


Solution

  • So, there's several issues here, some were already mentioned in the comments.

    • Naming conventions: Don't call your base class "interface". It's not an interface, and interfaces are typically prefixed with "I". a better choice would be simply Document or possibly DocumentBase. Also, it is common practice to prefex generic parameters with T, so a better choice might be: class DocumentBase<TDocument>.

    • Disposing: As a rule of thumb, don't rely on finalizers (destructors). Your class should implement IDisposable. Then, if your underlying implementation[s] (e.g. PdfDocument) need disposing, you should do so in the Dispose() method.

    • ReviewDocumentParser: It's bad design to do all the expensive IO and document loading inside the constructor. Either, make the document not part of the parser's state, but create a method that returns the document implementation, or put the logic into a factory method.

    To directly address your problem: You are using DocumentType both as a type name, and as the name for your generic parameter, which is unlucky, because now you have accidentially declared DocumentInterface<DocumentType> _documentInterface with the enum type as generic argument, that is why you cannot assign DocumentInterfacePDF which has PdfDocument as generic argument.

    As you see, that's how conforming to proper naming conventions protects you against such mistakes.

    Furthermore, I would argue that you probably don't even need the generic parameter in the first place. You could keep that as an implementation detail of the derived type.

    Here's a simplified version how I might approach this. (Disclaimer: I am not familiar with itext, so it's only about the class design.)

    public abstract class Document : IDisposable
    {
        public abstract int PageCount { get; }
        
        public abstract string GetPageContents(int pageNumber);
        
        // [...]
    
        public abstract void Dispose();
    }
    
    public sealed class PdfDocument : Document
    {
        // full namespace to avoid name collisions
        private readonly iText.Kernel.Pdf.PdfReader _reader;
        private readonly iText.Kernel.Pdf.PdfDocument _document;
        private bool _disposed;
        
        public PdfDocument(string path)
        {
            // (maybe you want to put this into a static factory method)
            _reader = new (path);
            _document = new (_reader);
        }
        
        public override int PageCount => _document.GetNumberOfPages();
    
        public override string GetPageContents(int pageNumber)
        {
            // [...]
            return null;
        }
    
        public override void Dispose()
        {
            if (_disposed)
            {
                return;
            }
            _document.Close();
            _reader.Close();
            _disposed = true;
        }
    }
    
    internal class ReviewDocumentParser
    {
        public Document Parse(DocumentType documenttype, string documentpath, string searchtermspath)
        {
            // [...]
            switch (documenttype)
            {
                case DocumentType.PDF:
                    return new PdfDocument(_documentPath);
                    break;
                // [...]
            }
            return null;
        }
    }