Search code examples
pdfpoppler

How to access Topic name from pdfs using poppler?


I am using poppler, and I want to access topic or headings of a particular page number using poppler, so please tell me how to do this using poppler.


Solution

  • Using the glib API. Don't know which API you want.

    I'm pretty sure there is no topic/heading stored with a particular page. You have to walk the index, if there is one.

    Walk the index with backtracking. If you are lucky, each index node contains a PopplerActionGotoDest (check type!). You can grab the title from the PopplerAction object (gchar *title) and get the page number from the included PopplerDest (int page_num). page_num should be the first page of the section.

    Assuming your PDF has an index containing PopplerActionGotoDest objects. Then you simply walk it, checking for the page_num. If page_num > searched_num, go back one step. When you are at the correct parent, walk the childs. This should give you the best match. I just made some code for it:

    gchar* getTitle(PopplerIndexIter *iter, int num, PopplerIndexIter *last,PopplerDocument *doc)
    {
        int cur_num = 0;
        int next;
        PopplerAction * action;
        PopplerDest * dest;
        gchar * title = NULL;
        PopplerIndexIter  * last_tmp;
    
        do
        {
                action = poppler_index_iter_get_action(iter);
                if (action->type != POPPLER_ACTION_GOTO_DEST) {
                    printf("No GOTO_DEST!\n");
                    return NULL;
                }
    
                //get page number of current node
                if (action->goto_dest.dest->type == POPPLER_DEST_NAMED) {
                    dest = poppler_document_find_dest (doc, action->goto_dest.dest->named_dest);
                    cur_num = dest->page_num;
                    poppler_dest_free(dest);
                } else {
                    cur_num = action->goto_dest.dest->page_num;
                }
                //printf("cur_num: %d, %d\n",cur_num,num);
    
                //free action, as we don't need it anymore
                poppler_action_free(action);
    
                //are there nodes following this one?
                last_tmp = poppler_index_iter_copy(iter);
                next = poppler_index_iter_next (iter);
    
                //descend
                if (!next || cur_num > num) {
                    if ((!next && cur_num < num) || cur_num == num) {
                        //descend current node
                        if (last) {
                            poppler_index_iter_free(last);
                        }
                        last = last_tmp;
                    }
                    //descend last node (backtracking)
                    if (last) {
                        /* Get the the action and do something with it */
                        PopplerIndexIter *child = poppler_index_iter_get_child (last);
                        gchar * tmp = NULL;
                        if (child) {
                            tmp = getTitle(child,num,last,doc);
                            poppler_index_iter_free (child);
                        } else {
                            action = poppler_index_iter_get_action(last);
                            if (action->type != POPPLER_ACTION_GOTO_DEST) {
                                tmp = NULL;
                            } else {
                                tmp = g_strdup (action->any.title);
                            }
                            poppler_action_free(action);
                            poppler_index_iter_free (last);
                        }
    
                        return tmp;
                    } else {
                        return NULL;
                    }
                }
    
                if (cur_num > num || (next && cur_num != 0)) {
                    // free last index_iter
                    if (last) {
                        poppler_index_iter_free(last);
                    }
                    last = last_tmp;
                }
        }
      while (next);
    
        return NULL;
    }
    

    getTitle gets called by:

        for (i = 0; i < num_pages; i++) {
                iter = poppler_index_iter_new (document);
                title = getTitle(iter,i,NULL,document);
                poppler_index_iter_free (iter);
    
                if (title) {
                    printf("title of %d: %s\n",i, title);
                    g_free(title);
                } else {
                    printf("%d: no title\n",i);
                }
        }