Search code examples
pythonpdfmatplotlib

Efficient multipage PDF creation using matplotlib subplots in Python


I am new to Python and trying to visualize a huge amount of data in a single multipage pdf output file using matplotlib subplot figures with the matplotlib PdfPages backend. My problem is that I've found a bottleneck I don't know how to solve. Here is the code I have so far:

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

with PdfPages("myfigures.pdf") as pdf:
    for i in range(1000):
        f, axarr = plt.subplots(2, 3)
        plt.subplots(2, 3)
        axarr[0, 0].plot(x1, y1)
        axarr[1, 0].plot(x2, y2)

        pdf.savefig(f)
        plt.close("all")

Creating a figure in each iteration of a for loop seems highly time-consuming, but if I put it outside the loop the previous figures do not get cleared when plotting the next. Other options I tried like clear() or clf() didn't work either, or ended up creating multiple different figures (when what I need is an array of subplots gathered and output as a single figure to pdf). Does anyone have an idea of how to implement this? And perhaps also make it faster?


Solution

  • Multipage PDF subplots with matplotlib

    Create m-rows × n-cols matrices of subplot axes arrays per pdf page & save (append) as each page's matrix of subplots becomes completely full → then create new page, repeat, 𝐞𝐭𝐜.

    To contain large numbers of subplots as multipage output inside a single pdf, immediately start filling the first page with your plot(s), then you'll need to create a new page after detecting that the latest subplot addition in your iteration of plot generation has maxed out the available space in the current page's m-rows × n-cols subplot-array layout [i.e., an m × n matrix of subplots], as applicable.

    Here's a way to do it where the dimensions (m × n) controlling the number of subplots per page can easily be changed:

        import sys
    
        import matplotlib
        from matplotlib.backends.backend_pdf import PdfPages
        import matplotlib.pyplot as plt
        import numpy as np
    
    
        matplotlib.rcParams.update({"font.size": 6})
    
        # Dimensions for any m-rows × n-cols array of subplots / pg.
        m, n = 4, 5
    
        # Don't forget to indent after the with statement
        with PdfPages("auto_subplotting.pdf") as pdf:
    
            """Before beginning the iteration through all the data,
            initialize the layout for the plots and create a
            representation of the subplots that can be easily
            iterated over for knowing when to create the next page
            (and also for custom settings like partial axes labels)"""
            f, axarr = plt.subplots(m, n, sharex="col", sharey="row")
            arr_ij = [(x, y) for x, y in np.ndindex(axarr.shape)]
            subplots = [axarr[index] for index in arr_ij]
    
            # To conserve needed plotting real estate,
            # only label the bottom row and leftmost subplots
            # as determined automatically using m and n
            splot_index = 0
            for s, splot in enumerate(subplots):
                splot.set_ylim(0, 0.15)
                splot.set_xlim(0, 50)
                last_row = m * n - s < n + 1
                first_in_row = s % n == 0
                if last_row:
                    splot.set_xlabel("X-axis label")
                if first_in_row:
                    splot.set_ylabel("Y-axis label")
    
            # Iterate through each sample in the data
            for sample in range(33):
    
                # As a stand-in for real data, let's just make numpy take 100 random draws
                # from a poisson distribution centered around say ~25 and then display
                # the outcome as a histogram
                scaled_y = np.random.randint(20, 30)
                random_data = np.random.poisson(scaled_y, 100)
                subplots[splot_index].hist(
                    random_data,
                    bins=12,
                    normed=True,
                    fc=(0, 0, 0, 0),
                    lw=0.75,
                    ec="b",
                )
    
                # Keep collecting subplots (into the mpl-created array; 
                # see: [1]) through the samples in the data and increment
                # a counter each time. The page will be full once the count is equal
                # to the product of the user-set dimensions (i.e. m * n)
                splot_index += 1
    
                """Once an mxn number of subplots have been collected 
                you now have a full page's worth, and it's time to 
                close and save to pdf that page and re-initialize for a
                new page possibly. We can basically repeat the same 
                exact code block used for the first layout 
                initialization, but with the addition of 3 new lines:
                 +2 for creating & saving the just-finished pdf page,
                 +1 more to reset the subplot index (back to zero)"""
                if splot_index == m * n:
                    pdf.savefig()
                    plt.close(f)
                    f, axarr = plt.subplots(m, n, sharex="col", sharey="row")
                    arr_ij = [(x, y) for x, y in np.ndindex(axarr.shape)]
                    subplots = [axarr[index] for index in arr_ij]
                    splot_index = 0
                    for s, splot in enumerate(subplots):
                        splot.set_ylim(0, 0.15)
                        splot.set_xlim(0, 50)
                        last_row = (m * n) - s < n + 1
                        first_in_row = s % n == 0
                        if last_row:
                            splot.set_xlabel("X-axis label")
                        if first_in_row:
                            splot.set_ylabel("Y-axis label")
    
            # Done!
            # But don't forget to save to pdf after the last page    
            pdf.savefig()
            plt.close(f)
    

    For any m×n layout, just change the declarations for the values of m and n, respectively. From the code above (where "m, n = 4, 5"), a 4x5 matrix of subplots with a total 33 samples is produced as a two-page pdf output file:

    References

    1. Link to matplotlib subplots official docs.

    Note: There will be, on the final page of the multipage PDF, a number of blank subplots equal to the remainder from the the product of your chosen subplots 𝑚 × 𝑛 layout dimension numbers and your total number of samples/data to plot. E.g., say m=3, and n=4, thus you get 3 rows of 4 subplots each equals 12 per page, and if you had say 20 samples, then there would be a two-page pdf auto-created with a total of 24 subplots with the last 4 (so full bottom-most row in this hypothetical example) of subplots on the second page empty.


    Using seaborn

    For a more advanced (& more "pythonic"*) extension of the implementation above, see below:

    The multipage handling should probably be simplified by creating a new_page function; it's better to not repeat code verbatim*, especially if you start customizing the plots in which case you won't want to have to mirror every change and type the same thing twice. A more customized aesthetic based off of seaborn and utilizing the available matplotlib parameters like shown below might be preferable too.

    Add a new_page function & some customizations for the subplot style:

        import matplotlib.pyplot as plt
        import numpy as np
        import random
        import seaborn as sns
    
        from matplotlib.backends.backend_pdf import PdfPages
    
        # this erases labels for any blank plots on the last page
        sns.set(font_scale=0.0)
        m, n = 4, 6
        datasize = 37 
        # 37 % (m*n) = 13, (m*n) - 13 = 24 - 13 = 11. Thus 11 blank subplots on final page
        
        # custom colors scheme / palette
        ctheme = [
            "k", "gray", "magenta", "fuchsia", "#be03fd", "#1e488f",
            (0.44313725490196076, 0.44313725490196076, 0.88627450980392153), "#75bbfd",
            "teal", "lime", "g", (0.6666674, 0.6666663, 0.29078014184397138), "y",
            "#f1da7a", "tan", "orange", "maroon", "r", ] # pick whatever colors you wish
        colors = sns.blend_palette(ctheme, datasize)
        fz = 7  # labels fontsize
    
    
        def new_page(m, n):
            global splot_index
            splot_index = 0
            fig, axarr = plt.subplots(m, n, sharey="row")
            plt.subplots_adjust(hspace=0.5, wspace=0.15)
            arr_ij = [(x, y) for x, y in np.ndindex(axarr.shape)]
            subplots = [axarr[index] for index in arr_ij]
            for s, splot in enumerate(subplots):
                splot.grid(
                    b=True,
                    which="major",
                    color="gray",
                    linestyle="-",
                    alpha=0.25,
                    zorder=1,
                    lw=0.5,
                )
                splot.set_ylim(0, 0.15)
                splot.set_xlim(0, 50)
                last_row = m * n - s < n + 1
                first_in_row = s % n == 0
                if last_row:
                    splot.set_xlabel("X-axis label", labelpad=8, fontsize=fz)
                if first_in_row:
                    splot.set_ylabel("Y-axis label", labelpad=8, fontsize=fz)
            return (fig, subplots)
    
    
        with PdfPages("auto_subplotting_colors.pdf") as pdf:
    
            fig, subplots = new_page(m, n)
    
            for sample in xrange(datasize):
                splot = subplots[splot_index]
                splot_index += 1
                scaled_y = np.random.randint(20, 30)
                random_data = np.random.poisson(scaled_y, 100)
                splot.hist(
                    random_data,
                    bins=12,
                    normed=True,
                    zorder=2,
                    alpha=0.99,
                    fc="white",
                    lw=0.75,
                    ec=colors.pop(),
                )
                splot.set_title("Sample {}".format(sample + 1), fontsize=fz)
                # tick fontsize & spacing
                splot.xaxis.set_tick_params(pad=4, labelsize=6)
                splot.yaxis.set_tick_params(pad=4, labelsize=6)
    
                # make new page:
                if splot_index == m * n:
                    pdf.savefig()
                    plt.close(fig)
                    fig, subplots = new_page(m, n)
    
            if splot_index > 0:
                pdf.savefig()
                plt.close(f)