Source code for py_doc.pdf

import fitz
import os
from py_doc import Image
import cv2
import numpy as np

[docs]class PDF:
    """
    A class for representing PDFs.

    :param name: The name of the document.
    :type name: str
    """

    def __init__(self, name) -> None:
        self.name = name
        self.doc = fitz.open(name)
        self.images = [] 

[docs]    def get_name(self):
        """
        Get the name of the document.

        :return: The name of the document.
        :rtype: str
        """

        return self.name
    
[docs]    def store_images_from_doc(self, output_path):
        """
        Turn a PDF into images and stores them on your local machine using the provided document from the constructor.

        :param output_path: The path of the folder where the images should be stored. 
        :type output_path: string with folder name

        :return: None
        :rtype: None
        """

        directory = output_path
        path = os.path.join(directory)
        if not os.path.exists(path): 
            os.mkdir(path)
        for page in self.doc:
            pix = page.get_pixmap(dpi=150)  
            pix.save(os.path.join(directory,"image_%04i.png" % page.number))

[docs]    def store_images(self, output_path, images = None):
        """
        Turn a PDF into images and stores them on your local machine using the class attribute images if a list of images is not provided.

        :param output_path: The path of the folder where the images should be stored. 
        :type output_path: string with folder name

        :return: None
        :rtype: None
        """

        # checks if images is None, if it is, it uses the class attribute images
        # additionally, if the class attribute images is empty, it will call convert_to_images to populate it
        if images is None:
            images = self.images
            if len(images) == 0:
                images = self.convert_to_images()

        directory = output_path
        path = os.path.join(directory)
        if not os.path.exists(path): 
            os.mkdir(path)
        index = 0
        for page in images:
            # get just the name of the pdf file
            name = os.path.basename(self.name)
            file_name = name.split(".")[0] + "_%04i.png" % index
            cv2.imwrite(os.path.join(directory, file_name), page.bytes)
            index += 1

[docs]    def convert_to_images(self):
        """
        Turn a PDF into an array of Image objects.

        :return: A list of images.
        :rtype: list
        """
        images = []
        index = 0
        for page in self.doc:
            pix = page.get_pixmap(dpi=150)
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
            images.append(Image("image_%04i.png" % index, img))
            index += 1

        self.images = images
        return self.images

[docs]    def draw_classifications(self, output_file):
        """
        Draw the bounding boxes on the images and merge them into a single PDF.

        :param output_file: The path of the folder where the images should be stored. 
        :type output_file: string

        :return: None
        :rtype: None
        """

        image_list = []
        if (len(self.images) == 0):
            self.convert_to_images()
        for image in self.images:
            image_list.append(image.draw_classifications())

        directory = os.path.dirname(output_file)
        path = os.path.join(directory, "images")
        if not os.path.exists(path): 
            os.mkdir(path)
        self.store_images(path, image_list)
        
        doc = fitz.open()
        imglist = os.listdir(path)
        for i, f in enumerate(imglist):
            img = fitz.open(os.path.join(path, f))
            rect = img[0].rect
            pdfbytes = img.convert_to_pdf()
            img.close()
            imgPDF = fitz.open("pdf", pdfbytes)
            page = doc.new_page(width = rect.width, height = rect.height)
            page.show_pdf_page(rect, imgPDF, 0)
            page.insert_image(rect, filename = os.path.join(path, f))
        doc.save(output_file)