Source code for py_doc.pdf

import fitz
import os
from py_doc import Image
import cv2
import numpy as np

[docs]class PDF: """ A class for representing PDFs. :param name: The name of the document. :type name: str """ def __init__(self, name) -> None: self.name = name self.doc = fitz.open(name) self.images = []
[docs] def get_name(self): """ Get the name of the document. :return: The name of the document. :rtype: str """ return self.name
[docs] def store_images_from_doc(self, output_path): """ Turn a PDF into images and stores them on your local machine using the provided document from the constructor. :param output_path: The path of the folder where the images should be stored. :type output_path: string with folder name :return: None :rtype: None """ directory = output_path path = os.path.join(directory) if not os.path.exists(path): os.mkdir(path) for page in self.doc: pix = page.get_pixmap(dpi=150) pix.save(os.path.join(directory,"image_%04i.png" % page.number))
[docs] def store_images(self, output_path, images = None): """ Turn a PDF into images and stores them on your local machine using the class attribute images if a list of images is not provided. :param output_path: The path of the folder where the images should be stored. :type output_path: string with folder name :return: None :rtype: None """ # checks if images is None, if it is, it uses the class attribute images # additionally, if the class attribute images is empty, it will call convert_to_images to populate it if images is None: images = self.images if len(images) == 0: images = self.convert_to_images() directory = output_path path = os.path.join(directory) if not os.path.exists(path): os.mkdir(path) index = 0 for page in images: # get just the name of the pdf file name = os.path.basename(self.name) file_name = name.split(".")[0] + "_%04i.png" % index cv2.imwrite(os.path.join(directory, file_name), page.bytes) index += 1
[docs] def convert_to_images(self): """ Turn a PDF into an array of Image objects. :return: A list of images. :rtype: list """ images = [] index = 0 for page in self.doc: pix = page.get_pixmap(dpi=150) img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n) images.append(Image("image_%04i.png" % index, img)) index += 1 self.images = images return self.images
[docs] def draw_classifications(self, output_file): """ Draw the bounding boxes on the images and merge them into a single PDF. :param output_file: The path of the folder where the images should be stored. :type output_file: string :return: None :rtype: None """ image_list = [] if (len(self.images) == 0): self.convert_to_images() for image in self.images: image_list.append(image.draw_classifications()) directory = os.path.dirname(output_file) path = os.path.join(directory, "images") if not os.path.exists(path): os.mkdir(path) self.store_images(path, image_list) doc = fitz.open() imglist = os.listdir(path) for i, f in enumerate(imglist): img = fitz.open(os.path.join(path, f)) rect = img[0].rect pdfbytes = img.convert_to_pdf() img.close() imgPDF = fitz.open("pdf", pdfbytes) page = doc.new_page(width = rect.width, height = rect.height) page.show_pdf_page(rect, imgPDF, 0) page.insert_image(rect, filename = os.path.join(path, f)) doc.save(output_file)