Source code for py_doc.image

import cv2
import pytesseract
from py_doc.yolov7.document_detection import detect_document
from py_doc import utils
import os
import fitz

[docs]class Image:
    """
    A class for representing an image. Takes in a name and a image. If the image is not provided, the name is used to load the image.

    :param name: The name of the image.
    :type name: str
    """

    def __init__(self, name, bytes=None):
        self.name = name
        if bytes is None:
            self.bytes = cv2.imread(name)
        else:
            self.bytes = bytes
        self.bboxes = [] 

[docs]    def get_name(self):
        """
        Get the name of the image.

        :return: The name of the image.
        :rtype: str
        """

        return self.name

[docs]    def get_bboxes(self):
        """
        Use an object detection model to get bounding boxes for titles, text, figures, lists, and tables in the image.

        :return: A list of bounding boxes of the image.
        :rtype: list
        """
        if len(self.bboxes) == 0:
            self.bboxes = detect_document(self.bytes)
        return self.bboxes 

[docs]    def draw_classifications(self, output_file = None):
        """
        Draw the bounding boxes on the image. Contains the option to save the image to a file, otherwise it will just return the image.

        :param file: The output file to save the image to.
        :type file: str

        :return: If the file is provided, it will return a boolean based on the success, otherwise it will return the image.
        :rtype: bool or numpy.ndarray
        """
        
        if len(self.bboxes) == 0:
            bboxes = detect_document(self.bytes)

        classes = ["text", "title", "list", "table", "figure"]
        colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0), (255, 255, 0), (0, 255, 255)]

        drawn_image = self.bytes.copy()

        for bbox in bboxes:
            x1, y1, x2, y2, class_id = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]), int(bbox[5])
            color = colors[class_id]
            cv2.rectangle(drawn_image, (x1, y1), (x2, y2), color, 2)
            cv2.putText(drawn_image, classes[class_id], (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.3, color, 1)

        if output_file is not None:
            cv2.imwrite(output_file, drawn_image)
            return True
        else:
            return Image(self.name, drawn_image)


[docs]    def get_text(self):
        """
        Get the text from the image.

        :return: The text from the image.
        :rtype: str
        """

        return pytesseract.image_to_string(self.image, lang='eng')

[docs]    def get_text_from_bbox(self, bbox):
        """
        Get the text from the bounding box.

        :param bbox: The bounding box to get the text from.
        :type bbox: list with 4 elements [x1, y1, x2, y2]

        :return: The text from the bounding box.
        :rtype: str
        """

        x1, y1, x2, y2 = utils.reformat_bbox(bbox)
        cropped = self.bytes[y1:y2, x1:x2]
        return pytesseract.image_to_string(cropped, lang='eng')