import cv2
import pytesseract
from py_doc.yolov7.document_detection import detect_document
from py_doc import utils
import os
import fitz
[docs]class Image:
"""
A class for representing an image. Takes in a name and a image. If the image is not provided, the name is used to load the image.
:param name: The name of the image.
:type name: str
"""
def __init__(self, name, bytes=None):
self.name = name
if bytes is None:
self.bytes = cv2.imread(name)
else:
self.bytes = bytes
self.bboxes = []
[docs] def get_name(self):
"""
Get the name of the image.
:return: The name of the image.
:rtype: str
"""
return self.name
[docs] def get_bboxes(self):
"""
Use an object detection model to get bounding boxes for titles, text, figures, lists, and tables in the image.
:return: A list of bounding boxes of the image.
:rtype: list
"""
if len(self.bboxes) == 0:
self.bboxes = detect_document(self.bytes)
return self.bboxes
[docs] def draw_classifications(self, output_file = None):
"""
Draw the bounding boxes on the image. Contains the option to save the image to a file, otherwise it will just return the image.
:param file: The output file to save the image to.
:type file: str
:return: If the file is provided, it will return a boolean based on the success, otherwise it will return the image.
:rtype: bool or numpy.ndarray
"""
if len(self.bboxes) == 0:
bboxes = detect_document(self.bytes)
classes = ["text", "title", "list", "table", "figure"]
colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0), (255, 255, 0), (0, 255, 255)]
drawn_image = self.bytes.copy()
for bbox in bboxes:
x1, y1, x2, y2, class_id = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]), int(bbox[5])
color = colors[class_id]
cv2.rectangle(drawn_image, (x1, y1), (x2, y2), color, 2)
cv2.putText(drawn_image, classes[class_id], (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.3, color, 1)
if output_file is not None:
cv2.imwrite(output_file, drawn_image)
return True
else:
return Image(self.name, drawn_image)
[docs] def get_text(self):
"""
Get the text from the image.
:return: The text from the image.
:rtype: str
"""
return pytesseract.image_to_string(self.image, lang='eng')
[docs] def get_text_from_bbox(self, bbox):
"""
Get the text from the bounding box.
:param bbox: The bounding box to get the text from.
:type bbox: list with 4 elements [x1, y1, x2, y2]
:return: The text from the bounding box.
:rtype: str
"""
x1, y1, x2, y2 = utils.reformat_bbox(bbox)
cropped = self.bytes[y1:y2, x1:x2]
return pytesseract.image_to_string(cropped, lang='eng')