Skip to content

Commit 94063fe

Browse files
apply fixes
1 parent cdfd59d commit 94063fe

17 files changed

Lines changed: 160 additions & 163 deletions

mindee/extraction/common/extracted_image.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import io
22
from pathlib import Path
3-
from typing import Optional
3+
from typing import Optional, Union
44

55
from PIL import Image
66

@@ -46,7 +46,9 @@ def __init__(
4646
self._page_id = page_id
4747
self._element_id = 0 if element_id is None else element_id
4848

49-
def save_to_file(self, output_path: str, file_format: Optional[str] = None):
49+
def save_to_file(
50+
self, output_path: Union[Path, str], file_format: Optional[str] = None
51+
):
5052
"""
5153
Saves the document to a file.
5254

mindee/extraction/common/image_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def get_file_extension(file_format: str):
116116
def extract_multiple_images_from_source(
117117
input_source: LocalInputSource,
118118
page_id: int,
119-
polygons: Union[List[Polygon], List[List[Point]]],
119+
polygons: List[Union[Polygon, List[Point]]],
120120
) -> List[ExtractedImage]:
121121
"""
122122
Extracts elements from a page based on a list of bounding boxes.

mindee/extraction/pdf_extractor/extracted_pdf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from pathlib import Path
2-
from typing import BinaryIO
2+
from typing import BinaryIO, Union
33

44
import pypdfium2 as pdfium
55

@@ -31,7 +31,7 @@ def write_to_file(self, output_path: str):
3131
"""Deprecated. Use ``save_to_file`` instead."""
3232
self.save_to_file(output_path)
3333

34-
def save_to_file(self, output_path: str):
34+
def save_to_file(self, output_path: Union[Path, str]):
3535
"""
3636
Writes the contents of the current PDF object to a file.
3737

mindee/v2/__init__.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1-
from mindee.v2.file_operations.split import Split
2-
from mindee.v2.file_operations.crop import Crop
1+
from mindee.v2.file_operations.crop import (
2+
extract_crops,
3+
extract_single_crop,
4+
)
5+
from mindee.v2.file_operations.split import extract_splits
36
from mindee.v2.product.classification.classification_parameters import (
47
ClassificationParameters,
58
)
@@ -14,14 +17,16 @@
1417
from mindee.v2.product.split.split_response import SplitResponse
1518

1619
__all__ = [
20+
"extract_crops",
21+
"extract_splits",
22+
"extract_crops",
23+
"extract_single_crop",
1724
"ClassificationResponse",
1825
"ClassificationParameters",
19-
"Crop",
2026
"CropResponse",
2127
"CropParameters",
2228
"OCRResponse",
2329
"OCRParameters",
24-
"Split",
2530
"SplitResponse",
2631
"SplitParameters",
2732
]
Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
from mindee.v2.file_operations.crop import Crop
2-
from mindee.v2.file_operations.split import Split
1+
from mindee.v2.file_operations.crop import (
2+
extract_crops,
3+
extract_single_crop,
4+
)
5+
from mindee.v2.file_operations.split import extract_splits
36

4-
__all__ = ["Crop", "Split"]
7+
__all__ = ["extract_crops", "extract_splits", "extract_crops", "extract_single_crop"]

mindee/v2/file_operations/crop.py

Lines changed: 42 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,69 +1,51 @@
1-
from typing import List
1+
from typing import List, Union
22

33
from mindee.error import MindeeError
44
from mindee.extraction import ExtractedImage, extract_multiple_images_from_source
5-
from mindee.geometry import Polygon
5+
from mindee.geometry import Point, Polygon
66
from mindee.input.sources.local_input_source import LocalInputSource
77
from mindee.parsing.v2.field import FieldLocation
8+
from mindee.v2.file_operations.crop_files import CropFiles
89
from mindee.v2.product.crop.crop_box import CropBox
910

1011

11-
class Crop:
12-
"""Crop operations for V2."""
13-
14-
@classmethod
15-
def extract_single_crop(
16-
cls, input_source: LocalInputSource, crop: FieldLocation
17-
) -> ExtractedImage:
18-
"""
19-
Extracts a single crop as complete PDFs from the document.
20-
21-
:param input_source: Local Input Source to extract sub-receipts from.
22-
:param crop: Crop to extract.
23-
:return: ExtractedImage.
24-
"""
25-
26-
return extract_multiple_images_from_source(
27-
input_source, crop.page, [crop.polygon]
28-
)[0]
29-
30-
@classmethod
31-
def extract_crops(
32-
cls, input_source: LocalInputSource, crops: List[CropBox]
33-
) -> List[ExtractedImage]:
34-
"""
35-
Extracts individual receipts from multi-receipts documents.
36-
37-
:param input_source: Local Input Source to extract sub-receipts from.
38-
:param crops: List of crops.
39-
:return: Individual extracted receipts as an array of ExtractedImage.
40-
"""
41-
images: List[ExtractedImage] = []
42-
if not crops:
43-
raise MindeeError("No possible candidates found for Crop extraction.")
44-
polygons: List[List[Polygon]] = [[] for _ in range(input_source.page_count)]
45-
for i, crop in enumerate(crops):
46-
polygons[crop.location.page].append(crop.location.polygon)
47-
for i, polygon in enumerate(polygons):
48-
images.extend(
49-
extract_multiple_images_from_source(
50-
input_source,
51-
i,
52-
polygon,
53-
)
12+
def extract_single_crop(
13+
input_source: LocalInputSource, crop: FieldLocation
14+
) -> ExtractedImage:
15+
"""
16+
Extracts a single crop as complete PDFs from the document.
17+
18+
:param input_source: Local Input Source to extract sub-receipts from.
19+
:param crop: Crop to extract.
20+
:return: ExtractedImage.
21+
"""
22+
23+
polygons: List[Union[Polygon, List[Point]]] = [crop.polygon]
24+
return extract_multiple_images_from_source(input_source, crop.page, polygons)[0]
25+
26+
27+
def extract_crops(input_source: LocalInputSource, crops: List[CropBox]) -> CropFiles:
28+
"""
29+
Extracts individual receipts from multi-receipts documents.
30+
31+
:param input_source: Local Input Source to extract sub-receipts from.
32+
:param crops: List of crops.
33+
:return: Individual extracted receipts as an array of ExtractedImage.
34+
"""
35+
images: List[ExtractedImage] = []
36+
if not crops:
37+
raise MindeeError("No possible candidates found for Crop extraction.")
38+
polygons: List[List[Union[Polygon, List[Point]]]] = [
39+
[] for _ in range(input_source.page_count)
40+
]
41+
for i, crop in enumerate(crops):
42+
polygons[crop.location.page].append(crop.location.polygon)
43+
for i, polygon in enumerate(polygons):
44+
images.extend(
45+
extract_multiple_images_from_source(
46+
input_source,
47+
i,
48+
polygon,
5449
)
55-
return images
56-
57-
@classmethod
58-
def apply(
59-
cls,
60-
input_source: LocalInputSource,
61-
crops: List[CropBox],
62-
) -> List[ExtractedImage]:
63-
"""Crop a document into multiple pages.
64-
65-
:param input_source: Input source to crop.
66-
:param crops: List of crops.
67-
"""
68-
69-
return cls.extract_crops(input_source, crops)
50+
)
51+
return CropFiles(images)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from pathlib import Path
2+
from typing import List, Union
3+
4+
from mindee.extraction import ExtractedImage
5+
6+
7+
class CropFiles(List[ExtractedImage]):
8+
"""Crop files."""
9+
10+
def save_all_to_disk(self, path: Union[Path, str]):
11+
"""
12+
Save all extracted crops to disk.
13+
14+
:param path: Path to save the extracted splits to
15+
"""
16+
if isinstance(path, str):
17+
path = Path(path)
18+
path.mkdir(parents=True, exist_ok=True)
19+
for idx, split in enumerate(self):
20+
split.save_to_file(path / f"crop_{idx:03}.jpg")

mindee/v2/file_operations/split.py

Lines changed: 26 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,33 @@
11
from typing import List, Union
22

33
from mindee.error import MindeeError
4-
from mindee.extraction import ExtractedPdf, PdfExtractor
4+
from mindee.extraction import PdfExtractor
55
from mindee.input.sources.local_input_source import LocalInputSource
6+
from mindee.v2.file_operations.split_files import SplitFiles
67
from mindee.v2.product.split.split_range import SplitRange
78

89

9-
class Split:
10-
"""Split operations for V2."""
11-
12-
@classmethod
13-
def extract_splits(
14-
cls,
15-
input_source: LocalInputSource,
16-
splits: Union[List[SplitRange], List[List[int]]],
17-
) -> List[ExtractedPdf]:
18-
"""
19-
Extracts splits as complete PDFs from the document.
20-
21-
:param input_source: Input source to split.
22-
:param splits: List of sub-lists of pages to keep.
23-
:return: A list of extracted invoices.
24-
"""
25-
pdf_extractor = PdfExtractor(input_source)
26-
page_groups = []
27-
for split in splits:
28-
if isinstance(split, SplitRange):
29-
lower_bound = split.page_range[0]
30-
upper_bound = split.page_range[1]
31-
else:
32-
lower_bound = split[0]
33-
upper_bound = split[1]
34-
page_groups.append(list(range(lower_bound, upper_bound + 1)))
35-
if len(splits) < 1:
36-
raise MindeeError("No indexes provided.")
37-
return pdf_extractor.extract_sub_documents(page_groups)
38-
39-
@classmethod
40-
def apply(
41-
cls, input_source: LocalInputSource, splits: List[SplitRange]
42-
) -> List[ExtractedPdf]:
43-
"""Split a document into multiple pages.
44-
45-
:param input_source: Input source to split.
46-
:param splits: List of splits.
47-
"""
48-
49-
return cls.extract_splits(input_source, splits)
10+
def extract_splits(
11+
input_source: LocalInputSource,
12+
splits: Union[List[SplitRange], List[List[int]]],
13+
) -> SplitFiles:
14+
"""
15+
Extracts splits as complete PDFs from the document.
16+
17+
:param input_source: Input source to split.
18+
:param splits: List of sub-lists of pages to keep.
19+
:return: A list of extracted invoices.
20+
"""
21+
pdf_extractor = PdfExtractor(input_source)
22+
page_groups = []
23+
for split in splits:
24+
if isinstance(split, SplitRange):
25+
lower_bound = split.page_range[0]
26+
upper_bound = split.page_range[1]
27+
else:
28+
lower_bound = split[0]
29+
upper_bound = split[1]
30+
page_groups.append(list(range(lower_bound, upper_bound + 1)))
31+
if len(splits) < 1:
32+
raise MindeeError("No indexes provided.")
33+
return SplitFiles(pdf_extractor.extract_sub_documents(page_groups))
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from pathlib import Path
2+
from typing import List, Union
3+
4+
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
5+
6+
7+
class SplitFiles(List[ExtractedPdf]):
8+
"""Split files."""
9+
10+
def save_all_to_disk(self, path: Union[str, Path]):
11+
"""
12+
Save all extracted splits to disk.
13+
14+
:param path: Path to save the extracted splits to
15+
"""
16+
if isinstance(path, str):
17+
path = Path(path)
18+
path.mkdir(parents=True, exist_ok=True)
19+
for idx, split in enumerate(self):
20+
split.save_to_file(path / f"split_{idx:03}.pdf")

mindee/v2/product/crop/crop_box.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def __init__(self, server_response: StringDict):
1919
def __str__(self) -> str:
2020
return f"* :Location: {self.location}\n :Object Type: {self.object_type}"
2121

22-
def apply_to_file(self, input_source: LocalInputSource) -> ExtractedImage:
22+
def extract_from_file(self, input_source: LocalInputSource) -> ExtractedImage:
2323
"""
2424
Apply the split range inference to a file and return a single extracted PDF.
2525

0 commit comments

Comments
 (0)