diff --git a/CHANGELOG.md b/CHANGELOG.md index cfd75988dc..1aacfa1067 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -85,11 +85,11 @@ ### Fixes +* **Use password** to load PDF with all modes * **V2 elements without first parent ID can be parsed** * **Fix missing elements when layout element parsed in V2 ontology** * updated **unstructured-inference** to be **0.8.1** in requirements/extra-pdf-image.in - ## 0.16.2 ### Enhancements diff --git a/example-docs/pdf/password.pdf b/example-docs/pdf/password.pdf new file mode 100644 index 0000000000..21bd55d500 Binary files /dev/null and b/example-docs/pdf/password.pdf differ diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index cea6b44129..64799f80b1 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1515,3 +1515,51 @@ def test_document_to_element_list_sets_category_depth_titles(): assert elements[1].metadata.category_depth == 2 assert elements[2].metadata.category_depth is None assert elements[3].metadata.category_depth == 0 + +@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"]) +@pytest.mark.parametrize( + ("strategy", "origin"), + # fast: can't capture the "intentionally left blank page" page + # others: will ignore the actual blank page + [ + (PartitionStrategy.FAST, {"pdfminer"}), + (PartitionStrategy.FAST, {"pdfminer"}), + (PartitionStrategy.HI_RES, {"yolox", "pdfminer", "ocr_tesseract"}), + (PartitionStrategy.OCR_ONLY, {"ocr_tesseract"}), + ], +) +def test_partition_pdf_with_password( + file_mode, + strategy, + origin, + filename=example_doc_path("pdf/password.pdf"), +): + # Test that the partition_pdf function can handle filename + def _test(result): + # validate that the result is a non-empty list of dicts + assert len(result) == 1 + assert result[0].text == 'File with password' + + if file_mode == "filename": + result = pdf.partition_pdf( + filename=filename, strategy=strategy, + password="password" + ) + _test(result) + elif file_mode == "rb": + with open(filename, "rb") as f: + result = pdf.partition_pdf( + file=f, strategy=strategy, + password="password" + ) + _test(result) + else: + with open(filename, "rb") as test_file: + spooled_temp_file = SpooledTemporaryFile() + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + result = pdf.partition_pdf( + file=spooled_temp_file, strategy=strategy, + password="password" + ) + _test(result) diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 50ceaa1187..489e2cb6b1 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -32,6 +32,7 @@ def partition_image( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + password:Optional[str]=None, **kwargs: Any, ) -> list[Element]: """Parses an image into a list of interpreted elements. @@ -91,6 +92,8 @@ def partition_image( (results in adding FormKeysValues elements to output). form_extraction_skip_tables Whether the form extraction logic should ignore regions designated as Tables. + password + The password to decrypt the PDF file. """ exactly_one(filename=filename, file=file) @@ -113,5 +116,6 @@ def partition_image( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + password=password, **kwargs, ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index f87812d40b..206c3156c8 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -12,7 +12,8 @@ import numpy as np import wrapt from pdfminer import psparser -from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox +from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox +from pdfminer.pdftypes import PDFObjRef from pdfminer.utils import open_filename from pi_heif import register_heif_opener from PIL import Image as PILImage @@ -142,6 +143,7 @@ def partition_pdf( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Parses a pdf document into a list of interpreted elements. @@ -222,6 +224,7 @@ def partition_pdf( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + password=password, **kwargs, ) @@ -243,6 +246,7 @@ def partition_pdf_or_image( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -271,6 +275,7 @@ def partition_pdf_or_image( languages=languages, metadata_last_modified=metadata_last_modified or last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) pdf_text_extractable = any( @@ -320,6 +325,7 @@ def partition_pdf_or_image( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + password=password, **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -345,6 +351,7 @@ def partition_pdf_or_image( is_image=is_image, metadata_last_modified=metadata_last_modified or last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -358,6 +365,7 @@ def extractable_elements( languages: Optional[list[str]] = None, metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, + password:Optional[str] = None, **kwargs: Any, ) -> list[list[Element]]: if isinstance(file, bytes): @@ -368,6 +376,7 @@ def extractable_elements( languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) @@ -378,6 +387,7 @@ def _partition_pdf_with_pdfminer( languages: list[str], metadata_last_modified: Optional[str], starting_page_number: int = 1, + password:Optional[str] = None, **kwargs: Any, ) -> list[list[Element]]: """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster @@ -401,6 +411,7 @@ def _partition_pdf_with_pdfminer( languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) @@ -411,6 +422,7 @@ def _partition_pdf_with_pdfminer( languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) @@ -425,6 +437,7 @@ def _process_pdfminer_pages( metadata_last_modified: Optional[str], annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD, starting_page_number: int = 1, + password: Optional[str] = None, **kwargs, ) -> list[list[Element]]: """Uses PDFMiner to split a document into pages and process them.""" @@ -432,7 +445,8 @@ def _process_pdfminer_pages( elements = [] for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(fp), start=starting_page_number + open_pdfminer_pages_generator(fp, password=password), + start=starting_page_number, ): width, height = page_layout.width, page_layout.height @@ -554,6 +568,7 @@ def _partition_pdf_or_image_local( extract_forms: bool = False, form_extraction_skip_tables: bool = True, pdf_hi_res_max_pages: Optional[int] = None, + password:Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partition using package installed locally""" @@ -590,10 +605,12 @@ def _partition_pdf_or_image_local( is_image=is_image, model_name=hi_res_model_name, pdf_image_dpi=pdf_image_dpi, + password=password, ) extracted_layout, layouts_links = ( - process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) + process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, + password=password) if pdf_text_extractable else ([], []) ) @@ -633,6 +650,7 @@ def _partition_pdf_or_image_local( ocr_mode=ocr_mode, pdf_image_dpi=pdf_image_dpi, ocr_layout_dumper=ocr_layout_dumper, + password=password, ) else: inferred_document_layout = process_data_with_model( @@ -640,13 +658,14 @@ def _partition_pdf_or_image_local( is_image=is_image, model_name=hi_res_model_name, pdf_image_dpi=pdf_image_dpi, + password=password, ) if hasattr(file, "seek"): file.seek(0) extracted_layout, layouts_links = ( - process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) + process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password) if pdf_text_extractable else ([], []) ) @@ -688,6 +707,7 @@ def _partition_pdf_or_image_local( ocr_mode=ocr_mode, pdf_image_dpi=pdf_image_dpi, ocr_layout_dumper=ocr_layout_dumper, + password=password, ) final_document_layout = clean_pdfminer_inner_elements(final_document_layout) @@ -834,6 +854,7 @@ def _partition_pdf_or_image_with_ocr( is_image: bool = False, metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, + password: Optional[str] = None, **kwargs: Any, ): """Partitions an image or PDF using OCR. For PDFs, each page is converted @@ -858,7 +879,8 @@ def _partition_pdf_or_image_with_ocr( elements.extend(page_elements) else: for page_number, image in enumerate( - convert_pdf_to_images(filename, file), start=starting_page_number + convert_pdf_to_images(filename, file, password=password), + start=starting_page_number ): page_elements = _partition_pdf_or_image_with_ocr_from_image( image=image, @@ -1142,6 +1164,24 @@ def document_to_element_list( page_elements.extend(element) translation_mapping.extend([(layout_element, el) for el in element]) continue + + # TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9 + # will need to switch to some pattern matching once we support more languages + if not word: + isalnum = char.isalnum() + if word and char.isalnum() != isalnum: + isalnum = char.isalnum() + words.append( + {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, + ) + word = "" + + if len(word) == 0: + start_index = text_len + index + x1 = character.x0 + y2 = height - character.y0 + x2 = character.x1 + y1 = height - character.y1 else: element.metadata.links = ( diff --git a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py index ecd7f722bf..b3752b0f22 100644 --- a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py +++ b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py @@ -546,6 +546,7 @@ def __init__( draw_grid: bool = False, resize: Optional[float] = None, format: str = "png", + password: Optional[str] = None, ): self.draw_caption = draw_caption self.draw_grid = draw_grid @@ -554,6 +555,7 @@ def __init__( self.format = format self.drawers = [] self.file = file + self.password = password super().__init__(filename, save_dir) @@ -678,6 +680,7 @@ def load_source_image(self) -> Generator[Image.Image, None, None]: file=self.file, output_folder=temp_dir, path_only=True, + password=self.password, ) except Exception as ex: # noqa: E722 print( diff --git a/unstructured/partition/pdf_image/analysis/tools.py b/unstructured/partition/pdf_image/analysis/tools.py index 3000f08db6..ba42a432a9 100644 --- a/unstructured/partition/pdf_image/analysis/tools.py +++ b/unstructured/partition/pdf_image/analysis/tools.py @@ -66,6 +66,7 @@ def save_analysis_artifiacts( draw_caption: bool = True, resize: Optional[float] = None, format: str = "png", + password: Optional[str] = None, ): """Save the analysis artifacts for a given file. Loads some settings from the environment configuration. @@ -82,6 +83,7 @@ def save_analysis_artifiacts( draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source) resize: Output image resize value. If not provided, the image will not be resized. format: The format for analyzed pages with bboxes drawn on them. Default is 'png'. + password (optional): The password to decrypt the PDF file. """ if not filename: filename = _generate_filename(is_image) @@ -109,6 +111,7 @@ def save_analysis_artifiacts( draw_caption=draw_caption, resize=resize, format=format, + password=password, ) for layout_dumper in layout_dumpers: @@ -125,6 +128,7 @@ def render_bboxes_for_file( draw_caption: bool = True, resize: Optional[float] = None, format: str = "png", + password: Optional[str] = None, ): """Render the bounding boxes for a given layout dimp file. To be used for analysis after the partition is performed for @@ -144,6 +148,7 @@ def render_bboxes_for_file( draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source) resize: Output image resize value. If not provided, the image will not be resized. format: The format for analyzed pages with bboxes drawn on them. Default is 'png'. + password (optional): The password to decrypt the PDF file. """ filename_stem = Path(filename).stem is_image = not Path(filename).suffix.endswith("pdf") @@ -183,6 +188,7 @@ def render_bboxes_for_file( draw_caption=draw_caption, resize=resize, format=format, + password=password, ) for drawer in layout_drawers: diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index f6b81dd2e4..92df9a701f 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -37,6 +37,7 @@ def process_data_with_ocr( ocr_mode: str = OCRMode.FULL_PAGE.value, pdf_image_dpi: int = 200, ocr_layout_dumper: Optional[OCRLayoutDumper] = None, + password: Optional[str] = None, ) -> "DocumentLayout": """ Process OCR data from a given data and supplement the output DocumentLayout @@ -64,6 +65,8 @@ def process_data_with_ocr( - ocr_layout_dumper (OCRLayoutDumper, optional): The OCR layout dumper to save the OCR layout. + - password (optional): The password to decrypt the PDF file. + Returns: DocumentLayout: The merged layout information obtained after OCR processing. """ @@ -84,6 +87,7 @@ def process_data_with_ocr( ocr_mode=ocr_mode, pdf_image_dpi=pdf_image_dpi, ocr_layout_dumper=ocr_layout_dumper, + password=password, ) return merged_layouts @@ -100,6 +104,7 @@ def process_file_with_ocr( ocr_mode: str = OCRMode.FULL_PAGE.value, pdf_image_dpi: int = 200, ocr_layout_dumper: Optional[OCRLayoutDumper] = None, + password: Optional[str] = None, ) -> "DocumentLayout": """ Process OCR data from a given file and supplement the output DocumentLayout @@ -124,6 +129,8 @@ def process_file_with_ocr( - pdf_image_dpi (int, optional): DPI (dots per inch) for processing PDF images. Defaults to 200. + - password (optional): The password to decrypt the PDF file. + Returns: DocumentLayout: The merged layout information obtained after OCR processing. """ @@ -157,6 +164,7 @@ def process_file_with_ocr( dpi=pdf_image_dpi, output_folder=temp_dir, paths_only=True, + userpw=password, ) image_paths = cast(List[str], _image_paths) for i, image_path in enumerate(image_paths): diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index a809c7f76d..fa24f7a537 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -58,6 +58,7 @@ def convert_pdf_to_image( dpi: int = 200, output_folder: Optional[Union[str, PurePath]] = None, path_only: bool = False, + password:Optional[str] = None, ) -> Union[List[Image.Image], List[str]]: """Get the image renderings of the pdf pages using pdf2image""" @@ -71,6 +72,7 @@ def convert_pdf_to_image( dpi=dpi, output_folder=output_folder, paths_only=path_only, + userpw=password, ) else: images = pdf2image.convert_from_path( @@ -125,6 +127,7 @@ def save_elements( is_image: bool = False, extract_image_block_to_payload: bool = False, output_dir_path: str | None = None, + password:Optional[str] = None, ): """ Saves specific elements from a PDF as images either to a directory or embeds them in the @@ -167,6 +170,7 @@ def save_elements( pdf_image_dpi, output_folder=temp_dir, path_only=True, + password=password, ) image_paths = cast(List[str], _image_paths) @@ -389,15 +393,16 @@ def convert_pdf_to_images( filename: str = "", file: Optional[bytes | IO[bytes]] = None, chunk_size: int = 10, + password:Optional[str] = None, ) -> Iterator[Image.Image]: # Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on) exactly_one(filename=filename, file=file) if file is not None: f_bytes = convert_to_bytes(file) - info = pdf2image.pdfinfo_from_bytes(f_bytes) + info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password) else: f_bytes = None - info = pdf2image.pdfinfo_from_path(filename) + info = pdf2image.pdfinfo_from_path(filename, userpw=password) total_pages = info["Pages"] for start_page in range(1, total_pages + 1, chunk_size): @@ -407,12 +412,14 @@ def convert_pdf_to_images( f_bytes, first_page=start_page, last_page=end_page, + userpw=password, ) else: chunk_images = pdf2image.convert_from_path( filename, first_page=start_page, last_page=end_page, + userpw=password, ) for image in chunk_images: diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 91a3e689f2..69974870e6 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -35,12 +35,14 @@ def process_file_with_pdfminer( filename: str = "", dpi: int = 200, + password: Optional[str] = None, ) -> tuple[List[List["TextRegion"]], List[List]]: with open_filename(filename, "rb") as fp: fp = cast(BinaryIO, fp) extracted_layout, layouts_links = process_data_with_pdfminer( file=fp, dpi=dpi, + password=password, ) return extracted_layout, layouts_links @@ -49,6 +51,7 @@ def process_file_with_pdfminer( def process_data_with_pdfminer( file: Optional[Union[bytes, BinaryIO]] = None, dpi: int = 200, + password:Optional[str]=None, ) -> tuple[List[List["TextRegion"]], List[List]]: """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the pdf pages using pdf2image""" @@ -62,7 +65,8 @@ def process_data_with_pdfminer( layouts_links = [] # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 - for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)): + for page_number, (page, page_layout) in ( + enumerate(open_pdfminer_pages_generator(file, password=password))): width, height = page_layout.width, page_layout.height text_layout = [] diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 23332745e6..0c1ed0ca05 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,6 +1,6 @@ import os import tempfile -from typing import BinaryIO, List, Tuple +from typing import BinaryIO, List, Tuple, Optional from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine @@ -73,6 +73,7 @@ def rect_to_bbox( @requires_dependencies(["pikepdf", "pypdf"]) def open_pdfminer_pages_generator( fp: BinaryIO, + password:Optional[str]=None, ): """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" @@ -84,7 +85,7 @@ def open_pdfminer_pages_generator( with tempfile.TemporaryDirectory() as tmp_dir_path: tmp_file_path = os.path.join(tmp_dir_path, "tmp_file") try: - pages = PDFPage.get_pages(fp) + pages = PDFPage.get_pages(fp, password=password) # Detect invalid dictionary construct for entire PDF for i, page in enumerate(pages): try: