Unstructured-IO · pprados · Oct 15, 2024 · Oct 16, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -85,11 +85,11 @@
 
 ### Fixes
 
+* **Use password** to load PDF with all modes
 * **V2 elements without first parent ID can be parsed**
 * **Fix missing elements when layout element parsed in V2 ontology**
 * updated **unstructured-inference** to be **0.8.1** in requirements/extra-pdf-image.in
 
-
 ## 0.16.2
 
 ### Enhancements

diff --git a/example-docs/pdf/password.pdf b/example-docs/pdf/password.pdf
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1515,3 +1515,51 @@ def test_document_to_element_list_sets_category_depth_titles():
     assert elements[1].metadata.category_depth == 2
     assert elements[2].metadata.category_depth is None
     assert elements[3].metadata.category_depth == 0
+
+@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
+@pytest.mark.parametrize(
+    ("strategy", "origin"),
+    # fast: can't capture the "intentionally left blank page" page
+    # others: will ignore the actual blank page
+    [
+        (PartitionStrategy.FAST,  {"pdfminer"}),
+        (PartitionStrategy.FAST,  {"pdfminer"}),
+        (PartitionStrategy.HI_RES, {"yolox", "pdfminer", "ocr_tesseract"}),
+        (PartitionStrategy.OCR_ONLY,  {"ocr_tesseract"}),
+    ],
+)
+def test_partition_pdf_with_password(
+    file_mode,
+    strategy,
+    origin,
+    filename=example_doc_path("pdf/password.pdf"),
+):
+    # Test that the partition_pdf function can handle filename
+    def _test(result):
+        # validate that the result is a non-empty list of dicts
+        assert len(result) == 1
+        assert result[0].text == 'File with password'
+
+    if file_mode == "filename":
+        result = pdf.partition_pdf(
+            filename=filename, strategy=strategy,
+            password="password"
+        )
+        _test(result)
+    elif file_mode == "rb":
+        with open(filename, "rb") as f:
+            result = pdf.partition_pdf(
+                file=f, strategy=strategy,
+                password="password"
+            )
+            _test(result)
+    else:
+        with open(filename, "rb") as test_file:
+            spooled_temp_file = SpooledTemporaryFile()
+            spooled_temp_file.write(test_file.read())
+            spooled_temp_file.seek(0)
+            result = pdf.partition_pdf(
+                file=spooled_temp_file, strategy=strategy,
+                password="password"
+            )
+            _test(result)
diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py
@@ -32,6 +32,7 @@ def partition_image(
     starting_page_number: int = 1,
     extract_forms: bool = False,
     form_extraction_skip_tables: bool = True,
+    password:Optional[str]=None,
     **kwargs: Any,
 ) -> list[Element]:
     """Parses an image into a list of interpreted elements.
@@ -91,6 +92,8 @@ def partition_image(
         (results in adding FormKeysValues elements to output).
     form_extraction_skip_tables
         Whether the form extraction logic should ignore regions designated as Tables.
+    password
+        The password to decrypt the PDF file.
     """
     exactly_one(filename=filename, file=file)
 
@@ -113,5 +116,6 @@ def partition_image(
         starting_page_number=starting_page_number,
         extract_forms=extract_forms,
         form_extraction_skip_tables=form_extraction_skip_tables,
+        password=password,
         **kwargs,
     )
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -12,7 +12,8 @@
 import numpy as np
 import wrapt
 from pdfminer import psparser
-from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
+from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox
+from pdfminer.pdftypes import PDFObjRef
 from pdfminer.utils import open_filename
 from pi_heif import register_heif_opener
 from PIL import Image as PILImage
@@ -142,6 +143,7 @@ def partition_pdf(
     starting_page_number: int = 1,
     extract_forms: bool = False,
     form_extraction_skip_tables: bool = True,
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> list[Element]:
     """Parses a pdf document into a list of interpreted elements.
@@ -222,6 +224,7 @@ def partition_pdf(
         starting_page_number=starting_page_number,
         extract_forms=extract_forms,
         form_extraction_skip_tables=form_extraction_skip_tables,
+        password=password,
         **kwargs,
     )
 
@@ -243,6 +246,7 @@ def partition_pdf_or_image(
     starting_page_number: int = 1,
     extract_forms: bool = False,
     form_extraction_skip_tables: bool = True,
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> list[Element]:
     """Parses a pdf or image document into a list of interpreted elements."""
@@ -271,6 +275,7 @@ def partition_pdf_or_image(
                 languages=languages,
                 metadata_last_modified=metadata_last_modified or last_modified,
                 starting_page_number=starting_page_number,
+                password=password,
                 **kwargs,
             )
             pdf_text_extractable = any(
@@ -320,6 +325,7 @@ def partition_pdf_or_image(
                 starting_page_number=starting_page_number,
                 extract_forms=extract_forms,
                 form_extraction_skip_tables=form_extraction_skip_tables,
+                password=password,
                 **kwargs,
             )
             out_elements = _process_uncategorized_text_elements(elements)
@@ -345,6 +351,7 @@ def partition_pdf_or_image(
                 is_image=is_image,
                 metadata_last_modified=metadata_last_modified or last_modified,
                 starting_page_number=starting_page_number,
+                password=password,
                 **kwargs,
             )
             out_elements = _process_uncategorized_text_elements(elements)
@@ -358,6 +365,7 @@ def extractable_elements(
     languages: Optional[list[str]] = None,
     metadata_last_modified: Optional[str] = None,
     starting_page_number: int = 1,
+    password:Optional[str] = None,
     **kwargs: Any,
 ) -> list[list[Element]]:
     if isinstance(file, bytes):
@@ -368,6 +376,7 @@ def extractable_elements(
         languages=languages,
         metadata_last_modified=metadata_last_modified,
         starting_page_number=starting_page_number,
+        password=password,
         **kwargs,
     )
 
@@ -378,6 +387,7 @@ def _partition_pdf_with_pdfminer(
     languages: list[str],
     metadata_last_modified: Optional[str],
     starting_page_number: int = 1,
+    password:Optional[str] = None,
     **kwargs: Any,
 ) -> list[list[Element]]:
     """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@@ -401,6 +411,7 @@ def _partition_pdf_with_pdfminer(
                 languages=languages,
                 metadata_last_modified=metadata_last_modified,
                 starting_page_number=starting_page_number,
+                password=password,
                 **kwargs,
             )
 
@@ -411,6 +422,7 @@ def _partition_pdf_with_pdfminer(
             languages=languages,
             metadata_last_modified=metadata_last_modified,
             starting_page_number=starting_page_number,
+            password=password,
             **kwargs,
         )
 
@@ -425,14 +437,16 @@ def _process_pdfminer_pages(
     metadata_last_modified: Optional[str],
     annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
     starting_page_number: int = 1,
+    password: Optional[str] = None,
     **kwargs,
 ) -> list[list[Element]]:
     """Uses PDFMiner to split a document into pages and process them."""
 
     elements = []
 
     for page_number, (page, page_layout) in enumerate(
-        open_pdfminer_pages_generator(fp), start=starting_page_number
+        open_pdfminer_pages_generator(fp, password=password),
+            start=starting_page_number,
     ):
         width, height = page_layout.width, page_layout.height
 
@@ -554,6 +568,7 @@ def _partition_pdf_or_image_local(
     extract_forms: bool = False,
     form_extraction_skip_tables: bool = True,
     pdf_hi_res_max_pages: Optional[int] = None,
+    password:Optional[str] = None,
     **kwargs: Any,
 ) -> list[Element]:
     """Partition using package installed locally"""
@@ -590,10 +605,12 @@ def _partition_pdf_or_image_local(
             is_image=is_image,
             model_name=hi_res_model_name,
             pdf_image_dpi=pdf_image_dpi,
+            password=password,
         )
 
         extracted_layout, layouts_links = (
-            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi,
+                                       password=password)
             if pdf_text_extractable
             else ([], [])
         )
@@ -633,20 +650,22 @@ def _partition_pdf_or_image_local(
             ocr_mode=ocr_mode,
             pdf_image_dpi=pdf_image_dpi,
             ocr_layout_dumper=ocr_layout_dumper,
+            password=password,
         )
     else:
         inferred_document_layout = process_data_with_model(
             file,
             is_image=is_image,
             model_name=hi_res_model_name,
             pdf_image_dpi=pdf_image_dpi,
+            password=password,
         )
 
         if hasattr(file, "seek"):
             file.seek(0)
 
         extracted_layout, layouts_links = (
-            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
+            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
             if pdf_text_extractable
             else ([], [])
         )
@@ -688,6 +707,7 @@ def _partition_pdf_or_image_local(
             ocr_mode=ocr_mode,
             pdf_image_dpi=pdf_image_dpi,
             ocr_layout_dumper=ocr_layout_dumper,
+            password=password,
         )
 
     final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
@@ -834,6 +854,7 @@ def _partition_pdf_or_image_with_ocr(
     is_image: bool = False,
     metadata_last_modified: Optional[str] = None,
     starting_page_number: int = 1,
+    password: Optional[str] = None,
     **kwargs: Any,
 ):
     """Partitions an image or PDF using OCR. For PDFs, each page is converted
@@ -858,7 +879,8 @@ def _partition_pdf_or_image_with_ocr(
             elements.extend(page_elements)
     else:
         for page_number, image in enumerate(
-            convert_pdf_to_images(filename, file), start=starting_page_number
+            convert_pdf_to_images(filename, file, password=password),
+                start=starting_page_number
         ):
             page_elements = _partition_pdf_or_image_with_ocr_from_image(
                 image=image,
@@ -1142,6 +1164,24 @@ def document_to_element_list(
                 page_elements.extend(element)
                 translation_mapping.extend([(layout_element, el) for el in element])
                 continue
+
+                # TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9
+                # will need to switch to some pattern matching once we support more languages
+                if not word:
+                    isalnum = char.isalnum()
+                if word and char.isalnum() != isalnum:
+                    isalnum = char.isalnum()
+                    words.append(
+                        {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index},
+                    )
+                    word = ""
+
+                if len(word) == 0:
+                    start_index = text_len + index
+                    x1 = character.x0
+                    y2 = height - character.y0
+                    x2 = character.x1
+                    y1 = height - character.y1
             else:
 
                 element.metadata.links = (

diff --git a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py
@@ -546,6 +546,7 @@ def __init__(
         draw_grid: bool = False,
         resize: Optional[float] = None,
         format: str = "png",
+        password: Optional[str] = None,
     ):
         self.draw_caption = draw_caption
         self.draw_grid = draw_grid
@@ -554,6 +555,7 @@ def __init__(
         self.format = format
         self.drawers = []
         self.file = file
+        self.password = password
 
         super().__init__(filename, save_dir)
 
@@ -678,6 +680,7 @@ def load_source_image(self) -> Generator[Image.Image, None, None]:
                         file=self.file,
                         output_folder=temp_dir,
                         path_only=True,
+                        password=self.password,
                     )
                 except Exception as ex:  # noqa: E722
                     print(

diff --git a/unstructured/partition/pdf_image/analysis/tools.py b/unstructured/partition/pdf_image/analysis/tools.py
@@ -66,6 +66,7 @@ def save_analysis_artifiacts(
     draw_caption: bool = True,
     resize: Optional[float] = None,
     format: str = "png",
+    password: Optional[str] = None,
 ):
     """Save the analysis artifacts for a given file. Loads some settings from
     the environment configuration.
@@ -82,6 +83,7 @@ def save_analysis_artifiacts(
         draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
         resize: Output image resize value. If not provided, the image will not be resized.
         format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
+        password (optional): The password to decrypt the PDF file.
     """
     if not filename:
         filename = _generate_filename(is_image)
@@ -109,6 +111,7 @@ def save_analysis_artifiacts(
             draw_caption=draw_caption,
             resize=resize,
             format=format,
+            password=password,
         )
 
         for layout_dumper in layout_dumpers:
@@ -125,6 +128,7 @@ def render_bboxes_for_file(
     draw_caption: bool = True,
     resize: Optional[float] = None,
     format: str = "png",
+    password: Optional[str] = None,
 ):
     """Render the bounding boxes for a given layout dimp file.
     To be used for analysis after the partition is performed for
@@ -144,6 +148,7 @@ def render_bboxes_for_file(
         draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
         resize: Output image resize value. If not provided, the image will not be resized.
         format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
+        password (optional): The password to decrypt the PDF file.
     """
     filename_stem = Path(filename).stem
     is_image = not Path(filename).suffix.endswith("pdf")
@@ -183,6 +188,7 @@ def render_bboxes_for_file(
             draw_caption=draw_caption,
             resize=resize,
             format=format,
+            password=password,
         )
 
         for drawer in layout_drawers: