diff options
author | 2022-11-13 23:46:45 +0530 | |
---|---|---|
committer | 2022-11-13 23:46:45 +0530 | |
commit | 9468226a9e2e2ab8cdd599f1d8538e860ca86120 (patch) | |
tree | 0a77ada226d6db80639f96b438bf83e4e756edb5 /env/lib/python3.10/site-packages/pikepdf/_methods.py | |
download | idcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.tar.gz idcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.tar.bz2 idcard-9468226a9e2e2ab8cdd599f1d8538e860ca86120.zip |
id card generator
Diffstat (limited to 'env/lib/python3.10/site-packages/pikepdf/_methods.py')
-rw-r--r-- | env/lib/python3.10/site-packages/pikepdf/_methods.py | 1340 |
1 files changed, 1340 insertions, 0 deletions
diff --git a/env/lib/python3.10/site-packages/pikepdf/_methods.py b/env/lib/python3.10/site-packages/pikepdf/_methods.py new file mode 100644 index 0000000..25e1d95 --- /dev/null +++ b/env/lib/python3.10/site-packages/pikepdf/_methods.py @@ -0,0 +1,1340 @@ +# SPDX-FileCopyrightText: 2022 James R. Barlow +# SPDX-License-Identifier: MPL-2.0 + +"""Implement some features in Python and monkey-patch them onto C++ classes. + +In several cases the implementation of some higher levels features might as +well be in Python. Fortunately we can attach Python methods to C++ class +bindings after the fact. + +We can also move the implementation to C++ if desired. +""" + +from __future__ import annotations + +import datetime +import mimetypes +import shutil +from collections.abc import KeysView, MutableMapping +from decimal import Decimal +from io import BytesIO +from pathlib import Path +from subprocess import run +from tempfile import NamedTemporaryFile +from typing import BinaryIO, Callable, ItemsView, Iterator, TypeVar, ValuesView +from warnings import warn + +from . import Array, Dictionary, Name, Object, Page, Pdf, Stream +from ._augments import augment_override_cpp, augments +from ._qpdf import ( + AccessMode, + AttachedFile, + AttachedFileSpec, + Attachments, + NameTree, + NumberTree, + ObjectStreamMode, + Rectangle, + StreamDecodeLevel, + StreamParser, + Token, + _ObjectMapping, +) +from .models import Encryption, EncryptionInfo, Outline, PdfMetadata, Permissions +from .models.metadata import decode_pdf_date, encode_pdf_date + +# pylint: disable=no-member,unsupported-membership-test,unsubscriptable-object +# mypy: ignore-errors + +__all__ = [] + +Numeric = TypeVar('Numeric', int, float, Decimal) + + +def _single_page_pdf(page) -> bytes: + """Construct a single page PDF from the provided page in memory.""" + pdf = Pdf.new() + pdf.pages.append(page) + bio = BytesIO() + pdf.save(bio) + bio.seek(0) + return bio.read() + + +def _mudraw(buffer, fmt) -> bytes: + """Use mupdf draw to rasterize the PDF in the memory buffer.""" + # mudraw cannot read from stdin so NamedTemporaryFile is required + with NamedTemporaryFile(suffix='.pdf') as tmp_in: + tmp_in.write(buffer) + tmp_in.seek(0) + tmp_in.flush() + + proc = run( + ['mudraw', '-F', fmt, '-o', '-', tmp_in.name], + capture_output=True, + check=True, + ) + return proc.stdout + + +@augments(Object) +class Extend_Object: + def _ipython_key_completions_(self): + if isinstance(self, (Dictionary, Stream)): + return self.keys() + return None + + def emplace(self, other: Object, retain=(Name.Parent,)): + """Copy all items from other without making a new object. + + Particularly when working with pages, it may be desirable to remove all + of the existing page's contents and emplace (insert) a new page on top + of it, in a way that preserves all links and references to the original + page. (Or similarly, for other Dictionary objects in a PDF.) + + Any Dictionary keys in the iterable *retain* are preserved. By default, + /Parent is retained. + + When a page is assigned (``pdf.pages[0] = new_page``), only the + application knows if references to the original the original page are + still valid. For example, a PDF optimizer might restructure a page + object into another visually similar one, and references would be valid; + but for a program that reorganizes page contents such as a N-up + compositor, references may not be valid anymore. + + This method takes precautions to ensure that child objects in common + with ``self`` and ``other`` are not inadvertently deleted. + + Example: + >>> pdf.pages[0].objgen + (16, 0) + >>> pdf.pages[0].emplace(pdf.pages[1]) + >>> pdf.pages[0].objgen + (16, 0) # Same object + + .. versionchanged:: 2.11.1 + Added the *retain* argument. + """ + if not self.same_owner_as(other): + raise TypeError("Objects must have the same owner for emplace()") + + # .keys() returns strings, so make all strings + retain = {str(k) for k in retain} + self_keys = set(self.keys()) + other_keys = set(other.keys()) + + assert all(isinstance(k, str) for k in (retain | self_keys | other_keys)) + + del_keys = self_keys - other_keys - retain + for k in (k for k in other_keys if k not in retain): + self[k] = other[k] # pylint: disable=unsupported-assignment-operation + for k in del_keys: + del self[k] # pylint: disable=unsupported-delete-operation + + def _type_check_write(self, filter_, decode_parms): + if isinstance(filter_, list): + filter_ = Array(filter_) + filter_ = filter_.wrap_in_array() + + if isinstance(decode_parms, list): + decode_parms = Array(decode_parms) + elif decode_parms is None: + decode_parms = Array([]) + else: + decode_parms = decode_parms.wrap_in_array() + + if not all(isinstance(item, Name) for item in filter_): + raise TypeError( + "filter must be: pikepdf.Name or pikepdf.Array([pikepdf.Name])" + ) + if not all( + (isinstance(item, Dictionary) or item is None) for item in decode_parms + ): + raise TypeError( + "decode_parms must be: pikepdf.Dictionary or " + "pikepdf.Array([pikepdf.Dictionary])" + ) + if len(decode_parms) != 0 and len(filter_) != len(decode_parms): + raise ValueError( + f"filter ({repr(filter_)}) and decode_parms " + f"({repr(decode_parms)}) must be arrays of same length" + ) + if len(filter_) == 1: + filter_ = filter_[0] + if len(decode_parms) == 0: + decode_parms = None + elif len(decode_parms) == 1: + decode_parms = decode_parms[0] + return filter_, decode_parms + + def write( + self, + data: bytes, + *, + filter: Name | Array | None = None, + decode_parms: Dictionary | Array | None = None, + type_check: bool = True, + ): # pylint: disable=redefined-builtin + """ + Replace stream object's data with new (possibly compressed) `data`. + + `filter` and `decode_parms` describe any compression that is already + present on the input `data`. For example, if your data is already + compressed with the Deflate algorithm, you would set + ``filter=Name.FlateDecode``. + + When writing the PDF in :meth:`pikepdf.Pdf.save`, + pikepdf may change the compression or apply compression to data that was + not compressed, depending on the parameters given to that function. It + will never change lossless to lossy encoding. + + PNG and TIFF images, even if compressed, cannot be directly inserted + into a PDF and displayed as images. + + Args: + data: the new data to use for replacement + filter: The filter(s) with which the + data is (already) encoded + decode_parms: Parameters for the + filters with which the object is encode + type_check: Check arguments; use False only if you want to + intentionally create malformed PDFs. + + If only one `filter` is specified, it may be a name such as + `Name('/FlateDecode')`. If there are multiple filters, then array + of names should be given. + + If there is only one filter, `decode_parms` is a Dictionary of + parameters for that filter. If there are multiple filters, then + `decode_parms` is an Array of Dictionary, where each array index + is corresponds to the filter. + """ + if type_check and filter is not None: + filter, decode_parms = self._type_check_write(filter, decode_parms) + + self._write(data, filter=filter, decode_parms=decode_parms) + + +@augments(Pdf) +class Extend_Pdf: + def _repr_mimebundle_( + self, include=None, exclude=None + ): # pylint: disable=unused-argument + """ + Present options to IPython or Jupyter for rich display of this object. + + See https://ipython.readthedocs.io/en/stable/config/integrating.html#rich-display + """ + bio = BytesIO() + self.save(bio) + bio.seek(0) + + data = {'application/pdf': bio.read()} + return data + + @property + def docinfo(self) -> Dictionary: + """ + Access the (deprecated) document information dictionary. + + The document information dictionary is a brief metadata record that can + store some information about the origin of a PDF. It is deprecated and + removed in the PDF 2.0 specification (not deprecated from the + perspective of pikepdf). Use the ``.open_metadata()`` API instead, which + will edit the modern (and unfortunately, more complicated) XMP metadata + object and synchronize changes to the document information dictionary. + + This property simplifies access to the actual document information + dictionary and ensures that it is created correctly if it needs to be + created. + + A new, empty dictionary will be created if this property is accessed + and dictionary does not exist. (This is to ensure that convenient code + like ``pdf.docinfo[Name.Title] = "Title"`` will work when the dictionary + does not exist at all.) + + You can delete the document information dictionary by deleting this property, + ``del pdf.docinfo``. Note that accessing the property after deleting it + will re-create with a new, empty dictionary. + + .. versionchanged: 2.4 + Added support for ``del pdf.docinfo``. + """ + if Name.Info not in self.trailer: + self.trailer.Info = self.make_indirect(Dictionary()) + return self.trailer.Info + + @docinfo.setter + def docinfo(self, new_docinfo: Dictionary): + if not new_docinfo.is_indirect: + raise ValueError( + "docinfo must be an indirect object - use Pdf.make_indirect" + ) + self.trailer.Info = new_docinfo + + @docinfo.deleter + def docinfo(self): + if Name.Info in self.trailer: + del self.trailer.Info + + def open_metadata( + self, + set_pikepdf_as_editor: bool = True, + update_docinfo: bool = True, + strict: bool = False, + ) -> PdfMetadata: + """ + Open the PDF's XMP metadata for editing. + + There is no ``.close()`` function on the metadata object, since this is + intended to be used inside a ``with`` block only. + + For historical reasons, certain parts of PDF metadata are stored in + two different locations and formats. This feature coordinates edits so + that both types of metadata are updated consistently and "atomically" + (assuming single threaded access). It operates on the ``Pdf`` in memory, + not any file on disk. To persist metadata changes, you must still use + ``Pdf.save()``. + + Example: + >>> with pdf.open_metadata() as meta: + meta['dc:title'] = 'Set the Dublic Core Title' + meta['dc:description'] = 'Put the Abstract here' + + Args: + set_pikepdf_as_editor: Automatically update the metadata ``pdf:Producer`` + to show that this version of pikepdf is the most recent software to + modify the metadata, and ``xmp:MetadataDate`` to timestamp the update. + Recommended, except for testing. + + update_docinfo: Update the standard fields of DocumentInfo + (the old PDF metadata dictionary) to match the corresponding + XMP fields. The mapping is described in + :attr:`PdfMetadata.DOCINFO_MAPPING`. Nonstandard DocumentInfo + fields and XMP metadata fields with no DocumentInfo equivalent + are ignored. + + strict: If ``False`` (the default), we aggressively attempt + to recover from any parse errors in XMP, and if that fails we + overwrite the XMP with an empty XMP record. If ``True``, raise + errors when either metadata bytes are not valid and well-formed + XMP (and thus, XML). Some trivial cases that are equivalent to + empty or incomplete "XMP skeletons" are never treated as errors, + and always replaced with a proper empty XMP block. Certain + errors may be logged. + """ + return PdfMetadata( + self, + pikepdf_mark=set_pikepdf_as_editor, + sync_docinfo=update_docinfo, + overwrite_invalid_xml=not strict, + ) + + def open_outline(self, max_depth: int = 15, strict: bool = False) -> Outline: + """ + Open the PDF outline ("bookmarks") for editing. + + Recommend for use in a ``with`` block. Changes are committed to the + PDF when the block exits. (The ``Pdf`` must still be opened.) + + Example: + >>> with pdf.open_outline() as outline: + outline.root.insert(0, OutlineItem('Intro', 0)) + + Args: + max_depth: Maximum recursion depth of the outline to be + imported and re-written to the document. ``0`` means only + considering the root level, ``1`` the first-level + sub-outline of each root element, and so on. Items beyond + this depth will be silently ignored. Default is ``15``. + strict: With the default behavior (set to ``False``), + structural errors (e.g. reference loops) in the PDF document + will only cancel processing further nodes on that particular + level, recovering the valid parts of the document outline + without raising an exception. When set to ``True``, any such + error will raise an ``OutlineStructureError``, leaving the + invalid parts in place. + Similarly, outline objects that have been accidentally + duplicated in the ``Outline`` container will be silently + fixed (i.e. reproduced as new objects) or raise an + ``OutlineStructureError``. + """ + return Outline(self, max_depth=max_depth, strict=strict) + + def make_stream(self, data: bytes, d=None, **kwargs) -> Stream: + """ + Create a new pikepdf.Stream object that is attached to this PDF. + + See: + :meth:`pikepdf.Stream.__new__` + + """ + return Stream(self, data, d, **kwargs) + + def add_blank_page( + self, *, page_size: tuple[Numeric, Numeric] = (612.0, 792.0) + ) -> Page: + """ + Add a blank page to this PDF. + + If pages already exist, the page will be added to the end. Pages may be + reordered using ``Pdf.pages``. + + The caller may add content to the page by modifying its objects after creating + it. + + Args: + page_size (tuple): The size of the page in PDF units (1/72 inch or 0.35mm). + Default size is set to a US Letter 8.5" x 11" page. + """ + for dim in page_size: + if not (3 <= dim <= 14400): + raise ValueError('Page size must be between 3 and 14400 PDF units') + + page_dict = Dictionary( + Type=Name.Page, + MediaBox=Array([0, 0, page_size[0], page_size[1]]), + Contents=self.make_stream(b''), + Resources=Dictionary(), + ) + page_obj = self.make_indirect(page_dict) + self._add_page(page_obj, first=False) + return Page(page_obj) + + def close(self) -> None: + """ + Close a ``Pdf`` object and release resources acquired by pikepdf. + + If pikepdf opened the file handle it will close it (e.g. when opened with a file + path). If the caller opened the file for pikepdf, the caller close the file. + ``with`` blocks will call close when exit. + + pikepdf lazily loads data from PDFs, so some :class:`pikepdf.Object` may + implicitly depend on the :class:`pikepdf.Pdf` being open. This is always the + case for :class:`pikepdf.Stream` but can be true for any object. Do not close + the `Pdf` object if you might still be accessing content from it. + + When an ``Object`` is copied from one ``Pdf`` to another, the ``Object`` is copied into + the destination ``Pdf`` immediately, so after accessing all desired information + from the source ``Pdf`` it may be closed. + + .. versionchanged:: 3.0 + In pikepdf 2.x, this function actually worked by resetting to a very short + empty PDF. Code that relied on this quirk may not function correctly. + """ + self._close() + if getattr(self, '_tmp_stream', None): + self._tmp_stream.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + @property + def allow(self) -> Permissions: + """ + Report permissions associated with this PDF. + + By default these permissions will be replicated when the PDF is + saved. Permissions may also only be changed when a PDF is being saved, + and are only available for encrypted PDFs. If a PDF is not encrypted, + all operations are reported as allowed. + + pikepdf has no way of enforcing permissions. + """ + results = {} + for field in Permissions._fields: + results[field] = getattr(self, '_allow_' + field) + return Permissions(**results) + + @property + def encryption(self) -> EncryptionInfo: + """ + Report encryption information for this PDF. + + Encryption settings may only be changed when a PDF is saved. + """ + return EncryptionInfo(self._encryption_data) + + def check(self) -> list[str]: + """ + Check if PDF is well-formed. + + Similar to ``qpdf --check``. + """ + + class DiscardingParser(StreamParser): + def __init__(self): # pylint: disable=useless-super-delegation + super().__init__() # required for C++ + + def handle_object(self, *_args): + pass + + def handle_eof(self): + pass + + problems: list[str] = [] + + self._decode_all_streams_and_discard() + + discarding_parser = DiscardingParser() + for page in self.pages: + page.parse_contents(discarding_parser) + + for warning in self.get_warnings(): + problems.append("WARNING: " + warning) + + return problems + + def save( + self, + filename_or_stream: Path | str | BinaryIO | None = None, + *, + static_id: bool = False, + preserve_pdfa: bool = True, + min_version: str | tuple[str, int] = "", + force_version: str | tuple[str, int] = "", + fix_metadata_version: bool = True, + compress_streams: bool = True, + stream_decode_level: StreamDecodeLevel | None = None, + object_stream_mode: ObjectStreamMode = ObjectStreamMode.preserve, + normalize_content: bool = False, + linearize: bool = False, + qdf: bool = False, + progress: Callable[[int], None] = None, + encryption: Encryption | bool | None = None, + recompress_flate: bool = False, + deterministic_id: bool = False, + ) -> None: + """ + Save all modifications to this :class:`pikepdf.Pdf`. + + Args: + filename_or_stream: Where to write the output. If a file + exists in this location it will be overwritten. + If the file was opened with ``allow_overwriting_input=True``, + then it is permitted to overwrite the original file, and + this parameter may be omitted to implicitly use the original + filename. Otherwise, the filename may not be the same as the + input file, as overwriting the input file would corrupt data + since pikepdf using lazy loading. + + static_id: Indicates that the ``/ID`` metadata, normally + calculated as a hash of certain PDF contents and metadata + including the current time, should instead be set to a static + value. Only use this for debugging and testing. Use + ``deterministic_id`` if you want to get the same ``/ID`` for + the same document contents. + preserve_pdfa: Ensures that the file is generated in a + manner compliant with PDF/A and other stricter variants. + This should be True, the default, in most cases. + + min_version: Sets the minimum version of PDF + specification that should be required. If left alone QPDF + will decide. If a tuple, the second element is an integer, the + extension level. If the version number is not a valid format, + QPDF will decide what to do. + force_version: Override the version recommend by QPDF, + potentially creating an invalid file that does not display + in old versions. See QPDF manual for details. If a tuple, the + second element is an integer, the extension level. + fix_metadata_version: If ``True`` (default) and the XMP metadata + contains the optional PDF version field, ensure the version in + metadata is correct. If the XMP metadata does not contain a PDF + version field, none will be added. To ensure that the field is + added, edit the metadata and insert a placeholder value in + ``pdf:PDFVersion``. If XMP metadata does not exist, it will + not be created regardless of the value of this argument. + + object_stream_mode: + ``disable`` prevents the use of object streams. + ``preserve`` keeps object streams from the input file. + ``generate`` uses object streams wherever possible, + creating the smallest files but requiring PDF 1.5+. + + compress_streams: Enables or disables the compression of + stream objects in the PDF that are created without specifying + any compression setting. Metadata is never compressed. + By default this is set to ``True``, and should be except + for debugging. Existing streams in the PDF or streams will not + be modified. To decompress existing streams, you must set + both ``compress_streams=False`` and ``stream_decode_level`` + to the desired decode level (e.g. ``.generalized`` will + decompress most non-image content). + + stream_decode_level: Specifies how + to encode stream objects. See documentation for + :class:`pikepdf.StreamDecodeLevel`. + + recompress_flate: When disabled (the default), qpdf does not + uncompress and recompress streams compressed with the Flate + compression algorithm. If True, pikepdf will instruct qpdf to + do this, which may be useful if recompressing streams to a + higher compression level. + + normalize_content: Enables parsing and reformatting the + content stream within PDFs. This may debugging PDFs easier. + + linearize: Enables creating linear or "fast web view", + where the file's contents are organized sequentially so that + a viewer can begin rendering before it has the whole file. + As a drawback, it tends to make files larger. + + qdf: Save output QDF mode. QDF mode is a special output + mode in QPDF to allow editing of PDFs in a text editor. Use + the program ``fix-qdf`` to fix convert back to a standard + PDF. + + progress: Specify a callback function that is called + as the PDF is written. The function will be called with an + integer between 0-100 as the sole parameter, the progress + percentage. This function may not access or modify the PDF + while it is being written, or data corruption will almost + certainly occur. + + encryption: If ``False`` + or omitted, existing encryption will be removed. If ``True`` + encryption settings are copied from the originating PDF. + Alternately, an ``Encryption`` object may be provided that + sets the parameters for new encryption. + + deterministic_id: Indicates that the ``/ID`` metadata, normally + calculated as a hash of certain PDF contents and metadata + including the current time, should instead be computed using + only deterministic data like the file contents. At a small + runtime cost, this enables generation of the same ``/ID`` if + the same inputs are converted in the same way multiple times. + Does not work for encrypted files. + + Raises: + PdfError + ForeignObjectError + ValueError + + You may call ``.save()`` multiple times with different parameters + to generate different versions of a file, and you *may* continue + to modify the file after saving it. ``.save()`` does not modify + the ``Pdf`` object in memory, except possibly by updating the XMP + metadata version with ``fix_metadata_version``. + + .. note:: + + :meth:`pikepdf.Pdf.remove_unreferenced_resources` before saving + may eliminate unnecessary resources from the output file if there + are any objects (such as images) that are referenced in a page's + Resources dictionary but never called in the page's content stream. + + .. note:: + + pikepdf can read PDFs with incremental updates, but always + coalesces any incremental updates into a single non-incremental + PDF file when saving. + + .. versionchanged:: 2.7 + Added *recompress_flate*. + + .. versionchanged:: 3.0 + Keyword arguments now mandatory for everything except the first + argument. + """ + if not filename_or_stream and getattr(self, '_original_filename', None): + filename_or_stream = self._original_filename + if not filename_or_stream: + raise ValueError( + "Cannot save to original filename because the original file was " + "not opening using Pdf.open(..., allow_overwriting_input=True). " + "Either specify a new destination filename/file stream or open " + "with allow_overwriting_input=True. If this Pdf was created using " + "Pdf.new(), you must specify a destination object since there is " + "no original filename to save to." + ) + self._save( + filename_or_stream, + static_id=static_id, + preserve_pdfa=preserve_pdfa, + min_version=min_version, + force_version=force_version, + fix_metadata_version=fix_metadata_version, + compress_streams=compress_streams, + stream_decode_level=stream_decode_level, + object_stream_mode=object_stream_mode, + normalize_content=normalize_content, + linearize=linearize, + qdf=qdf, + progress=progress, + encryption=encryption, + samefile_check=getattr(self, '_tmp_stream', None) is None, + recompress_flate=recompress_flate, + deterministic_id=deterministic_id, + ) + + @staticmethod + def open( + filename_or_stream: Path | str | BinaryIO, + *, + password: str | bytes = "", + hex_password: bool = False, + ignore_xref_streams: bool = False, + suppress_warnings: bool = True, + attempt_recovery: bool = True, + inherit_page_attributes: bool = True, + access_mode: AccessMode = AccessMode.default, + allow_overwriting_input: bool = False, + ) -> Pdf: + """ + Open an existing file at *filename_or_stream*. + + If *filename_or_stream* is path-like, the file will be opened for reading. + The file should not be modified by another process while it is open in + pikepdf, or undefined behavior may occur. This is because the file may be + lazily loaded. Despite this restriction, pikepdf does not try to use any OS + services to obtain an exclusive lock on the file. Some applications may + want to attempt this or copy the file to a temporary location before + editing. This behaviour changes if *allow_overwriting_input* is set: the whole + file is then read and copied to memory, so that pikepdf can overwrite it + when calling ``.save()``. + + When this function is called with a stream-like object, you must ensure + that the data it returns cannot be modified, or undefined behavior will + occur. + + Any changes to the file must be persisted by using ``.save()``. + + If *filename_or_stream* has ``.read()`` and ``.seek()`` methods, the file + will be accessed as a readable binary stream. pikepdf will read the + entire stream into a private buffer. + + ``.open()`` may be used in a ``with``-block; ``.close()`` will be called when + the block exits, if applicable. + + Whenever pikepdf opens a file, it will close it. If you open the file + for pikepdf or give it a stream-like object to read from, you must + release that object when appropriate. + + Examples: + >>> with Pdf.open("test.pdf") as pdf: + ... + + >>> pdf = Pdf.open("test.pdf", password="rosebud") + + Args: + filename_or_stream: Filename or Python readable and seekable file + stream of PDF to open. + password: User or owner password to open an + encrypted PDF. If the type of this parameter is ``str`` + it will be encoded as UTF-8. If the type is ``bytes`` it will + be saved verbatim. Passwords are always padded or + truncated to 32 bytes internally. Use ASCII passwords for + maximum compatibility. + hex_password: If True, interpret the password as a + hex-encoded version of the exact encryption key to use, without + performing the normal key computation. Useful in forensics. + ignore_xref_streams: If True, ignore cross-reference + streams. See qpdf documentation. + suppress_warnings: If True (default), warnings are not + printed to stderr. Use :meth:`pikepdf.Pdf.get_warnings()` to + retrieve warnings. + attempt_recovery: If True (default), attempt to recover + from PDF parsing errors. + inherit_page_attributes: If True (default), push attributes + set on a group of pages to individual pages + access_mode: If ``.default``, pikepdf will + decide how to access the file. Currently, it will always + selected stream access. To attempt memory mapping and fallback + to stream if memory mapping failed, use ``.mmap``. Use + ``.mmap_only`` to require memory mapping or fail + (this is expected to only be useful for testing). Applications + should be prepared to handle the SIGBUS signal on POSIX in + the event that the file is successfully mapped but later goes + away. + allow_overwriting_input: If True, allows calling ``.save()`` + to overwrite the input file. This is performed by loading the + entire input file into memory at open time; this will use more + memory and may recent performance especially when the opened + file will not be modified. + + Raises: + pikepdf.PasswordError: If the password failed to open the + file. + pikepdf.PdfError: If for other reasons we could not open + the file. + TypeError: If the type of ``filename_or_stream`` is not + usable. + FileNotFoundError: If the file was not found. + + Note: + When *filename_or_stream* is a stream and the stream is located on a + network, pikepdf assumes that the stream using buffering and read caches + to achieve reasonable performance. Streams that fetch data over a network + in response to every read or seek request, no matter how small, will + perform poorly. It may be easier to download a PDF from network to + temporary local storage (such as ``io.BytesIO``), manipulate it, and + then re-upload it. + + .. versionchanged:: 3.0 + Keyword arguments now mandatory for everything except the first + argument. + """ + if isinstance(filename_or_stream, bytes) and filename_or_stream.startswith( + b'%PDF-' + ): + warn( + "It looks like you called with Pdf.open(data) with a bytes-like object " + "containing a PDF. This will probably fail because this function " + "expects a filename or opened file-like object. Instead, please use " + "Pdf.open(BytesIO(data))." + ) + + tmp_stream, original_filename = None, False + if allow_overwriting_input: + try: + Path(filename_or_stream) + except TypeError as error: + raise ValueError( + '"allow_overwriting_input=True" requires "open" first argument ' + 'to be a file path' + ) from error + original_filename = Path(filename_or_stream) + with open(original_filename, 'rb') as pdf_file: + tmp_stream = BytesIO() + shutil.copyfileobj(pdf_file, tmp_stream) + pdf = Pdf._open( + tmp_stream or filename_or_stream, + password=password, + hex_password=hex_password, + ignore_xref_streams=ignore_xref_streams, + suppress_warnings=suppress_warnings, + attempt_recovery=attempt_recovery, + inherit_page_attributes=inherit_page_attributes, + access_mode=access_mode, + ) + pdf._tmp_stream = tmp_stream + pdf._original_filename = original_filename + return pdf + + +@augments(_ObjectMapping) +class Extend_ObjectMapping: + def get(self, key, default=None) -> Object: + try: + return self[key] + except KeyError: + return default + + +def check_is_box(obj) -> None: + try: + if obj.is_rectangle: + return + except AttributeError: + pass + + try: + pdfobj = Array(obj) + if pdfobj.is_rectangle: + return + except Exception as e: + raise ValueError("object is not a rectangle") from e + + raise ValueError("object is not a rectangle") + + +@augments(Page) +class Extend_Page: + @property + def mediabox(self): + """Return page's /MediaBox, in PDF units.""" + return self._get_mediabox(True) + + @mediabox.setter + def mediabox(self, value): + check_is_box(value) + self.obj['/MediaBox'] = value + + @property + def cropbox(self): + """Return page's effective /CropBox, in PDF units. + + If the /CropBox is not defined, the /MediaBox is returned. + """ + return self._get_cropbox(True, False) + + @cropbox.setter + def cropbox(self, value): + check_is_box(value) + self.obj['/CropBox'] = value + + @property + def trimbox(self): + """Return page's effective /TrimBox, in PDF units. + + If the /TrimBox is not defined, the /CropBox is returned (and if + /CropBox is not defined, /MediaBox is returned). + """ + return self._get_trimbox(True, False) + + @trimbox.setter + def trimbox(self, value): + check_is_box(value) + self.obj['/TrimBox'] = value + + @property + def images(self) -> _ObjectMapping: + """Return all regular images associated with this page. + + This method does not recurse into Form XObjects and does not + attempt to find inline images. + """ + return self._images + + @property + def resources(self) -> Dictionary: + """Return this page's resources dictionary.""" + return self.obj['/Resources'] + + def add_resource( + self, + res: Object, + res_type: Name, + name: Name | None = None, + *, + prefix: str = '', + replace_existing: bool = True, + ) -> Name: + """Add a new resource to the page's Resources dictionary. + + If the Resources dictionaries do not exist, they will be created. + + Args: + self: The object to add to the resources dictionary. + res: The dictionary object to insert into the resources + dictionary. + res_type: Should be one of the following Resource dictionary types: + ExtGState, ColorSpace, Pattern, Shading, XObject, Font, Properties. + name: The name of the object. If omitted, a random name will be + generated with enough randomness to be globally unique. + prefix: A prefix for the name of the object. Allows conveniently + namespacing when using random names, e.g. prefix="Im" for images. + Mutually exclusive with name parameter. + replace_existing: If the name already exists in one of the resource + dictionaries, remove it. + + Example: + >>> resource_name = pdf.pages[0].add_resource(formxobj, Name.XObject) + + .. versionadded:: 2.3 + + .. versionchanged:: 2.14 + If *res* does not belong to the same `Pdf` that owns this page, + a copy of *res* is automatically created and added instead. In previous + versions, it was necessary to change for this case manually. + + .. versionchanged:: 4.3.0 + Returns the name of the overlay in the resources dictionary instead + of returning None. + """ + if Name.Resources not in self.obj: + self.obj.Resources = Dictionary() + elif not isinstance(self.obj.Resources, Dictionary): + raise TypeError("Page /Resources exists but is not a dictionary") + resources = self.obj.Resources + + if res_type not in resources: + resources[res_type] = Dictionary() + + if name is not None and prefix: + raise ValueError("Must specify one of name= or prefix=") + if name is None: + name = Name.random(prefix=prefix) + + for res_dict in resources.as_dict().values(): + if not isinstance(res_dict, Dictionary): + continue + if name in res_dict: + if replace_existing: + del res_dict[name] + else: + raise ValueError(f"Name {name} already exists in page /Resources") + + resources[res_type][name] = res.with_same_owner_as(self.obj) + return name + + def _over_underlay( + self, + other, + rect: Rectangle | None, + under: bool, + push_stack: bool, + shrink: bool, + expand: bool, + ) -> Name: + formx = None + if isinstance(other, Page): + formx = other.as_form_xobject() + elif isinstance(other, Dictionary) and other.get(Name.Type) == Name.Page: + formx = Page(other).as_form_xobject() + elif ( + isinstance(other, Stream) + and other.get(Name.Type) == Name.XObject + and other.get(Name.Subtype) == Name.Form + ): + formx = other + + if formx is None: + raise TypeError( + "other object is not something we can convert to Form XObject" + ) + + if rect is None: + rect = Rectangle(self.trimbox) + + formx_placed_name = self.add_resource(formx, Name.XObject) + cs = self.calc_form_xobject_placement( + formx, formx_placed_name, rect, allow_shrink=shrink, allow_expand=expand + ) + + if push_stack: + self.contents_add(b'q\n', prepend=True) # prepend q + self.contents_add(b'Q\n', prepend=False) # i.e. append Q + + self.contents_add(cs, prepend=under) + self.contents_coalesce() + return formx_placed_name + + def add_overlay( + self, + other: Object | Page, + rect: Rectangle | None = None, + *, + push_stack: bool = True, + shrink: bool = True, + expand: bool = True, + ) -> Name: + """Overlay another object on this page. + + Overlays will be drawn after all previous content, potentially drawing on top + of existing content. + + Args: + other: A Page or Form XObject to render as an overlay on top of this + page. + rect: The PDF rectangle (in PDF units) in which to draw the overlay. + If omitted, this page's trimbox, cropbox or mediabox (in that order) + will be used. + push_stack: If True (default), push the graphics stack of the existing + content stream to ensure that the overlay is rendered correctly. + Officially PDF limits the graphics stack depth to 32. Most + viewers will tolerate more, but excessive pushes may cause problems. + Multiple content streams may also be coalesced into a single content + stream where this parameter is True, since the PDF specification + permits PDF writers to coalesce streams as they see fit. + shrink: If True (default), allow the object to shrink to fit inside the + rectangle. The aspect ratio will be preserved. + expand: If True (default), allow the object to expand to fit inside the + rectangle. The aspect ratio will be preserved. + + Returns: + The name of the Form XObject that contains the overlay. + + .. versionadded:: 2.14 + + .. versionchanged:: 4.0.0 + Added the *push_stack* parameter. Previously, this method behaved + as if *push_stack* were False. + + .. versionchanged:: 4.2.0 + Added the *shrink* and *expand* parameters. Previously, this method + behaved as if ``shrink=True, expand=False``. + + .. versionchanged:: 4.3.0 + Returns the name of the overlay in the resources dictionary instead + of returning None. + """ + return self._over_underlay( + other, + rect, + under=False, + push_stack=push_stack, + expand=expand, + shrink=shrink, + ) + + def add_underlay( + self, + other: Object | Page, + rect: Rectangle | None = None, + *, + shrink: bool = True, + expand: bool = True, + ) -> Name: + """Underlay another object beneath this page. + + Underlays will be drawn before all other content, so they may be overdrawn + partially or completely. + + There is no *push_stack* parameter for this function, since adding an + underlay can be done without manipulating the graphics stack. + + Args: + other: A Page or Form XObject to render as an underlay underneath this + page. + rect: The PDF rectangle (in PDF units) in which to draw the underlay. + If omitted, this page's trimbox, cropbox or mediabox (in that order) + will be used. + shrink: If True (default), allow the object to shrink to fit inside the + rectangle. The aspect ratio will be preserved. + expand: If True (default), allow the object to expand to fit inside the + rectangle. The aspect ratio will be preserved. + + Returns: + The name of the Form XObject that contains the underlay. + + .. versionadded:: 2.14 + + .. versionchanged:: 4.2.0 + Added the *shrink* and *expand* parameters. Previously, this method + behaved as if ``shrink=True, expand=False``. Fixed issue with wrong + page rect being selected. + """ + return self._over_underlay( + other, rect, under=True, push_stack=False, expand=expand, shrink=shrink + ) + + def contents_add(self, contents: Stream | bytes, *, prepend: bool = False): + """Append or prepend to an existing page's content stream. + + Args: + contents: An existing content stream to append or prepend. + prepend: Prepend if true, append if false (default). + + .. versionadded:: 2.14 + """ + return self._contents_add(contents, prepend=prepend) + + def __getattr__(self, name): + return getattr(self.obj, name) + + @augment_override_cpp + def __setattr__(self, name, value): + if hasattr(self.__class__, name): + object.__setattr__(self, name, value) + else: + setattr(self.obj, name, value) + + @augment_override_cpp + def __delattr__(self, name): + if hasattr(self.__class__, name): + object.__delattr__(self, name) + else: + delattr(self.obj, name) + + def __getitem__(self, key): + return self.obj[key] + + def __setitem__(self, key, value): + self.obj[key] = value + + def __delitem__(self, key): + del self.obj[key] + + def __contains__(self, key): + return key in self.obj + + def get(self, key, default=None): + try: + return self[key] + except KeyError: + return default + + def emplace(self, other: Page, retain=(Name.Parent,)): + return self.obj.emplace(other.obj, retain=retain) + + def __repr__(self): + return ( + repr(self.obj) + .replace('Dictionary', 'Page', 1) + .replace('(Type="/Page")', '', 1) + ) + + def _repr_mimebundle_(self, include=None, exclude=None): + data = {} + bundle = {'application/pdf', 'image/png'} + if include: + bundle = {k for k in bundle if k in include} + if exclude: + bundle = {k for k in bundle if k not in exclude} + pagedata = _single_page_pdf(self.obj) + if 'application/pdf' in bundle: + data['application/pdf'] = pagedata + if 'image/png' in bundle: + try: + data['image/png'] = _mudraw(pagedata, 'png') + except (FileNotFoundError, RuntimeError): + pass + return data + + +@augments(Token) +class Extend_Token: + def __repr__(self): + return f'pikepdf.Token({self.type_}, {self.raw_value})' + + +@augments(Rectangle) +class Extend_Rectangle: + def __repr__(self): + return f'pikepdf.Rectangle({self.llx}, {self.lly}, {self.urx}, {self.ury})' + + def __hash__(self): + return hash((self.llx, self.lly, self.urx, self.ury)) + + +@augments(Attachments) +class Extend_Attachments(MutableMapping): + def __getitem__(self, k: str) -> AttachedFileSpec: + filespec = self._get_filespec(k) + if filespec is None: + raise KeyError(k) + return filespec + + def __setitem__(self, k: str, v: AttachedFileSpec) -> None: + if not v.filename: + v.filename = k + return self._add_replace_filespec(k, v) + + def __delitem__(self, k: str) -> None: + return self._remove_filespec(k) + + def __len__(self): + return len(self._get_all_filespecs()) + + def __iter__(self) -> Iterator[str]: + yield from self._get_all_filespecs() + + def __repr__(self): + return f"<pikepdf._qpdf.Attachments with {len(self)} attached files>" + + +@augments(AttachedFileSpec) +class Extend_AttachedFileSpec: + @staticmethod + def from_filepath(pdf: Pdf, path: Path | str, *, description: str = ''): + """Construct a file specification from a file path. + + This function will automatically add a creation and modified date + using the file system, and a MIME type inferred from the file's extension. + + If the data required for the attach is in memory, use + :meth:`pikepdf.AttachedFileSpec` instead. + + Args: + pdf: The Pdf to attach this file specification to. + path: A file path for the file to attach to this Pdf. + description: An optional description. May be shown to the user in + PDF viewers. + """ + mime, _ = mimetypes.guess_type(str(path)) + if mime is None: + mime = '' + if not isinstance(path, Path): + path = Path(path) + + stat = path.stat() + return AttachedFileSpec( + pdf, + path.read_bytes(), + description=description, + filename=str(path.name), + mime_type=mime, + creation_date=encode_pdf_date( + datetime.datetime.fromtimestamp(stat.st_ctime) + ), + mod_date=encode_pdf_date(datetime.datetime.fromtimestamp(stat.st_mtime)), + ) + + def __repr__(self): + if self.filename: + return ( + f"<pikepdf._qpdf.AttachedFileSpec for {self.filename!r}, " + f"description {self.description!r}>" + ) + return f"<pikepdf._qpdf.AttachedFileSpec description {self.description!r}>" + + +@augments(AttachedFile) +class Extend_AttachedFile: + @property + def creation_date(self) -> datetime.datetime | None: + if not self._creation_date: + return None + return decode_pdf_date(self._creation_date) + + @creation_date.setter + def creation_date(self, value: datetime.datetime): + self._creation_date = encode_pdf_date(value) + + @property + def mod_date(self) -> datetime.datetime | None: + if not self._mod_date: + return None + return decode_pdf_date(self._mod_date) + + @mod_date.setter + def mod_date(self, value: datetime.datetime): + self._mod_date = encode_pdf_date(value) + + def read_bytes(self) -> bytes: + return self.obj.read_bytes() + + def __repr__(self): + return ( + f'<pikepdf._qpdf.AttachedFile objid={self.obj.objgen} size={self.size} ' + f'mime_type={self.mime_type} creation_date={self.creation_date} ' + f'mod_date={self.mod_date}>' + ) + + +@augments(NameTree) +class Extend_NameTree: + def keys(self): + return KeysView(self._as_map()) + + def values(self): + return ValuesView(self._as_map()) + + def items(self): + return ItemsView(self._as_map()) + + get = MutableMapping.get + pop = MutableMapping.pop + popitem = MutableMapping.popitem + clear = MutableMapping.clear + update = MutableMapping.update + setdefault = MutableMapping.setdefault + + +MutableMapping.register(NameTree) + + +@augments(NumberTree) +class Extend_NumberTree: + def keys(self): + return KeysView(self._as_map()) + + def values(self): + return ValuesView(self._as_map()) + + def items(self): + return ItemsView(self._as_map()) + + get = MutableMapping.get + pop = MutableMapping.pop + popitem = MutableMapping.popitem + clear = MutableMapping.clear + update = MutableMapping.update + setdefault = MutableMapping.setdefault + + +MutableMapping.register(NumberTree) |