Source code for niklib.data.pdf

__all__ = [
    'PDFIO', 'XFAPDF', 'ExampleXFA'
]

# core
import xml.etree.ElementTree as et
import PyPDF2 as pypdf
import re
# ours: data
from niklib.data import functional
from niklib.data.constant import ExampleDocTypes
# helpers
from enum import Enum
from typing import Any
import logging


# logging
logger = logging.getLogger(__name__)


[docs] class PDFIO: """Base class for dealing with PDF files For each mode of PDF, let's say XFA files, one needs to extend this class and abstract methods like :func:`extract_raw_content` to generate a string of the content of the PDF in a format that can be used by the other classes (e.g. `XML`). For instance, see :class:`XFAPDF` for the extension of this class. """
[docs] def __init__(self) -> None: pass
[docs] def extract_raw_content(self, pdf_path: str) -> str: """Extracts unprocessed data from a PDF file Args: pdf_path (str): Path to the pdf file """ raise NotImplementedError
[docs] def find_in_dict(self, needle: Any, haystack: Any) -> Any: """Looks for the value of a key inside a nested dictionary Args: needle (Any): Key to look for haystack (Any): Dictionary to look in. Can be a dict inside another dict Returns: Any: The value of key ``needle`` """ for key in haystack.keys(): try: value = haystack[key] except: continue if key == needle: return value if isinstance(value, dict): x = self.find_in_dict(needle, value) if x is not None: return x
[docs] class XFAPDF(PDFIO): """Contains functions and utility tools for dealing with XFA PDF documents. Note: Developers should subclass this override :meth:`clean_xml_for_csv` for their own specific XFA data used. """
[docs] def __init__(self) -> None: super().__init__()
[docs] def extract_raw_content(self, pdf_path: str) -> str: """Extracts RAW content of XFA PDF files which are in XML format Args: pdf_path (str): path to the pdf file Reference: - https://towardsdatascience.com/how-to-extract-data-from-pdf-forms-using-python-10b5e5f26f70 Returns: str: XFA object of the pdf file in XML format """ pdf_object = open(pdf_path, 'rb') pdf = pypdf.PdfFileReader(pdf_object) xfa = self.find_in_dict('/XFA', pdf.resolved_objects) # `datasets` keyword contains filled forms in XFA array xml = xfa[xfa.index('datasets')+1].get_object().get_data() xml = str(xml) # convert bytes to str logger.info(f'XFA content of {pdf_path} extracted as XML.') return xml
[docs] def clean_xml_for_csv(self, xml: str, mode: Enum) -> str: """Cleans the XML file extracted from XFA forms Since each form has its own format and issues, this method needs to be implemented uniquely for each unique file/form which needs to be specified using argument ``mode`` that can be populated from :class:`niklib.data.constant.ExampleDocTypes`. Args: xml (str): XML content mode (Enum): mode of the document defined in :class:`niklib.data.constant.ExampleDocTypes` Returns: str: cleaned XML content to be used in CSV file """ raise NotImplementedError
[docs] def flatten_dict(self, d: dict) -> dict: """Takes a (nested) multilevel dictionary and flattens it The final keys are ``key.key...`` and values are the leaf values of dictionary Args: d (dict): A dictionary References: * https://stackoverflow.com/a/67744709/18971263 Returns: dict: A flattened dictionary """ return functional.flatten_dict(d=d)
[docs] def xml_to_flattened_dict(self, xml: str) -> dict: """Takes a (nested) XML and converts it to a flattened dictionary The final keys are ``key.key...`` and values are the leaf values of XML tree Args: xml (str): A XML string Returns: dict: A flattened dictionary """ return functional.xml_to_flattened_dict(xml=xml)
[docs] class ExampleXFA(XFAPDF): """Handles Canada XFA PDF files """
[docs] def __init__(self) -> None: super().__init__()
[docs] def clean_xml_for_csv(self, xml: str, mode: Enum) -> str: """Hardcoded cleaning of Example XFA XML files to be XML compatible with CSV Args: xml (str): XML content mode (Enum): mode of the document defined in :class:`niklib.data.constant.ExampleDocTypes` Returns: str: cleaned XML content to be used in CSV file """ if mode == ExampleDocTypes.CANADA_5257E: # remove bad characters xml = re.sub(r"b'\\n", '', xml) xml = re.sub(r"'", '', xml) xml = re.sub(r"\\n", '', xml) # remove 9000 lines of redundant info for '5257e' doc tree = et.ElementTree(et.fromstring(xml)) root = tree.getroot() junk = tree.findall('LOVFile') root.remove(junk[0]) xml = str(et.tostring(root, encoding='utf8', method='xml')) # parsing through ElementTree adds bad characters too xml = re.sub( r"b'<\?xml version=\\'1.0\\' encoding=\\'utf8\\'\?>", '', xml) xml = re.sub(r"'", '', xml) xml = re.sub(r"\\n[ ]*", '', xml) elif mode == ExampleDocTypes.CANADA_5645E: # remove bad characters xml = re.sub(r"b'\\n", '', xml) xml = re.sub(r"'", '', xml) xml = re.sub(r"\\n", '', xml) logger.info(f'Finished cleaning XML content.') return xml