Source code for serveliza.mixins.pdf

# builtin libraries
import sys

# internal modules
from .pdf_processors import PROCESSORS

# from pdfminer.high_level import extract_pages

# disables warinings of processors libs
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


[docs]class PDFProcessorMixin(*PROCESSORS): ''' Mixin that allows an instance the ability to process \ PDF files with certain libraries. In the constructor, \ the processor to be used is defined with the argument \ of the same name, binding the :attr:`process_pdf \ <.ProcesosrPDFMixin.process_pdf>` property to the \ method related to it. PDF processor availables: - `pdftotext <https://github.com/jalan/pdftotext>`_ \ (0.1.0 release) with :meth:`pdftotext_processor \ <.ProcessorPDFMixin.processor_pdftotext>` - `pdfminersix <https://pdfminersix.readthedocs.io/>`_ \ (0.1.0 release) with :meth:`pdftotext_processor \ <.ProcessorPDFMixin.processor_pdfminersix>` ''' _processor = 'pdftotext' _tmp_file = None processor_ref = { 'pdftotext': 'https://github.com/jalan/pdftotext', # 0.1.0 'pdfminersix': 'https://pdfminersix.readthedocs.io/', # 0.1.0 # dev note: add processors ref here {name:url}. } @property def processor(self): ''' Processor (library) to extract text from pdf file. ''' return self._processor @processor.setter def processor(self, value): if value in self.processor_ref: self._processor = value self._process_pdf = getattr(self, 'processor_'+value) self._process_pdf_page = getattr(self, 'processor_'+value+'_page') self._processor_name = value return None raise TypeError( f'{str(value)} must be a available ' 'processor: ' + ','.join( [x[0] for x in self.processors_ref.items()])) @property def process_pdf(self): ''' Property that calls the method corresponding to the PDF file \ processor configured in the instance initialization. >>> obj.process_pdf(*args) ''' if self._tmp_file: self._tmp_file.close() # ensure closing file. self._tmp_file = None return self._process_pdf @property def process_pdf_page(self): ''' ''' return self._process_pdf_page