Source code for serveliza.mixins.pdf_processors

# PDF processors libraries:
# --------------------------
import pdftotext
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator


[docs]class PdftotextMixin: ''' '''
[docs] def processor_pdftotext(self, pathfile): ''' Method to use `pdftotext <https://github.com/jalan/pdftotext>`_ \ in a file specified in the argument as a path. >>> obj.processor_pdftotext('/path/to/file.pdf') list # without processing ''' self._tmp_file = open(str(pathfile), 'rb') return pdftotext.PDF(self._tmp_file)
[docs] def processor_pdftotext_page(self, page): '''pdftotext not need that.''' return page
[docs]class PdfminersixMixin: ''' '''
[docs] def processor_pdfminersix(self, pathfile): ''' ''' self._tmp_file = open(str(pathfile), 'rb') return [x for x in PDFPage.get_pages(self._tmp_file)]
[docs] def processor_pdfminersix_page(self, page): ''' ''' resource_manager = PDFResourceManager() device = PDFPageAggregator(resource_manager, laparams=LAParams()) interpreter = PDFPageInterpreter(resource_manager, device) interpreter.process_page(page) return device.get_result()
PROCESSORS = [PdftotextMixin, PdfminersixMixin]