Source code for serveliza.roll.roll

from datetime import datetime as dt
from datetime import timedelta
from pathlib import Path
from pandas import pandas as pd
import numpy as np


from serveliza.mixins.pdf import PDFProcessorMixin
from serveliza.utils import pdf as pdf_utils
from .parsers import RollParser
from .adapters import RollAdapter
from .printer import RollPrinter
from .memorizer import RollMemorizer
from .exporter import RollExporter


DURATIONS_SCHEMA = {
    'processing': timedelta(),
    'adapting': timedelta(),
    'parsing': timedelta(),
    'memorizing': timedelta(),
    'exporting': timedelta()}


[docs]class ElectoralRoll(PDFProcessorMixin):
    '''ElectoralRoll

    :class:`ElectoralRoll <.ElectoralRoll>` allows to \
    instantiate an electoral roll of the chilean Electoral Service \
    (*Servicio Electoral de Chile*, SERVEL) from PDF files.

    Different parameters are handled in the constructor of this class. In \
    itself it handles the *source* parameter that determines the path \
    where to recognize pdf files (it can be a directory path or a file), \
    the *recursive* parameter that determines whether to search in the \
    root of the defined path or in each of its subdirectories, and \
    the *auto* parameter that determines if the extraction is automatic. \
    It also inherits parameters from :class:`PDFProcessorMixin \
    <.PDFProcessorMixin>` (*processor*). Likewise, in the constructor \
    it instantiates other nested classes by routing their parameters.

    :param str source: The source path of pdf files.
    :param bool auto: Run the extract in the instantation.
    :param bool recursive: Determines if the search for pdf files in the \
        delivered source is recursive or is only for the root of the \
        indicated directory,
    :param str processor: Processor to use (default='pdftotext', see \
        more in :class:`PDFProcessorMixin <.PDFProcessorMixin>`).
    :param bool memorize: Storage data in memory of instance (default=True, \
        see more in :class:`RollMemorizer <.RollMemorizer>`).
    :param bool export: If export data in csv file (default=False, \
        see more in :class:`RollExporter <.RollExporter>`).
    :param str output: Directory to store the data in csv file(s) (\
        see more in :class:`RollExporter <.RollExporter>`).
    :param str mode: Determines the data export mode in files. If it \
        is *unified* (default) it creates a single csv file with the data,\
         or if it is *separated* into several according to communal or \
        regional criteria (see more in :class:`RollExporter <.RollExporter>`).
    :param str mode_sep: Criteria for separating files in export in \
        separate mode (*commune* or *region*, default="commune", \
        see more in :class:`RollExporter <.RollExporter>`).
    :param bool random_suffix: Determines whether exported files have a \
        random text string appended to the end (see more in \
        :class:`RollExporter <.RollExporter>`).
    :param bool summary: Determines whether to generate a summary file of \
        the export and the extracted data (see more in :class:`RollExporter \
        <.RollExporter>`).

    Anyway, only the *source* parameter is required:

    >>> obj = ElectoralRoll(source='/path/to/the/pdf/file(s')
    >>> obj.run()  # Start the analysis and extraction of data.

    Setting the parameter *auto* to true in the constructor will \
    automatically start the :meth:`run <.ElectoralRoll.run>` method:

    >>> obj = ElectoralRoll(source='/path/to/the/pdf/file(s', auto=True)

    See the :meth:`run <.ElectoralRoll.run>` at that method for a better \
    understanding of this class.
    '''

    inner_class_parser = RollParser
    inner_class_adapter = RollAdapter
    inner_class_printer = RollPrinter
    inner_class_memorizer = RollMemorizer
    inner_class_exporter = RollExporter

    # Operational methods
    # --------------------
[docs]    def run(self):
        '''
        :meth:`ElectoralRoll.run <.ElectoralRoll.run>` is the main method \
        within the class logic that executes the complete flow of data \
        analysis and extraction:

        * Iterate over the found files, ordered by size from smallest \
        to largest, executing the :meth:`run_file <.ElectoralRoll.run_file>` \
        method with the file, its index and the total.
        * It iterates on each page of each file:
            * *Processing* it with the library determined in the processor \
            property and defined in the constructor (see more in \
            :class:`PDFProcessorMixin <.PDFProcessorMixin>`).
            * *Adapting* the rendered page if required by the application \
            and the processor used (see more in :class:`RollAdapter \
            <.RollAdapter>`).
            * *Analyzing* the content text to extract its data (see more \
            in :class:`RollParser <.RollParser>`).
            * *Memorizing* your data in a consolidated data stored in the \
            memorizer. Its execution can be skipped by setting the \
            *memorizer* parameter to false in the constructor (see more \
            in :class:`RollMemorizer <.RollMemorizer>`).
            * *Exporting* your data to one or more csv files depending on \
            how the exporter is configured. Its execution can be activated \
            by defining the *export* parameter as true in the constructor \
            (see more in :class:`RollExporter <RollExporter>`).
        * The printer (:class:`RollPrinter <.RollPrinter>` instance) is \
        executed in each part of the flow and it determines if and how it \
        prints on the screen (as declared in the constructor).

        >>> roll.run()
        '''
        started = dt.now()
        self._metadata['analysis']['started'] = started
        files = self._metadata['files']
        self.printer.run_started(started, files)
        files = [x[1] for x in sorted(
            files.items(), key=lambda x: x[1]['bytes'])]
        for idx, file in enumerate(files):
            self.run_file(file, idx, len(files))
        finalized = dt.now()
        self._metadata['analysis']['finalized'] = finalized
        summary = self.exporter.export_summary(self.rid, self.metadata)
        if summary:
            self._metadata['exported_to'].append(summary)
        self._is_runned = True
        self.printer.run_finalized(finalized, self.metadata)

[docs]    def run_file(self, file, file_num, file_total):
        '''
        :param dict file: data of file
        :param int file_num: the number of file to analize.
        :param int file_total: the total of files to analize.
            this param and before is needed for printer.

        The :meth:`run_file <.ElectoralRoll.run_file>` method is called by \
        the :meth:`run <.ElectoralRoll.run>` method and iterates on each page \
        of each file:

            * *Processing* it with the library determined in the processor \
            property and defined in the constructor (see more in \
            :class:`PDFProcessorMixin <.PDFProcessorMixin>`).
            * *Adapting* the rendered page if required by the application \
            and the processor used (see more in :class:`RollAdapter \
            <.RollAdapter>`).
            * *Analyzing* the content text to extract its data (see more \
            in :class:`RollParser <.RollParser>`).
            * *Memorizing* your data in a consolidated data stored in the \
            memorizer. Its execution can be skipped by setting the \
            *memorizer* parameter to false in the constructor (see more \
            in :class:`RollMemorizer <.RollMemorizer>`).
            * *Exporting* your data to one or more csv files depending on \
            how the exporter is configured. Its execution can be activated \
            by defining the *export* parameter as true in the constructor \
            (see more in :class:`RollExporter <RollExporter>`).

        Stores metadatas of the extraction of each file.
        '''

        def get_progress(self, rid, files, sheets):
            metadata = self.metadata['rolls'].get(rid, {})
            if not metadata:
                metadata['entries'] = {'total': 0, 'errors': 0}
            return {
                'entries': metadata['entries']['total'],
                'errors': metadata['entries']['errors'],
                'files': files, 'sheets': sheets,
                'duration': dt.now() - self.metadata['analysis']['started'],
                }

        def duration_wrapper(self, file, stage, method, args):
            init = dt.now()
            method = getattr(self, method)
            result = method(*args)
            duration = dt.now() - init
            self._metadata['files'][file]['durations'][stage] += duration
            self._metadata['analysis']['durations'][stage] += duration
            return result

        def update_file_metadata(parsed, metadata):
            if not metadata:
                metadata['rid'] = parsed.metadata['rid']
                attributes = ['roll', 'year', 'region',
                              'province', 'commune']
                for attr in attributes:
                    metadata[attr] = parsed.header[attr]
                metadata['entries'] = {'total': 0, 'rescue': 0, 'errors': 0}
                declared = parsed.header.get('total_sheets', False)
                if declared:
                    metadata['entries']['declared'] = declared
            entries = parsed.metadata['entries']
            for meta in entries:
                metadata['entries'][meta] += entries[meta]
            return metadata

        # pre-processing
        init = dt.now()
        pdf = self.process_pdf(file['absolute'])
        total_sheets = len(pdf)  # number of pages.
        file_metadata = {}
        rid = None
        self.printer.run_file_start(file, file_num)
        for idx, sheet in enumerate(pdf):
            # printing
            progress = get_progress(
                self, rid, (file_num, file_total), (idx+1, total_sheets))
            self.printer.run_file_progress(progress)
            # processing
            processed = duration_wrapper(
                self, file['name'], 'processing', 'process_pdf_page', [sheet])
            # adapting
            adapted = duration_wrapper(
                self, file['name'], 'adapting', 'inner_class_adapter',
                [processed, self.processor]).sheet
            # parsingd
            parsed = duration_wrapper(
                self, file['name'], 'parsing', 'sheet_parse', [adapted])
            # memorizing
            duration_wrapper(self, file['name'], 'memorizing',
                             'sheet_memorize', [parsed])
            # exporting
            exported = duration_wrapper(
                self, file['name'], 'exporting', 'sheet_export', [parsed])
            if exported:
                if 'exported_to' not in self.metadata:
                    self._metadata['exported_to'] = []
                if exported not in self._metadata['exported_to']:
                    self._metadata['exported_to'].append(exported)
            # update file metadata
            file_metadata = update_file_metadata(parsed, file_metadata)
            rid = file_metadata['rid']
        file_metadata['duration'] = dt.now() - init
        self._metadata['files'][file['name']].update(file_metadata)
        self.printer.run_file_end(file_metadata)

[docs]    def sheet_parse(self, sheet, *args, **kwargs):
        '''
        :param str sheet: sheet in string.
        :return: instance of :class:`RollParser <.RollParser>`.

        Method that calls the class defined in the :attr:`inner_class_parser \
        <.ElectoralRoll.inner_class_parser>` class attribute, initializing it \
        with the sheet argument.
        '''
        return self.inner_class_parser(sheet, *args, **kwargs)

[docs]    def sheet_memorize(self, parsed, *args, **kwargs):
        '''
        :param str parsed: instance of :class:`RollParser <.RollParser>`.

        Method that routes a parsed page to the :meth:`memorize \
        <.RollMemorizer.memorize>` method of the memorizer.
        '''
        return self.memorizer.memorize(parsed, *args, **kwargs)

[docs]    def sheet_export(self, parsed, *args, **kwargs):
        '''
        :param str parsed: instance of :class:`RollParser <.RollParser>`.
        :return: Absolute path of the csv file where the data was exported.

        Method that routes a parsed page to the :meth:`export_sheet \
        <.RollExporter.export_sheet>` method of the exporter.
        '''
        return self.exporter.export_sheet(parsed, *args, **kwargs)

    @property
    def printer(self):
        '''
        :return: inner instance of :class:`RollPrinter <.RollPrinter>`.

        Property to call the :class:`RollPrinter <.RollPrinter>` object \
        instanciated in constructor.

        >>> roll.printer.__class__
        serveliza.roll.printer.RollPrinter
        '''
        return self._printer

    @property
    def memorizer(self):
        '''
        :return: inner instance of :class:`RollMemorizer <.RollMemorizer>`.

        Property to call the :class:`RollMemorizer <.RollMemorizer>` object \
        instanciated in constructor.

        >>> roll.memorizer.__class__
        serveliza.roll.memorizer.RollMemorizer
        '''
        return self._memorizer

    @property
    def exporter(self):
        '''
        :return: inner instance of :class:`RollExporter <.RollExporter>`.

        Property to call the :class:`RollExporter <.RollExporter>` object \
        instanciated in constructor.

        >>> roll.exporter.__class__
        serveliza.roll.exporter.RollExporter
        '''
        return self._exporter

    # Operational properties
    # -----------------------
    @property
    def is_runned(self):
        '''
        :return: boolean.

        Boolean property that indicates whether the instance has executed the \
        :meth:`run <.ElectoralRoll.run>` method or not.

        >>> roll.is_runned
        True  # or False
        '''
        return self._is_runned

    @property
    def metadata(self):
        '''
        :return: dictionary with all metadata.

        Property that stores the analysis metadata.
        It integrates the metadata of each electoral register detected \
        in the analysis.

        >>> roll.is_runned
        False
        >>> roll.metadata
        {'files': {'filename.pdf': {'name': 'filename.pdf',
           'bytes': 10000,
           'relative': 'relative/path/filename.pdf',
           'absolute': '/absolute/path/filename.pdf',
           'mtime': datetime.datetime(...),
           'atime': datetime.datetime(...),
           'durations': {'processing': datetime.timedelta(0),
            'adapting': datetime.timedelta(0),
            'parsing': datetime.timedelta(0),
            'memorizing': datetime.timedelta(0),
            'exporting': datetime.timedelta(0)}}},
         'analysis': {'started': None,
          'finalized': None,
          'durations': {'processing': datetime.timedelta(0),
           'adapting': datetime.timedelta(0),
           'parsing': datetime.timedelta(0),
           'memorizing': datetime.timedelta(0),
           'exporting': datetime.timedelta(0)}},
         'rolls': {}}
        >>> roll.run()
        >>> roll.metadata
        {'files': {'filename.pdf': {'name': 'filename.pdf',
           ...
           'rid': 'RID-XXXX',
           'roll': 'PADRON ELECTORAL X - ELECCIONES X XXXX',
           'year': XXXX,
           'region': 'REGION',
           'province': 'PROVINCE',
           'commune': 'COMMUNE',
           'entries': {'total': 999, 'rescue': 0, 'errors': 0},
           'duration': datetime.timedelta(...)}},
         'analysis': {'started': datetime.datetime(...) ...},
         'rolls': {'RID-XXXX: {'roll': 'PADRON ELECTORAL X - ELE...',
           'year': XXXX,
           'regions': ['REGION', ...],
           'communes': ['COMMUNE', ...],
           'provinces': ['PROVINCE', ...],
           'nulls': {'total': 0},
           'entries': {'total': 999, 'rescue': 0, 'errors': 0}}}}
        '''
        stored = {'rolls': {}}
        for k, v in self.memorizer.storage.items():
            if 'metadata' in v:
                stored['rolls'][k] = v['metadata']
        return {**self._metadata, **stored}

    @property
    def rid(self):
        '''
        :return: string with first roll identifier.

        Property that returns the identifier of the electoral roll analyzed.
        If it will return only the first identifier detected, this should \
        not cause inconvenience unless pdf files from different electoral \
        rolls are loaded.

        >>> roll.rid
        'RID-XXXX'

        If the instance did not run, it returns None.
        '''
        if self.memorizer.storage:
            rids = [x for x in self.memorizer.storage]
            return rids[0]

    @property
    def roll(self):
        '''
        :return: name of electoral roll.

        Property that returns the full name of electoral roll analyzed.

        >>> roll.roll
        'PADRON ELECTORAL X - ELECCIONES X XXXX'

        Internaly use the :attr:`rid <.ElectoralRoll.rid>` property. If the \
        instance did not run, it returns None.
        '''
        if self.memorizer.storage:
            return self.memorizer.storage[self.rid]['metadata']['roll']

    @property
    def entries(self):
        '''
        :return: list of data entries of memorizer.

        Property that accesses the data entries of the electoral roll \
        analyzed. The data is stored in the :class:`RollMemorizer \
        <.RollMemorizer>` instance.

        >>> roll.entries
        [[...]...]

        Internaly use the :attr:`rid <.ElectoralRoll.rid>` property. If the \
        instance did not run, it returns None.
        '''
        storage = self.memorizer.storage
        if storage and 'entries' in storage[self.rid]:
            return self.memorizer.storage[self.rid]['entries']

    @property
    def fields(self):
        '''
        :return: list of fields of electoral roll.

        Property that returns the fields of the electoral roll analyzed.

        >>> roll.fields
        ['nombre',
         'c-identidad',
         'sex',
         'region',
         'provincia',
         'comuna',
         'domicilio-electoral',
         'circunscripcion',
         'mesa',
         'reference']

        Internaly use the :attr:`rid <.ElectoralRoll.rid>` property. If the \
        instance did not run, it returns None.
        '''
        storage = self.memorizer.storage
        if storage and 'fields' in storage[self.rid]:
            return self.memorizer.storage[self.rid]['fields']

    @property
    def errors(self):
        '''
        :return: list of errors found.

        Property that stores the errors of the analysis. List of errors \
        found in the analysis. Errors are dictionaries with data to keep \
        track of. The purpose of registering them is to improve the \
        development of serveliza.

        >>> roll.errors
        [...]
        '''
        return self.memorizer.errors

    @property
    def to_dataframe(self):
        '''
        :return: Pandas DataFrame instance.
        :raises UserWarning: You need to run the application before \
            converting the result to Pandas DataFrame.

        Property that returns the electoral roll data in a new Pandas \
        `DataFrame`_ instance.

        .. _DataFrame: https://pandas.pydata.org/pandas-docs/stable/\
            reference/api/pandas.DataFrame.html
        '''
        if not self.is_runned:
            raise UserWarning('You need to run the application before '
                              'converting the result to Pandas DataFrame.')
        return pd.DataFrame(np.array(self.entries), columns=self.fields)

    @property
    def source(self):
        '''
        :return: list of paths to valid pdf files.
        :raises TypeError: source param must be string or list.
        :raises TypeError: source doesnt have valid PDF files.

        Property that stores paths of pdf files obtained from a list or \
        string with file paths or directories.

        >>> roll.source
        ['relative / path / to / file.pdf']

        The source is loaded into the constructor through the parameter \
        of the same name. It is also possible to redefine through the \
        property setter:

        >>> roll.source = ['path / to / file.pdf', '/ path / to / dir']
        >>> roll.source = '/path/to/dir/o/file.pdf'
        '''
        return self._source

    @source.setter
    def source(self, source):
        paths = []
        if isinstance(source, str):
            paths.append(source)
        elif isinstance(source, list):
            paths += source
        else:
            raise TypeError('source param must be string or list.')
        files = []
        for path in paths:
            if isinstance(path, str) and pdf_utils.is_valid_pdf(path):
                files += [path]
            elif isinstance(path, str) and Path(path).is_dir():
                files += self.printer.init_search(
                    pdf_utils.get_all_pdf_in_path, [path, self.recursive])
        if not files:
            raise TypeError('Source doesnt have valid PDF files.')
        self._source += files
        meta_files = pdf_utils.get_metadata_from_pdfs(files)
        self.printer.init_founded(meta_files)
        for file in meta_files:
            meta_files[file]['durations'] = DURATIONS_SCHEMA
        self._metadata['files'].update(meta_files)

    @property
    def recursive(self):
        '''
        Property that determines if the search for pdf files in the \
        delivered source is recursive or is only for the root of the \
        indicated directory,
        '''
        return self._recursive

    def __init__(self, source, auto=False, *args, **kwargs):
        processor = kwargs.get('processor', self.processor)
        self.processor = processor
        self._printer = self.inner_class_printer(**kwargs)
        self._memorizer = self.inner_class_memorizer(**kwargs)
        self._exporter = self.inner_class_exporter(**kwargs)
        self._metadata = {'files': {}}
        self._is_runned = False
        self._recursive = bool(kwargs.get('recursive', False))
        self._source = []
        self.source = source
        self._metadata['analysis'] = {
            'started': None,
            'finalized': None,
            'durations': DURATIONS_SCHEMA}
        if auto:
            self.printer.init_auto()
            self.run()

    def __repr__(self):
        return self.printer.repr(self)
Source code for serveliza.roll.roll

serveliza

Navigation

Related Topics