Source code for niklib.data.logic

__all__ = [
    'Logics', 'ExampleLogics'
]

# core
from functools import reduce
import pandas as pd
import numpy as np
# helpers
from typing import Callable, cast


[docs] class Logics: """Applies logics on different type of data resulting in summarized, expanded, or transformed data Methods here are implemented in the way that can be used as ``Pandas.agg_`` function over :class:`pandas.Series` using ``functools.reduce_``. Note: This is constructed based on domain knowledge hence is designed for a specific purpose based on application. For demonstration purposes, see following methods of this class: - :meth:`count_previous_residency_country` - :meth:`count_rel` - :meth:`count_foreign_family_resident` These methods has be implemented by their superclass. See: - :meth:`ExampleLogics.count_previous_residency_country` - :meth:`ExampleLogics.count_rel` - :meth:`ExampleLogics.count_foreign_family_resident` .. _Pandas.agg: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.agg.html .. _functools.reduce: https://docs.python.org/3/library/functools.html#functools.reduce """
[docs] def __init__(self, dataframe: pd.DataFrame = None) -> None: """Init class by setting dataframe globally Args: dataframe (:class:`pandas.DataFrame`, optional): The dataframe that functions of this class will be user over its series, i.e. ``Logics.*(series)``. Defaults to None. """ self.df = dataframe
def __check_df(self, func: str) -> None: """Checks that :attr:`df` is initialized when function with the name ``func`` is being called Args: func (str): The name of the function that operates over :attr:`df` Raises: TypeError: If :attr:`df` is not initialized """ if self.df is None: raise TypeError( f'`df` attribute cannot be `None` when using "{func}".')
[docs] def reset_dataframe(self, dataframe: pd.DataFrame) -> None: """Takes a new dataframe and replaces the old one Note: This should be used when the dataframe is modified outside of functions provided in this class. E.g.:: my_df: pd.DataFrame = ... logics = Logics(dataframe=my_df) my_df = third_party_tools(my_df) # now update df in logics logics.reset_dataframe(dataframe=my_df) Args: dataframe (:class:`pandas.DataFrame`): The new dataframe """ self.df = dataframe
[docs] def add_agg_column( self, aggregator: Callable, agg_column_name: str, columns: list ) -> pd.DataFrame: """Aggregate columns and adds it to the original dataframe using an aggregator function Args: aggregator (Callable): A function that takes multiple columns of a series and reduces it agg_column_name (str): The name of new aggregated column columns (list): Name of columns to be aggregated (i.e. input to ``aggregator``) Note: Although this function updated the dataframe the class initialized with *inplace*, but user must update the main dataframe outside of this class to make sure he/she can use it via different tools. Simply put:: my_df: pd.DataFrame = ... logics = Logics(dataframe=my_df) my_df = logics.add_agg_column(...) my_df = third_party_tools(my_df) # now update df in logics logics.reset_dataframe(dataframe=my_df) # aggregate again... my_df = logics.add_agg_column(...) Returns: :class:`pandas.DataFrame`: Updated dataframe that contains aggregated data """ # check self.df is initialized self.__check_df(func=self.add_agg_column.__name__) self.df = cast(pd.DataFrame, self.df) # aggregate self.df[agg_column_name] = self.df[columns].agg(aggregator, axis=1) self.df = self.df.rename( columns={aggregator.__name__: agg_column_name}) # return updated dataframe to be used outside of this class return self.df
[docs] def count_previous_residency_country(self, series: pd.Series) -> int: """Counts the number of previous country of resident Args: series (:class:`pandas.Series`): Pandas Series to be processed Returns: int: Result of counting """ raise NotImplementedError
[docs] def count_rel(self, series: pd.Series) -> int: """Counts the number of items for the given relationship Args: series (:class:`pandas.Series`): Pandas Series to be processed Returns: int: Result of counting """ raise NotImplementedError
[docs] def count_foreign_family_resident(self, series: pd.Series) -> int: """Counts the number of family members that are living in a foreign country Args: series (:class:`pandas.Series`): Pandas Series to be processed Returns: int: Result of counting """ raise NotImplementedError
[docs] class ExampleLogics(Logics): """ Customize and extend logics defined in :class:`Logics` for an Example (Canada) dataset """
[docs] def __init__(self, dataframe: pd.DataFrame = None) -> None: super().__init__(dataframe)
[docs] def count_previous_residency_country(self, series: pd.Series) -> int: """Counts the number of previous residency by counting non-zero periods of residency When ``*.Period == 0``, then we can say that the person has no residency. This way one just needs to count non-zero periods. Args: series (:class:`pandas.Series`): Pandas Series to be processed containing residency periods Returns: int: Result of counting """ def counter(x, y): return np.sum(np.isin([x, y], [0])) return reduce(lambda x, y: 2 - counter(x, y), series)
[docs] def count_rel(self, series: pd.Series) -> int: """Counts the number of people for the given relationship, e.g. siblings. Args: series (:class:`pandas.Series`): Pandas Series to be processed Returns: int: Result of counting """ def counter(y): return np.sum(y != 0.) return reduce(lambda x, y: x + counter(y), series, 0)
[docs] def count_foreign_family_resident(self, series: pd.Series) -> int: """Counts the number of family members that are long distance resident This is being done by only checking the literal value ``'foreign'`` in the ``'*Addr'`` columns (address columns). Args: series (:class:`pandas.Series`): Pandas Series to be processed containing the residency state/province in string. In practice, any string different from applicant's province will be counted as difference. Examples: >>> import pandas as pd >>> from niklib.data.logic import CanadaLogics >>> f = CanadaLogics().count_foreign_family_resident >>> s = pd.Series(['alborz', 'alborz', 'alborz', None, 'foreign', None, 'gilan', 'isfahan', None]) >>> f(s) 1 >>> s1 = pd.Series(['foreign', 'foreign', 'alborz', 'fars']) >>> f(s1) 2 >>> s2 = pd.Series([None, None, 'alborz', 'fars']) >>> f(s2) 0 Returns: int: Result of counting """ self.df = cast(pd.DataFrame, self.df) # for mypy only def counter(y): return np.sum( np.isin([y], ['foreign'])) # type: ignore return reduce(lambda x, y: x + counter(y), series, 0) # type: ignore