Source code for niklib.models.preprocessors.helpers

# core
import numpy as np
import pandas as pd
# ours: models
from niklib.models.preprocessors import ColumnTransformer
from niklib.models.preprocessors import OneHotEncoder
# helpers
from typing import List, Union
import logging


# configure logging
logger = logging.getLogger(__name__)


[docs] def preview_column_transformer( column_transformer: ColumnTransformer, original: np.ndarray, transformed: np.ndarray, df: pd.DataFrame, random_state: Union[int, np.random.Generator] = np.random.default_rng(), **kwargs ) -> pd.DataFrame: """Preview transformed data next to original one obtained via ``ColumnTransformer`` When the transformation is not :class:`sklearn.preprocessing.OneHotEncoder`, the transformed data is previewed next to the original data in a pandas dataframe. But when the transformation is :class:`sklearn.preprocessing.OneHotEncoder`, this is no longer clean or informative in seeing only 0s and 1s. So, I just skip previewing the transformed data entirely but report following information: - The number of columns affected by transformation - The number of unique values in all of affected columns - The number of newly produced columns Args: column_transformer (ColumnTransformer): An instance of :class:`sklearn.compose.ColumnTransformer` original (:class:`numpy.ndarray`): Original data as a :class:`numpy.ndarray`. Same shape as ``transformed`` transformed (:class:`numpy.ndarray`): Transformed data as a :class:`numpy.ndarray`. Same shape as ``original`` df (:class:`pandas.DataFrame`): A dataframe that hosts the ``original`` and ``transformed`` data. Used to extract column names and unique values for logging information about the transformations done random_state (Union[int, :class:`numpy.random.Generator`], optional): A seed value or instance of :class:`numpy.random.Generator` for sampling. Defaults to :func:`numpy.random.default_rng()`. **kwargs: Additional arguments as follows: * ``n_samples`` (int): Number of samples to draw. Defaults to 1. Raises: ValueError: If ``original`` and ``transformed`` are not of the same shape Yields: :class:`pandas.DataFrame`: Preview dataframe for each transformer in ``column_transformer.transformers_``. Dataframe has twice as columns as ``original`` and ``transformed``, i.e. ``df.shape == (original.shape[0], 2 * original.shape[1])`` """ # extract kwargs n_samples = kwargs.get('n_samples', 1) # just aliases for shorter lines ct = column_transformer # set rng if isinstance(random_state, int): random_state = np.random.default_rng(random_state) # generate sample indices sample_indices = random_state.choice( original.shape[0], size=n_samples, replace=False ) sample_indices = sample_indices.reshape(-1, 1) # to broadcast properly # loop through each transform (over subset of columns) and preview it for idx, k in enumerate(ct.output_indices_): # 'remainder' is not transformed, so end of the loop if k == 'remainder': return None # get indices of the transformed columns transformed_columns_slice: Union[list, slice] = ct.output_indices_[k] if isinstance(transformed_columns_slice, slice): transformed_columns_range = range(transformed_columns_slice.stop) transformed_columns_range = transformed_columns_range[transformed_columns_slice] transformed_columns_indices = list(transformed_columns_range) else: transformed_columns_indices = transformed_columns_slice # get indices of the original columns columns_indices = ct._columns[idx] # get names of the original columns columns_indices_names = df.columns.values[columns_indices] # preview transformed and original data side by side if not isinstance(ct.transformers[idx][1], OneHotEncoder): # compare the values of the transformed and the original columns original_sample = original[ sample_indices, columns_indices ] transformed_sample = transformed[ sample_indices, transformed_columns_indices ] # fix shapes to be 2d original_sample = original_sample.reshape( sample_indices.shape[0], columns_indices.__len__() ) transformed_sample = transformed_sample.reshape( sample_indices.shape[0], columns_indices.__len__() ) # create a dataframe with the original and transformed columns side by side sample = np.empty( shape=(original_sample.shape[0], original_sample.shape[1] * 2) ) sample[:, ::2] = original_sample sample[:, 1::2] = transformed_sample preview_cols: List[str] = [] [preview_cols.extend([f'{c}_og', f'{c}_tf']) # type: ignore for c in columns_indices_names] preview_df = pd.DataFrame(sample, columns=preview_cols) # yield the previews if n_samples == 1: # just better visuals for single sample yield preview_df.T else: yield preview_df # show info about onehot encoder changes elif isinstance(ct.transformers[idx][1], OneHotEncoder): count_uniques = df.iloc[:, columns_indices].nunique().sum() logger.warning(f'For "{ct.transformers[idx][0]}" transformer: ') logger.warning(f'{len(columns_indices)} columns are affected. ') logger.warning( f'On selected columns, {count_uniques} unique values exist. ' f'It is expected to have {count_uniques - len(columns_indices)}' f' new columns and ' f'{len(transformed_columns_indices) - len(columns_indices)}' f' columns are newly produced.\n' )