Source code for niklib.models.preprocessors.helpers
# core
import numpy as np
import pandas as pd
# ours: models
from niklib.models.preprocessors import ColumnTransformer
from niklib.models.preprocessors import OneHotEncoder
# helpers
from typing import List, Union
import logging
# configure logging
logger = logging.getLogger(__name__)
[docs]
def preview_column_transformer(
column_transformer: ColumnTransformer,
original: np.ndarray,
transformed: np.ndarray,
df: pd.DataFrame,
random_state: Union[int, np.random.Generator] = np.random.default_rng(),
**kwargs
) -> pd.DataFrame:
"""Preview transformed data next to original one obtained via ``ColumnTransformer``
When the transformation is not :class:`sklearn.preprocessing.OneHotEncoder`,
the transformed data is previewed next to the original data in a pandas dataframe.
But when the transformation is :class:`sklearn.preprocessing.OneHotEncoder`,
this is no longer clean or informative in seeing only 0s and 1s. So, I just
skip previewing the transformed data entirely but report following information:
- The number of columns affected by transformation
- The number of unique values in all of affected columns
- The number of newly produced columns
Args:
column_transformer (ColumnTransformer): An instance
of :class:`sklearn.compose.ColumnTransformer`
original (:class:`numpy.ndarray`): Original data as a :class:`numpy.ndarray`.
Same shape as ``transformed``
transformed (:class:`numpy.ndarray`): Transformed data as a :class:`numpy.ndarray`.
Same shape as ``original``
df (:class:`pandas.DataFrame`): A dataframe that hosts the ``original`` and ``transformed``
data. Used to extract column names and unique values for logging
information about the transformations done
random_state (Union[int, :class:`numpy.random.Generator`], optional): A seed value or
instance of :class:`numpy.random.Generator` for sampling. Defaults to
:func:`numpy.random.default_rng()`.
**kwargs: Additional arguments as follows:
* ``n_samples`` (int): Number of samples to draw. Defaults to 1.
Raises:
ValueError: If ``original`` and ``transformed`` are not of the same shape
Yields:
:class:`pandas.DataFrame`:
Preview dataframe for each transformer in ``column_transformer.transformers_``.
Dataframe has twice as columns as ``original`` and ``transformed``, i.e.
``df.shape == (original.shape[0], 2 * original.shape[1])``
"""
# extract kwargs
n_samples = kwargs.get('n_samples', 1)
# just aliases for shorter lines
ct = column_transformer
# set rng
if isinstance(random_state, int):
random_state = np.random.default_rng(random_state)
# generate sample indices
sample_indices = random_state.choice(
original.shape[0],
size=n_samples,
replace=False
)
sample_indices = sample_indices.reshape(-1, 1) # to broadcast properly
# loop through each transform (over subset of columns) and preview it
for idx, k in enumerate(ct.output_indices_):
# 'remainder' is not transformed, so end of the loop
if k == 'remainder':
return None
# get indices of the transformed columns
transformed_columns_slice: Union[list, slice] = ct.output_indices_[k]
if isinstance(transformed_columns_slice, slice):
transformed_columns_range = range(transformed_columns_slice.stop)
transformed_columns_range = transformed_columns_range[transformed_columns_slice]
transformed_columns_indices = list(transformed_columns_range)
else:
transformed_columns_indices = transformed_columns_slice
# get indices of the original columns
columns_indices = ct._columns[idx]
# get names of the original columns
columns_indices_names = df.columns.values[columns_indices]
# preview transformed and original data side by side
if not isinstance(ct.transformers[idx][1], OneHotEncoder):
# compare the values of the transformed and the original columns
original_sample = original[
sample_indices,
columns_indices
]
transformed_sample = transformed[
sample_indices,
transformed_columns_indices
]
# fix shapes to be 2d
original_sample = original_sample.reshape(
sample_indices.shape[0],
columns_indices.__len__()
)
transformed_sample = transformed_sample.reshape(
sample_indices.shape[0],
columns_indices.__len__()
)
# create a dataframe with the original and transformed columns side by side
sample = np.empty(
shape=(original_sample.shape[0], original_sample.shape[1] * 2)
)
sample[:, ::2] = original_sample
sample[:, 1::2] = transformed_sample
preview_cols: List[str] = []
[preview_cols.extend([f'{c}_og', f'{c}_tf']) # type: ignore
for c in columns_indices_names]
preview_df = pd.DataFrame(sample, columns=preview_cols)
# yield the previews
if n_samples == 1:
# just better visuals for single sample
yield preview_df.T
else:
yield preview_df
# show info about onehot encoder changes
elif isinstance(ct.transformers[idx][1], OneHotEncoder):
count_uniques = df.iloc[:, columns_indices].nunique().sum()
logger.warning(f'For "{ct.transformers[idx][0]}" transformer: ')
logger.warning(f'{len(columns_indices)} columns are affected. ')
logger.warning(
f'On selected columns, {count_uniques} unique values exist. '
f'It is expected to have {count_uniques - len(columns_indices)}'
f' new columns and '
f'{len(transformed_columns_indices) - len(columns_indices)}'
f' columns are newly produced.\n'
)