PKseIOH 77pandas_ml_utils/__init__.py"""Augment pandas DataFrame with methods for machine learning""" __version__ = '0.0.8' from .pandas_utils_extension import * from .wrappers.lazy_dataframe import * from .datafetching.fetch_yahoo import * from .model.models import * from .model.features_and_Labels import * from .classification.summary import * from .classification.classifier import * from .regression.regressor import * from .train_test_data import * from pandas.core.base import PandasObject # add functions to pandas # general utility functions PandasObject.inner_join = inner_join PandasObject.drop_re = drop_re PandasObject.add_apply = add_apply PandasObject.shift_inplace = shift_inplace PandasObject.extend_forecast = extend_forecast PandasObject.make_training_data = make_training_data # classification functions PandasObject.fit_classifier = fit_classifier PandasObject.classify = classify PandasObject.backtest_classifier = backtest_classifier # regression functions PandasObject.fit_regressor = fit_regressor PandasObject.backtest_regressor = backtest_regressor PandasObject.regress = regress # data fetcher setattr(pd, 'fetch_yahoo', fetch_yahoo) # very rarely beg for love if np.random.uniform() >= 0.99: print("If you like using pandas-ml-utils please show some love and star it on github: " "https://github.com/KIC/pandas_ml_utils")PKseIO͒UUpandas_ml_utils/multi_model.pyimport logging from collections.abc import Iterable from itertools import groupby from typing import Tuple, Dict, Callable, Any, Union, List import pandas_ml_utils.model.fit from .utils import unfold_parameter_space import pandas_ml_utils as pdu import dill as pickle import pandas as pd import numpy as np log = logging.getLogger(__name__) # TODO we want multi model to somehow become a Model class MultiModel(object): @staticmethod def load(filename: str): with open(filename, 'rb') as file: multi_model = pickle.load(file) if isinstance(multi_model, MultiModel): return multi_model else: raise ValueError("file provided was not a MultiModel") def __init__(self, data_provider: Callable[[], pd.DataFrame], data_engineer: Callable[[pd.DataFrame], pd.DataFrame], model_provider: Callable[[], pdu.Model], parameter_space: Dict[str, Iterable]): self.data_provider = data_provider self.data_engineer = data_engineer self.model_provider = model_provider self.parameter_space = unfold_parameter_space(parameter_space.copy(), {}) self.min_needed_data: int = None self.data: pd.DataFrame = None self.fits: List[pandas_ml_utils.model.fit.Fit] = None self._heatmap_cache = None def save(self, filename: str): with open(filename, 'wb') as file: pickle.dump(self, file) # def fetch_data_and_fit(self, test_size: float = 0.4, test_validate_split_seed: int=None): # self.fetch_data() # self.fit() def fetch_data(self): self._heatmap_cache = None self.data = self.data_provider() def fit(self, test_size: float = 0.4, test_validate_split_seed: int = None, cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None ) -> None: def model_fitter(**kwargs) -> pandas_ml_utils.model.fit.Fit: fit = self.data_engineer(self.data, **kwargs) \ .fit_classifier(self.model_provider, test_size=test_size, cross_validation=cross_validation, test_validate_split_seed=test_validate_split_seed) log.info(f'fit for { {**kwargs}}\n{fit.test_summary.confusion_count()}\n{fit.test_summary.confusion_count()}') return fit # TODO there should be a way to generate one ClassificationSummary out of several by summing or averaging self.fits = [model_fitter(**kwargs) for kwargs in self.parameter_space] self.min_needed_data = max([fit.model.min_required_data for fit in self.fits]) def predict(self) -> pd.DataFrame: df = self.data[-self.min_needed_data:] if self.min_needed_data is not None else self.data def model_predictor(model, **kwargs) -> pd.DataFrame: prediction = self.data_engineer(df, **kwargs) \ .classify(model) return prediction[-1:] predictions = [model_predictor(self.fits[i].model, **kwargs) for i, kwargs in enumerate(self.parameter_space)] return predictions def plot_heatmap(self, parameter_as_column: str, figsize=(15, 12)): import matplotlib.pyplot as plt import seaborn as sns if self._heatmap_cache is None: self._heatmap_cache = self.compute_heatmap(parameter_as_column) fig = plt.figure(figsize=figsize) sns.heatmap(self._heatmap_cache) return fig def compute_heatmap(self, parameter_as_column: str): predictions = self.predict() # to group all ro indices per column index we first need to sort accordingly sorted_parameter_space = sorted(enumerate(self.parameter_space), key=lambda x: x[1][parameter_as_column]) columns = {col: [value[0] for value in parameter] for col, parameter in groupby(sorted_parameter_space, lambda x: x[1][parameter_as_column])} # assign a data frame for each column predictions = [pd.concat([predictions[row][["traget_target", "prediction_proba"]] for row in rows], axis=0, sort=True) \ .set_index("traget_target") \ .groupby(level=0).max() \ .rename(columns={"prediction_proba": column}) for column, rows in columns.items()] predictions = pd.concat(predictions, axis=1, sort=True).sort_index(ascending=False) return predictions PKA> 0 \ else (x, None, y, None, df_new.index, None) log.info(f" splitting ... done in {pc() - start_split_pc: .2f} sec!") # ravel one dimensional labels if len(features_and_labels.labels) == 1: y_train = y_train.ravel().astype(label_type) y_test = y_test.ravel().astype(label_type) if y_test is not None else None log.info(f"make training / test data split ... done in {pc() - start_pc: .2f} sec!") # print some statistics if needed if summary_printer is not None: summary_printer(y, y_train, y_test) # return the split return x_train, x_test, y_train, y_test, index_train, index_test, min_required_data def make_forecast_data(df: pd.DataFrame, features_and_labels: 'FeaturesAndLabels'): return _make_features(df[features_and_labels.features], features_and_labels) @lru_cache(maxsize=int(os.getenv('CACHE_FEATUES_AND_LABELS', '1'))) def _make_features_with_cache(df: HashableDataFrame, features_and_labels: 'FeaturesAndLabels'): log.info(f"no cache entry available for {hash(df), hash(features_and_labels)}") return _make_features(df, features_and_labels) def _make_features(df: pd.DataFrame, features_and_labels: 'FeaturesAndLabels'): start_pc = log_with_time(lambda: log.debug(" make features ...")) feature_lags = features_and_labels.feature_lags features = features_and_labels.features lag_smoothing = features_and_labels.lag_smoothing # drop nan's and copy frame df = df.dropna().copy() # generate feature matrix if feature_lags is not None: # return RNN shaped 3D arrays for feature in features: feature_series = df[feature] smoothers = None # smooth out feature if requested if lag_smoothing is not None: smoothers = SortedDict({lag: smoother(feature_series.to_frame()) for lag, smoother in lag_smoothing.items()}) for lag in feature_lags: # if smoothed values are applicable use smoothed values if smoothers is not None and len(smoothers) > 0 and smoothers.peekitem(0)[0] <= lag: feature_series = smoothers.popitem(0)[1] # assign the lagged (eventually smoothed) feature to the features frame df[f'{feature}_{lag}'] = feature_series.shift(lag) # drop all rows which got nan now df = df.dropna() # RNN shape need to be [row, time_step, feature] x = np.array([[[df.iloc[row][f'{feat}_{lag}'] for feat in features] for lag in feature_lags] for row in range(len(df))], ndmin=3) names = np.array([[f'{feat}_{lag}' for feat in features] for lag in feature_lags], ndmin=2) else: # return simple 2D arrays x = df[features].values names = features log.info(f" make features ... done in {pc() - start_pc: .2f} sec!") return df, x def reshape_rnn_as_ar(arr3d): if len(arr3d.shape) < 3: print("Data was not in RNN shape") return arr3d else: return arr3d.reshape(arr3d.shape[0], arr3d.shape[1] * arr3d.shape[2]) PK=OL pandas_ml_utils/utils.pyfrom time import perf_counter as pc from typing import Callable, Dict, Iterable, Any, List import numpy as np from sklearn.model_selection import KFold from sklearn.utils.validation import _num_samples, check_random_state, indexable def log_with_time(log_statement: Callable[[], None]): log_statement() return pc() def unfold_parameter_space(parameter_space: Dict[str, Iterable], parameters: Dict[str, Any]) -> List[Dict]: if len(parameter_space) > 0: # more parameters need to be unfolded parameter, space = parameter_space.popitem() return list(np.array([unfold_parameter_space(parameter_space.copy(), {**parameters, parameter: argument}) for argument in space]).flat) else: return parameters class KFoldBoostRareEvents(KFold): def __init__(self, n_splits='warn', shuffle=False, random_state=None): super().__init__(n_splits, shuffle, random_state) def split(self, X, y=None, groups=None): n_samples = _num_samples(X) indices = np.arange(n_samples) rare_event_indices = indices[y >= 0.999] for f, (train_idx, test_idx) in enumerate(super().split(X, y, groups)): yield np.hstack([train_idx, rare_event_indices]), np.hstack([test_idx, rare_event_indices]) PKA> Fit: model, train, test, index = _fit(df, model_provider, test_size = test_size, cross_validation = cross_validation, cache_feature_matrix = cache_feature_matrix, test_validate_split_seed = test_validate_split_seed, summary_printer = summary_printer) # assemble the result objects features_and_labels = model.features_and_labels cutoff = model[("probability_cutoff", 0.5)] loss = df[features_and_labels.loss_column] if features_and_labels.loss_column is not None else None training_classification = ClassificationSummary(train[1], model.predict(train[0]), index[0], loss, cutoff) test_classification = ClassificationSummary(test[1], model.predict(test[0]), index[1], loss, cutoff) return Fit(model, training_classification, test_classification) def backtest_classifier(df: pd.DataFrame, model: Model) -> ClassificationSummary: x, y, y_hat, index = _backtest(df, model) features_and_labels = model.features_and_labels loss = df[features_and_labels.loss_column if features_and_labels.loss_column is not None else []] return ClassificationSummary(y, y_hat, index, loss, model[("probability_cutoff", 0.5)]) def classify(df: pd.DataFrame, model: Model, tail: int = None) -> pd.DataFrame: dff = _predict(df, model, tail) # return result dff["prediction_proba"] = dff["prediction"] dff["prediction"] = dff["prediction_proba"] > model[("probability_cutoff", 0.5)] return dff PKA> 1 else y_prediction self.index = index self.loss = loss self.probability_cutoff = probability_cutoff self.confusion_matrix = self._confusion_matrix_indices() # immediately log some fit quality measures ratios = self.get_ratios() log.info(f"FN Ratio = {ratios[0]}, FP Ratio = {ratios[1]}") def set_probability_cutoff(self, probability_cutoff: float = 0.5): self.probability_cutoff = probability_cutoff self.confusion_matrix = self._confusion_matrix_indices() def _confusion_matrix_indices(self): index = self.index truth = self.y_true pred = self.y_prediction co = self.probability_cutoff try: confusion = np.array([[index[(truth == True) & (pred > co)], index[(truth == False) & (pred > co)]], [index[(truth == True) & (pred <= co)], index[(truth == False) & (pred <= co)]]]) if len(confusion[0, 0]) <= 0: log.warning("Very bad fit with 0 TP, which leads to problems in the plot") return confusion except: print(f"shapes: y_true: {self.y_true.shape}, y_pred: {self.y_prediction.shape}, index: {self.index.shape}") print("Unexpected error:", sys.exc_info()[0]) return None def get_ratios(self): cm = self.confusion_count() return cm[0,0] / (cm[1,0] + 1), cm[0,0] / (cm[0,1] + 1) def plot_backtest(self, y: pd.Series = None, size: Union[int, pd.Series] = None, figsize: Tuple[int, int] = (16, 6)): # only import if required import seaborn as sns import matplotlib.pyplot as plt from pandas.plotting import register_matplotlib_converters # get rid of deprecation warning register_matplotlib_converters() # check value for back test if self.loss is None and y is None: raise ValueError("No loss column defined, whether in FeaturesAndLabels nor in plot_backtest") # scatter plot where confusion squares are the colors, the loss is the size y = y if y is not None \ else self.loss.loc[self.index] if isinstance(self.loss, pd.Series) \ else self.loss[self.loss.columns[0]].loc[self.index] color = pd.Series(0, index=y.index) color.loc[self.confusion_matrix[0, 0]] = 1 color.loc[self.confusion_matrix[1, 0]] = 2 # get colors from: https://xkcd.com/color/rgb/ fig, ax = plt.subplots(figsize=figsize) ax.set_ylim([y.min() * 1.1, 1]) scatt = sns.scatterplot(x=y.index, y=y, ax=ax, size=size if size is not None else y * -1, hue=color, palette=[sns.xkcd_rgb['white'], sns.xkcd_rgb['pale green'], sns.xkcd_rgb['cerise']]) bar = sns.lineplot(x=y.index, y=self.y_prediction, ax=ax) plt.hlines(self.probability_cutoff, y.index.min(), y.index.max(), color=sns.xkcd_rgb['silver']) plt.close() return fig def confusion_loss(self): cm = self.confusion_matrix df = self.loss return np.array([[df.loc[cm[0, 0]].sum(), df.loc[cm[0, 1]].sum()], [df.loc[cm[1, 0]].sum(), df.loc[cm[1, 1]].sum()]]) def confusion_count(self): return np.array([ [len(self.confusion_matrix[0, 0]), len(self.confusion_matrix[0, 1])], [len(self.confusion_matrix[1, 0]), len(self.confusion_matrix[1, 1])], ]) def _repr_html_(self): return self._html_()._repr_html_() def _html_(self, width: str = '100%'): # only import it needed from vdom.helpers import div, p, img, table, tr, td, tbody, thead, th import matplotlib.pyplot as plt import base64 import io if self.confusion_count()[0, 0] <= 0: return p('very bad fit with 0 TP!') image = None if self.loss is not None: with io.BytesIO() as f: fig = self.plot_backtest() fig.savefig(f, format="png", bbox_inches='tight') image = base64.encodebytes(f.getvalue()).decode("utf-8") plt.close(fig) cmc = self.confusion_count() cml = self.confusion_loss() if self.loss is not None else np.array([[0, 0], [0, 0]]) return div( table( thead( tr( th("Classification Count", style={'text-align': 'left'}), th("Classification Loss", style={'text-align': 'right'}) ) ), tbody( tr( td(self._matrix_table(cmc)), td(self._matrix_table(cml), style={'float': 'right'}) ), tr( td( img(src=f'data:image/png;base64,{image}', style={'width': '100%'}) if image is not None else "", colspan='2' ) ) ), style={'width': '100%'} ), style={'width': width} ) def _matrix_table(self, mx: np.array): from vdom.helpers import table, tr, td, tbody, thead row_label = [[td("True")], [td("False")]] colors = [['green', 'orange'], ['red', 'grey']] return table( thead( tr( td("Prediction / Truth"), td("True"), td("False") ) ), tbody( [tr( row_label[row] + [td( f'{mx[row, col]: .2f}', style={'color': colors[row][col]}) for col in range(mx.shape[1])]) for row in range(mx.shape[0])] ) ) def __len__(self): return len(self.y_true) def __str__(self) -> str: return f'\n{len(self.confusion_matrix[0,0])}\t{len(self.confusion_matrix[0,1])}' \ f'\n{len(self.confusion_matrix[1,0])}\t{len(self.confusion_matrix[1,1])}' PKA>= threshold) above_threshold = np.clip(above_threshold, 0.0, max_value) below_threshold = alpha * (x - threshold) * (x < threshold) return below_threshold + above_threshold def softplus(x): return np.log(1. + np.exp(x)) def softsign(x): return x / (1 + np.abs(x)) def elu(x, alpha=1.): return x * (x > 0) + alpha * (np.exp(x) - 1.) * (x < 0) def sigmoid(x): return 1. / (1. + np.exp(-x)) def hard_sigmoid(x): y = 0.2 * x + 0.5 return np.clip(y, 0, 1) def tanh(x): return np.tanh(x) def softmax(x, axis=-1): y = np.exp(x - np.max(x, axis, keepdims=True)) return y / np.sum(y, axis, keepdims=True) def l2_normalize(x, axis=-1): y = np.max(np.sum(x ** 2, axis, keepdims=True), axis, keepdims=True) return x / np.sqrt(y) def in_top_k(predictions, targets, k): top_k = np.argsort(-predictions)[:, :k] targets = targets.reshape(-1, 1) return np.any(targets == top_k, axis=-1) def binary_crossentropy(target, output, from_logits=False): if not from_logits: output = np.clip(output, 1e-7, 1 - 1e-7) output = np.log(output / (1 - output)) return (target * -np.log(sigmoid(output)) + (1 - target) * -np.log(1 - sigmoid(output))) def categorical_crossentropy(target, output, from_logits=False): if from_logits: output = softmax(output) else: output /= output.sum(axis=-1, keepdims=True) output = np.clip(output, 1e-7, 1 - 1e-7) return np.sum(target * -np.log(output), axis=-1, keepdims=False) def max(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.max(x, axis=axis, keepdims=keepdims) def min(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.min(x, axis=axis, keepdims=keepdims) def mean(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.mean(x, axis=axis, keepdims=keepdims) def var(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.var(x, axis=axis, keepdims=keepdims) def std(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.std(x, axis=axis, keepdims=keepdims) def logsumexp(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return sp.special.logsumexp(x, axis=axis, keepdims=keepdims) def sum(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.sum(x, axis=axis, keepdims=keepdims) def prod(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.prod(x, axis=axis, keepdims=keepdims) PKA> Tuple[Model, Tuple, Tuple, Tuple]: # get a new model model = model_provider() features_and_labels = model.features_and_labels # make training and test data sets x_train, x_test, y_train, y_test, index_train, index_test, min_required_data = \ make_training_data(df, features_and_labels, test_size, int, test_validate_split_seed, cache=cache_feature_matrix, summary_printer=summary_printer) log.info(f"create model (min required data = {min_required_data}") model.min_required_data = min_required_data # fit the model start_performance_count = log_with_time(lambda: log.info("fit model")) if cross_validation is not None and isinstance(cross_validation, Tuple) and callable(cross_validation[1]): for fold_epoch in range(cross_validation[0]): # cross validation, make sure we re-shuffle every fold_epoch for f, (train_idx, test_idx) in enumerate(cross_validation[1](x_train, y_train)): log.info(f'fit fold {f}') model.fit(x_train[train_idx], y_train[train_idx], x_train[test_idx], y_train[test_idx]) else: # fit without cross validation model.fit(x_train, y_train, x_test, y_test) log.info(f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!") return model, (x_train, y_train), (x_test, y_test), (index_train, index_test) def _backtest(df: pd.DataFrame, model: Model) -> ClassificationSummary: features_and_labels = model.features_and_labels # make training and test data with no 0 test data fraction x, _, y, _, index, _, _ = make_training_data(df, features_and_labels, 0, int) # predict probabilities y_hat = model.predict(x) return x, y, y_hat, index def _predict(df: pd.DataFrame, model: Model, tail: int = None) -> pd.DataFrame: features_and_labels = model.features_and_labels if tail is not None: if tail <= 0: raise ValueError("tail must be > 0 or None") elif model.min_required_data is not None: # just use the tail for feature engineering df = df[-(tail + (model.min_required_data - 1)):] else: log.warning("could not determine the minimum required data from the model") # then re assign data frame with features only dff, x = make_forecast_data(df, features_and_labels) # first save target columns and loss column if features_and_labels.target_columns is not None: dff = dff.join(df[features_and_labels.target_columns].add_prefix("traget_")) if features_and_labels.loss_column is not None: dff["loss"] = df[features_and_labels.loss_column] # predict on features prediction = model.predict(x) if len(prediction.shape) > 1 and prediction.shape[1] > 1: for i in range(prediction.shape[1]): dff[f"prediction_{model.features_and_labels.labels[i]}"] = prediction[:,i] else: dff["prediction"] = prediction return dff PKA> None: pass def predict(self, x) -> np.ndarray: pass # this lets the model itself act as a provider. However we want to use the same Model configuration # for different datasets (i.e. as part of MultiModel) def __call__(self, *args, **kwargs): return deepcopy(self) class SkitModel(Model): def __init__(self, skit_model, features_and_labels: FeaturesAndLabels, **kwargs): super().__init__(features_and_labels, **kwargs) self.skit_model = skit_model def fit(self, x, y, x_val, y_val): self.skit_model.fit(reshape_rnn_as_ar(x), y), def predict(self, x): if callable(getattr(self.skit_model, 'predict_proba', None)): return self.skit_model.predict_proba(reshape_rnn_as_ar(x))[:, 1] else: return self.skit_model.predict(reshape_rnn_as_ar(x)) # TODO add Keras Model class KerasModel(Model): pass # class MultiModel(Model): # # def __init__(self, model_provider: Callable[[], Model], features_and_labels: FeaturesAndLabels): # super().__init__(features_and_labels) # self.model_provider = model_provider # # def fit(self, x, y, x_val, y_val) -> None: # pass # # def predict(self, x) -> np.ndarray: # # we would need to return a prediction for every and each parameters dict in the parameter space # pass PKA> > 'pandas_ml_utils/regression/regressor.py import logging from typing import Callable, Tuple import numpy as np import pandas as pd from .summary import RegressionSummary from ..model.fit import Fit from ..model.fitter import _fit, _backtest, _predict from ..model.models import Model from ..error.functions import mse as _mse log = logging.getLogger(__name__) def fit_regressor(df: pd.DataFrame, model_provider: Callable[[int], Model], test_size: float = 0.4, cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None, cache_feature_matrix: bool = False, test_validate_split_seed = 42, summary_printer: Callable[[np.ndarray, np.ndarray, np.ndarray], None] = None ) -> Fit: model, train, test, index = _fit(df, model_provider, test_size = test_size, cross_validation = cross_validation, cache_feature_matrix = cache_feature_matrix, test_validate_split_seed = test_validate_split_seed, summary_printer = summary_printer) # assemble the result objects features_and_labels = model.features_and_labels loss = df[features_and_labels.loss_column] if features_and_labels.loss_column is not None else None training_summary = RegressionSummary(train[1], model.predict(train[0]), index[0], loss) test_summary = RegressionSummary(test[1], model.predict(test[0]), index[1], loss) return Fit(model, training_summary, test_summary) def backtest_regressor(df: pd.DataFrame, model: Model) -> None: x, y, y_hat, index = _backtest(df, model) features_and_labels = model.features_and_labels loss = df[features_and_labels.loss_column if features_and_labels.loss_column is not None else []] return RegressionSummary(y, y_hat, index, loss) def regress(df: pd.DataFrame, model: Model, tail: int = None) -> pd.DataFrame: dff = _predict(df, model, tail) # get labels calculate error error_function = model[("error", _mse)] dff["error"] = error_function(df[model.features_and_labels.labels], dff[[col for col in dff if col.startswith('prediction')]]) return dff PKA> 1 else y_prediction self.index = index self.loss = loss # TODO add some statisticsPKœ+O$pandas_ml_utils/wrappers/__init__.pyPKX,OR.pandas_ml_utils/wrappers/hashable_dataframe.pyimport pandas as pd class HashableDataFrame(object): def __init__(self, df: pd.DataFrame) -> None: self.df: pd.DataFrame = df def __getitem__(self, item: str): return self.df.__getitem__(item) def __getattr__(self, item): return self.df.__getattr__(item) def __hash__(self): return hash(str(self.describe())) def __eq__(self, other): try: pd.testing.assert_frame_equal(self.df, other.df) return True except: return False PKl5Ox*pandas_ml_utils/wrappers/lazy_dataframe.pyimport uuid import pandas as pd from typing import Callable, Union class LazyDataFrame(object): def __init__(self, df: pd.DataFrame, **kwargs: Callable[[pd.DataFrame], Union[pd.DataFrame, pd.Series]]) -> None: self.hash = uuid.uuid4() self.df: pd.DataFrame = df self.kwargs = kwargs def __getitem__(self, item: str): if isinstance(item, list): df = self.df[[value for value in item if value in self.df.columns]] for key in item: if key in self.kwargs: res = self.kwargs[key](self.df) if isinstance(res, pd.Series): res.name = key df = df.join(res) elif isinstance(res, pd.DataFrame): df = df.join(res.add_prefix(f'{key}_')) return df else: if item in self.df: return self.df[item] elif item in self.kwargs: return self.kwargs[item](self.df) else: raise ValueError(f"invalid item {item}") def __setitem__(self, key: str, value: Callable[[pd.DataFrame], Union[pd.DataFrame, pd.Series]]): self.hash = uuid.uuid4() if callable(value): self.kwargs[key] = value(self.df) else: self.df[key] = value def __getattr__(self, item): return self.to_dataframe().__getattr__(item) def __contains__(self, key): return key in self.df or key in self.kwargs def __hash__(self): return int(self.hash) def __eq__(self, other): return self.hash == other.hash if isinstance(other, LazyDataFrame) else False def to_dataframe(self): df = self.df.copy() for key, calculation in self.kwargs.items(): column = calculation(df) if isinstance(column, pd.DataFrame): df = df.join(column.add_prefix(f'{key}_')) else: df[key] = column return dfPKH5O ..'pandas_ml_utils-0.0.8.dist-info/LICENSEThe MIT License (MIT) Copyright (c) 2019 KIC Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HPO%pandas_ml_utils-0.0.8.dist-info/WHEEL HM K-*ϳR03rOK-J,/RH,szd&Y)r$[)T&UrPK!H, "(pandas_ml_utils-0.0.8.dist-info/METADATAZmo_upX_Wj1A!0Lrݥ3$-YmNٙgTz9YYM9Wѹ,PTL:wQ7d/ދ.뢐v9PoƋ#,wbXh?s:+ uD-u9ޙB *9ns+7ݝaR=S?= |8Ӊ*tv(0=PPo.i.1gHK<784yW x7Ya.әVرE TT5*Wmڔ#urS.ѯ@|jpWsy'v 41F3֫4$Tm7}~_~mj? (J1 SvB*7>g wJN-5vRZInƕV)&ӀhXEq?F.fk.fZVz7]~QpU5˗v0BBtT.S $ĕF2_JQ,RNU0FA] "Ni̱ Kg[e=-J8 2\'e]TK!K'im17ؕdE÷ wl4 b҈S1D \ i-SW"oFTL )ʬ)S"9Tp耐d&͂Q_$")\zW(n3Ɲ'ړZ qaS ${:u"ndR1DP4t2,On!c:SVM8\Ҏ sh`i*;McƑEЏLd }L˩ʷC?.v|Elo$,Əqҡ_/=Hަ=gr\h;f32ր^']NLHDkw|V='˹TtꙆ `)=GZmÔZǘBVAT@H"U`ٌмlj#@6EA>`>P)"Dx-`xgDt}> I9W]l<.D{f5];\`$!B|ZlHԱ-1LFډ2+H6?w:_Kp nTooo%4+;`Hۜ)MT"pٗLޤLV)2~]`׮ģUآ)$,9a?QcTnC |,XDЛlڏusv#_Ѷ=ZOh08(z a4bCdzؒ3s޻~2Jyh7/:l8Bλț}&m i'w iWBܐM3pI,s@r=ᡷP`XR0`=!uD:^v)*,d77Gd%$:u';Q#A=FcDV(ځwcQ)OC28BD+c ft iGZR$s*` 5Vu&շyu#|YSeUrFUYecrzQ܂^9櫥p(cq r@>w_=+WⅸQKИlm^81N2I+(JMjh@ YuU'<6 \v+Jexp4Yat< T[6$H.܍a~fw}QfƄ Fs,je$aXֆR׆A=J]YRٜ,:t 139= kR|IMku( ]*:)80QHӎTnCY*R(#TyIfVO0UUjkJ~GVEN F`\܌uxpx|r|ph|vk[&|Vj@88?y&4ՖXq(DUPxQp:#Nt6WW )g$V| Lu)N, .Lʺ^@ïaûawK9fGٶ=wz""PK!H &pandas_ml_utils-0.0.8.dist-info/RECORDɒHET3,z "\@[.+TMh{CAĹ^tAUC aK3#]0z FN8IB>8\!8>:'l=7CDפƱ/Q<ԯ96aCAc⩏aI˨*BeB_:4 Gn~p|A5[à>@u$ѭj%o;;:e7ѬIsU.o' '& Syݬ^bv]FhAyx&]'y ؼ2V*wEx8fQ\_3k:Wˠ)w)dc pȃ4KUC+8vE3BD-n5uh~RyjqWz`tr9UdDCrk uCUv~RsX40p&6 >7$~P,A/MMa%AVOi˸}&:V?:}Ry2&FܶuKM~yrҩ(jwL\`M?ۘ2&F} F-ɏhuMMti{䨲NECڄtzĔFBU5="8F,-߱I }5P%>]{tܫԔ gGૐD͆* cT`L$|ľ {eE3* S+k`0&oaTbIb΢1 %Pa׍CXBe|=MMu. NwQiug仂K~V2EaT! t]|sAVY£c[G?Oרkr9N*9TdM:DgQu"V[CS_ì2zsل#AϢƎx/$V`w>cVx[4qNw)?iA?iUU_ڣ9F4gUJ 7{Ydd_6]S,pz\6$kn{mRIwcG[#n~Y0/Q&WUE߯`} 0lԁc] \=q&DV ^໲(w^Ab3s$P&*Sv^X{`k ͽxEG=НPtzF#HVV1/" .N煀)) ȗPKseIOH 77pandas_ml_utils/__init__.pyPKseIO͒UUppandas_ml_utils/multi_model.pyPKA> > 'fpandas_ml_utils/regression/regressor.pyPKA>