PKMOpandas_ml_utils/__init__.py"""Augment pandas DataFrame with methods for machine learning""" __version__ = '0.0.9' from .pandas_utils_extension import * from .wrappers.lazy_dataframe import * from .datafetching.fetch_yahoo import * from .model.models import * from .model.features_and_Labels import * from .model.selection import * from .classification.summary import * from .classification.classifier import * from .reinforcement.agent import * from .regression.regressor import * from .train_test_data import * from pandas.core.base import PandasObject # add functions to pandas # general utility functions PandasObject.inner_join = inner_join PandasObject.drop_re = drop_re PandasObject.add_apply = add_apply PandasObject.shift_inplace = shift_inplace PandasObject.extend_forecast = extend_forecast PandasObject.make_training_data = make_training_data # feature selection PandasObject.filtration = filtration # classification functions PandasObject.fit_classifier = fit_classifier PandasObject.classify = classify PandasObject.backtest_classifier = backtest_classifier # regression functions PandasObject.fit_regressor = fit_regressor PandasObject.backtest_regressor = backtest_regressor PandasObject.regress = regress # reinforcement learning PandasObject.fit_agent = fit_agent PandasObject.backtest_agent = backtest_agent PandasObject.agent_take_action = agent_take_action # data fetcher setattr(pd, 'fetch_yahoo', fetch_yahoo)PKMOsspandas_ml_utils/multi_model.pyimport logging from collections.abc import Iterable from itertools import groupby from typing import Tuple, Dict, Callable, Any, Union, List import pandas_ml_utils.model.fit from .utils import unfold_parameter_space import pandas_ml_utils as pdu import dill as pickle import pandas as pd import numpy as np log = logging.getLogger(__name__) # TODO we want multi model to somehow become a Model class MultiModel(object): @staticmethod def load(filename: str): with open(filename, 'rb') as file: multi_model = pickle.load(file) if isinstance(multi_model, MultiModel): return multi_model else: raise ValueError("file provided was not a MultiModel") def __init__(self, data_provider: Callable[[], pd.DataFrame], data_engineer: Callable[[pd.DataFrame], pd.DataFrame], model_provider: Callable[[], pdu.Model], parameter_space: Dict[str, Iterable]): self.data_provider = data_provider self.data_engineer = data_engineer self.model_provider = model_provider self.parameter_space = unfold_parameter_space(parameter_space.copy(), {}) self.min_needed_data: int = None self.data: pd.DataFrame = None self.fits: List[pandas_ml_utils.model.fit.Fit] = None self._heatmap_cache = None def save(self, filename: str): with open(filename, 'wb') as file: pickle.dump(self, file) # def fetch_data_and_fit(self, test_size: float = 0.4, test_validate_split_seed: int=None): # self.fetch_data() # self.fit() def fetch_data(self): self._heatmap_cache = None self.data = self.data_provider() def fit(self, test_size: float = 0.4, test_validate_split_seed: int = None, cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None ) -> None: def model_fitter(**kwargs) -> pandas_ml_utils.model.fit.Fit: fit = self.data_engineer(self.data, **kwargs) \ .fit_classifier(self.model_provider, test_size=test_size, cross_validation=cross_validation, test_validate_split_seed=test_validate_split_seed) log.info(f'fit for { {**kwargs}}\n{fit.test_summary.confusion_count()}\n{fit.test_summary.confusion_count()}') return fit # TODO there should be a way to generate one ClassificationSummary out of several by summing or averaging self.fits = [model_fitter(**kwargs) for kwargs in self.parameter_space] self.min_needed_data = max([fit.model.min_required_data for fit in self.fits]) def predict(self) -> pd.DataFrame: df = self.data[-self.min_needed_data:] if self.min_needed_data is not None else self.data def model_predictor(model, **kwargs) -> pd.DataFrame: prediction = self.data_engineer(df, **kwargs) \ .classify(model) return prediction[-1:] predictions = [model_predictor(self.fits[i].model, **kwargs) for i, kwargs in enumerate(self.parameter_space)] return predictions def plot_heatmap(self, parameter_as_column: str, figsize=(15, 12)): import matplotlib.pyplot as plt import seaborn as sns if self._heatmap_cache is None: self._heatmap_cache = self.compute_heatmap(parameter_as_column) fig = plt.figure(figsize=figsize) sns.heatmap(self._heatmap_cache, annot=True, cmap=plt.cm.Reds) return fig def compute_heatmap(self, parameter_as_column: str): predictions = self.predict() # to group all ro indices per column index we first need to sort accordingly sorted_parameter_space = sorted(enumerate(self.parameter_space), key=lambda x: x[1][parameter_as_column]) columns = {col: [value[0] for value in parameter] for col, parameter in groupby(sorted_parameter_space, lambda x: x[1][parameter_as_column])} # assign a data frame for each column predictions = [pd.concat([predictions[row][["traget_target", "prediction_proba"]] for row in rows], axis=0, sort=True) \ .set_index("traget_target") \ .groupby(level=0).max() \ .rename(columns={"prediction_proba": column}) for column, rows in columns.items()] predictions = pd.concat(predictions, axis=1, sort=True).sort_index(ascending=False) return predictions PKA> 0 \ else (x, None, y, None, df_new.index, None) log.info(f" splitting ... done in {pc() - start_split_pc: .2f} sec!") # ravel one dimensional labels if len(features_and_labels.labels) == 1: y_train = y_train.ravel().astype(label_type) y_test = y_test.ravel().astype(label_type) if y_test is not None else None log.info(f"make training / test data split ... done in {pc() - start_pc: .2f} sec!") # print some statistics if needed if summary_printer is not None: summary_printer(y, y_train, y_test) # return the split #log.debug(f"${len(x_train)}, {len(x_test)}, {len(y_train)}, {len(y_test)}, {len(index_train)}, {len(index_test)}, {min_required_data}") return x_train, x_test, y_train, y_test, index_train, index_test, min_required_data def make_forecast_data(df: pd.DataFrame, features_and_labels: 'FeaturesAndLabels'): return _make_features(df[features_and_labels.features], features_and_labels) @lru_cache(maxsize=int(os.getenv('CACHE_FEATUES_AND_LABELS', '1'))) def _make_features_with_cache(df: HashableDataFrame, features_and_labels: 'FeaturesAndLabels'): log.info(f"no cache entry available for {hash(df), hash(features_and_labels)}") return _make_features(df, features_and_labels) def _make_features(df: pd.DataFrame, features_and_labels: 'FeaturesAndLabels'): start_pc = log_with_time(lambda: log.debug(" make features ...")) feature_lags = features_and_labels.feature_lags features = features_and_labels.features lag_smoothing = features_and_labels.lag_smoothing # drop nan's and copy frame df = df.dropna().copy() # generate feature matrix if feature_lags is not None: # return RNN shaped 3D arrays for feature in features: feature_series = df[feature] smoothers = None # smooth out feature if requested if lag_smoothing is not None: smoothers = SortedDict({lag: smoother(feature_series.to_frame()) for lag, smoother in lag_smoothing.items()}) for lag in feature_lags: # if smoothed values are applicable use smoothed values if smoothers is not None and len(smoothers) > 0 and smoothers.peekitem(0)[0] <= lag: feature_series = smoothers.popitem(0)[1] # assign the lagged (eventually smoothed) feature to the features frame df[f'{feature}_{lag}'] = feature_series.shift(lag) # drop all rows which got nan now df = df.dropna() # RNN shape need to be [row, time_step, feature] x = np.array([[[df.iloc[row][f'{feat}_{lag}'] for feat in features] for lag in feature_lags] for row in range(len(df))], ndmin=3) names = np.array([[f'{feat}_{lag}' for feat in features] for lag in feature_lags], ndmin=2) else: # return simple 2D arrays x = df[features].values names = features log.info(f" make features ... done in {pc() - start_pc: .2f} sec!") return df, x def reshape_rnn_as_ar(arr3d): if len(arr3d.shape) < 3: print("Data was not in RNN shape") return arr3d else: return arr3d.reshape(arr3d.shape[0], arr3d.shape[1] * arr3d.shape[2]) PK=OL pandas_ml_utils/utils.pyfrom time import perf_counter as pc from typing import Callable, Dict, Iterable, Any, List import numpy as np from sklearn.model_selection import KFold from sklearn.utils.validation import _num_samples, check_random_state, indexable def log_with_time(log_statement: Callable[[], None]): log_statement() return pc() def unfold_parameter_space(parameter_space: Dict[str, Iterable], parameters: Dict[str, Any]) -> List[Dict]: if len(parameter_space) > 0: # more parameters need to be unfolded parameter, space = parameter_space.popitem() return list(np.array([unfold_parameter_space(parameter_space.copy(), {**parameters, parameter: argument}) for argument in space]).flat) else: return parameters class KFoldBoostRareEvents(KFold): def __init__(self, n_splits='warn', shuffle=False, random_state=None): super().__init__(n_splits, shuffle, random_state) def split(self, X, y=None, groups=None): n_samples = _num_samples(X) indices = np.arange(n_samples) rare_event_indices = indices[y >= 0.999] for f, (train_idx, test_idx) in enumerate(super().split(X, y, groups)): yield np.hstack([train_idx, rare_event_indices]), np.hstack([test_idx, rare_event_indices]) PKA> Fit: model, train, test, index = _fit(df, model_provider, test_size = test_size, cross_validation = cross_validation, cache_feature_matrix = cache_feature_matrix, test_validate_split_seed = test_validate_split_seed, summary_printer = summary_printer) # assemble the result objects features_and_labels = model.features_and_labels cutoff = model[("probability_cutoff", 0.5)] loss = df[features_and_labels.loss_column] if features_and_labels.loss_column is not None else None training_classification = ClassificationSummary(train[1], model.predict(train[0]), index[0], loss, cutoff) test_classification = ClassificationSummary(test[1], model.predict(test[0]), index[1], loss, cutoff) return Fit(model, training_classification, test_classification) def backtest_classifier(df: pd.DataFrame, model: Model) -> ClassificationSummary: x, y, y_hat, index = _backtest(df, model) features_and_labels = model.features_and_labels loss = df[features_and_labels.loss_column if features_and_labels.loss_column is not None else []] return ClassificationSummary(y, y_hat, index, loss, model[("probability_cutoff", 0.5)]) def classify(df: pd.DataFrame, model: Model, tail: int = None) -> pd.DataFrame: dff = _predict(df, model, tail) # return result dff["prediction_proba"] = dff["prediction"] dff["prediction"] = dff["prediction_proba"] > model[("probability_cutoff", 0.5)] return dff PKA> 1 else y_prediction self.index = index self.loss = loss self.probability_cutoff = probability_cutoff self.confusion_matrix = self._confusion_matrix_indices() # immediately log some fit quality measures ratios = self.get_ratios() log.info(f"FN Ratio = {ratios[0]}, FP Ratio = {ratios[1]}") def set_probability_cutoff(self, probability_cutoff: float = 0.5): self.probability_cutoff = probability_cutoff self.confusion_matrix = self._confusion_matrix_indices() def _confusion_matrix_indices(self): index = self.index truth = self.y_true pred = self.y_prediction co = self.probability_cutoff try: confusion = np.array([[index[(truth == True) & (pred > co)], index[(truth == False) & (pred > co)]], [index[(truth == True) & (pred <= co)], index[(truth == False) & (pred <= co)]]]) if len(confusion[0, 0]) <= 0: log.warning("Very bad fit with 0 TP, which leads to problems in the plot") return confusion except: print(f"shapes: y_true: {self.y_true.shape}, y_pred: {self.y_prediction.shape}, index: {self.index.shape}") print("Unexpected error:", sys.exc_info()[0]) return None def get_ratios(self): cm = self.confusion_count() return cm[0,0] / (cm[1,0] + 1), cm[0,0] / (cm[0,1] + 1) def plot_backtest(self, y: pd.Series = None, size: Union[int, pd.Series] = None, figsize: Tuple[int, int] = (16, 6)): # only import if required import seaborn as sns import matplotlib.pyplot as plt from pandas.plotting import register_matplotlib_converters # get rid of deprecation warning register_matplotlib_converters() # check value for back test if self.loss is None and y is None: raise ValueError("No loss column defined, whether in FeaturesAndLabels nor in plot_backtest") # scatter plot where confusion squares are the colors, the loss is the size y = y if y is not None \ else self.loss.loc[self.index] if isinstance(self.loss, pd.Series) \ else self.loss[self.loss.columns[0]].loc[self.index] color = pd.Series(0, index=y.index) color.loc[self.confusion_matrix[0, 0]] = 1 color.loc[self.confusion_matrix[1, 0]] = 2 # get colors from: https://xkcd.com/color/rgb/ fig, ax = plt.subplots(figsize=figsize) ax.set_ylim([y.min() * 1.1, 1]) scatt = sns.scatterplot(x=y.index, y=y, ax=ax, size=size if size is not None else y * -1, hue=color, palette=[sns.xkcd_rgb['white'], sns.xkcd_rgb['pale green'], sns.xkcd_rgb['cerise']]) bar = sns.lineplot(x=y.index, y=self.y_prediction, ax=ax) plt.hlines(self.probability_cutoff, y.index.min(), y.index.max(), color=sns.xkcd_rgb['silver']) plt.close() return fig def confusion_loss(self): cm = self.confusion_matrix df = self.loss return np.array([[df.loc[cm[0, 0]].sum(), df.loc[cm[0, 1]].sum()], [df.loc[cm[1, 0]].sum(), df.loc[cm[1, 1]].sum()]]) def confusion_count(self): return np.array([ [len(self.confusion_matrix[0, 0]), len(self.confusion_matrix[0, 1])], [len(self.confusion_matrix[1, 0]), len(self.confusion_matrix[1, 1])], ]) def _repr_html_(self): return self._html_()._repr_html_() def _html_(self, width: str = '100%'): # only import it needed from vdom.helpers import div, p, img, table, tr, td, tbody, thead, th import matplotlib.pyplot as plt import base64 import io if self.confusion_count()[0, 0] <= 0: return p('very bad fit with 0 TP!') image = None if self.loss is not None: with io.BytesIO() as f: fig = self.plot_backtest() fig.savefig(f, format="png", bbox_inches='tight') image = base64.encodebytes(f.getvalue()).decode("utf-8") plt.close(fig) cmc = self.confusion_count() cml = self.confusion_loss() if self.loss is not None else np.array([[0, 0], [0, 0]]) return div( table( thead( tr( th("Classification Count", style={'text-align': 'left'}), th("Classification Loss", style={'text-align': 'right'}) ) ), tbody( tr( td(self._matrix_table(cmc)), td(self._matrix_table(cml), style={'float': 'right'}) ), tr( td( img(src=f'data:image/png;base64,{image}', style={'width': '100%'}) if image is not None else "", colspan='2' ) ) ), style={'width': '100%'} ), style={'width': width} ) def _matrix_table(self, mx: np.array): from vdom.helpers import table, tr, td, tbody, thead row_label = [[td("True")], [td("False")]] colors = [['green', 'orange'], ['red', 'grey']] return table( thead( tr( td("Prediction / Truth"), td("True"), td("False") ) ), tbody( [tr( row_label[row] + [td( f'{mx[row, col]: .2f}', style={'color': colors[row][col]}) for col in range(mx.shape[1])]) for row in range(mx.shape[0])] ) ) def __len__(self): return len(self.y_true) def __str__(self) -> str: return f'\n{len(self.confusion_matrix[0,0])}\t{len(self.confusion_matrix[0,1])}' \ f'\n{len(self.confusion_matrix[1,0])}\t{len(self.confusion_matrix[1,1])}' PKA>= threshold) above_threshold = np.clip(above_threshold, 0.0, max_value) below_threshold = alpha * (x - threshold) * (x < threshold) return below_threshold + above_threshold def softplus(x): return np.log(1. + np.exp(x)) def softsign(x): return x / (1 + np.abs(x)) def elu(x, alpha=1.): return x * (x > 0) + alpha * (np.exp(x) - 1.) * (x < 0) def sigmoid(x): return 1. / (1. + np.exp(-x)) def hard_sigmoid(x): y = 0.2 * x + 0.5 return np.clip(y, 0, 1) def tanh(x): return np.tanh(x) def softmax(x, axis=-1): y = np.exp(x - np.max(x, axis, keepdims=True)) return y / np.sum(y, axis, keepdims=True) def l2_normalize(x, axis=-1): y = np.max(np.sum(x ** 2, axis, keepdims=True), axis, keepdims=True) return x / np.sqrt(y) def in_top_k(predictions, targets, k): top_k = np.argsort(-predictions)[:, :k] targets = targets.reshape(-1, 1) return np.any(targets == top_k, axis=-1) def binary_crossentropy(target, output, from_logits=False): if not from_logits: output = np.clip(output, 1e-7, 1 - 1e-7) output = np.log(output / (1 - output)) return (target * -np.log(sigmoid(output)) + (1 - target) * -np.log(1 - sigmoid(output))) def categorical_crossentropy(target, output, from_logits=False): if from_logits: output = softmax(output) else: output /= output.sum(axis=-1, keepdims=True) output = np.clip(output, 1e-7, 1 - 1e-7) return np.sum(target * -np.log(output), axis=-1, keepdims=False) def max(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.max(x, axis=axis, keepdims=keepdims) def min(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.min(x, axis=axis, keepdims=keepdims) def mean(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.mean(x, axis=axis, keepdims=keepdims) def var(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.var(x, axis=axis, keepdims=keepdims) def std(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.std(x, axis=axis, keepdims=keepdims) def logsumexp(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return sp.special.logsumexp(x, axis=axis, keepdims=keepdims) def sum(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.sum(x, axis=axis, keepdims=keepdims) def prod(x, axis=None, keepdims=False): if isinstance(axis, list): axis = tuple(axis) return np.prod(x, axis=axis, keepdims=keepdims) PKA> Tuple[Model, Tuple, Tuple, Tuple]: # get a new model model = model_provider() features_and_labels = model.features_and_labels # make training and test data sets x_train, x_test, y_train, y_test, index_train, index_test, min_required_data = \ make_training_data(df, features_and_labels, test_size, label_type=features_and_labels.label_type, seed=test_validate_split_seed, cache=cache_feature_matrix, summary_printer=summary_printer) log.info(f"create model (min required data = {min_required_data}") model.min_required_data = min_required_data # fit the model start_performance_count = log_with_time(lambda: log.info("fit model")) if cross_validation is not None and isinstance(cross_validation, Tuple) and callable(cross_validation[1]): for fold_epoch in range(cross_validation[0]): # cross validation, make sure we re-shuffle every fold_epoch for f, (train_idx, test_idx) in enumerate(cross_validation[1](x_train, y_train)): log.info(f'fit fold {f}') model.fit(x_train[train_idx], y_train[train_idx], x_train[test_idx], y_train[test_idx], index_train[train_idx], index_train[test_idx]) else: # fit without cross validation model.fit(x_train, y_train, x_test, y_test, index_train, index_test) log.info(f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!") return model, (x_train, y_train), (x_test, y_test), (index_train, index_test) def _backtest(df: pd.DataFrame, model: Model) -> ClassificationSummary: features_and_labels = model.features_and_labels # make training and test data with no 0 test data fraction x, _, y, _, index, _, _ = make_training_data(df, features_and_labels, 0, int) # predict probabilities y_hat = model.predict(x) return x, y, y_hat, index def _predict(df: pd.DataFrame, model: Model, tail: int = None) -> pd.DataFrame: features_and_labels = model.features_and_labels if tail is not None: if tail <= 0: raise ValueError("tail must be > 0 or None") elif model.min_required_data is not None: # just use the tail for feature engineering df = df[-(tail + (model.min_required_data - 1)):] else: log.warning("could not determine the minimum required data from the model") # then re assign data frame with features only dff, x = make_forecast_data(df, features_and_labels) # first save target columns and loss column if features_and_labels.target_columns is not None: dff = dff.join(df[features_and_labels.target_columns].add_prefix("traget_")) if features_and_labels.loss_column is not None: dff["loss"] = df[features_and_labels.loss_column] prediction = model.predict(x) if len(prediction.shape) > 1 and prediction.shape[1] > 1: for i in range(prediction.shape[1]): dff[f"prediction_{model.features_and_labels.labels[i]}"] = prediction[:,i] else: dff["prediction"] = prediction return dff PKMOj#opandas_ml_utils/model/models.pyfrom __future__ import annotations import logging from copy import deepcopy import dill as pickle import numpy as np from typing import List, Callable, Tuple from .features_and_Labels import FeaturesAndLabels from ..train_test_data import reshape_rnn_as_ar from ..reinforcement.gym import RowWiseGym log = logging.getLogger(__name__) class Model(object): @staticmethod def load(filename: str): with open(filename, 'rb') as file: model = pickle.load(file) if isinstance(model, Model): return model else: raise ValueError("Deserialized pickle was not a Model!") def __init__(self, features_and_labels: FeaturesAndLabels, **kwargs): self.features_and_labels = features_and_labels self.min_required_data: int = None self.kwargs = kwargs def __getitem__(self, item): if isinstance(item, tuple) and len(item) == 2: return self.kwargs[item[0]] if item[0] in self.kwargs else item[1] else: return self.kwargs[item] if item in self.kwargs else None def save(self, filename: str): with open(filename, 'wb') as file: pickle.dump(self, file) def fit(self, x, y, x_val, y_val, df_index_train, df_index_test) -> None: pass def predict(self, x) -> np.ndarray: pass # this lets the model itself act as a provider. However we want to use the same Model configuration # for different datasets (i.e. as part of MultiModel) def __call__(self, *args, **kwargs): return deepcopy(self) class SkitModel(Model): def __init__(self, skit_model, features_and_labels: FeaturesAndLabels, **kwargs): super().__init__(features_and_labels, **kwargs) self.skit_model = skit_model def fit(self, x, y, x_val, y_val, df_index_train, df_index_test): self.skit_model.fit(reshape_rnn_as_ar(x), y), def predict(self, x): if callable(getattr(self.skit_model, 'predict_proba', None)): return self.skit_model.predict_proba(reshape_rnn_as_ar(x))[:, 1] else: return self.skit_model.predict(reshape_rnn_as_ar(x)) class KerasModel(Model): from typing import TYPE_CHECKING if TYPE_CHECKING: from keras.models import Model as KModel def __init__(self, keras_compiled_model_provider: Callable[[], KModel], features_and_labels: FeaturesAndLabels, callbacks: List[Callable], **kwargs): super().__init__(features_and_labels, **kwargs) self.keras_model_provider = keras_compiled_model_provider self.keras_model = keras_compiled_model_provider() self.callbacks = callbacks self.history = None def fit(self, x, y, x_val, y_val, df_index_train, df_index_test) -> None: self.history = self.keras_model.fit(x, y, validation_data=(x_val, y_val), callbacks=self.callbacks) def predict(self, x) -> np.ndarray: self.keras_model.predict(x) def __call__(self, *args, **kwargs): return KerasModel(self.keras_model_provider, self.features_and_labels, self.callbacks **self.kwargs) # class MultiModel(Model): # # def __init__(self, model_provider: Callable[[], Model], features_and_labels: FeaturesAndLabels): # super().__init__(features_and_labels) # self.model_provider = model_provider # # def fit(self, x, y, x_val, y_val) -> None: # pass # # def predict(self, x) -> np.ndarray: # # we would need to return a prediction for every and each parameters dict in the parameter space # pass class OpenAiGymModel(Model): from typing import TYPE_CHECKING if TYPE_CHECKING: from rl.core import Agent def __init__(self, agent_provider: Callable[[Tuple, int], Agent], features_and_labels: FeaturesAndLabels, action_reward_functions: List[Callable[[np.ndarray], float]], reward_range: Tuple[int, int], oservation_range: Tuple[int, int] = None, episodes: int = 1000, **kwargs): super().__init__(features_and_labels, **kwargs) self.agent_provider = agent_provider self.action_reward_functions = action_reward_functions self.reward_range = reward_range self.oservation_range = oservation_range self.episodes = episodes self.agent = agent_provider(features_and_labels.shape()[0], len(action_reward_functions)) # some history self.keras_train_history = None self.keras_test_history = None self.history = () def fit(self, x, y, x_val, y_val, df_index_train, df_index_test): mm = (min([x.min(), x_val.min()]), max([x.max(), x_val.max()])) if self.oservation_range is None else self.oservation_range training_gym = RowWiseGym((df_index_train, x, y), self.features_and_labels, self.action_reward_functions, self.reward_range, mm) test_gym = RowWiseGym((df_index_test, x_val, y_val), self.features_and_labels, self.action_reward_functions, self.reward_range, mm) self.keras_train_history = self.agent.fit(training_gym, nb_steps=len(x) * self.episodes) #self.keras_test_history = self.agent.test(test_gym, nb_episodes=1) # clarification needed what test actually does: https://github.com/keras-rl/keras-rl/issues/342 test_gym = self._forward_gym(test_gym) self.history = (training_gym.get_history(), test_gym.get_history()) def back_test(self, index, x, y): mm = (x.min(), x.max()) if self.oservation_range is None else self.oservation_range gym = RowWiseGym((index, x, y), self.features_and_labels, self.action_reward_functions, self.reward_range, mm) return self._forward_gym(gym).get_history() def predict(self, x): return [self.agent.forward(x[r]) for r in range(len(x))] def __call__(self, *args, **kwargs): return OpenAiGymModel(self.agent_provider, self.features_and_labels, self.action_reward_functions, self.reward_range, self.episodes, **self.kwargs) def _forward_gym(self, gym): done = False state = gym.reset() while not done: state, reward, done, _ = gym.step(self.agent.forward(state)) return gymPKMO{3u"pandas_ml_utils/model/selection.pyimport logging import numpy as np import pandas as pd from typing import List # from .features_and_Labels import FeaturesAndLabels log = logging.getLogger(__name__) def filtration(df: pd.DataFrame, label_columns: List[str], correlation_threshold: float = 0.5, minimum_features: int = 1, figsize=(12, 10)): correlation_mx = df.corr() log.info(correlation_mx) try: # only import if needed and only plot if libraries found import matplotlib.pyplot as plt import seaborn as sns fig = plt.figure(figsize=figsize) sns.heatmap(correlation_mx, annot=True, cmap=plt.cm.Reds) except: fig = None # select features most correlated with the target values for label in label_columns: target_vec = abs(correlation_mx[label]) features = target_vec[target_vec > correlation_threshold].drop(label_columns) print(f"\nfeatures with correlation > {correlation_threshold} to {label}") print(features) # then eliminate features with high correlation while len(features) > minimum_features and df[features.index].corr().max().values.max() > correlation_threshold: correlation_mx = df[features.index].corr().values index = np.unravel_index(correlation_mx.argmax(), correlation_mx.shape) features = features.drop(features.index[index[0]]) print(f"\nfiltered features with correlation > {correlation_threshold} to {label}") print(features) return fig PKdLO3zz pandas_ml_utils/model/summary.py class Summary(object): def _repr_html_(self): pass def _html_(self, width: str = '100%'): pass PKA> > 'pandas_ml_utils/regression/regressor.py import logging from typing import Callable, Tuple import numpy as np import pandas as pd from .summary import RegressionSummary from ..model.fit import Fit from ..model.fitter import _fit, _backtest, _predict from ..model.models import Model from ..error.functions import mse as _mse log = logging.getLogger(__name__) def fit_regressor(df: pd.DataFrame, model_provider: Callable[[int], Model], test_size: float = 0.4, cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None, cache_feature_matrix: bool = False, test_validate_split_seed = 42, summary_printer: Callable[[np.ndarray, np.ndarray, np.ndarray], None] = None ) -> Fit: model, train, test, index = _fit(df, model_provider, test_size = test_size, cross_validation = cross_validation, cache_feature_matrix = cache_feature_matrix, test_validate_split_seed = test_validate_split_seed, summary_printer = summary_printer) # assemble the result objects features_and_labels = model.features_and_labels loss = df[features_and_labels.loss_column] if features_and_labels.loss_column is not None else None training_summary = RegressionSummary(train[1], model.predict(train[0]), index[0], loss) test_summary = RegressionSummary(test[1], model.predict(test[0]), index[1], loss) return Fit(model, training_summary, test_summary) def backtest_regressor(df: pd.DataFrame, model: Model) -> None: x, y, y_hat, index = _backtest(df, model) features_and_labels = model.features_and_labels loss = df[features_and_labels.loss_column if features_and_labels.loss_column is not None else []] return RegressionSummary(y, y_hat, index, loss) def regress(df: pd.DataFrame, model: Model, tail: int = None) -> pd.DataFrame: dff = _predict(df, model, tail) # get labels calculate error error_function = model[("error", _mse)] dff["error"] = error_function(df[model.features_and_labels.labels], dff[[col for col in dff if col.startswith('prediction')]]) return dff PKA> 1 else y_prediction self.index = index self.loss = loss # TODO add some statisticsPKMO)pandas_ml_utils/reinforcement/__init__.pyPKMO7P&pandas_ml_utils/reinforcement/agent.pyimport logging from typing import Callable, Tuple import numpy as np import pandas as pd from ..model.fit import Fit from ..model.fitter import _fit, _predict from ..model.models import Model from ..train_test_data import make_training_data from .summary import ReinforcementSummary log = logging.getLogger(__name__) def fit_agent(df: pd.DataFrame, model_provider: Callable[[int], Model], test_size: float = 0.4, cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None, cache_feature_matrix: bool = False, test_validate_split_seed = 42, summary_printer: Callable[[np.ndarray, np.ndarray, np.ndarray], None] = None ) -> Fit: model, train, test, index = _fit(df, model_provider, test_size = test_size, cross_validation = cross_validation, cache_feature_matrix = cache_feature_matrix, test_validate_split_seed = test_validate_split_seed, summary_printer = summary_printer) train_targets = df[model.features_and_labels.target_columns].loc[index[0]] test_targets = df[model.features_and_labels.target_columns].loc[index[1]] training_classification = ReinforcementSummary(train_targets, model.history[0]) test_classification = ReinforcementSummary(test_targets, model.history[1]) return Fit(model, training_classification, test_classification) def backtest_agent(df: pd.DataFrame, model: Model) -> ReinforcementSummary: features_and_labels = model.features_and_labels # make training and test data with no 0 test data fraction x, _, y, _, index, _, _ = make_training_data(df, features_and_labels, 0, int) targets = df[model.features_and_labels.target_columns] back_test_history = model.back_test(index, x, y) return ReinforcementSummary(targets, back_test_history) def agent_take_action(df: pd.DataFrame, model: Model, tail: int = None) -> pd.DataFrame: dff = _predict(df, model, tail) return dff PKMO?+  $pandas_ml_utils/reinforcement/gym.pyimport pandas as pd import numpy as np import gym from gym import spaces from typing import Tuple, Callable, List from ..model.features_and_Labels import FeaturesAndLabels INIT_ACTION = -1 class RowWiseGym(gym.Env): def __init__(self, environment: Tuple[np.ndarray, np.ndarray, np.ndarray], features_and_labels: FeaturesAndLabels, action_reward_functions: List[Callable[[np.ndarray], float]], observation_range: Tuple[int, int], reward_range: Tuple[int, int]): super().__init__() self.environment = environment self.reward_range = reward_range self.action_reward_functions = action_reward_functions # start at the beginning of the frame self.state = 0 # define spaces self.action_space = spaces.Discrete(len(action_reward_functions)) self.observation_space = spaces.Box(low=observation_range[0], high=observation_range[1], shape=features_and_labels.shape()[0], dtype=np.float16) # define history self.reward_history = [] self.action_history = [] metadata = {'render.modes': ['human']} def reset(self): # Reset the state of the environment to an initial state and reset history self.reward_history = [] self.action_history = [] return self.step(INIT_ACTION)[0] def step(self, action): # Execute one time step within the environment if action is not INIT_ACTION: reward = self.action_reward_functions[action](self.environment[2][self.state]) self.reward_history.append(reward) self.action_history.append(action) self.state += 1 else: reward = 0 self.state = 0 done = self.state >= len(self.environment[1]) obs = self.environment[1][self.state if not done else 0] return obs, reward, done, {} def render(self, mode='human', close=False): print(f"reward: {sum(self.reward_history)}") def get_history(self): return pd.DataFrame({"reward_history": self.reward_history, "action_history": self.action_history}, index=self.environment[0]).sort_index()PKMOV(pandas_ml_utils/reinforcement/summary.pyimport logging import sys from typing import Tuple, Union import numpy as np import pandas as pd from ..model.summary import Summary log = logging.getLogger(__name__) class ReinforcementSummary(Summary): def __init__(self, target: pd.DataFrame, agent_history: pd.DataFrame): self.df = target.join(agent_history).sort_index() def get_data_frame(self): return self.df def _repr_html_(self): return self._html_()._repr_html_() def _html_(self, width: str = '100%'): # only import it needed from vdom.helpers import div, p, img, table, tr, td, tbody, thead, th import matplotlib.pyplot as plt import base64 import io return div(p("TODO")) PKœ+O$pandas_ml_utils/wrappers/__init__.pyPKX,OR.pandas_ml_utils/wrappers/hashable_dataframe.pyimport pandas as pd class HashableDataFrame(object): def __init__(self, df: pd.DataFrame) -> None: self.df: pd.DataFrame = df def __getitem__(self, item: str): return self.df.__getitem__(item) def __getattr__(self, item): return self.df.__getattr__(item) def __hash__(self): return hash(str(self.describe())) def __eq__(self, other): try: pd.testing.assert_frame_equal(self.df, other.df) return True except: return False PKl5Ox*pandas_ml_utils/wrappers/lazy_dataframe.pyimport uuid import pandas as pd from typing import Callable, Union class LazyDataFrame(object): def __init__(self, df: pd.DataFrame, **kwargs: Callable[[pd.DataFrame], Union[pd.DataFrame, pd.Series]]) -> None: self.hash = uuid.uuid4() self.df: pd.DataFrame = df self.kwargs = kwargs def __getitem__(self, item: str): if isinstance(item, list): df = self.df[[value for value in item if value in self.df.columns]] for key in item: if key in self.kwargs: res = self.kwargs[key](self.df) if isinstance(res, pd.Series): res.name = key df = df.join(res) elif isinstance(res, pd.DataFrame): df = df.join(res.add_prefix(f'{key}_')) return df else: if item in self.df: return self.df[item] elif item in self.kwargs: return self.kwargs[item](self.df) else: raise ValueError(f"invalid item {item}") def __setitem__(self, key: str, value: Callable[[pd.DataFrame], Union[pd.DataFrame, pd.Series]]): self.hash = uuid.uuid4() if callable(value): self.kwargs[key] = value(self.df) else: self.df[key] = value def __getattr__(self, item): return self.to_dataframe().__getattr__(item) def __contains__(self, key): return key in self.df or key in self.kwargs def __hash__(self): return int(self.hash) def __eq__(self, other): return self.hash == other.hash if isinstance(other, LazyDataFrame) else False def to_dataframe(self): df = self.df.copy() for key, calculation in self.kwargs.items(): column = calculation(df) if isinstance(column, pd.DataFrame): df = df.join(column.add_prefix(f'{key}_')) else: df[key] = column return dfPKH5O ..'pandas_ml_utils-0.0.9.dist-info/LICENSEThe MIT License (MIT) Copyright (c) 2019 KIC Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HPO%pandas_ml_utils-0.0.9.dist-info/WHEEL HM K-*ϳR03rOK-J,/RH,szd&Y)r$[)T&UrPK!H5'(pandas_ml_utils-0.0.9.dist-info/METADATAZmo7砰X+'nzWtwN F;E.%]\+=3b׽K NyK/?*봩Yr!Ku,jY bx]r&WMYJ<'ͬT){cXh?s;15V2JBI[j5rx8aR3I3S;{5 b8יpOIyJcq'LWlUn~8F(sr\fuq+SygpW͢J^9= ;FQ񱸼:'um͝{cʩSY]W^b [58)\xooa5&7qzg8nol(rSU@ VHP{KBO5)91zrCtsqg[<~r+Yej}&w0ckp,ڃtllӻh3) 02w{rVERm|1~<Q¬rOasIa౥d)`鿨 ]H2\+=U .EXTJA8RQìQ H"8e*4h$9 @UZNg,(a8፨^ i\:1NkC_4[`WMu23ܨΰ K薊+#\):fDh+ AJ^c[.V%mq$(:@6l5N՚8SŔv$0p^#Oөr9vM7Ik49Kq"BМ*UЈ,7:=FaYq'6[#U`1W+= 'p!pnwڸFߚ @<p BsYv$ WMpaXô\a>N 3C i[hL_zgr#l =&dUuI!̓'%+MTJW¯/O/Eba5V`/sNaXlO71/heL7775[+ ֋u;&}-CJ픇Q9t lqFh7Ǔhbwf$ XOUiw$c9)+~4EW>}+D{C?{5e>QAMI=ŬRKs7H|"tR|e tn @P(BTl>^ʹ1=W/GW߆,+V0F{ "}vx ˁ^+9a }>7/)jP8 N/w_ƑqZr@K_H.{, "ϊ@1~Γ3n%O {)#$3(-Qw`i#@ G kWsYsQ88 j;Ԣ< bD8JnQ/eKčSIdQٌм$:H$G"4dTJ-оfA?'!;,@Mδt TS9sL4@$Ȼ1?B$V&#퍘6 ,yokxS=ڠ F8LQ4h.YmqVD w lSX. LV"MA,],۝\i \-N1UDdd ?Dr;o`w#f1ƂAou?f{}K;сxq mG_Eѿ@o1,vLLv-42xd[gGc}LBn p!mp4%lxt ^+$7x 2\JsӐm?Bcvx]753H@%GX;nON;$ĺxEF7= {g<"n&!W(7RNFU0444$XxL=œ }[ze-iMI})}YƄMNl$l܎e!U*~.hDV4@$Y-$L|[6"Pd8M z ?ԳMX%{)+<-vz^gׯ$-#ӋJ\`(٩!3:\E6W|SU2 D9Gnl(<'f7ىKޒ4= aj0!8HgM3tLPn-I*. 4X1QI@ 0[%(16wѷ Tmm)?iO[b| |3c~޾Y6 3,INǞ9cZfVyHOޟqe&S"JѶxe\܁;[5ЗSxDOxSI 9-g6եh Hr nq:?0A0K6ƴ\젇яȧB6COka=BU~j ev&fsC %:tgmV a-E+"h7l}$ b|Sy/j\4 "a{Ok?*m.ܲX%EYkiAmkU[<~t6x!, q^>?B {_%"#|S(A'gw6MPhڑ2R]"( bKR-zmA 3H*V@UwښYކ"'!u F7.>:y^_O.N'/__qzM^+ yC`Q !o< v ~2-7ZᘒX̟dUI6c@L y3Ulm4.YIH#)&s0OE1^A9l4|Dcƭkl3 )'<3csz:U\+ EidPkޔ,\tUkvz S. PK!Hssq &pandas_ml_utils-0.0.9.dist-info/RECORDI-<,H l\D Qne wo2PwR'=7 + YyYLP(AO.'1fqtfsd%{y9? Oq!8Yȼbu sNI1y 1Pp 6 V dIDU=Jh3I+ç CW 7QlݤEƉhSC*^SvRE4D`4A+ +&TkK SxwgbHoAw]gFkF{yT;E5PdG,5m5X{ xyy8Yd⾺FгڶiW]kp(%G0ܽlhTTBqP ĿZeJ9E-kwرaZ$$6I?Y5;||6 CL$+(/tY">B*6_bc#[+rz;\ ?OmZL8~ܸ[ sQj=T‘c]0dr}:B,!=So [>E+^LKϜ"(dm׶cg'?cTQqh+\B|Tћc0up64yu[ 3<+Klhd) gFYϖ #N^hh)҇ [MSFo&fbW1.]`wׁIP|l()D0FYmQ~7k,=>` C z⢢j ]j|~d ,֠6J>㓩\ ^'#]҄vm~F4 "ga۶gĈvEW*0wue#SɃE1<mGM_DS=U~%zH?#߬(ݱs S*?O_*DN,4 ʬ&AYyɖlϵN݋ 6qߘoaց])K~-cn_x ;H ~" u{^|H)9ƨ AA?sJ > 'pandas_ml_utils/regression/regressor.pyPKA>