PK]6Ol$$pandas_ml_utils/__init__.py"""Augment pandas data frames with methods for machine learning""" __version__ = '0.0.3' from .pandas_extensions import * from .classifier import * from .lazy_dataframe import * from .fetch_yahoo import * from .data_objects import * from .classifier_models import * from .features_and_Labels import * from pandas.core.base import PandasObject # add functions to pandas PandasObject.hashable = hashable PandasObject.inner_join = inner_join PandasObject.drop_re = drop_re PandasObject.add_apply = add_apply PandasObject.shift_inplace = shift_inplace PandasObject.extend_forecast = extend_forecast PandasObject.make_training_data = make_training_data PandasObject.fit_classifier = fit_classifier PandasObject.classify = classify PandasObject.backtest = backtest setattr(pd, 'fetch_yahoo', fetch_yahoo) PKl5OEz'pandas_ml_utils/classification_plots.pyimport inspect import logging import sys from typing import List, Tuple, Callable, Iterable, Dict, Union from matplotlib import pyplot as plt import seaborn as sns import numpy as np import pandas as pd from .multi_model import MultiModel log = logging.getLogger(__name__) def plot_forecast_heatmap(df: pd.DataFrame, multi_model: MultiModel, parameter_as_column: str): # we need a data frame with the target values as row index and the forecast periods as columns # if we have one data frame (which is effectively one row) per "target" we just need to concatenete them # and join the same frames as new column for each forecast period passPKl5O+OT944pandas_ml_utils/classifier.pyimport logging from time import perf_counter from typing import Callable, Tuple import numpy as np import pandas as pd from sklearn.model_selection import KFold from pandas_ml_utils.train_test_data import make_training_data, make_forecast_data from pandas_ml_utils.utils import log_with_time from .classifier_models import Model from .data_objects import ClassificationSummary, Fit log = logging.getLogger(__name__) def fit_classifier(df: pd.DataFrame, model_provider: Callable[[int], Model], test_size: float = 0.4, number_of_cross_validation_splits: int = None, cache_feature_matrix: bool = False, test_validate_split_seed = 42, summary_printer: Callable[[np.ndarray, np.ndarray, np.ndarray], None] = None ) -> Tuple[Model, ClassificationSummary, ClassificationSummary]: # get a new model model = model_provider() features_and_labels = model.features_and_labels # make training and test data sets x_train, x_test, y_train, y_test, index_train, index_test, min_required_data, names = \ make_training_data(df, features_and_labels, test_size, int, test_validate_split_seed, cache=cache_feature_matrix, summary_printer=summary_printer) log.info(f"create model (min required data = {min_required_data}") model.min_required_data = min_required_data # fit the model start_performance_count = log_with_time(lambda: log.info("fit model")) if number_of_cross_validation_splits is not None: # cross validation cv = KFold(n_splits = number_of_cross_validation_splits) folds = cv.split(x_train, y_train) for f, (train_idx, test_idx) in enumerate(folds): log.info(f'fit fold {f}') model.fit(x_train[train_idx], y_train[train_idx], x_train[test_idx], y_train[test_idx]) else: # fit without cross validation model.fit(x_train, y_train, x_test, y_test) log.info(f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!") # assemble the result objects pc = features_and_labels.probability_cutoff loss = df[features_and_labels.loss_column] if features_and_labels.loss_column is not None else None training_classification = ClassificationSummary(y_train, model.predict(x_train), index_train, loss, pc) test_classification = ClassificationSummary(y_test, model.predict(x_test), index_test, loss, pc) return Fit(model, training_classification, test_classification) def backtest(df: pd.DataFrame, model: Model) -> ClassificationSummary: features_and_labels = model.features_and_labels # make training and test data with no 0 test data fraction x, _, y, _, index, _, _, names = make_training_data(df, features_and_labels, 0, int) # predict probabilities y_hat = model.predict(x) loss = df[features_and_labels.loss_column if features_and_labels.loss_column is not None else []] return ClassificationSummary(y, y_hat, index, loss, features_and_labels.probability_cutoff) def classify(df: pd.DataFrame, model: Model, tail: int = None) -> pd.DataFrame: features_and_labels = model.features_and_labels if tail is not None: if tail <= 0: raise ValueError("tail must be > 0 or None") elif model.min_required_data is not None: # just use the tail for feature engineering df = df[-(tail + (model.min_required_data - 1)):] else: log.warning("could not determine the minimum required data from the model") # first save target columns target = df[features_and_labels.target_columns] if features_and_labels.target_columns is not None else None loss = df[features_and_labels.loss_column] if features_and_labels.loss_column is not None else None # then re assign data frame with features only dff, x, _ = make_forecast_data(df, features_and_labels) # predict on features prediction = model.predict(x) pc = features_and_labels.probability_cutoff # return result dff["prediction"] = prediction > pc dff["prediction_proba"] = prediction dff["target"] = target dff["loss"] = loss return dff PK6L6O~-@$pandas_ml_utils/classifier_models.pyimport logging import dill as pickle import numpy as np from copy import deepcopy from typing import Callable from .train_test_data import reshape_rnn_as_ar from .features_and_Labels import FeaturesAndLabels log = logging.getLogger(__name__) class Model(object): @staticmethod def load(filename: str): with open(filename, 'rb') as file: model = pickle.load(file) if isinstance(model, Model): return model else: raise ValueError("Deserialized pickle was not a Model!") def __init__(self, features_and_labels: FeaturesAndLabels): self.features_and_labels = features_and_labels self.min_required_data: int = None def save(self, filename: str): with open(filename, 'wb') as file: pickle.dump(self, file) def fit(self, x, y, x_val, y_val) -> None: pass def predict(self, x) -> np.ndarray: pass # this lets the model itself act as a provider. However we want to use the same Model configuration # for different datasets (i.e. as part of MultiModel) def __call__(self, *args, **kwargs): return deepcopy(self) class SkitModel(Model): def __init__(self, skit_model, features_and_labels: FeaturesAndLabels): super().__init__(features_and_labels) self.skit_model = skit_model def fit(self, x, y, x_val, y_val): self.skit_model.fit(reshape_rnn_as_ar(x), y), def predict(self, x): return self.skit_model.predict_proba(reshape_rnn_as_ar(x))[:, 1] # class MultiModel(Model): # # def __init__(self, model_provider: Callable[[], Model], features_and_labels: FeaturesAndLabels): # super().__init__(features_and_labels) # self.model_provider = model_provider # # def fit(self, x, y, x_val, y_val) -> None: # pass # # def predict(self, x) -> np.ndarray: # # we would need to return a prediction for every and each parameters dict in the parameter space # pass PK]6O@3"3"pandas_ml_utils/data_objects.pyimport inspect import logging import sys from typing import List, Tuple, Callable, Iterable, Dict, Union import numpy as np import pandas as pd from .classifier_models import Model log = logging.getLogger(__name__) class ClassificationSummary(object): def __init__(self, y_true: np.ndarray, y_prediction: np.ndarray, index: np.ndarray, loss: pd.Series = None, probability_cutoff: float = 0.5): self.y_true = y_true self.y_prediction = y_prediction.ravel() if len(y_prediction.shape) > 1 else y_prediction self.index = index self.loss = loss self.probability_cutoff = probability_cutoff self.confusion_matrix = self._confusion_matrix_indices() # immediately log some fit quality measures ratios = self.get_ratios() log.info(f"FN Ratio = {ratios[0]}, FP Ratio = {ratios[1]}") def set_probability_cutoff(self, probability_cutoff: float = 0.5): self.probability_cutoff = probability_cutoff self.confusion_matrix = self._confusion_matrix_indices() def _confusion_matrix_indices(self): index = self.index truth = self.y_true pred = self.y_prediction co = self.probability_cutoff try: confusion = np.array([[index[(truth == True) & (pred > co)], index[(truth == False) & (pred > co)]], [index[(truth == True) & (pred <= co)], index[(truth == False) & (pred <= co)]]]) if len(confusion[0, 0]) <= 0: log.warning("Very bad fit with 0 TP, which leads to problems in the plot") return confusion except: print(f"shapes: y_true: {self.y_true.shape}, y_pred: {self.y_prediction.shape}, index: {self.index.shape}") print("Unexpected error:", sys.exc_info()[0]) return None def get_ratios(self): cm = self.confusion_count() return cm[0,0] / (cm[1,0] + 1), cm[0,0] / (cm[0,1] + 1) def plot_backtest(self, y: pd.Series = None, size: Union[int, pd.Series] = None, figsize: Tuple[int, int] = (16, 6)): # only import if required import seaborn as sns import matplotlib.pyplot as plt from pandas.plotting import register_matplotlib_converters # get rid of deprecation warning register_matplotlib_converters() # check value for back test if self.loss is None and y is None: raise ValueError("No loss column defined, whether in FeaturesAndLabels nor in plot_backtest") # scatter plot where confusion squares are the colors, the loss is the size y = y if y is not None \ else self.loss.loc[self.index] if isinstance(self.loss, pd.Series) \ else self.loss[self.loss.columns[0]].loc[self.index] color = pd.Series(0, index=y.index) color.loc[self.confusion_matrix[0, 0]] = 1 color.loc[self.confusion_matrix[1, 0]] = 2 # get colors from: https://xkcd.com/color/rgb/ fig, ax = plt.subplots(figsize=figsize) ax.set_ylim([y.min() * 1.1, 1]) scatt = sns.scatterplot(x=y.index, y=y, ax=ax, size=size if size is not None else y * -1, hue=color, palette=[sns.xkcd_rgb['white'], sns.xkcd_rgb['pale green'], sns.xkcd_rgb['cerise']]) bar = sns.lineplot(x=y.index, y=self.y_prediction, ax=ax) plt.hlines(self.probability_cutoff, y.index.min(), y.index.max(), color=sns.xkcd_rgb['silver']) plt.close() return fig def confusion_loss(self): cm = self.confusion_matrix df = self.loss return np.array([[df.loc[cm[0, 0]].sum(), df.loc[cm[0, 1]].sum()], [df.loc[cm[1, 0]].sum(), df.loc[cm[1, 1]].sum()]]) def confusion_count(self): return np.array([ [len(self.confusion_matrix[0, 0]), len(self.confusion_matrix[0, 1])], [len(self.confusion_matrix[1, 0]), len(self.confusion_matrix[1, 1])], ]) def _repr_html_(self): return self._html_()._repr_html_() def _html_(self, width: str = '100%'): # only import it needed from vdom.helpers import div, p, img, table, tr, td, tbody, thead, th import matplotlib.pyplot as plt import base64 import io if self.confusion_count()[0, 0] <= 0: return p('very bad fit with 0 TP!') image = None if self.loss is not None: with io.BytesIO() as f: fig = self.plot_backtest() fig.savefig(f, format="png", bbox_inches='tight') image = base64.encodebytes(f.getvalue()).decode("utf-8") plt.close(fig) cmc = self.confusion_count() cml = self.confusion_loss() if self.loss is not None else np.array([[0, 0], [0, 0]]) return div( table( thead( tr( th("Classification Count", style={'text-align': 'left'}), th("Classification Loss", style={'text-align': 'right'}) ) ), tbody( tr( td(self._matrix_table(cmc)), td(self._matrix_table(cml), style={'float': 'right'}) ), tr( td( img(src=f'data:image/png;base64,{image}', style={'width': '100%'}) if image is not None else "", colspan='2' ) ) ), style={'width': '100%'} ), style={'width': width} ) def _matrix_table(self, mx: np.array): from vdom.helpers import table, tr, td, tbody, thead row_label = [[td("True")], [td("False")]] colors = [['green', 'orange'], ['red', 'grey']] return table( thead( tr( td("Prediction / Truth"), td("True"), td("False") ) ), tbody( [tr( row_label[row] + [td( f'{mx[row, col]: .2f}', style={'color': colors[row][col]}) for col in range(mx.shape[1])]) for row in range(mx.shape[0])] ) ) def __len__(self): return len(self.y_true) def __str__(self) -> str: return f'\n{len(self.confusion_matrix[0,0])}\t{len(self.confusion_matrix[0,1])}' \ f'\n{len(self.confusion_matrix[1,0])}\t{len(self.confusion_matrix[1,1])}' class Fit(object): def __init__(self, model: Model, training_classification: ClassificationSummary, test_classification: ClassificationSummary): self.model = model self.training_classification = training_classification self.test_classification = test_classification def set_probability_cutoff(self, probability_cutoff: float = 0.5): self.training_classification.set_probability_cutoff(probability_cutoff) self.test_classification.set_probability_cutoff(probability_cutoff) def values(self): return self.model, self.training_classification, self.test_classification def _repr_html_(self): return self._html_()._repr_html_() def _html_(self): # only import it needed from vdom.helpers import div, table, tr, td, tbody, thead, th model = self.model.__repr__() if model is None: model = str(self.model) return div( table( thead( tr( th("Training Data", style={'text-align': 'left'}), th("Test Data", style={'text-align': 'right'}) ) ), tbody( tr( td(self.training_classification._html_()), td(self.test_classification._html_()) ), tr( td( model, colspan="2" ) ) ), style={'width': '100%'} ), style={'width': '100%', 'float': 'left'} ) PKl5O`,,&pandas_ml_utils/features_and_Labels.pyimport inspect import logging from typing import List, Callable, Iterable, Dict import pandas as pd log = logging.getLogger(__name__) class FeaturesAndLabels(object): def __init__(self, features: List[str], labels: List[str], target_columns: List[str] = None, loss_column: str = None, feature_lags: Iterable[int] = None, lag_smoothing: Dict[int, Callable[[pd.Series], pd.Series]] = None, probability_cutoff: float = 0.5): self.features = features self.labels = labels self.target_columns = target_columns self.loss_column = loss_column self.feature_lags = feature_lags self.lag_smoothing = lag_smoothing self.probability_cutoff = probability_cutoff self.len_feature_lags = sum(1 for _ in feature_lags) if feature_lags is not None else 1 self.expanded_feature_length = len(features) * self.len_feature_lags if feature_lags is not None else len(features) log.info(f'number of features, lags and total: {self.len_features()}') def len_features(self): return len(self.features), self.len_feature_lags, self.expanded_feature_length def len_labels(self): return len(self.labels) def __repr__(self): return f'FeaturesAndLabels({self.features},{self.labels},{self.target_columns},{self.loss_column},' \ f'{self.feature_lags},{self.lag_smoothing},{self.probability_cutoff}) #{len(self.features)} ' \ f'features expand to {self.expanded_feature_length}' def __hash__(self): return hash(self.__id__()) def __eq__(self, other): return self.__id__() == other.__id__() def __id__(self): import dill # only import if really needed smoothers = {feature: inspect.getsource(smoother) for feature, smoother in self.lag_smoothing.items()} return f'{self.features},{self.labels},{dill.dumps(self.feature_lags)},{smoothers}' def __str__(self): return self.__repr__() PKl5Opandas_ml_utils/fetch_yahoo.pyimport logging import traceback import pandas as pd import cachetools.func def inner_join(df, join: pd.DataFrame, prefix: str = ''): return pd.merge(df, join.add_prefix(prefix), left_index=True, right_index=True, how='inner', sort=True) @cachetools.func.ttl_cache(maxsize=1, ttl=10 * 60) def fetch_yahoo(period='max', **kwargs: str): import yfinance as yf df = None for k, v in kwargs.items(): px = f'{k}_' df_ = None # bloody skew index does not have any data on yahoo if v == '^SKEW': df_ = pd.read_csv('http://www.cboe.com/publish/scheduledtask/mktdata/datahouse/skewdailyprices.csv', skiprows=1, parse_dates=True, index_col='Date') \ .drop(['Unnamed: 2', 'Unnamed: 3'], axis=1) else: ticker = yf.Ticker(v) try: # first ty to append the most recent data df_ = ticker.history(period="1d", interval='1d')[-1:].combine_first(ticker.history(period=period)) except: traceback.print_exc() logging.warning('failed to add yf.Ticker({v}).history(period="1d", interval="1d")[-1:] fallback to hist only!') df_ = ticker.history(period=period) # print some statistics logging.info(f'number of rows for {k} = {len(df_)}, from {df_.index[0]} to {df_.index[-1]} period={period}') if df is None: df = df_.add_prefix(px) else: df = df.inner_join(df_, prefix=px) # print some statistics logging.info(f'number of rows for joined dataframe = {len(df)}, from {df.index[0]} to {df.index[-1]}') return df PKl5Ox!pandas_ml_utils/lazy_dataframe.pyimport uuid import pandas as pd from typing import Callable, Union class LazyDataFrame(object): def __init__(self, df: pd.DataFrame, **kwargs: Callable[[pd.DataFrame], Union[pd.DataFrame, pd.Series]]) -> None: self.hash = uuid.uuid4() self.df: pd.DataFrame = df self.kwargs = kwargs def __getitem__(self, item: str): if isinstance(item, list): df = self.df[[value for value in item if value in self.df.columns]] for key in item: if key in self.kwargs: res = self.kwargs[key](self.df) if isinstance(res, pd.Series): res.name = key df = df.join(res) elif isinstance(res, pd.DataFrame): df = df.join(res.add_prefix(f'{key}_')) return df else: if item in self.df: return self.df[item] elif item in self.kwargs: return self.kwargs[item](self.df) else: raise ValueError(f"invalid item {item}") def __setitem__(self, key: str, value: Callable[[pd.DataFrame], Union[pd.DataFrame, pd.Series]]): self.hash = uuid.uuid4() if callable(value): self.kwargs[key] = value(self.df) else: self.df[key] = value def __getattr__(self, item): return self.to_dataframe().__getattr__(item) def __contains__(self, key): return key in self.df or key in self.kwargs def __hash__(self): return int(self.hash) def __eq__(self, other): return self.hash == other.hash if isinstance(other, LazyDataFrame) else False def to_dataframe(self): df = self.df.copy() for key, calculation in self.kwargs.items(): column = calculation(df) if isinstance(column, pd.DataFrame): df = df.join(column.add_prefix(f'{key}_')) else: df[key] = column return dfPKl5OTpandas_ml_utils/multi_model.pyimport logging from collections.abc import Iterable from itertools import groupby from typing import Tuple, Dict, Callable, Any, Union, List from .utils import unfold_parameter_space import pandas_ml_utils as pdu import dill as pickle import pandas as pd import numpy as np log = logging.getLogger(__name__) class MultiModel(object): @staticmethod def load(filename: str): with open(filename, 'rb') as file: multi_model = pickle.load(file) if isinstance(multi_model, MultiModel): return multi_model else: raise ValueError("file provided was not a MultiModel") def __init__(self, data_provider: Callable[[], pd.DataFrame], data_engineer: Callable[[pd.DataFrame], pd.DataFrame], model_provider: Callable[[], pdu.Model], parameter_space: Dict[str, Iterable]): self.data_provider = data_provider self.data_engineer = data_engineer self.model_provider = model_provider self.parameter_space = unfold_parameter_space(parameter_space.copy(), {}) self.min_needed_data: int = None self.data: pd.DataFrame = None self.fits: List[pdu.Fit] = None def save(self, filename: str): with open(filename, 'wb') as file: pickle.dump(self, file) # def fetch_data_and_fit(self, test_size: float = 0.4, test_validate_split_seed: int=None): # self.fetch_data() # self.fit() def fetch_data(self): self.data = self.data_provider() def fit(self, test_size: float = 0.4, test_validate_split_seed: int = None) -> None: def model_fitter(**kwargs) -> pdu.Fit: fit = self.data_engineer(self.data, **kwargs) \ .fit_classifier(self.model_provider, test_size=test_size, test_validate_split_seed=test_validate_split_seed) log.info(f'fit for { {**kwargs}}\n{fit.training_classification.confusion_count()}\n{fit.test_classification.confusion_count()}') return fit # TODO there should be a way to generate one ClassificationSummary out of several by summing or averaging self.fits = [model_fitter(**kwargs) for kwargs in self.parameter_space] self.min_needed_data = max([fit.model.min_required_data for fit in self.fits]) def predict(self) -> pd.DataFrame: df = self.data[-self.min_needed_data:] if self.min_needed_data is not None else self.data def model_predictor(model, **kwargs) -> pd.DataFrame: prediction = self.data_engineer(df, **kwargs) \ .classify(model) return prediction[-1:] predictions = [model_predictor(self.fits[i].model, **kwargs) for i, kwargs in enumerate(self.parameter_space)] return predictions def plot_heatmap(self, parameter_as_column: str): import seaborn as sns sns.heatmap(self.compute_heatmap(parameter_as_column)) def compute_heatmap(self, parameter_as_column: str): predictions = self.predict() # to group all ro indices per column index we first need to sort accordingly sorted_parameter_space = sorted(enumerate(self.parameter_space), key=lambda x: x[1][parameter_as_column]) columns = {col: [value[0] for value in parameter] for col, parameter in groupby(sorted_parameter_space, lambda x: x[1][parameter_as_column])} # assign a data frame for each column predictions = [pd.concat([predictions[row][["target", "prediction_proba"]] for row in rows], axis=0, sort=True) \ .set_index("target") \ .groupby(level=0).max() \ .rename(columns={"prediction_proba": column}) for column, rows in columns.items()] predictions = pd.concat(predictions, axis=1, sort=True).sort_index(ascending=False) return predictions PKl5O3Q7$pandas_ml_utils/pandas_extensions.pyimport logging import re from typing import Union, Callable import pandas as pd from wrappers.hashable_dataframe import HashableDataFrame log = logging.getLogger(__name__) def hashable(df): return HashableDataFrame(df) def add_apply(df, **kwargs: Callable[[pd.DataFrame], Union[pd.Series, pd.DataFrame]]): df2 = pd.DataFrame() for k, v in kwargs.items(): df2[k] = v(df) return df.join(df2) def shift_inplace(df, **kwargs: int): for k, v in kwargs.items(): df[k] = df[k].shift(v) return df def drop_re(df, *args: str): drop_list = [] for regex in args: drop_list.extend(list(filter(re.compile(regex).match, df.columns))) return df.drop(drop_list, axis=1) def extend_forecast(df, periods: int): df_ext = pd.DataFrame(index=pd.date_range(df.index[-1], periods=periods+1, closed='right')) return pd.concat([df, df_ext], axis=0, sort=True) PKl5OtEuupandas_ml_utils/plots.pyimport inspect import logging import sys from typing import List, Tuple, Callable, Iterable, Dict, Union from matplotlib import pyplot as plt import seaborn as sns import numpy as np import pandas as pd from .classifier_models import Model log = logging.getLogger(__name__) def plot_forecast_heatmap(df: pd.DataFrame, model: Model): passPKl5O J"pandas_ml_utils/train_test_data.pyimport os from functools import lru_cache import numpy as np import pandas as pd import logging from time import perf_counter as pc from sortedcontainers import SortedDict from typing import Type, Iterable, List, Callable, Dict from wrappers.hashable_dataframe import HashableDataFrame from pandas_ml_utils.utils import log_with_time log = logging.getLogger(__name__) def make_backtest_data(df: pd.DataFrame, features_and_labels: 'FeaturesAndLabels', label_type: Type = int): return make_training_data(df, features_and_labels, 0, label_type) def make_training_data(df: pd.DataFrame, features_and_labels: 'FeaturesAndLabels', test_size: float = 0.4, label_type: Type = int, seed: int = 42, cache: bool = False, summary_printer: Callable[[np.ndarray, np.ndarray, np.ndarray], None] = None): # only import if this method is needed from sklearn.model_selection import train_test_split # use only feature and label columns start_pc = log_with_time(lambda: log.debug("make training / test data split ...")) df = df[set(features_and_labels.features + features_and_labels.labels)] # create features and re-assign data frame with all nan rows dropped df_new, x, names = _make_features_with_cache(HashableDataFrame(df), features_and_labels) if cache else \ _make_features(df, features_and_labels) # calculate the minimum required data min_required_data = len(df) - len(df_new) + 1 # assign labels y = df_new[features_and_labels.labels].values # split training and test data start_split_pc = log_with_time(lambda: log.debug(" splitting ...")) x_train, x_test, y_train, y_test, index_train, index_test = \ train_test_split(x, y, df_new.index, test_size=test_size, random_state=seed) if test_size > 0 \ else (x, None, y, None, df_new.index, None) log.info(f" splitting ... done in {pc() - start_split_pc: .2f} sec!") # ravel one dimensional labels if len(features_and_labels.labels) == 1: y_train = y_train.ravel().astype(label_type) y_test = y_test.ravel().astype(label_type) if y_test is not None else None log.info(f"make training / test data split ... done in {pc() - start_pc: .2f} sec!") # print some statistics if needed if summary_printer is not None: summary_printer(y, y_train, y_test) # return the split return x_train, x_test, y_train, y_test, index_train, index_test, min_required_data, (names, features_and_labels.labels) def make_forecast_data(df: pd.DataFrame, features_and_labels: 'FeaturesAndLabels'): return _make_features(df[features_and_labels.features], features_and_labels) @lru_cache(maxsize=int(os.getenv('CACHE_FEATUES_AND_LABELS', '1'))) def _make_features_with_cache(df: HashableDataFrame, features_and_labels: 'FeaturesAndLabels'): log.info(f"no cache entry available for {hash(df), hash(features_and_labels)}") return _make_features(df, features_and_labels) def _make_features(df: pd.DataFrame, features_and_labels: 'FeaturesAndLabels'): start_pc = log_with_time(lambda: log.debug(" make features ...")) feature_lags = features_and_labels.feature_lags features = features_and_labels.features lag_smoothing = features_and_labels.lag_smoothing # drop nan's and copy frame df = df.dropna().copy() # generate feature matrix if feature_lags is not None: # return RNN shaped 3D arrays for feature in features: feature_series = df[feature] smoothers = None # smooth out feature if requested if lag_smoothing is not None: smoothers = SortedDict({lag: smoother(feature_series.to_frame()) for lag, smoother in lag_smoothing.items()}) for lag in feature_lags: # if smoothed values are applicable use smoothed values if smoothers is not None and len(smoothers) > 0 and smoothers.peekitem(0)[0] <= lag: feature_series = smoothers.popitem(0)[1] # assign the lagged (eventually smoothed) feature to the features frame df[f'{feature}_{lag}'] = feature_series.shift(lag) # drop all rows which got nan now df = df.dropna() # RNN shape need to be [row, time_step, feature] x = np.array([[[df.iloc[row][f'{feat}_{lag}'] for feat in features] for lag in feature_lags] for row in range(len(df))], ndmin=3) names = np.array([[f'{feat}_{lag}' for feat in features] for lag in feature_lags], ndmin=2) else: # return simple 2D arrays x = df[features].values names = features log.info(f" make features ... done in {pc() - start_pc: .2f} sec!") return df, x, names def reshape_rnn_as_ar(arr3d): if len(arr3d.shape) < 3: print("Data was not in RNN shape") return arr3d else: return arr3d.reshape(arr3d.shape[0], arr3d.shape[1] * arr3d.shape[2]) PKl5OLIpandas_ml_utils/utils.pyfrom time import perf_counter as pc from typing import Callable, Dict, Iterable, Any, List import numpy as np def log_with_time(log_statement: Callable[[], None]): log_statement() return pc() def unfold_parameter_space(parameter_space: Dict[str, Iterable], parameters: Dict[str, Any]) -> List[Dict]: if len(parameter_space) > 0: # more parameters need to be unfolded parameter, space = parameter_space.popitem() return list(np.array([unfold_parameter_space(parameter_space.copy(), {**parameters, parameter: argument}) for argument in space]).flat) else: return parameters PKH5O ..'pandas_ml_utils-0.0.3.dist-info/LICENSEThe MIT License (MIT) Copyright (c) 2019 KIC Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HPO%pandas_ml_utils-0.0.3.dist-info/WHEEL HM K-*ϳR03rOK-J,/RH,szd&Y)r$[)T&UrPK!H+ (pandas_ml_utils-0.0.3.dist-info/METADATAYmS#>c*M၅K/ajj2234o*=OK3dUHV_|N‰Xo3QULyv*тd7ލ˺(蠞t =1;4S.BL&P!Lr)LiA_)˜`ggE8NtN`;*QT%dz_΢̀@[Pe:2$1)NÐ?EG&FU'Dy}w;8mget kDI؈B_AU}'S~xrN,9w2וWإLG}0WxHYhMJu6cq2MpUD*&x<<@1]@rIr|a& &:{bڈŷRb+ \!a X5M7r~ޠs!ٻ DmJ%@CWc (ES[1J/ k'GcٺQ!`Vsĸ0B2oל60@> M'[gvljcΑEi1^9*AzX;aJE7ta,ЯƗ,lw]ʮ9 >Ǔ܆ߏ{ێU?})hvy[ڈTռGxe⽒F!qIP^^)6g | EU4a8)X5ׯ=?o(>ؼNYUwz-uM43ʌ=j2d@2e8D<8&GQӔqD/JrGR/2ƱtRZXIVx3?eRTyJr &N81C1<5I( KP!pb,C qUN{CP$jkY,Yi\Ȓ &!Τf듙'ֹ04yN%^(cNLW枓:/gY G uXxl t (ۡkQtI6L뮭ˋ_7Cart%F~XW\rVSwW'igm}h~3vJϧz1EBaT!x dد,2B rrQG&tH :q-8*ҚLTZq4AլOggtJF'Η=j qυ#s[sA3*(DԺ 9r"8Yb:e4o'$uHEJi(mT~Ė\ނr>:}~\MjwqY%j v.0Q B|fvF,Ȕ.# M2aA$dgo ~7\m0SF9}ܫ[ƩyUwL7ѶY]6 <ƾT$-e9 ־hwU(h(n: TF0(RnMz,Ƞ/lڍ<̽rh۴MvWDw8G3WeN[G-( Ao5(ﴛ(R'|_!ө8]%lN HR]MM̉;aI+ ,<>xMWuC1(ٖf ~m>{AX7_=ʻCz9;:_Kр|$_gCPxe74$2褐R墚A{Z_7/!E[=f KC hJoz1?4>;Ad]OV!%t]*\8-lЩ:_X1?Al_}8Ò,QHF9 G ]pct%O\r`"]p%.svZa\|nP >`iV ct[¶c| mެs`E>W\r V-գz)ێuB8;wyGP h#렶aa}h|S ybC/sKʱs0yG+Ds/~iV֕8=D^xNsf!/ dY!EkiUy4bFPK!HtPHD&pandas_ml_utils-0.0.3.dist-info/RECORD˒8}:,da6`6*0`dn'3rd*iԑW NP B"TTC譙8cQQtZj p>[֬mCN2&ZĘuR3x/N֨5hIeM M˘{OHi|=hLyOUUr29{$8Yiu0#9licaS [N'aT_niS!v7{/6nBt{S:X^s? O9]Sbw)EcFy3YH\J ZGU냃?X/xY 7JocqG{kЄ>KW‡:} jLx ~3\es4]xPK]6Ol$$pandas_ml_utils/__init__.pyPKl5OEz']pandas_ml_utils/classification_plots.pyPKl5O+OT944kpandas_ml_utils/classifier.pyPK6L6O~-@$pandas_ml_utils/classifier_models.pyPK]6O@3"3" pandas_ml_utils/data_objects.pyPKl5O`,,&xBpandas_ml_utils/features_and_Labels.pyPKl5OJpandas_ml_utils/fetch_yahoo.pyPKl5Ox!Qpandas_ml_utils/lazy_dataframe.pyPKl5OT,Zpandas_ml_utils/multi_model.pyPKl5O3Q7$?jpandas_ml_utils/pandas_extensions.pyPKl5OtEuunpandas_ml_utils/plots.pyPKl5O J"opandas_ml_utils/train_test_data.pyPKl5OLIpandas_ml_utils/utils.pyPKH5O ..'pandas_ml_utils-0.0.3.dist-info/LICENSEPK!HPO%pandas_ml_utils-0.0.3.dist-info/WHEELPK!H+ (pandas_ml_utils-0.0.3.dist-info/METADATAPK!HtPHD&pandas_ml_utils-0.0.3.dist-info/RECORDPKC