Source code for retentioneering.analysis.model

from __future__ import print_function
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import os
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split
from retentioneering.analysis.utils import _check_folder, get_all_agg
from retentioneering.visualization import plot


def _str_agg(x):
    return ' '.join(x)


[docs]def create_filter(data, n_folds=None): """ Creates interested events filter with histogram :param data: data from BQ or your own (clickstream). Should have at least three columns: `event_name`, `event_timestamp` and `user_pseudo_id` :param n_folds: number of folds in histogram (it affects how far technical events should be from users` events) :type data: pd.DataFrame :type n_folds: int :return: set """ all_events = set(data.event_name) x = data.groupby('event_name').user_pseudo_id.count() if n_folds is None: n_folds = (x.shape[0] // 200) + 1 hist = np.histogram(x, bins=n_folds) while (hist[0] == 0).sum() > 0: x = x[x < hist[1][n_folds - 2]] x = x[x > hist[1][0]] hist = np.histogram(x, bins=n_folds) return all_events - set(x.index)
[docs]class Model: """ Base model for classification """ def __init__(self, data, target_event, settings, event_filter=None, n_start_events=None, emb_type='tf-idf', ngram_range=(1, 3), emb_dims=None, embedder=None): """ Inits users classifier :param data: data from BQ or your own (clickstream). Should have at least three columns: `event_name`, `event_timestamp` and `user_pseudo_id` :param target_event: name of event which signalize target function (e.g. for prediction of lost users it'll be `lost`) :param settings: experiment config (can be empty dict here) :param event_filter: list of events that is wanted to use in analysis :param n_start_events: length of users trajectory from start :param emb_type: type of embedding (now only `tf-idf` is supported) :param ngram_range: The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. :param emb_dims: it is needed only when use emb_type different from `tf-idf` :param embedder: pretrained model for event embedding (now only `tf-idf` is supported) :type data: pd.DataFrame :type target_event: str :type settings: dict :type event_filter: list or other iterable :type n_start_events: int :type emb_type: str :type ngram_range: tuple[int] :type emb_dims: int :type embedder: model or None """ self._source_data = data self.data = self._prepare_dataset(data, target_event, event_filter, n_start_events) self.features = self.data.event_name.values self.target = self.data.target.values self.users = self.data.user_pseudo_id.values self.emb_type = emb_type self._embedder = embedder self.ngram_range = ngram_range self.emb_dims = emb_dims self.roc_auc_score = None self.average_precision_score = None self.roc_c = None self.prec_rec = None self._check_folder(settings) self._model_type = None self.model = None if embedder is not None: self._fit_vec = False else: self._fit_vec = True def _get_vectors(self, sample): if self._fit_vec: self._fit_vectors(sample) return self._embedder.transform(sample) def _fit_vectors(self, sample): if self.emb_type == 'tf-idf': from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(ngram_range=self.ngram_range) self._embedder = tfidf.fit(sample) self._fit_vec = False @staticmethod def _prepare_dataset(df, target_event=None, event_filter=None, n_start_events=None): if event_filter is not None: df = df[df.event_name.isin(event_filter)] df = df.sort_values('event_timestamp') train = df.groupby('user_pseudo_id').event_name.agg(_str_agg) train = train.reset_index(None) train.event_name = train.event_name.apply(lambda x: x.split()) if target_event is not None: train['target'] = train.event_name.apply(lambda x: x[-1] == target_event) train.event_name = train.event_name.apply(lambda x: x[:-1]) else: train.event_name = train.event_name.apply(lambda x: x) if n_start_events: train.event_name = train.event_name.apply(lambda x: ' '.join(x[:n_start_events])) else: train.event_name = train.event_name.apply(lambda x: ' '.join(x)) return train def _prepare_data(self): x_train, x_test, y_train, y_test = train_test_split(self.features, self.target, test_size=0.2, random_state=42) x_train_vec = self._get_vectors(x_train) x_test_vec = self._get_vectors(x_test) return x_train_vec, x_test_vec, y_train, y_test def _validate(self, x_test_vec, y_test): preds = self.model.predict_proba(x_test_vec) self.roc_auc_score = roc_auc_score(y_test, preds[:, 1]) self.average_precision_score = average_precision_score(y_test, preds[:, 1]) print('ROC-AUC: {:.4f}'.format(self.roc_auc_score)) print('PR-AUC: {:.4f}'.format(self.average_precision_score)) self.roc_c = roc_curve(y_test, preds[:, 1]) self.prec_rec = precision_recall_curve(y_test, preds[:, 1]) self.plot()
[docs] def fit_model(self, model_type='logit'): """ Fits classifier :param model_type: type of model (now only logit is supported) :return: None :type model_type: str """ self._model_type = model_type x_train_vec, x_test_vec, y_train, y_test = self._prepare_data() if model_type == 'logit': from sklearn.linear_model import LogisticRegression lr = LogisticRegression(penalty='l1') lr.fit(x_train_vec, y_train) self.model = lr self._validate(x_test_vec, y_test)
[docs] def predict_proba(self, sample): """ Predicts probability of sample :param sample: sample of users vectorized tracks (e.g. with `tf-idf` transform) :return: probabilities of different classes as list of `[not target_event probability, target_event probability]` :type sample: np.ndarray or pd.DataFrame :rtype: np.ndarray """ return self.model.predict_proba(sample)
[docs] def build_important_track(self): """ Finds the most important tracks for definition of target_event :return: most important edges in graph :rtype: pd.DataFrame """ if self._model_type == 'logit': imp = self.model.coef_ else: raise NotImplementedError('Sorry, but only logit available in model type') if self.emb_type == 'tf-idf': imp = self._embedder.inverse_transform(imp)[0] edges = [] for i in imp: j = i.split() if len(j) == 2: edges.append(j) elif len(j) > 2: for k in range(1, len(j)): edges.append([j[k - 1], j[k]]) elif len(j) == 1: edges.append([j[0], None]) return pd.DataFrame(edges).drop_duplicates()
@staticmethod def _get_tsne(sample): return TSNE().fit_transform(sample.todense())
[docs] def plot_projections(self, sample=None, target=None, ids=None): """ Plots tsne projection of users trajectories :param sample: sample of trajectories :param target: by which column data should be splitted (if target is None then probabilities of `target_event` is highlighted) :param ids: list of user_ids for visualization in plot :type sample: np.ndarray of pd.DataFrame :type target: str :type ids: list or other iterable :return: None """ if sample is None: self._plot_proj_sample(self.features, self.target, self.users) else: self._plot_proj_sample(sample, target, ids)
def _plot_proj_sample(self, sample, target=None, ids=None): pre_vec = self._get_vectors(sample) vec = self._get_tsne(pre_vec) self._cached_tsne = vec self._tsne_sample = sample if ids is not None: self._tsne_users = ids if ids is not None: txt = np.array(['user_id: {}'.format(i) for i in ids]) else: txt = None if target is not None: figs = [] for i in np.unique(target): figs.append( go.Scatter( x=vec[target == i][:, 0], y=vec[target == i][:, 1], name=str(i), mode='markers', text=list(txt[target == i]) if txt is not None else txt, ) ) else: probs = self.predict_proba(pre_vec)[:, 1] if txt is not None: txt = [i + ',\n prob: {}'.format(j) for i, j in zip(txt, probs)] else: txt = ['prob: {}'.format(i) for i in probs] figs = [go.Scatter( x=vec[:, 0], y=vec[:, 1], mode='markers', marker=dict( color=probs, colorscale='YlGnBu', showscale=True ), text=txt, )] py.init_notebook_mode() py.iplot(figs) filename = os.path.join( self.export_folder, 'tsne_{}.html'.format(datetime.now().strftime('%Y-%m-%dT%H_%M_%S_%f'))) py.plot(figs, filename=filename, auto_open=False) def _get_data_from_plot(self, bbox): bbox = np.array(bbox) left = bbox[:, 0].min() right = bbox[:, 0].max() up = bbox[:, 1].max() bot = bbox[:, 1].min() res = self._cached_tsne fil = (res[:, 1] < up) & (res[:, 1] > bot) & (res[:, 0] < right) & (res[:, 0] > left) users = self._tsne_users[fil] data = self._source_data[self._source_data.user_pseudo_id.isin(users)] return data
[docs] def plot_cluster_track(self, bbox): """ Plots graph for users in selected area :param bbox: coordinates of top-left and bottom-right angles of area :type bbox: List[List[float]] :return: None """ data = self._get_data_from_plot(bbox) data_agg = get_all_agg(data, ['trans_count']) plot.plot_graph(data_agg, 'trans_count', {'export_folder': self.export_folder})
def _check_folder(self, settings): settings = _check_folder(settings) self.export_folder = settings['export_folder']
[docs] def plot(self): """ Plot metrics of model :return: None """ f, ax = plt.subplots(1, 2) f.set_size_inches(15, 5) ax[0].plot(self.roc_c[0], self.roc_c[1]) ax[0].set_title('ROC curve') ax[1].plot(self.prec_rec[1], self.prec_rec[0]) ax[1].set_title('Precision-Recall curve') plt.grid() filename = os.path.join(self.export_folder, 'scores {}.png'.format(datetime.now())) f.savefig(filename)