Source code for retentioneering.analysis.calculate

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from retentioneering.analysis.utils import prepare_dataset
from retentioneering.visualization import plot


[docs]def calculate_frequency_hist(df, settings, target_events=None, make_plot=True, save=True, plot_name=None, figsize=(8, 5)): """ Calculate frequency of each event from input clickstream and plot a barplot :param df: data from BQ or your own (clickstream). Should have at least three columns: `event_name`, `event_timestamp` and `user_pseudo_id` :param settings: experiment config (can be empty dict here) :param target_events: name of event which signalize target function (e.g. for prediction of lost users it'll be `lost`) :param make_plot: plot stats or not :param save: True if the graph should be saved :param plot_name: name of file with graph plot :param figsize: width, height in inches. If not provided, defaults to rcParams["figure.figsize"] = [6.4, 4.8] :type df: pd.DataFrame :type settings: dict :type target_events: Union[tuple, list, str, None] :type make_plot: bool :type save: bool :type plot_name: str :type figsize: tuple :return: pd.DataFrame """ if isinstance(target_events, str): target_events = [target_events] if target_events is not None: users = df.user_pseudo_id[df.event_name.isin(target_events)].unique() df = df[df.user_pseudo_id.isin(users)] nodes_hist = (df.groupby('event_name', as_index=False) .event_timestamp.count() .sort_values('event_timestamp', ascending=False)) if make_plot: plot.bars(nodes_hist.event_name.values, nodes_hist.event_timestamp.values, settings, save=save, plot_name=plot_name, figsize=figsize) return nodes_hist
[docs]def calculate_frequency_map(df, settings, target_events=None, plot_name=None, make_plot=True, save=True, figsize_hist=(8, 5), figsize_heatmap=(10, 15)): """ Calculate frequency of each event for each user from input clickstream and plot a heatmap :param df: data from BQ or your own (clickstream). Should have at least three columns: `event_name`, `event_timestamp` and `user_pseudo_id` :param settings: experiment config (can be empty dict here) :param target_events: name of event which signalize target function (e.g. for prediction of lost users it'll be `lost`) :param plot_name: name of file with graph plot :param make_plot: plot stats or not :param save: True if the graph should be saved :param figsize_hist: width, height in inches for bar plot with events. If None, defaults to rcParams["figure.figsize"] = [6.4, 4.8] :param figsize_heatmap: width, height in inches for heatmap. If None, defaults to rcParams["figure.figsize"] = [6.4, 4.8] :type df: pd.DataFrame :type settings: dict :type target_events: Union[tuple, list, str, None] :type plot_name: str :type make_plot: bool :type save: bool :type figsize_hist: tuple :type figsize_heatmap: tuple :return: pd.DataFrame """ if isinstance(target_events, str): target_events = [target_events] df = df.copy() if target_events is not None: users = df.user_pseudo_id[df.event_name.isin(target_events)].unique() df = df[df.user_pseudo_id.isin(users)] event_to_idx = {event_name: str(i).zfill(3) for i, event_name in enumerate(df.event_name.unique())} idx_to_event = {i: event_name for event_name, i in event_to_idx.items()} df.event_name = df.event_name.map(event_to_idx) data = prepare_dataset(df, [event_to_idx[t] for t in target_events or []]) cv = CountVectorizer() x = cv.fit_transform(data.event_name.values).todense() cols = [idx_to_event[c] for c in cv.get_feature_names()] x = pd.DataFrame(x, columns=cols, index=data.user_pseudo_id) df.event_name = df.event_name.map(idx_to_event) nodes_hist = calculate_frequency_hist(df=df, settings=settings, target_events=None, make_plot=make_plot, save=save, plot_name=plot_name, figsize=figsize_hist) sorted_cols = nodes_hist.event_name[~nodes_hist.event_name.isin(target_events or [])].values x = x.loc[:, sorted_cols] x = x.sort_values(list(sorted_cols), ascending=False) if make_plot: plot.heatmap(x.values, sorted_cols, settings=settings, save=save, plot_name=plot_name, figsize=figsize_heatmap) return x