import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from retentioneering.analysis.utils import prepare_dataset
from retentioneering.visualization import plot
[docs]def calculate_frequency_hist(df, settings, target_events=None,
make_plot=True, save=True, plot_name=None, figsize=(8, 5)):
"""
Calculate frequency of each event from input clickstream and plot a barplot
:param df: data from BQ or your own (clickstream). Should have at least three columns: `event_name`,
`event_timestamp` and `user_pseudo_id`
:param settings: experiment config (can be empty dict here)
:param target_events: name of event which signalize target function
(e.g. for prediction of lost users it'll be `lost`)
:param make_plot: plot stats or not
:param save: True if the graph should be saved
:param plot_name: name of file with graph plot
:param figsize: width, height in inches. If not provided, defaults to rcParams["figure.figsize"] = [6.4, 4.8]
:type df: pd.DataFrame
:type settings: dict
:type target_events: Union[tuple, list, str, None]
:type make_plot: bool
:type save: bool
:type plot_name: str
:type figsize: tuple
:return: pd.DataFrame
"""
if isinstance(target_events, str):
target_events = [target_events]
if target_events is not None:
users = df.user_pseudo_id[df.event_name.isin(target_events)].unique()
df = df[df.user_pseudo_id.isin(users)]
nodes_hist = (df.groupby('event_name', as_index=False)
.event_timestamp.count()
.sort_values('event_timestamp', ascending=False))
if make_plot:
plot.bars(nodes_hist.event_name.values, nodes_hist.event_timestamp.values, settings,
save=save, plot_name=plot_name, figsize=figsize)
return nodes_hist
[docs]def calculate_frequency_map(df, settings, target_events=None, plot_name=None,
make_plot=True, save=True, figsize_hist=(8, 5), figsize_heatmap=(10, 15)):
"""
Calculate frequency of each event for each user from input clickstream and plot a heatmap
:param df: data from BQ or your own (clickstream). Should have at least three columns: `event_name`,
`event_timestamp` and `user_pseudo_id`
:param settings: experiment config (can be empty dict here)
:param target_events: name of event which signalize target function
(e.g. for prediction of lost users it'll be `lost`)
:param plot_name: name of file with graph plot
:param make_plot: plot stats or not
:param save: True if the graph should be saved
:param figsize_hist: width, height in inches for bar plot with events. If None, defaults to rcParams["figure.figsize"] = [6.4, 4.8]
:param figsize_heatmap: width, height in inches for heatmap. If None, defaults to rcParams["figure.figsize"] = [6.4, 4.8]
:type df: pd.DataFrame
:type settings: dict
:type target_events: Union[tuple, list, str, None]
:type plot_name: str
:type make_plot: bool
:type save: bool
:type figsize_hist: tuple
:type figsize_heatmap: tuple
:return: pd.DataFrame
"""
if isinstance(target_events, str):
target_events = [target_events]
df = df.copy()
if target_events is not None:
users = df.user_pseudo_id[df.event_name.isin(target_events)].unique()
df = df[df.user_pseudo_id.isin(users)]
event_to_idx = {event_name: str(i).zfill(3) for i, event_name in enumerate(df.event_name.unique())}
idx_to_event = {i: event_name for event_name, i in event_to_idx.items()}
df.event_name = df.event_name.map(event_to_idx)
data = prepare_dataset(df, [event_to_idx[t] for t in target_events or []])
cv = CountVectorizer()
x = cv.fit_transform(data.event_name.values).todense()
cols = [idx_to_event[c] for c in cv.get_feature_names()]
x = pd.DataFrame(x, columns=cols, index=data.user_pseudo_id)
df.event_name = df.event_name.map(idx_to_event)
nodes_hist = calculate_frequency_hist(df=df, settings=settings, target_events=None,
make_plot=make_plot, save=save, plot_name=plot_name, figsize=figsize_hist)
sorted_cols = nodes_hist.event_name[~nodes_hist.event_name.isin(target_events or [])].values
x = x.loc[:, sorted_cols]
x = x.sort_values(list(sorted_cols), ascending=False)
if make_plot:
plot.heatmap(x.values, sorted_cols, settings=settings, save=save, plot_name=plot_name, figsize=figsize_heatmap)
return x