Source code for retentioneering.analysis.weighter

import pandas as pd
import numpy as np
from tqdm import tqdm


def _calc_weights(counts, target_mechanics, mechanics_events):
    counts[target_mechanics] = counts.event_name.isin(mechanics_events) * counts.event_count
    return counts


def _diff(x):
    if x.shape[0] == 1:
        return x
    return x.iloc[0] - x.iloc[1]


def _get_mech_events(mechanics_events, mode):
    sub_mech = mechanics_events[mechanics_events['mode'] == mode]
    x = sub_mech.groupby(['mechanics', 'target']).event_name.agg(set).reset_index()
    x = x.sort_values('target', ascending=False)
    mex = x.groupby('mechanics').event_name.agg(_diff).to_dict()
    return mex


[docs]def calc_all_norm_mech(data, mechanics_events, mode='session', duration_thresh=1, len_thresh=None): """ Calculates weights of different mechanics in users` sessions :param data: clickstream data with columns `session` (rank of user`s session) :param mechanics_events: mapping of mechanic and its target events :param mode: if `session` then calculates weights over session, if `full` over full users story :param duration_thresh: duration in time threshold for technical (ping) session deletion :param len_thresh: number of events in session threshold for technical (ping) session deletion :return: session description with weights of each mechanic :type data: pd.DataFrame :type mechanics_events: Dict[str, List[str]] :type mode: str :type duration_thresh: float :type len_thresh: int :rtype: pd.DataFrame """ counts = data.groupby(['user_pseudo_id', 'session', 'event_name']).size().rename('event_count') counts = counts.reset_index() mex = _get_mech_events(mechanics_events=mechanics_events, mode=mode) for target_mechanics, mechanics_events in mex.items(): counts = _calc_weights(counts, target_mechanics, mechanics_events) counts = counts.drop('event_name', 1).groupby(['user_pseudo_id', 'session'], as_index=False).sum() tmp = data.groupby(['user_pseudo_id', 'session']).event_timestamp.max().rename('session_end').reset_index() counts = counts.merge(tmp, on=['user_pseudo_id', 'session']) tmp = data.groupby(['user_pseudo_id', 'session']).event_timestamp.min().rename('session_start').reset_index() counts = counts.merge(tmp, on=['user_pseudo_id', 'session']) counts['session_duration'] = (counts.session_end - counts.session_start) / np.timedelta64(1, 's') if duration_thresh is not None: counts = counts.loc[counts.session_duration >= duration_thresh].copy() if len_thresh is not None: counts = counts.loc[counts.event_count >= len_thresh].copy() norm = (counts[list(mex.keys())].max(axis=1).values + 1e-20).reshape(-1, 1) counts[list(mex.keys())] = counts[list(mex.keys())].values / norm return counts
def _calc_stats(data, target): if type(target) is str: data['is_target'] = data.event_name == target else: data['is_target'] = data.event_name.isin(target) grouped = data.groupby('user_ses').agg({'is_target': 'sum', 'event_name': 'count'}) grouped['freq'] = grouped.is_target / grouped.event_name return grouped.reset_index() def _get_anom(src, data, q=.99, mode='top', q2=.99): if mode == 'top': thresh = np.quantile(data.freq.values, q=q) users = data[data.freq >= thresh].user_ses.values return set(src[src.user_ses.isin(users)].event_name) else: df = data.freq == 0 if any(df): users = data[df == 0].user_ses.values else: return {} tmp = src[src.user_ses.isin(users)] return _top_event_loosers(tmp, q2) def _top_event_loosers(tmp, q=.99): if tmp.shape[0] == 0: return set() nuq = tmp.groupby('user_ses').event_name.count() thresh = np.quantile(nuq.values, q=q) users = nuq[nuq >= thresh].index.values return set(tmp[tmp.user_ses.isin(users)].event_name) def _get_clean_event_list(data, target, q=.99, q2=.99, session_mode=True): if session_mode: data['user_ses'] = data['user_pseudo_id'] + data['session'].astype(str) else: data['user_ses'] = data['user_pseudo_id'] res = _calc_stats(data.copy(), target) top = _get_anom(data, res, q) los = _get_anom(data, res, q2=q2, mode='los') return list(top - los), top, los def _get_df_results(top, los, mode): res_df_top = pd.DataFrame(list(top), columns=['event_name']) res_df_top['target'] = True res_df_los = pd.DataFrame(list(los), columns=['event_name']) res_df_los['target'] = False res_df = res_df_top.append(res_df_los, ignore_index=True) res_df['mode'] = mode return res_df.reset_index(drop=True) def _build_df(data, target, mex, q=.99, q2=.99): res = pd.DataFrame(columns=['event_name', 'target', 'mode']) if 'session' in data.columns: good, top, los = _get_clean_event_list(data, target, q=q, q2=q2, session_mode=True) res = _get_df_results(top, los, 'session') good, top, los = _get_clean_event_list(data, target, q=.99, q2=.99, session_mode=False) res = res.append(_get_df_results(top, los, 'full'), ignore_index=True) res['mechanics'] = mex return res.reset_index(drop=True)
[docs]def mechanics_enrichment(data, mechanics, id_col, event_col, q=.99, q2=.99): """ Enrich list of events specific for mechanic :param data: clickstream data with columns `session` (rank of user`s session) :param mechanics: table with description in form `[id_col, event_col]`, where id_col is a column with mechanic name and event_col is a column which contains target events specific for that mechanic :param id_col: name of the column with mechanic name :param event_col: name of the column with target events specific for that mechanic :param q: quantile for frequency of target events :param q2: quantile for frequency of target events of other mechanic :type data: pd.DataFrame :type mechanics: pd.DataFrame :type q: float in interval (0, 1) :type q2: float in interval (0, 1) :return: mapping of mechanic and its target events :rtype: Dict[str, List[str]] """ mechanics_map = mechanics.groupby(id_col)[event_col].agg(list).to_dict() mechanics_events = [] for key, val in tqdm(mechanics_map.items(), total=len(mechanics_map)): mechanics_events.append(_build_df(data, val, key, q=q, q2=q2)) mechanics_events = pd.concat(mechanics_events) return mechanics_events