Source code for retentioneering.analysis.cluster

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from retentioneering.visualization import plot


[docs]def cluster_users(countmap, n_clusters=None, clusterer=None): """ Cluster users based on input dataframe and return DataFrame with `user_pseudo_id` and `cluster` for each user :param countmap: input dataframe, should have user_id in index. All fields will be features in clustering algorithm :param n_clusters: supposed number of clusters, could be None :param clusterer: clustering algorithm. Should have fit_predict function :type countmap: pd.DataFrame :type n_clusters: int :type clusterer: func :return: pd.DataFrame """ users_clusters = pd.DataFrame(countmap.index.values, columns=['user_pseudo_id']) if clusterer is None: if n_clusters: clusterer = KMeans(n_clusters=n_clusters) else: nn = NearestNeighbors(metric='cosine') nn.fit(countmap.values) dists = nn.kneighbors(countmap.values, 2)[0][:, 1] eps = np.percentile(dists, 99) clusterer = DBSCAN(eps=eps, metric='cosine') users_clusters['cluster'] = clusterer.fit_predict(countmap) return users_clusters
[docs]def add_cluster_of_users(data, users_clusters, how='left'): """ Add cluster of each user to clickstream data :param data: data from BQ or your own (clickstream). Should have at least one column: `user_pseudo_id` :param users_clusters: DataFrame with `user_pseudo_id` and `cluster` for each user :param how: argument to pass in pd.merge function :type data: pd.DataFrame :type users_clusters: pd.DataFrame :type how: str :return: pd.DataFrame """ df = data.merge(users_clusters, how=how, on=['user_pseudo_id']) return df
[docs]def calculate_cluster_stats(data, users_clusters, settings, target_events=('lost', 'passed'), make_plot=True, plot_count=2, save=True, plot_name=None, figsize=(10, 5)): """ Plot pie-chart with distribution of target events in clusters :param data: data from BQ or your own (clickstream). Should have at least three columns: `event_name`, `event_timestamp` and `user_pseudo_id` :param users_clusters: DataFrame with `user_pseudo_id` and `cluster` for each user :param settings: experiment config (can be empty dict here) :param target_events: name of event which signalize target function (e.g. for prediction of lost users it'll be `lost`) :param make_plot: plot stats or not :param plot_count: number of plots for output :param save: True if the graph should be saved :param plot_name: name of file with graph plot :param figsize: width, height in inches. If None, defaults to rcParams["figure.figsize"] = [6.4, 4.8] :type data: pd.DataFrame :type users_clusters: pd.DataFrame :type settings: dict :type target_events: list or tuple :type make_plot: bool :type plot_count: int :type plot_name: str :type figsize: tuple :return: np.array """ data = data.loc[data.event_name.isin(target_events), :] data = add_cluster_of_users(data, users_clusters, how='inner') main_classes = (users_clusters.groupby('cluster').size() .sort_values(ascending=False) .iloc[:plot_count].index.values) groups = (data.groupby(['cluster', 'event_name']).size() .unstack('event_name') .reindex(index=main_classes, columns=target_events) .fillna(0) .values .astype(int)) if make_plot: plot.cluster_stats(groups, labels=target_events, settings=settings, plot_count=plot_count, figsize=figsize, save=save, plot_name=plot_name) return groups