Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

Duplicates found in y_true list

See original GitHub issue

Current Version : 0.4.0

In function print_metrics_ranking, I have added in the lines:

        rec_list = pd.DataFrame.from_dict(y_reco_list, orient="index").reset_index()
        rec_list.columns = ['user', 'rec_1','rec_2','rec_3', 'rec_4','rec_5','rec_6', 'rec_7','rec_8','rec_9', 'rec_10']
        true_list = pd.DataFrame.from_dict(y_true_list, orient="index").reset_index()
        true_list.columns = ['user', 'true_1', 'true_2', 'true_3' ,'true_4']
        
        
        print(true_list) 
        print(rec_list)

I have noticed the following from the printed dataframe:

In y_true_list, there can be more than more item and in one example the item number is repeated. An example output is shown below: 9089:array(‘I’, [7, 7, 7]) 9023: array(‘I’, [17, 14]) Is it possible to have more than one item per user and have duplicate items for a specific user?

A full version of the updated function can be found below:

def print_metrics_ranking(self,df,metrics, y_prob=None, y_true=None, y_reco_list=None,
                          y_true_list=None, users=None, k=10, train=True):
    #global eval_df
    #global v
    eval_df=pd.DataFrame()
    
    if train:
        for m in metrics:
            if m in ["log_loss", "loss"]:
                log_loss_ = log_loss(y_true, y_prob, eps=1e-7)
                print(f"\t train log_loss: {log_loss_:.4f}")
                eval_df['log_loss']=[log_loss_]      
    else:
        for m in metrics:

            if m in ["log_loss", "loss"]:
                log_loss_ = log_loss(y_true, y_prob, eps=1e-7)
                print(f"\t eval log_loss: {log_loss_:.4f}")
                eval_df['log_loss_']=[log_loss_]
                
            elif m == "balanced_accuracy":
                y_pred = np.round(y_prob)
                accuracy = balanced_accuracy_score(y_true, y_pred)
                print(f"\t eval balanced accuracy: {accuracy:.4f}")
                eval_df['accuracy']=[round(accuracy,4)]
                
                
            elif m == "roc_auc":
                roc_auc = roc_auc_score(y_true, y_prob)
                print(f"\t eval roc_auc: {roc_auc:.4f}")
                eval_df['roc_auc']=[round(roc_auc,4)]

            elif m == "pr_auc":
                precision, recall, _ = precision_recall_curve(y_true, y_prob)
                pr_auc = auc(recall, precision)
                eval_df['pr_auc']=[round(pr_auc,4)]
                
            elif m == "precision":
                precision_all = precision_at_k(y_true_list, y_reco_list,
                                               users, k)
                print(f"\t eval precision@{k}: {precision_all:.4f}")
                eval_df['precision_all']=[round(precision_all,4)]
                
            elif m == "recall":
                recall_all = recall_at_k(y_true_list, y_reco_list, users, k)
                print(f"\t eval recall@{k}: {recall_all:.4f}")
                eval_df['recall_all']=[round(recall_all,4)]
                
            elif m == "map":
                map_all = map_at_k(y_true_list, y_reco_list, users, k)
                print(f"\t eval map@{k}: {map_all:.4f}")
                eval_df['map_all']=[round(map_all,4)]
                
            elif m == "ndcg":
                ndcg_all = ndcg_at_k(y_true_list, y_reco_list, users, k)
                print(f"\t eval ndcg@{k}: {ndcg_all:.4f}")
                eval_df['ndcg_all']=[round(ndcg_all,4)]
                
        #print(y_true_list)
        
        df = pd.concat([df,eval_df],ignore_index=True)
        
        #print(test_results_df)
        rec_list = pd.DataFrame.from_dict(y_reco_list, orient="index").reset_index()
        rec_list.columns = ['user', 'rec_1','rec_2','rec_3', 'rec_4','rec_5','rec_6', 'rec_7','rec_8','rec_9', 'rec_10']
        true_list = pd.DataFrame.from_dict(y_true_list, orient="index").reset_index()
        true_list.columns = ['user', 'true_1', 'true_2', 'true_3' ,'true_4', 'true_5', 'true_6', 'true_7' ,'true_8', 'true_9', 'true_10', 'true_11']
        self.rec_results  = pd.concat([self.rec_results ,rec_list],ignore_index=True)
        self.true_results  = pd.concat([self.true_results ,true_list],ignore_index=True)
        
    return df,self.true_results,self.rec_results