Commit 17135b6c authored by ALVES Guilherme's avatar ALVES Guilherme
Browse files

New argument added to indicate which explainer should be employed

parent f8975b96
......@@ -2,6 +2,23 @@
This project is an extension of LimeOut[1]. It aims at tackle process fairness for classification, while keeping the accuracy level (or improving).
More precisely, ExpOut incorporates different explainers.
Classifiers available:
* Multilayer Perceptron
* Logistic Regression
* Random Forest
* Bagging
* AdaBoost
* Gaussian Mixture
* Gradiente Boosting
Explainers
* LIME
* Anchors
# Example
`runner --data german.data --trainsize 0.8 --algo mlp --cat_features 0 2 3 5 6 8 9 11 13 14 16 18 19 --drop 8 18 19`
# References
[1] Vaishnavi Bhargava, Miguel Couceiro, Amedeo Napoli. LimeOut: An Ensemble Approach To Improve Process Fairness. 2020. ⟨hal-02864059v2⟩
......
......@@ -3,22 +3,43 @@ from collections import Counter
from anchor import anchor_tabular
import lime_global
import pandas as pd
import numpy as np
def features_contributions(model, train, feature_names, class_names, categorical_features, categorical_names, kernel_width=3):
def fairness_eval(model, train, max_features, sensitive_features, feature_names, class_names, categorical_features, categorical_names):
_, sp_obj = lime_global.features_contributions(model.prob, train, feature_names, class_names, categorical_features, categorical_names, kernel_width)
_, sp_obj = lime_global.features_contributions(model.prob, train, feature_names, max_features, class_names, categorical_features, categorical_names)
indices = sp_obj.indices
a_explainer = anchor_tabular.AnchorTabularExplainer(class_names,feature_names,train,categorical_names=categorical_names)
non_empty_anchors = 0
counter = Counter()
for i in indices:
exp = a_explainer.explain_instance(train[i], model.predict, threshold=0.95)
print(i,'%.2f' % exp.precision(),' %.2f' % exp.coverage(), '%s' % (' AND '.join(exp.names())))
a1 = Counter(exp.exp_map['feature'])
counter.update(a1)
print(i,'%.2f' % exp.precision(),' %.2f' % exp.coverage(), ' (class %s)' % exp.exp_map['prediction'], '%s' % (' AND '.join(exp.names())))
features = exp.exp_map['feature']
if len(features) > 0:
a1 = Counter(features)
non_empty_anchors += 1
counter.update(a1)
is_fair = True
i = 0
ans_data = []
for key, value in sorted(counter.items(), key=lambda x: x[1], reverse=True):
print(feature_names[key],"\t",value)
ans_data1 = [feature_names[key],value/non_empty_anchors]
ans_data.append(ans_data1)
if i < max_features and key in sensitive_features:
is_fair = False
i += 1
df = pd.DataFrame(ans_data, columns = ["Feature", "Frequency"])
print(df.iloc[(-np.abs(df['Frequency'].values)).argsort()])
return is_fair, ans_data
......@@ -107,8 +107,6 @@ class Model:
self.encoders = encoders
self.features_to_remove = features_to_remove
# TODO :
"""
Args:
models: a list of trained classifiers
......
......@@ -7,7 +7,7 @@ from sklearn.model_selection import train_test_split
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from anchor_global import features_contributions
from anchor_global import fairness_eval
from core import load_data, Model, evaluation, \
find_threshold, remove, ensemble_out, train_classifier
......@@ -20,6 +20,7 @@ from core import load_data, Model, evaluation, \
def main():
train_size = 0.8
test_size = 0.2
max_features = 10
algo = MLPClassifier
print(algo.__name__)
......@@ -39,7 +40,7 @@ def main():
accuracy = evaluation(original_model.prob(test), labels_test, threshold_1)
print(accuracy)
features_contributions(original_model, train, feature_names, class_names, all_categorical_features, categorical_names)
fairness_eval(original_model, train, max_features, to_drop, feature_names, class_names, all_categorical_features, categorical_names)
print("###########\nExpOut ensemble's model \n###########")
ensemble = ensemble_out(algo,to_drop,train,labels_train, all_categorical_features)
......@@ -49,7 +50,7 @@ def main():
accuracy = evaluation(ensemble.prob(test), labels_test, threshold_2)
print(accuracy)
features_contributions(ensemble, train, feature_names, class_names, all_categorical_features, categorical_names)
fairness_eval(ensemble, train, max_features, to_drop, feature_names, class_names, all_categorical_features, categorical_names)
if __name__ == "__main__":
......
......@@ -8,11 +8,17 @@ import numpy as np
from lime import lime_tabular, submodular_pick
def features_contributions(model, train, feature_names, class_names, categorical_features, categorical_names, kernel_width=3):
def features_contributions(predict_fn, train, feature_names, max_features, class_names, categorical_features, categorical_names, kernel_width=3):
explainer = lime_tabular.LimeTabularExplainer(train,feature_names=feature_names,class_names=class_names,categorical_features=categorical_features,categorical_names=categorical_names,kernel_width=kernel_width)
sp_obj = submodular_pick.SubmodularPick(explainer, train, model, sample_size=5, num_features=5, num_exps_desired=5)
sp_obj = submodular_pick.SubmodularPick(explainer, train, predict_fn, sample_size=500, num_features=max_features, num_exps_desired=10)
return explainer, sp_obj
def fairness_eval(model, train, max_features, sensitive_features, feature_names, class_names, categorical_features, categorical_names):
_, sp_obj = features_contributions(model.prob, train, feature_names, max_features, class_names, categorical_features, categorical_names)
a = Counter()
for i in sp_obj.V:
......@@ -20,16 +26,22 @@ def features_contributions(model, train, feature_names, class_names, categorical
a1 = Counter(dict(exp.local_exp[1]))
a.update(a1)
is_fair = True
counter = 0
ans_data = []
for key in a:
ans_data1 = []
ans_data1.append(feature_names[key])
ans_data1.append(a[key])
ans_data1 = [feature_names[key],a[key]]
ans_data.append(ans_data1)
if counter < max_features and key in sensitive_features:
is_fair = False
counter += 1
# print(feature_names[key] )
df = pd.DataFrame(ans_data, columns = ["Feature", "Contribution"])
sumdf = df['Contribution'].sum()
df['Contribution'] = df['Contribution']
# sumdf = df['Contribution'].sum()
# df['Contribution'] = df['Contribution']
print(df.iloc[(-np.abs(df['Contribution'].values)).argsort()])
return explainer, sp_obj
\ No newline at end of file
return is_fair, ans_data
\ No newline at end of file
......@@ -7,16 +7,18 @@ from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.ensemble._gb import GradientBoostingClassifier
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.mixture._gaussian_mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.svm._classes import SVC
from sklearn.model_selection import train_test_split
import anchor_global
import lime_global
from anchor_global import features_contributions
# from anchor_global import features_contributions
from core import load_data, train_classifier, Model, evaluation, \
find_threshold, remove, ensemble_out
find_threshold, ensemble_out
def main(source_name, train_size, to_drop, all_categorical_features, algo):
def main(source_name, train_size, to_drop, all_categorical_features, max_features, algo, exp):
data, labels, class_names, feature_names, categorical_names = load_data(source_name, all_categorical_features)
train, test, labels_train, labels_test = train_test_split(data, labels, train_size=train_size, test_size=1-train_size, random_state=2)
......@@ -30,17 +32,19 @@ def main(source_name, train_size, to_drop, all_categorical_features, algo):
accuracy = evaluation(original_model.prob(test), labels_test, threshold_1)
print(accuracy)
features_contributions(original_model, train, feature_names, class_names, all_categorical_features, categorical_names)
print("###########\nExpOut ensemble's model \n###########")
ensemble = ensemble_out(algo,to_drop,train,labels_train, all_categorical_features)
threshold_2 = find_threshold(ensemble.prob(train), labels_train)
accuracy = evaluation(ensemble.prob(test), labels_test, threshold_2)
print(accuracy)
features_contributions(ensemble, train, feature_names, class_names, all_categorical_features, categorical_names)
is_fair ,_ = exp(original_model, train, max_features, to_drop, feature_names, class_names, all_categorical_features, categorical_names)
if not is_fair:
print("###########\nExpOut ensemble's model \n###########")
ensemble = ensemble_out(algo,to_drop,train,labels_train, all_categorical_features)
threshold_2 = find_threshold(ensemble.prob(train), labels_train)
accuracy = evaluation(ensemble.prob(test), labels_test, threshold_2)
print(accuracy)
is_fair ,_ = exp(ensemble, train, max_features, to_drop, feature_names, class_names, all_categorical_features, categorical_names)
def algo_parser(algo_str):
......@@ -65,61 +69,35 @@ def algo_parser(algo_str):
return SVC
else:
return None
# AdaBoostClassifier
# BaggingClassifier
# BayesianGaussianMixture
# BernoulliNB
# CalibratedClassifierCV
# CategoricalNB
# ClassifierChain
# ComplementNB
# DecisionTreeClassifier
# DummyClassifier
# ExtraTreeClassifier
# ExtraTreesClassifier
# GaussianMixture
# GaussianNB
# GaussianProcessClassifier
# GradientBoostingClassifier
# GridSearchCV
# HistGradientBoostingClassifier
# KNeighborsClassifier
# LabelPropagation
# LabelSpreading
# LinearDiscriminantAnalysis
# LogisticRegression
# LogisticRegressionCV
# MLPClassifier
# MultiOutputClassifier
# MultinomialNB
# NuSVC
# OneVsRestClassifier
# Pipeline
# QuadraticDiscriminantAnalysis
# RFE
# RFECV
# RadiusNeighborsClassifier
# RandomForestClassifier
# RandomizedSearchCV
# SGDClassifier
# SVC
# StackingClassifier
# VotingClassifier
def exp_parser(algo_str):
algo = algo_str.lower()
if algo == "lime":
return lime_global.fairness_eval
elif algo == "anchors":
return anchor_global.fairness_eval
elif algo == "shap":
return None
else:
return None
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Description')
parser = argparse.ArgumentParser(description='ExpOut')
parser.add_argument('--data')
parser.add_argument('--trainsize', type=float)
parser.add_argument('--algo')
parser.add_argument('--exp')
parser.add_argument('--max_features', type=int)
parser.add_argument('-cat_features', '--cat_features', action='store', dest='cat_features_list', type=int, nargs='*', default=[], help="Examples: -i ")
parser.add_argument('-drop', '--drop', action='store', dest='drop_list', type=int, nargs='*', default=[], help="Examples: -i ")
args = parser.parse_args()
now = datetime.datetime.now()
print(now.year,'-', now.month,'-', now.day,',', now.hour,':', now.minute,':', now.second)
print(now.year,'-', now.month,'-', now.day,',', now.hour,':', now.minute,':', now.second,sep='')
main(args.data, args.trainsize, args.drop_list, args.cat_features_list, algo_parser(args.algo))
\ No newline at end of file
main(args.data, args.trainsize, args.drop_list, args.cat_features_list, args.max_features, algo_parser(args.algo), exp_parser(args.exp))
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment