Attention une mise à jour du service Gitlab va être effectuée le mardi 30 novembre entre 17h30 et 18h00. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes. Cette mise à jour intermédiaire en version 14.0.12 nous permettra de rapidement pouvoir mettre à votre disposition une version plus récente.

Commit 0050dff9 authored by BERNIER Fabien's avatar BERNIER Fabien
Browse files

[+] Module stuff

parent 943f6879
import sys; sys.path.extend(['..'])
import datetime
from fixout.core import FixOut, evaluation, Model, evaluation_fixed_threshold
from examples.datasets_param import *
from fairmodels import ModelProb, FairnessObject
import numpy as np
import pandas as pd
from runner import algo_parser, exp_parser
def one_experiment(source_name, sep, train_size, to_drop, all_categorical_features, max_features, algo, exp, sampling_size, threshold_exp, p_groups, p_label):
original_f_metrics = {}
ensemble_f_metrics = {}
for i in range(1):
fixout = FixOut(source_name, sep, train_size, to_drop, all_categorical_features, algo, exp, max_features, sampling_size, seed=i, threshold=threshold_exp)
actual_sensitive, is_fair1, contributions_original, accuracy_original, threshold_orig = fixout.is_fair()
print("~~~~~~~~~\nOriginal model DONE\n~~~~~~~~~")
if not is_fair1:
is_fair2, contributions_ensemble, accuracy_ensemble, threshold_ens = fixout.ensemble_out(actual_sensitive)
print("Ensemble model DONE\n~~~~~~~~~")
else:
print("Original model is FAIR")
print("~~~~~~~~~")
print("#########\nRESULTS",i,"BEGIN\n#########")
print("Original model accuracy", accuracy_original, threshold_orig)
print(contributions_original)
if is_fair1:
continue
if not is_fair1:
print("Ensemble model accuracy", accuracy_ensemble, threshold_ens)
ensemble = fixout.ensemble
for f in range(len(actual_sensitive)):
submodel = Model([ensemble.models[f]],[ensemble.encoders[f]],[actual_sensitive[f]])
# accuracy, threshold = evaluation(submodel.prob(fixout.test), fixout.labels_test)
accuracy = evaluation_fixed_threshold(submodel.prob(fixout.test), fixout.labels_test, threshold_ens)
print("Ensemble Submodel accuracy", accuracy, threshold_ens)
submodel = Model([ensemble.models[len(actual_sensitive)]],[ensemble.encoders[len(actual_sensitive)]],[actual_sensitive])
# accuracy = evaluation_fixed_threshold(submodel.prob(fixout.test), fixout.labels_test,threshold_orig)
accuracy, threshold = evaluation(submodel.prob(fixout.test), fixout.labels_test)
print("Ensemble Submodel accuracy", accuracy, threshold)
print(contributions_ensemble)
print("#########\nRESULTS",i,"END\n#########")
#############
y = np.array([[value] for value in fixout.labels_test])
res = np.concatenate((y,fixout.test[:,actual_sensitive]),axis=1)
sensitive_features_names = list(np.array(fixout.feature_names)[actual_sensitive])
true_labels = pd.DataFrame(res,columns=['y_true']+sensitive_features_names)
true_labels.set_index(sensitive_features_names, inplace=True)
print("\n\n****************\nFAIRNESS METRICS\n****************")
test_input = pd.DataFrame(fixout.test.astype(np.uint8), columns=fixout.feature_names)
original_predictions = ModelProb(fixout.original_model.predict_proba(fixout.test)[:, 1], threshold=0.5, name="Original")
fixout_predictions = ModelProb(fixout.ensemble.predict_proba(fixout.test)[:, 1], threshold=0.5, name="Ensemble")
fobject = FairnessObject(
model_probs=[original_predictions, fixout_predictions],
y=fixout.labels_test,
protected=test_input.statussex,
privileged=0
)
fplt = fobject.plot()
fplt.show()
densityplt = fobject.plot_density()
densityplt.show()
hmplt = fobject.plot_fairness_heatmap()
hmplt.show()
radarplt = fobject.plot_fairness_radar()
radarplt.show()
print(fobject.parity_loss_metric_data)
"""
for sen_feature in sensitive_features_names:
dp,eq,ea,aod,fpr_diff,di = fairness_metrics_eval(fixout.original_model.prob(fixout.test), true_labels, sen_feature, p_groups[str(sen_feature)], p_label)
if sen_feature not in original_f_metrics:
res = []
original_f_metrics[sen_feature] = res
original_f_metrics[sen_feature].append((dp,eq,ea,aod,fpr_diff,di))
for sen_feature in sensitive_features_names:
dp,eq,ea,aod,fpr_diff,di = fairness_metrics_eval(fixout.ensemble.prob(fixout.test), true_labels, sen_feature, p_groups[sen_feature], p_label)
if sen_feature not in ensemble_f_metrics:
res = []
ensemble_f_metrics[sen_feature] = res
ensemble_f_metrics[sen_feature].append((dp,eq,ea,aod,fpr_diff,di))
print("ORIGINAL MODEL")
for k,v in original_f_metrics.items():
dp_l,eq_l,ea_l,eo_l,fpr_diff_l,di_l = [dp for dp,_,_,_,_,_ in v], [eq for _,eq,_,_,_,_ in v], [ea for _,_,ea,_,_,_ in v], [eo for _,_,_,eo,_,_ in v], [fpr_diff for _,_,_,_,fpr_diff,_ in v], [di for _,_,_,_,_,di in v]
print("Sensitive Feature:", k)
print(np.mean(dp_l),np.mean(eq_l),np.mean(ea_l),np.mean(eo_l),np.mean(fpr_diff_l),np.mean(di_l))
print(np.std(dp_l),np.std(eq_l),np.std(ea_l),np.std(eo_l),np.std(fpr_diff_l),np.std(di_l))
print("FIXOUT ENSEMBLE MODEL")
for k,v in ensemble_f_metrics.items():
dp_l,eq_l,ea_l,eo_l,fpr_diff_l,di_l = [dp for dp,_,_,_,_,_ in v], [eq for _,eq,_,_,_,_ in v], [ea for _,_,ea,_,_,_ in v], [eo for _,_,_,eo,_,_ in v], [fpr_diff for _,_,_,_,fpr_diff,_ in v], [di for _,_,_,_,_,di in v]
print("Sensitive Feature:", k)
print(np.mean(dp_l),np.mean(eq_l),np.mean(ea_l),np.mean(eo_l),np.mean(fpr_diff_l),np.mean(di_l))
print(np.std(dp_l),np.std(eq_l),np.std(ea_l),np.std(eo_l),np.std(fpr_diff_l),np.std(di_l))
"""
def main(general_param, data_param):
# for t in [0.1,0.2,0.3,0.4,0.5]:
# print('threshold =',t)
# param["threshold"] = t
one_experiment(data_param["source_name"], data_param["sep"], general_param["train_size"], data_param["sensitive"], data_param["all_categorical_features"], general_param["max_features"], algo_parser(general_param["algo"]), exp_parser(general_param["exp"]), general_param["sample_size"], general_param["threshold"], data_param['priv_group'], data_param['pos_label'])
general_parameters = {
"train_size" : 0.7,
"algo" : "logreg",
"exp" : "lime",
"max_features" : 10,
"sample_size" : 500,
"threshold" : None
}
if __name__ == "__main__":
dataset_param = german
now = datetime.datetime.now()
print(now.year,'-', now.month,'-', now.day,',', now.hour,':', now.minute,':', now.second,sep='')
print("Data Parameters:",dataset_param)
print("General Parameters:",general_parameters)
main(general_parameters, dataset_param)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from fixout.lime_tabular_global import TabularExplainer
from fixout.core_tabular import EnsembleOutTabular
from fixout.utils import columns_preprocessers, transform_categorical_names
# load the data and convert it to numpy
data = pd.read_csv("/home/fabien/Documents/Orpa/fixout-demo/datasets/adult.data")
features = data.columns[:-1]
X = data.drop(columns="Target").to_numpy()
y = data["Target"].to_numpy()
# preprocess the data with respect to their type (categorical/numerical)
categorical = [1, 3, 5, 6, 7, 8, 9, 13]
categories = transform_categorical_names(X, categorical, feature_names=features)
ct = columns_preprocessers(X, categorical,
categorical_preprocesser=OneHotEncoder())
# split the data and train a model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LogisticRegression()
model = make_pipeline(ct, lr)
model.fit(X_train, y_train)
print("Original score:", model.score(X_test, y_test))
# explain the original model
explainer_original = TabularExplainer(model.predict_proba, X_train, categorical_features=categorical)
explainer_original.global_explanation(n_samples=200)
for i, contrib in explainer_original.get_top_k(k=10) :
print(features[i], '\t', contrib)
# make an ensemble
ensemble = EnsembleOutTabular(model, sensitive_features=(5, 8, 9))
ensemble.fit(X_train, y_train)
print("Ensemble score:", ensemble.score(X_test, y_test))
# explain the ensemble
explainer_ensemble = TabularExplainer(ensemble.predict_proba, X_train, categorical_features=categorical)
explainer_ensemble.global_explanation(n_samples=200)
for i, contrib in explainer_original.get_top_k(k=10) :
print(features[i], '\t', contrib)
"""
Implements the main procedures to build fairer ensembles, e.g. feature drop out, model training, ensemble bulding
"""
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.metrics import accuracy_score
from sklearn.base import ClassifierMixin
from fixout.lime_tabular_global import TabularExplainer
from copy import deepcopy
import numpy as np
class FixOutTabular:
def __init__(self, model, X, sensitives, explainer="lime", top_k=10, sampling_size=None):
if type(explainer) == str :
if explainer == "lime" :
self.explainer = TabularExplainer(model.predict_proba, X)
else :
self.explainer = explainer
self.model = model
self.X = X
self.sensitives = sensitives
self.actual_sensitive = []
self.top_k = top_k
if sampling_size is None :
self.sampling_size = len(X)//10
else :
self.sampling_size = sampling_size
def explain(self, X):
exp = self.explainer.global_explanation(X)
top = sorted(enumerate(exp), key=lambda x:x[1], reverse=True)[:self.top_k]
top_features = set(map(lambda x:x[0], top))
actual_sensitive = top_features & set(self.sensitives)
self.actual_sensitive = list(actual_sensitive)
self.is_fair = len(actual_sensitive) < 2
return top
def get_ensemble(self):
if len(self.actual_sensitive) < 2 :
return None
return EnsembleOutTabular(self.model, sensitive_features=self.actual_sensitive)
class EnsembleOutTabular(ClassifierMixin):
"""Class for ensemble models
Saves a list of trained classifiers and their respective encoders and deleted features
Args:
models: a list of trained classifiers
"""
def __init__(self,
base_model,
sensitive_features=(),
auto_threshold=False):
self.models = [base_model]
self.sensitives = [[i] for i in sensitive_features] \
+ ([[sensitive_features]] if len(sensitive_features) > 1 else [])
self.threshold = 0.5
self.dtype = np.uint8
self.auto_threshold = auto_threshold
def fit(self, X, y):
try:
self.dtype = y.dtype
except:
pass
base_model = self.models.pop(0)
for features in self.sensitives:
X_train = np.array(X)
X_train[:,features] = 0
model = deepcopy(base_model)
model.fit(X_train, y)
self.models.append(model)
if self.auto_threshold:
self._adjust_threshold(X, y)
def _adjust_threshold(self, X, y):
thresholds = np.arange(0, 1, 0.001)
scores = [accuracy_score(y, self.predict(X, threshold=t)) for t in thresholds] # score each threshold
i = np.argmax(scores)
self.threshold = thresholds[i]
def predict_proba(self, X):
"""
Returns probability for each class label.
"""
probs = []
n_models = len(self.models)
for i in range(n_models):
model = self.models[i]
features = self.sensitives[i]
X_ = np.array(X)
X_[:,features] = 0
y = model.predict_proba(X_)
probs.append(y)
return np.array(probs).mean(axis=0)
def predict(self, X, threshold=None):
if threshold is None: threshold = self.threshold
return (self.predict_proba(X)[:,1] > threshold).astype(self.dtype)
\ No newline at end of file
......@@ -21,7 +21,7 @@ class FixOutText:
self.exp = exp
self.X = X
le= LabelEncoder()
le = LabelEncoder()
self.y = le.fit_transform(y)
self.class_names = le.classes_
......
"""
Implements LIME_Global. Verifies if sensitives features have high contributions.
"""
import numpy as np
from fixout.lime.lime_tabular import LimeTabularExplainer
class TabularExplainer:
def __init__(self, predict_fn, X_data, **kwargs):
self.predict_fn = predict_fn
self.X_data = X_data
self.explainer = LimeTabularExplainer(X_data, **kwargs)
self.explanations = None
def explain_instances(self, X):
explanations = []
for instance in X:
exp = self.explainer.explain_instance(
instance,
self.predict_fn,
num_features=len(instance)
).local_exp[1]
local_exp = np.zeros((1, len(exp)))
for i, v in exp:
local_exp[0,i] = v
explanations.append(local_exp)
return np.concatenate(explanations)
def global_explanation(self, X_data=None, n_samples=5000):
if X_data is None:
X_data = self.X_data
I = np.arange(len(X_data))
np.random.shuffle(I)
I = I[:n_samples]
X = X_data[I]
explanations = self.explain_instances(X)
self.explanations = np.mean(explanations, axis=0)
return self
def get_top_k(self, k=10):
if self.explanations is None:
return
top = sorted(enumerate(self.explanations), key=lambda x: abs(x[1]), reverse=True)[:k]
return top
\ No newline at end of file
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
def columns_preprocessers(X, categorical_indexes, categorical_preprocesser, numerical_preprocesser=None, remainder="passthrough"):
ct = ColumnTransformer([("ohe", categorical_preprocesser, categorical_indexes)], remainder=remainder)
ct.fit(X)
if numerical_preprocesser is not None:
numerical = list(set(range(len(X[0]))) - set(categorical_indexes))
scaler = ColumnTransformer([("scaler", numerical_preprocesser, numerical)], remainder=remainder)
scaler.fit(X)
return ct, scaler
return ct
def transform_categorical_names(X, categorical_indexes, feature_names=None):
le = LabelEncoder()
names = {}
for i in categorical_indexes:
X[:, i] = le.fit_transform(X[:, i])
if feature_names is None :
k = i
else :
k = feature_names[i]
names[k] = le.classes_
return names
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment