Attention une mise à jour du service Gitlab va être effectuée le mardi 30 novembre entre 17h30 et 18h00. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes. Cette mise à jour intermédiaire en version 14.0.12 nous permettra de rapidement pouvoir mettre à votre disposition une version plus récente.

Commit 62f52680 authored by ALVES Guilherme's avatar ALVES Guilherme
Browse files

Removed unecessary files. Updated datasets parameters file.

parent 35ce914a
import datetime
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.ensemble._gb import GradientBoostingClassifier
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.svm._classes import SVC
from sklearn.tree import DecisionTreeClassifier
from core import load_data, Model, evaluation, \
find_threshold, remove, ensemble_out, train_classifier
from lime_global import fairness_eval
import os
import numpy as np
os.environ['KMP_DUPLICATE_LIB_OK']='True'
np.random.seed(10)
def main(seed):
train_size = 0.8
max_features = 4
sample_size = 500
algo = MLPClassifier
print(algo.__name__)
source_name = 'adult.data'
to_drop = [5,8,9] # sex, race, marital status
all_categorical_features = [1,3,5,6,7,8,9,13]
data, labels, class_names, feature_names, categorical_names = load_data(source_name, all_categorical_features, delimiter=',')
train, test, labels_train, labels_test = train_test_split(data, labels, train_size=train_size, test_size=1-train_size, random_state=seed)
print("Features")
for i in range(len(feature_names)):
f_name = feature_names[i]
print(i,f_name,end=' ')
if i in all_categorical_features:
print("[c]",end=' ')
if i in to_drop:
print("[s]",end=' ')
print('')
model, encoder = train_classifier(algo, train, test, labels_train, [], all_categorical_features)
original_model = Model([model],[encoder],[[]])
threshold_1 = find_threshold(original_model.prob(train), labels_train)
print("Original model OK")
fairness_eval(original_model, train, max_features, to_drop, feature_names, class_names, all_categorical_features, categorical_names, sample_size)
ensemble = ensemble_out(algo,to_drop,train, test, labels_train, all_categorical_features)
threshold_2 = find_threshold(ensemble.prob(train), labels_train)
print("ExpOut ensemble's model OK")
accuracy_original = evaluation(original_model.prob(test), labels_test, threshold_1)
print("accuracy_original", accuracy_original)
accuracy_ensemble = evaluation(ensemble.prob(test), labels_test, threshold_2)
print("accuracy_ensemble", accuracy_ensemble)
# debug
for i in range(len(ensemble.models)):
m = Model([ensemble.models[i]],[ensemble.encoders[i]],[ensemble.features_to_remove[i]])
print("accuracy_m",i,' ', evaluation(m.prob(test), labels_test, threshold_2), sep='')
fairness_eval(ensemble, train, max_features, to_drop, feature_names, class_names, all_categorical_features, categorical_names, sample_size)
if __name__ == "__main__":
now = datetime.datetime.now()
print('adult mlp (100,100,)\n',now.year,'-', now.month,'-', now.day,',', now.hour,':', now.minute,':', now.second, sep='')
for i in range(5):
print("experiment i=",i)
main(i)
\ No newline at end of file
import datetime
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.svm._classes import SVC
from sklearn.tree import DecisionTreeClassifier
from anchor_global import fairness_eval
from core import load_data, Model, evaluation, \
find_threshold, remove, ensemble_out, train_classifier
def main(seed):
train_size = 0.8
max_features = 10
algo = SVC
print(algo.__name__)
source_name = 'bank.data'
to_drop = [0,2] # age, marital
all_categorical_features = [1,2,3,4,6,7,8,10,15]
data, labels, class_names, feature_names, categorical_names = load_data(source_name, all_categorical_features,delimiter=";")
train, test, labels_train, labels_test = train_test_split(data, labels, train_size=train_size, test_size=1-train_size, random_state=seed)
model, encoder = train_classifier(algo, train, test, labels_train, [], all_categorical_features)
original_model = Model([model],[encoder],[[]])
# threshold_1 = find_threshold(original_model.prob(train), labels_train)
print("Original model OK")
fairness_eval(original_model, train, max_features, to_drop, feature_names, class_names, all_categorical_features, categorical_names)
ensemble = ensemble_out(algo,to_drop,train, test, labels_train, all_categorical_features)
# threshold_2 = find_threshold(ensemble.prob(train), labels_train)
print("ExpOut ensemble's model OK")
# accuracy_original = evaluation(original_model.prob(test), labels_test, threshold_1)
# print("accuracy_original", accuracy_original)
# accuracy_ensemble = evaluation(ensemble.prob(test), labels_test, threshold_2)
# print("accuracy_ensemble", accuracy_ensemble)
fairness_eval(ensemble, train, max_features, to_drop, feature_names, class_names, all_categorical_features, categorical_names)
if __name__ == "__main__":
now = datetime.datetime.now()
print('bank\n',now.year,'-', now.month,'-', now.day,',', now.hour,':', now.minute,':', now.second, sep='')
for i in range(1):
print("experiment i=",i)
main(i)
\ No newline at end of file
import datetime
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from anchor_global import features_contributions
from core import load_data, train_classifier, Model, evaluation, \
find_threshold, remove, ensemble_out, random_state
def main():
train_size = 0.8
test_size = 0.2
algo = MLPClassifier
print(algo.__name__)
source_name = 'german.data'
feature_names = ['existingchecking', 'duration', 'credithistory', 'purpose', 'creditamount', 'savings', 'employmentsince', 'installmentrate', 'statussex', 'otherdebtors', 'residencesince', 'property', 'age', 'otherinstallmentplans', 'housing', 'existingcredits', 'job', 'peopleliable', 'telephone', 'foreignworker', 'classification']
to_drop = [8,18,19]
all_categorical_features = [0,2,3,5,6,8,9,11,13,14,16,18,19]
data, labels, class_names, feature_names, categorical_names = load_data(source_name, feature_names, all_categorical_features)
for i in range(30):
train, test, labels_train, labels_test = train_test_split(data, labels, train_size=train_size, test_size=test_size, random_state=i)
model, encoder = train_classifier(algo, train, labels_train, [], all_categorical_features)
original_model = Model([model],[encoder],[[]])
threshold_1 = find_threshold(original_model.prob(train), labels_train)
accuracy1 = evaluation(original_model.prob(test), labels_test, threshold_1)
ensemble = ensemble_out(algo,to_drop,train,labels_train, all_categorical_features)
threshold_2 = find_threshold(ensemble.prob(train), labels_train)
accuracy2 = evaluation(ensemble.prob(test), labels_test, threshold_2)
print(i,accuracy1,accuracy2)
if __name__ == "__main__":
now = datetime.datetime.now()
print(now.year,'-', now.month,'-', now.day,',', now.hour,':', now.minute,':', now.second)
main()
\ No newline at end of file
......@@ -57,11 +57,12 @@ class FixOut:
model, encoder = train_classifier(self.algo, self.train, self.test, self.labels_train, [], self.all_categorical_f, self.seed)
self.original_model = Model([model],[encoder],[[]])
accuracy, _ = evaluation(self.original_model.prob(self.test), self.labels_test)
# accuracy = evaluation_fixed_threshold(self.original_model.prob(self.test), self.labels_test,0.5)
accuracy, threshold = evaluation(self.original_model.prob(self.test), self.labels_test)
actual_sensitive, is_fair_flag ,ans_data, explainer = self.exp(self.original_model, self.train, self.max_features, self.sensitive_f, self.feature_names, self.class_names, self.all_categorical_f, self.categorical_names, self.sampling_size, self.threshold)
return actual_sensitive, is_fair_flag, ans_data, accuracy
return actual_sensitive, is_fair_flag, ans_data, accuracy, threshold
def ensemble_out(self, actual_sensitive):
......@@ -95,12 +96,13 @@ class FixOut:
self.ensemble = Model(models,encoders,features_to_remove)
accuracy, _ = evaluation(self.ensemble.prob(self.test), self.labels_test)
accuracy, threshold = evaluation(self.ensemble.prob(self.test), self.labels_test)
# accuracy = evaluation_fixed_threshold(self.ensemble.prob(self.test), self.labels_test, 0.5)
# _, is_fair_flag ,ans_data, explainer = self.exp(self.ensemble, self.train, self.max_features, actual_sensitive, self.feature_names, self.class_names, self.all_categorical_f, self.categorical_names, self.sampling_size, self.threshold)
#
# return is_fair_flag, ans_data, accuracy
return False, None, accuracy
_, is_fair_flag ,ans_data, explainer = self.exp(self.ensemble, self.train, self.max_features, actual_sensitive, self.feature_names, self.class_names, self.all_categorical_f, self.categorical_names, self.sampling_size, self.threshold)
return is_fair_flag, ans_data, accuracy, threshold
# return False, None, accuracy
class Model:
......@@ -253,6 +255,11 @@ def evaluation(probs, true_labels):
accuracy = accuracy_score(true_labels, to_labels(probs, thresholds[ix]))
return accuracy, thresholds[ix]
def evaluation_fixed_threshold(probs, true_labels, threshold):
probs = probs[:, 1]
return accuracy_score(true_labels, to_labels(probs, threshold))
def fairness_metrics_eval(probs, true_labels, protected_attr, p_group, p_label):
_probs = probs[:, 1]
......
......@@ -44,8 +44,11 @@ hdma = {
# "all_categorical_features" : [0,1,2,3,5,10,12,13,14,19,20], # 15,16
"sensitive" : [1,2,3], # "derived_race","derived_sex","derived_ethnicity"
"priv_group" : {
"derived_sex" : 1, # Male ('Female', 'Male')
"derived_race" : 6, # White ('Black or African American', 'White', 'Asian', 'Native Hawaiian or Other Pacific Islander', 'American Indian or Alaska Native', '2 or more minority races', 'Race Not Available')
"derived_ethnicity" : 0 # Ethnicity Not Available ('Ethnicity Not Available', 'Hispanic or Latino', 'Joint', 'Not Hispanic or Latino')
},
"pos_label" : 0
"pos_label" : 1 # high-priced
}
nypd_sqf = {
......@@ -61,8 +64,10 @@ lsac = {
"all_categorical_features" : [3,4,5],
"sensitive" : [4,5],
"priv_group" : {
"sex" : 1, # "Male" (0 means Female)
"race" : 2 # "White" (0 means Black and 1 means "Other")
},
"pos_label" : 0
"pos_label" : 1 # Passed (0 is Failed_or_not_attempted)
}
default = {
......@@ -71,8 +76,10 @@ default = {
"all_categorical_features" : [1,2,3],
"sensitive" : [1,3], # sex, marriage,
"priv_group" : {
"SEX" : 0, # "1" male (X2: Gender (1 = male; 2 = female).)
"MARRIAGE" : 2 # "2" single (X4: Marital status (1 = married; 2 = single; 3 = others).)
},
"pos_label" : 0
"pos_label" : 1 # This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable
}
comcrime = {
......
import datetime
from core import FixOut, evaluation, Model, fairness_metrics_eval
from core import FixOut, evaluation, Model, fairness_metrics_eval, \
evaluation_fixed_threshold
from datasets_param import *
from runner import algo_parser, exp_parser
import numpy as np
import pandas as pd
from runner import algo_parser, exp_parser
def one_experiment(source_name, sep, train_size, to_drop, all_categorical_features, max_features, algo, exp, sampling_size, threshold_exp, p_groups, p_label):
original_f_metrics = {}
ensemble_f_metrics = {}
for i in range(0,2):
for i in range(0,20):
fixout = FixOut(source_name, sep, train_size, to_drop, all_categorical_features, algo, exp, max_features, sampling_size, seed=i, threshold=threshold_exp)
actual_sensitive, is_fair1, contributions_original, accuracy_original = fixout.is_fair()
actual_sensitive, is_fair1, contributions_original, accuracy_original, threshold_orig = fixout.is_fair()
print("~~~~~~~~~\nOriginal model DONE\n~~~~~~~~~")
if not is_fair1:
is_fair2, contributions_ensemble, accuracy_ensemble = fixout.ensemble_out(actual_sensitive)
is_fair2, contributions_ensemble, accuracy_ensemble, threshold_ens = fixout.ensemble_out(actual_sensitive)
print("Ensemble model DONE\n~~~~~~~~~")
else:
print("Original model is FAIR")
print("~~~~~~~~~")
print("#########\nRESULTS",i,"BEGIN\n#########")
print("Original model accuracy", accuracy_original)
print("Original model accuracy", accuracy_original, threshold_orig)
print(contributions_original)
if is_fair1:
continue
if not is_fair1:
print("Ensemble model accuracy", accuracy_ensemble)
print("Ensemble model accuracy", accuracy_ensemble, threshold_ens)
ensemble = fixout.ensemble
for f in range(len(actual_sensitive)):
submodel = Model([ensemble.models[f]],[ensemble.encoders[f]],[actual_sensitive[f]])
accuracy, threshold = evaluation(submodel.prob(fixout.test), fixout.labels_test)
print("Ensemble Submodel accuracy", accuracy, threshold)
# accuracy, threshold = evaluation(submodel.prob(fixout.test), fixout.labels_test)
accuracy = evaluation_fixed_threshold(submodel.prob(fixout.test), fixout.labels_test, threshold_ens)
print("Ensemble Submodel accuracy", accuracy, threshold_ens)
submodel = Model([ensemble.models[len(actual_sensitive)]],[ensemble.encoders[len(actual_sensitive)]],[actual_sensitive])
# accuracy = evaluation_fixed_threshold(submodel.prob(fixout.test), fixout.labels_test,threshold_orig)
accuracy, threshold = evaluation(submodel.prob(fixout.test), fixout.labels_test)
print("Ensemble Submodel accuracy", accuracy, threshold)
......
from aif360.datasets.adult_dataset import AdultDataset
from aif360.sklearn.metrics.metrics import disparate_impact_ratio, \
equal_opportunity_difference, statistical_parity_difference, theil_index, \
average_odds_difference, difference, generalized_fpr, specificity_score
from aif360.sklearn.utils import check_groups
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
def main():
ex = np.matrix([[1,1],[1,1],[0,0],[0,0]])
y_true = pd.DataFrame(ex,columns=['y_true','x1'])
y_true.set_index(['x1'], inplace=True)
y_pred = np.array([1,1,0,0])
probs = np.array([[0.2,0.8],[0.1,0.9],[0.6,0.4],[0.7,0.3]])
p_class=1
print(statistical_parity_difference(y_true, y_pred, prot_attr='x1',priv_group=1, pos_label=p_class)) # Demographic Parity
print(equal_opportunity_difference(y_true, y_pred, prot_attr='x1')) # Equal Opportunity
print(average_odds_difference(y_true, y_pred, prot_attr='x1')) # Equal Odds ???
print(disparate_impact_ratio(y_true, y_pred, prot_attr='x1')) # Disparate Impact
print(difference(accuracy_score, y_true, y_pred, prot_attr='x1'))
theil_index(np.array([1,1,0,0])) # Theil Index
__difference_fpr(y_true, y_pred, prot_attr='x1',priv_group=1, pos_label=p_class)
# print(difference(generalized_fpr, np.array([1,1,0,0]), probs[:,p_class], prot_attr='x1', pos_label=p_class))
# print(difference(generalized_fpr, y_true, probs[:,p_class], prot_attr='x1'))
# generalized_fpr(y_true, probs, pos_label=1), true_labels, probs)
def __difference_fpr(y_true, y_pred, prot_attr=None, priv_group=1, pos_label=1):
# y_true
print("true")
groups, _ = check_groups(y_true, prot_attr)
idx = (groups == priv_group)
n_y_true = y_true['y_true'].to_numpy()
priv_true = n_y_true[idx]
unpriv_true = n_y_true[~idx]
priv_pred = y_pred[idx]
unpriv_pred = y_pred[~idx]
tn, fp, fn, tp = confusion_matrix(priv_true, priv_pred).ravel()
tn, fp, fn, tp = confusion_matrix(unpriv_true, unpriv_pred).ravel()
#
# u = generalized_fpr(unpriv, probs, pos_label)
# p = generalized_fpr(priv, probs, pos_label)
if __name__ == "__main__":
main()
# data = AdultDataset()
# # data = datasets.CompasDataset()
#
# X,y = data.features, data.labels
#
# res = np.concatenate((y,data.protected_attributes),axis=1)
#
# y_true = pd.DataFrame(res,columns=['y_true']+data.protected_attribute_names)
# y_true.set_index(data.protected_attribute_names, inplace=True)
#
# # Equal Accuracy
# # Equal Odds
#
# rf = RandomForestClassifier()
# rf.fit(X, y)
#
# y_pred = rf.predict(X)
#
# print(statistical_parity_difference(y_true, y_pred, prot_attr='sex')) # Demographic Parity
# print(equal_opportunity_difference(y_true, y_pred, prot_attr='sex')) # Equal Opportunity
# print(average_odds_difference(y_true, y_pred, prot_attr='sex')) # Equal Odds ???
# # theil_index() # Theil Index
#
# print("###########")
#
# print(statistical_parity_difference(y_true, y_pred, prot_attr='race')) # Demographic Parity
# print(equal_opportunity_difference(y_true, y_pred, prot_attr='race')) # Equal Opportunity
# print(average_odds_difference(y_true, y_pred, prot_attr='race')) # Equal Odds ???
# print(disparate_impact_ratio(y_true, y_pred, prot_attr='race')) # Disparate Impact
import datetime
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.ensemble._gb import GradientBoostingClassifier
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.svm._classes import SVC
from sklearn.tree import DecisionTreeClassifier
from core import load_data, Model, evaluation, \
find_threshold, remove, train_classifier
from lime_global import fairness_eval
from decimal import Decimal
global_seed = 2
# categorical_features1 = [0,2,3,5,6,8,10,12,13,15,17,18]
# categorical_features2 = [0,2,3,5,6,8,9,11,13,14,16,18]
# categorical_features3 = [0,2,3,5,6,8,9,11,13,14,16,18]
# categorical_features4 = [0,2,3,5,6,8,10,12,13,15]
def main(seed):
train_size = 0.7
test_size = 0.3
max_features = 10
sample_size = 500
algo = MLPClassifier # hidden_layer_sizes=(100,100,)
print(algo.__name__)
source_name = 'german.data'
to_drop = [8,18,19]
all_categorical_features = [0,2,3,5,6,8,9,11,13,14,16,18,19]
data, labels, class_names, feature_names, categorical_names = load_data(source_name, all_categorical_features)
train, test, labels_train, labels_test = train_test_split(data, labels, train_size=train_size, test_size=test_size, random_state=global_seed)
model, encoder = train_classifier(algo, train, test, labels_train, [], all_categorical_features)
original_model = Model([model],[encoder],[[]])
threshold_1 = find_threshold(original_model.prob(test), labels_test)
print("Original model OK")
# fairness_eval(original_model, train, max_features, to_drop, feature_names, class_names, all_categorical_features, categorical_names, sample_size)
ensemble = ensemble_out(algo,to_drop,train, test, labels_train, all_categorical_features)
threshold_2 = find_threshold(ensemble.prob(test), labels_test)
print("ExpOut ensemble's model OK")
accuracy_original = evaluation(original_model.prob(test), labels_test, threshold_1)
print("accuracy_original", accuracy_original)
accuracy_ensemble = evaluation(ensemble.prob(test), labels_test, threshold_2)
print("accuracy_ensemble", accuracy_ensemble)
# fairness_eval(ensemble, train, max_features, to_drop, feature_names, class_names, all_categorical_features, categorical_names, sample_size)
if __name__ == "__main__":
now = datetime.datetime.now()
print('german MLP(100,100,) solver=lbfgs \n',now.year,'-', now.month,'-', now.day,',', now.hour,':', now.minute,':', now.second, sep='')
for i in range(1):
print("experiment i=",i)
main(i)
\ No newline at end of file
import pickle
from sklearn.model_selection._split import train_test_split
from core import load_data, pre_process, remove
def main(params):
to_drop = params['remove_features']
all_categorical_f = params['all_categorical_features']
data, labels, class_names, feature_names, categorical_names = load_data(params['source_name'], all_categorical_f, delimiter=params['delimiter'])
train, test, labels_train, labels_test = train_test_split(data, labels, train_size=params['train_size'], test_size=1-params['train_size'], random_state=2)
batchs = []
train_res, labels_train_res, encoder = pre_process(train, test, labels_train, [], all_categorical_f)
batchs.append((train_res, labels_train_res, encoder))
for feature in to_drop:
categorical_features = remove([feature], all_categorical_f)
train_res, labels_train_res, encoder = pre_process(train, test, labels_train, [feature], categorical_features)
batchs.append((train_res, labels_train_res, encoder))
categorical_features = remove(to_drop, all_categorical_f)
train_res, labels_train_res, encoder = pre_process(train, test, labels_train, to_drop, categorical_features)
batchs.append((train_res, labels_train_res, encoder))
res = {
"batchs" : batchs,
"labels_test" : labels_test,
"class_names" : class_names,
"to_drop" : to_drop,
"all_categorical_f" : all_categorical_f,
"feature_names" : feature_names,
"categorical_names" : categorical_names
}
pickle.dump(res, open( params['source_name']+".p", "wb" ) )
if __name__ == "__main__":
adult = {
"source_name" : "adult.data",
"all_categorical_features" : [1, 3, 5, 6, 7, 8, 9, 13],
"delimiter" : ",",
"remove_features" : [5, 8, 9],
"train_size" : 0.8
}
main(adult)
#--data adult.data
#--trainsize 0.8
#--algo logreg
#--max_features 10
#--cat_features 1 3 5 6 7 8 9 13
#--drop 5 8 9
#--exp lime
#--samplesize 50
#--sep ,
\ No newline at end of file
......@@ -14,7 +14,7 @@ import anchor_global
from core import FixOut
import lime_global
import shap_global
from sklearn.tree.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
def main(source_name, sep, train_size, to_drop, all_categorical_features, max_features, algo, exp, sampling_size):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment