Commit f8975b96 authored by ALVES Guilherme's avatar ALVES Guilherme
Browse files

First commit

Main functions and classes
parent 71e27fe9
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>expout</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.python.pydev.PyDevBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.python.pydev.pythonNature</nature>
</natures>
</projectDescription>
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?eclipse-pydev version="1.0"?><pydev_project>
<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
<path>/${PROJECT_DIR_NAME}</path>
</pydev_pathproperty>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python interpreter</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
</pydev_project>
This diff is collapsed.
This diff is collapsed.
from collections import Counter
from anchor import anchor_tabular
import lime_global
def features_contributions(model, train, feature_names, class_names, categorical_features, categorical_names, kernel_width=3):
_, sp_obj = lime_global.features_contributions(model.prob, train, feature_names, class_names, categorical_features, categorical_names, kernel_width)
indices = sp_obj.indices
a_explainer = anchor_tabular.AnchorTabularExplainer(class_names,feature_names,train,categorical_names=categorical_names)
counter = Counter()
for i in indices:
exp = a_explainer.explain_instance(train[i], model.predict, threshold=0.95)
print(i,'%.2f' % exp.precision(),' %.2f' % exp.coverage(), '%s' % (' AND '.join(exp.names())))
a1 = Counter(exp.exp_map['feature'])
counter.update(a1)
for key, value in sorted(counter.items(), key=lambda x: x[1], reverse=True):
print(feature_names[key],"\t",value)
"""
Implements the main procedures to build fairer ensembles, e.g. feature drop out, model training, ensemble bulding
"""
import os
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.preprocessing._encoders import OneHotEncoder
from sklearn.preprocessing._label import LabelEncoder
import numpy as np
import pandas as pd
os.environ['KMP_DUPLICATE_LIB_OK']='True'
# np.random.seed(10)
def load_data(source_name, categorical_features, feature_names=None, delimiter=' '):
"""
Loads data from a text file source
"""
if feature_names != None:
data = pd.read_csv(source_name,names = feature_names, delimiter=' ')
else:
data = pd.read_csv(source_name, header=0, delimiter=' ')
feature_names = data.columns.values.tolist()
data = data.to_numpy()
labels = data[:,-1]
le= LabelEncoder()
le.fit(labels)
labels = le.transform(labels)
class_names = le.classes_
data = data[:,:-1]
feature_names = feature_names[:-1]
categorical_names = {}
for feature in categorical_features:
le = LabelEncoder()
le.fit(data[:, feature])
data[:, feature] = le.transform(data[:, feature])
categorical_names[feature] = le.classes_
data = data.astype(float)
return data, labels, class_names, feature_names, categorical_names
def train_classifier(algo, data, labels, remove_features, categorical_features):
data = np.delete(data, remove_features, axis = 1)
encoder = ColumnTransformer(
[('one_hot_encoder', OneHotEncoder(categories='auto'), categorical_features)],
remainder='passthrough'
)
encoded_train = encoder.fit_transform(data)
sm = SMOTE(sampling_strategy='auto')
train_res, labels_train_res = sm.fit_sample(encoded_train, labels)
model = algo()
model.fit(train_res, labels_train_res)
return model, encoder
def ensemble_out(algo,to_drop,train,labels_train, all_categorical_features):
"""
Implements ENSEMBLE_Out
Parameters
algo: class of a classification algorithm
to_drop: list of features that must be dropped
train: X
labels_train: y
all_categorical_features: list of indices of categorical features
"""
models, encoders, features_to_remove = [], [], []
for i in to_drop:
remove_features = [i]
categorical_features = remove(remove_features, all_categorical_features)
model, encoder = train_classifier(algo, train, labels_train, remove_features, categorical_features)
models.append(model)
encoders.append(encoder)
features_to_remove.append(remove_features)
categorical_features4 = remove(to_drop, all_categorical_features)
model4, encoder4 = train_classifier(algo, train, labels_train, to_drop, categorical_features4)
models.append(model4)
encoders.append(encoder4)
features_to_remove.append(to_drop)
return Model(models,encoders,features_to_remove)
class Model:
"""Class for ensemble models
Saves a list of trained classifiers and their respective encoders and deleted features
"""
def __init__(self,models,encoders,features_to_remove):
self.models = models
self.encoders = encoders
self.features_to_remove = features_to_remove
# TODO :
"""
Args:
models: a list of trained classifiers
encoders: a list of encoders (1st encoder for the 1st model)
features_to_remove: a list of lists of indices to be removed in order to use models (feature removal mapping)
"""
def prob(self,X):
"""
Returns probability for each class label.
"""
probs = []
n_models = len(self.models)
for i in range(n_models):
model = self.models[i]
encoder = self.encoders[i]
to_remove = self.features_to_remove[i]
comp = model.predict_proba(encoder.transform(np.delete(X, to_remove, axis=1))).astype(float)
probs.append(comp)
res = sum(probs)/n_models
return res
def predict(self,X):
"""
Returns a class label for each instance.
"""
probs = self.prob(X)
probs = probs[:, 1]
return to_labels(probs, 0.5)
def remove(target_list, categorical_features):
"""
Returns a list of indices of categorical features that have to be keep
(removes categorical features that are in target_list and updates indices
of categorical features that must be keep)
"""
counter = 0
new_indices = categorical_features
for target in target_list:
t = target - counter
under = [v for v in new_indices if v < t]
above = [v-1 for v in new_indices if v > t]
new_indices = under + above
counter += 1
return new_indices
def to_labels(pos_probs, threshold):
return (pos_probs >= threshold).astype('int')
def find_threshold(train_probs, labels_train):
train_probs = train_probs[:, 1]
thresholds = np.arange(0, 1, 0.001) # define thresholds
scores = [f1_score(labels_train, to_labels(train_probs, t)) for t in thresholds] # evaluate each threshold
ix = np.argmax(scores) # get best threshold
return thresholds[ix]
def evaluation(probs, labels_test, threshold):
probs = probs[:, 1]
# print('Threshold=%.3f, F-Score=%.5f' % (threshold, f1_score(labels_test, to_labels(probs, threshold))))
accuracy = accuracy_score(labels_test, to_labels(probs, threshold))
return accuracy
# def train_classifier(algo, data, labels, remove_features, categorical_features):
#
# data = np.delete(data, remove_features, axis = 1)
#
# encoder = ColumnTransformer(
# [('one_hot_encoder', OneHotEncoder(categories='auto'), categorical_features)],
# remainder='passthrough'
# )
#
# train, test, labels_train, labels_test = train_test_split(data, labels, train_size=train_size, test_size=test_size, random_state=random_state)
#
# sm = SMOTE(sampling_strategy='auto')
# train_res, labels_train_res = sm.fit_sample(train, labels_train)
#
# data = []
# data = np.concatenate([train_res, test])
#
# encoder = encoder.fit(train_res)
# encoded_train = encoder.transform(train_res)
#
# model = algo()
# model.fit(encoded_train.toarray(), labels_train_res)
#
# return model, encoder
# def train_classifier_original(algo, data, labels, remove_features, categorical_features):
#
# data = np.delete(data, remove_features, axis = 1)
#
# encoder = ColumnTransformer(
# [('one_hot_encoder', OneHotEncoder(categories='auto'), categorical_features)],
# remainder='passthrough'
# )
# encoder.fit(data)
#
# train, test, labels_train, labels_test = train_test_split(data, labels, train_size=train_size, test_size=test_size, random_state=random_state)
#
# sm = SMOTE(sampling_strategy='auto')
# train_res, labels_train_res = sm.fit_sample(train, labels_train)
#
# data = []
# data = np.concatenate([train_res, test])
#
# encoder = encoder.fit(data)
# encoded_train = encoder.transform(train_res)
#
# model = algo()
# model.fit(encoded_train.toarray(), labels_train_res)
#
# return model, encoder
This diff is collapsed.
import datetime
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from anchor_global import features_contributions
from core import load_data, Model, evaluation, \
find_threshold, remove, ensemble_out, train_classifier
# categorical_features1 = [0,2,3,5,6,8,10,12,13,15,17,18]
# categorical_features2 = [0,2,3,5,6,8,9,11,13,14,16,18]
# categorical_features3 = [0,2,3,5,6,8,9,11,13,14,16,18]
# categorical_features4 = [0,2,3,5,6,8,10,12,13,15]
def main():
train_size = 0.8
test_size = 0.2
algo = MLPClassifier
print(algo.__name__)
source_name = 'german.data'
to_drop = [8,18,19]
all_categorical_features = [0,2,3,5,6,8,9,11,13,14,16,18,19]
data, labels, class_names, feature_names, categorical_names = load_data(source_name, all_categorical_features)
train, test, labels_train, labels_test = train_test_split(data, labels, train_size=train_size, test_size=test_size, random_state=2)
print("###########\nOriginal model \n###########")
model, encoder = train_classifier(algo, train, labels_train, [], all_categorical_features)
original_model = Model([model],[encoder],[[]])
threshold_1 = find_threshold(original_model.prob(train), labels_train)
accuracy = evaluation(original_model.prob(test), labels_test, threshold_1)
print(accuracy)
features_contributions(original_model, train, feature_names, class_names, all_categorical_features, categorical_names)
print("###########\nExpOut ensemble's model \n###########")
ensemble = ensemble_out(algo,to_drop,train,labels_train, all_categorical_features)
threshold_2 = find_threshold(ensemble.prob(train), labels_train)
accuracy = evaluation(ensemble.prob(test), labels_test, threshold_2)
print(accuracy)
features_contributions(ensemble, train, feature_names, class_names, all_categorical_features, categorical_names)
if __name__ == "__main__":
now = datetime.datetime.now()
print(now.year,'-', now.month,'-', now.day,',', now.hour,':', now.minute,':', now.second)
main()
\ No newline at end of file
"""
Implements LIME_Global. Verifies if sensitives features have high contributions.
"""
from collections import Counter
import pandas as pd
import numpy as np
from lime import lime_tabular, submodular_pick
def features_contributions(model, train, feature_names, class_names, categorical_features, categorical_names, kernel_width=3):
explainer = lime_tabular.LimeTabularExplainer(train,feature_names=feature_names,class_names=class_names,categorical_features=categorical_features,categorical_names=categorical_names,kernel_width=kernel_width)
sp_obj = submodular_pick.SubmodularPick(explainer, train, model, sample_size=5, num_features=5, num_exps_desired=5)
a = Counter()
for i in sp_obj.V:
exp = sp_obj.explanations[i]
a1 = Counter(dict(exp.local_exp[1]))
a.update(a1)
ans_data = []
for key in a:
ans_data1 = []
ans_data1.append(feature_names[key])
ans_data1.append(a[key])
ans_data.append(ans_data1)
# print(feature_names[key] )
df = pd.DataFrame(ans_data, columns = ["Feature", "Contribution"])
sumdf = df['Contribution'].sum()
df['Contribution'] = df['Contribution']
print(df.iloc[(-np.abs(df['Contribution'].values)).argsort()])
return explainer, sp_obj
\ No newline at end of file
import argparse
import datetime
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble._bagging import BaggingClassifier
from sklearn.ensemble._forest import RandomForestClassifier
from sklearn.ensemble._gb import GradientBoostingClassifier
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.mixture._gaussian_mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.svm._classes import SVC
from anchor_global import features_contributions
from core import load_data, train_classifier, Model, evaluation, \
find_threshold, remove, ensemble_out
def main(source_name, train_size, to_drop, all_categorical_features, algo):
data, labels, class_names, feature_names, categorical_names = load_data(source_name, all_categorical_features)
train, test, labels_train, labels_test = train_test_split(data, labels, train_size=train_size, test_size=1-train_size, random_state=2)
print("###########\nOriginal model \n###########")
model, encoder = train_classifier(algo, train, labels_train, [], all_categorical_features)
original_model = Model([model],[encoder],[[]])
threshold_1 = find_threshold(original_model.prob(train), labels_train)
accuracy = evaluation(original_model.prob(test), labels_test, threshold_1)
print(accuracy)
features_contributions(original_model, train, feature_names, class_names, all_categorical_features, categorical_names)
print("###########\nExpOut ensemble's model \n###########")
ensemble = ensemble_out(algo,to_drop,train,labels_train, all_categorical_features)
threshold_2 = find_threshold(ensemble.prob(train), labels_train)
accuracy = evaluation(ensemble.prob(test), labels_test, threshold_2)
print(accuracy)
features_contributions(ensemble, train, feature_names, class_names, all_categorical_features, categorical_names)
def algo_parser(algo_str):
algo = algo_str.lower()
if algo == "mlp":
return MLPClassifier
elif algo == "logreg":
return LogisticRegression
elif algo == "rf":
return RandomForestClassifier
elif algo == "ada":
return AdaBoostClassifier
elif algo == "bagging":
return BaggingClassifier
elif algo == "gaussianmixture":
return GaussianMixture
elif algo == "gradient":
return GradientBoostingClassifier
elif algo == "svm":
return SVC
else:
return None
# AdaBoostClassifier
# BaggingClassifier
# BayesianGaussianMixture
# BernoulliNB
# CalibratedClassifierCV
# CategoricalNB
# ClassifierChain
# ComplementNB
# DecisionTreeClassifier
# DummyClassifier
# ExtraTreeClassifier
# ExtraTreesClassifier
# GaussianMixture
# GaussianNB
# GaussianProcessClassifier
# GradientBoostingClassifier
# GridSearchCV
# HistGradientBoostingClassifier
# KNeighborsClassifier
# LabelPropagation
# LabelSpreading
# LinearDiscriminantAnalysis
# LogisticRegression
# LogisticRegressionCV
# MLPClassifier
# MultiOutputClassifier
# MultinomialNB
# NuSVC
# OneVsRestClassifier
# Pipeline
# QuadraticDiscriminantAnalysis
# RFE
# RFECV
# RadiusNeighborsClassifier
# RandomForestClassifier
# RandomizedSearchCV
# SGDClassifier
# SVC
# StackingClassifier
# VotingClassifier
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Description')
parser.add_argument('--data')
parser.add_argument('--trainsize', type=float)
parser.add_argument('--algo')
parser.add_argument('-cat_features', '--cat_features', action='store', dest='cat_features_list', type=int, nargs='*', default=[], help="Examples: -i ")
parser.add_argument('-drop', '--drop', action='store', dest='drop_list', type=int, nargs='*', default=[], help="Examples: -i ")
args = parser.parse_args()
now = datetime.datetime.now()
print(now.year,'-', now.month,'-', now.day,',', now.hour,':', now.minute,':', now.second)
main(args.data, args.trainsize, args.drop_list, args.cat_features_list, algo_parser(args.algo))
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment