import numpy as np import pandas as pd from nltk.corpus import stopwords from nltk.stem.porter import * from nltk import pos_tag from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import TfidfVectorizer from time import time from core_text import FixOutText hsdata = pd.read_csv("datasets/hate_speech.csv") class_names = np.array(["ok", "hate speech"]) stopwords = stopwords.words("english") stopwords.extend(["#ff", "ff", "rt"]) stemmer = PorterStemmer() def preprocess(text_string): """ Accepts a text string and replaces: 1) urls with URLHERE 2) lots of whitespace with one instance 3) mentions with MENTIONHERE This allows us to get standardized counts of urls and mentions Without caring about specific people mentioned """ space_pattern = '\s+' giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') mention_regex = '@[\w\-]+' parsed_text = re.sub(space_pattern, ' ', text_string) parsed_text = re.sub(giant_url_regex, '', parsed_text) parsed_text = re.sub(mention_regex, '', parsed_text) return parsed_text def tokenize(tweet): """Removes punctuation & excess whitespace, sets to lowercase, and stems tweets. Returns a list of stemmed tokens.""" tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip() tokens = [stemmer.stem(t) for t in tweet.split()] return tokens def basic_tokenize(tweet): """Same as tokenize but without the stemming""" tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip() return tweet.split() tweet_tags = [] for tweet in hsdata.tweet: tokens = basic_tokenize(preprocess(tweet)) tags = pos_tag(tokens) tag_list = [x[1] for x in tags] tag_str = " ".join(tag_list) tweet_tags.append(tag_str) X = hsdata.tweet.to_numpy() y = (hsdata["class"] != 2).to_numpy(dtype=np.uint8) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) vectorizer = TfidfVectorizer( tokenizer=tokenize, preprocessor=preprocess, ngram_range=(1, 3), stop_words=stopwords, use_idf=True, smooth_idf=False, norm=None, decode_error='replace', max_features=10000, min_df=5, max_df=0.75 ) lr = LogisticRegression(class_weight='balanced') # training the model model = make_pipeline(vectorizer, lr) model.fit(X_train, y_train) # evaluating the model pred = model.predict(X_test) print("Accuracy:", accuracy_score(y_test, pred)) # ~~~~~~~~~~~~~~~~~~~~~~~~~~ checking parity ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def get_dataset_predictions(dataset_filename) : variant = pd.read_csv(dataset_filename) tweets = variant["tweet"].to_numpy() for i in range(tweets.shape[0]): tweets[i] = tweets[i][1:-1] groups = variant["group"] return model.predict(tweets), tweets, groups def check_parity(preds, tweets, groups, word=None): if word is not None : I = list(map(lambda t: word in t, tweets)) groups = groups[I] preds = preds[I] preds_aa = preds[groups == "AA"] preds_sa = preds[groups == "SA"] hate_rate_aa = len(preds_aa[preds_aa == 1]) / len(preds_aa) hate_rate_sa = len(preds_sa[preds_sa == 1]) / len(preds_sa) print(f"[{word}] P(hate_speech | AA) = {hate_rate_aa}") print(f"[{word}] P(hate_speech | SA) = {hate_rate_sa}") preds, tweets, groups = get_dataset_predictions("datasets/english_variant.csv") check_parity(preds, tweets, groups) btch_preds, btch_tweets, btch_groups = get_dataset_predictions("datasets/english_variant_btch.csv") check_parity(btch_preds, btch_tweets, btch_groups) ngga_preds, ngga_tweets, ngga_groups = get_dataset_predictions("datasets/english_variant_ngga.csv") check_parity(ngga_preds, ngga_tweets, ngga_groups) # explaining the model vocab = list(model[0].vocabulary_.keys()) fixout = FixOutText(X, y, vocab, to_drop=["black", "white", "bitch"], algo=model, max_features=-1) t0 = time() actual_sensitive, is_fair_flag, ans_data, accuracy, threshold = fixout.is_fair() print("took", time()-t0, "seconds")