Commit 12851916 authored by BERNIER Fabien's avatar BERNIER Fabien
Browse files

[+] English variant detection model

parent 797929d2
This diff is collapsed.
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk import pos_tag
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
from core_text import FixOutText
# Import data
df = pd.read_csv("datasets/english_variant.csv")
class_names = np.array(["SA", "AA"])
stopwords = stopwords.words("english")
stopwords.extend(["#ff", "ff", "rt"])
stemmer = PorterStemmer()
def preprocess(text_string):
Accepts a text string and replaces:
1) urls with URLHERE
2) lots of whitespace with one instance
3) mentions with MENTIONHERE
This allows us to get standardized counts of urls and mentions
Without caring about specific people mentioned
space_pattern = '\s+'
giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
mention_regex = '@[\w\-]+'
parsed_text = re.sub(space_pattern, ' ', text_string)
parsed_text = re.sub(giant_url_regex, '', parsed_text)
parsed_text = re.sub(mention_regex, '', parsed_text)
return parsed_text
def tokenize(tweet):
"""Removes punctuation & excess whitespace, sets to lowercase,
and stems tweets. Returns a list of stemmed tokens."""
tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
tokens = [stemmer.stem(t) for t in tweet.split()]
return tokens
def basic_tokenize(tweet):
"""Same as tokenize but without the stemming"""
tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
return tweet.split()
tweet_tags = []
for tweet in df.tweet:
tokens = basic_tokenize(preprocess(tweet))
tags = pos_tag(tokens)
tag_list = [x[1] for x in tags]
tag_str = " ".join(tag_list)
X = df.tweet.to_numpy()
y = (df["group"] == "AA").to_numpy(dtype=np.uint8)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
vectorizer = TfidfVectorizer(
ngram_range=(1, 3),
# lr = LogisticRegression(class_weight='balanced')
rf = RandomForestClassifier(n_estimators=100)
# training the model
model = make_pipeline(vectorizer, rf), y_train)
# evaluating the model
pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print(class_names[model.predict(["piece of cake", "piece of shit"])])
# explaining the model
# vocab = list(model[0].vocabulary_.keys())
# fixout = FixOutText(X, y, vocab, to_drop=["black", "white", "bitch"], algo=model, max_features=50)
# t0 = time()
# actual_sensitive, is_fair_flag, ans_data, accuracy, threshold = fixout.is_fair()
# print("took", time()-t0, "seconds")
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment