Loading features.joblib +3 B (14.7 KiB) File changed.No diff preview for this file type. View original file View changed file ml.py +43 −13 Original line number Diff line number Diff line Loading @@ -5,14 +5,20 @@ from sklearn.linear_model import Perceptron, SGDClassifier from sklearn.neural_network import MLPClassifier from sklearn.tree import DecisionTreeClassifier from joblib import dump, load import os from warnings import simplefilter simplefilter(action='ignore') # preprocessing import preprocess # global variables X = [] Y = [] import preprocess pla = None sgc = None nn = None tree = None def printMetrics(y_actual, y_pred): Loading @@ -23,19 +29,19 @@ def printMetrics(y_actual, y_pred): def preprocessing(): #this function does nothing global X, Y XYList = preprocess.loadXY() X = XYList[0] Y = XYList[1] def trainModels(): XYList = preprocess.loadXY() X = XYList[0] Y = XYList[1] preprocessing() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) global pla, sgc, nn, tree print('\n==================================================\n') ### train perceptron Loading Loading @@ -122,31 +128,55 @@ def trainModels(): print('\n==================================================\n') def testModels(): XYList = preprocess.loadXY() X = XYList[0] Y = XYList[1] def loadModels(): global pla, sgc, nn, tree if not os.path.isfile('pla.joblib') or not os.path.isfile('sgc.joblib') or not os.path.isfile('nn.joblib') or not os.path.isfile('tree.joblib'): print('Training models...') trainModels() else: print('Loading models from file...') pla = load('pla.joblib') sgc = load('sgc.joblib') nn = load('nn.joblib') tree = load('tree.joblib') def testModels(): if len(X) == 0: preprocessing() if tree == None: loadModels() pred_y = pla.predict(X) print('Perceptron (all data):') printMetrics(Y, pred_y) sgc = load('sgc.joblib') pred_y = sgc.predict(X) print('\nStochastic Gradient Descent (all data):') printMetrics(Y, pred_y) nn = load('nn.joblib') pred_y = nn.predict(X) print('\nNeural Network (all data):') printMetrics(Y, pred_y) tree = load('tree.joblib') pred_y = tree.predict(X) print('\nDecision Tree (all data):') printMetrics(Y, pred_y) #### not finished def makePrediction(message): if tree == None: loadModels() pred_y = tree.predict([preprocess.extract(message)]) print(pred_y) if pred_y[0] == 1: return 'spam' else: return 'ham' # declaring, training, fitting each algorithm Loading nn.joblib +3.26 KiB (3.07 MiB) File changed.No diff preview for this file type. View original file View changed file pla.joblib (8.91 KiB) File changed.No diff preview for this file type. View original file View changed file preprocess.py +45 −36 Original line number Diff line number Diff line Loading @@ -5,6 +5,27 @@ import os from collections import Counter from joblib import dump, load features = [] #read in training and testing data #kaggle and UCI contain the same data def read_data(): #read kaggle data data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1) #read enron data data = pd.concat([data, pd.read_csv('enron.csv', header=None).drop(2, axis=1)]) #read spamassassin data data = pd.concat([data, pd.read_csv('spamassassin.csv', header=None).drop(2, axis=1)]) data = data.drop_duplicates() data = data.dropna(axis=0, how='any') data.reset_index(drop=True, inplace=True) data.columns = ['class', 'text'] return data def clean_text(text): #remove html tags Loading @@ -20,7 +41,7 @@ def clean_text(text): ### define features from training data ### create features from training data ### input - spam/ham training set ### returns a list of feature words def define_features(data, num_features=1000): Loading @@ -43,28 +64,39 @@ def define_features(data, num_features=1000): #create list of words from text words = clean.split() for word in words: #add each word occurrence for ham words, subtract for spam words #typically ham words will have a large positive value #typically spam words will have a large negative value #add each word occurrence for spam words, subtract for ham words if len(word) > 1 and word not in stop_words: if labels[i] == 'ham': if labels[i] == 'spam': word_count[word] += 1 else: word_count[word] -= 1 #select the most 50 most common ham and spam words as features features = word_count.most_common(num_features // 2) features.extend(word_count.most_common()[-(num_features //2):]) feature_list = word_count.most_common(num_features // 2) feature_list.extend(word_count.most_common()[-(num_features //2):]) #return list of keywords return [item[0] for item in features] global features features = [item[0] for item in feature_list] dump(features, 'features.joblib') ### creates a feature vector from a message ### input - list of feature words and an sms message ### returns a feature vector of 1 if feature is found in text and 0 if feature is not def extract(features, message): def extract(message): global features if len(features) == 0: if os.path.isfile('features.joblib'): print('Loading features from file...') features = load('features.joblib') else: print('Defining features...') data = read_data() define_features(data) vector = [0] * len(features) #clean text clean = clean_text(message) Loading @@ -89,7 +121,7 @@ def extract(features, message): ### creates a feature matrix from a data set ### input - list of feature words and data set to extract features from ### returns a numpy matrix containing a feature vector for each message, also returns an array of correct labels def prepare(features, data): def prepare(data): #create list of text and labels text = data['text'].tolist() labels = data['class'].tolist() Loading @@ -98,7 +130,7 @@ def prepare(features, data): matrix = [] for sample in text: #add feature vector to matrix for each message matrix.append(extract(features, sample)) matrix.append(extract(sample)) #label each sample - ham = -1, spam = +1 target = np.ones(len(labels), dtype=int) Loading @@ -109,39 +141,16 @@ def prepare(features, data): return np.array(matrix), target #read in training and testing data #kaggle and UCI contain the same data def read_data(): #read kaggle data data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1) #read enron data data = pd.concat([data, pd.read_csv('enron.csv', header=None).drop(2, axis=1)]) #read spamassassin data data = pd.concat([data, pd.read_csv('spamassassin.csv', header=None).drop(2, axis=1)]) data = data.drop_duplicates() data = data.dropna(axis=0, how='any') data.reset_index(drop=True, inplace=True) data.columns = ['class', 'text'] return data # for API def loadXY(refresh_data=False): if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib') or refresh_data: data = read_data() #extract features from training data features = define_features(data) dump(features, 'features.joblib') define_features(data) #create feature matrix for training and testing data X, Y = prepare(features, data) X, Y = prepare(data) dump(X, 'X.joblib') dump(Y, 'Y.joblib') return X, Y Loading Loading
features.joblib +3 B (14.7 KiB) File changed.No diff preview for this file type. View original file View changed file
ml.py +43 −13 Original line number Diff line number Diff line Loading @@ -5,14 +5,20 @@ from sklearn.linear_model import Perceptron, SGDClassifier from sklearn.neural_network import MLPClassifier from sklearn.tree import DecisionTreeClassifier from joblib import dump, load import os from warnings import simplefilter simplefilter(action='ignore') # preprocessing import preprocess # global variables X = [] Y = [] import preprocess pla = None sgc = None nn = None tree = None def printMetrics(y_actual, y_pred): Loading @@ -23,19 +29,19 @@ def printMetrics(y_actual, y_pred): def preprocessing(): #this function does nothing global X, Y XYList = preprocess.loadXY() X = XYList[0] Y = XYList[1] def trainModels(): XYList = preprocess.loadXY() X = XYList[0] Y = XYList[1] preprocessing() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) global pla, sgc, nn, tree print('\n==================================================\n') ### train perceptron Loading Loading @@ -122,31 +128,55 @@ def trainModels(): print('\n==================================================\n') def testModels(): XYList = preprocess.loadXY() X = XYList[0] Y = XYList[1] def loadModels(): global pla, sgc, nn, tree if not os.path.isfile('pla.joblib') or not os.path.isfile('sgc.joblib') or not os.path.isfile('nn.joblib') or not os.path.isfile('tree.joblib'): print('Training models...') trainModels() else: print('Loading models from file...') pla = load('pla.joblib') sgc = load('sgc.joblib') nn = load('nn.joblib') tree = load('tree.joblib') def testModels(): if len(X) == 0: preprocessing() if tree == None: loadModels() pred_y = pla.predict(X) print('Perceptron (all data):') printMetrics(Y, pred_y) sgc = load('sgc.joblib') pred_y = sgc.predict(X) print('\nStochastic Gradient Descent (all data):') printMetrics(Y, pred_y) nn = load('nn.joblib') pred_y = nn.predict(X) print('\nNeural Network (all data):') printMetrics(Y, pred_y) tree = load('tree.joblib') pred_y = tree.predict(X) print('\nDecision Tree (all data):') printMetrics(Y, pred_y) #### not finished def makePrediction(message): if tree == None: loadModels() pred_y = tree.predict([preprocess.extract(message)]) print(pred_y) if pred_y[0] == 1: return 'spam' else: return 'ham' # declaring, training, fitting each algorithm Loading
nn.joblib +3.26 KiB (3.07 MiB) File changed.No diff preview for this file type. View original file View changed file
pla.joblib (8.91 KiB) File changed.No diff preview for this file type. View original file View changed file
preprocess.py +45 −36 Original line number Diff line number Diff line Loading @@ -5,6 +5,27 @@ import os from collections import Counter from joblib import dump, load features = [] #read in training and testing data #kaggle and UCI contain the same data def read_data(): #read kaggle data data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1) #read enron data data = pd.concat([data, pd.read_csv('enron.csv', header=None).drop(2, axis=1)]) #read spamassassin data data = pd.concat([data, pd.read_csv('spamassassin.csv', header=None).drop(2, axis=1)]) data = data.drop_duplicates() data = data.dropna(axis=0, how='any') data.reset_index(drop=True, inplace=True) data.columns = ['class', 'text'] return data def clean_text(text): #remove html tags Loading @@ -20,7 +41,7 @@ def clean_text(text): ### define features from training data ### create features from training data ### input - spam/ham training set ### returns a list of feature words def define_features(data, num_features=1000): Loading @@ -43,28 +64,39 @@ def define_features(data, num_features=1000): #create list of words from text words = clean.split() for word in words: #add each word occurrence for ham words, subtract for spam words #typically ham words will have a large positive value #typically spam words will have a large negative value #add each word occurrence for spam words, subtract for ham words if len(word) > 1 and word not in stop_words: if labels[i] == 'ham': if labels[i] == 'spam': word_count[word] += 1 else: word_count[word] -= 1 #select the most 50 most common ham and spam words as features features = word_count.most_common(num_features // 2) features.extend(word_count.most_common()[-(num_features //2):]) feature_list = word_count.most_common(num_features // 2) feature_list.extend(word_count.most_common()[-(num_features //2):]) #return list of keywords return [item[0] for item in features] global features features = [item[0] for item in feature_list] dump(features, 'features.joblib') ### creates a feature vector from a message ### input - list of feature words and an sms message ### returns a feature vector of 1 if feature is found in text and 0 if feature is not def extract(features, message): def extract(message): global features if len(features) == 0: if os.path.isfile('features.joblib'): print('Loading features from file...') features = load('features.joblib') else: print('Defining features...') data = read_data() define_features(data) vector = [0] * len(features) #clean text clean = clean_text(message) Loading @@ -89,7 +121,7 @@ def extract(features, message): ### creates a feature matrix from a data set ### input - list of feature words and data set to extract features from ### returns a numpy matrix containing a feature vector for each message, also returns an array of correct labels def prepare(features, data): def prepare(data): #create list of text and labels text = data['text'].tolist() labels = data['class'].tolist() Loading @@ -98,7 +130,7 @@ def prepare(features, data): matrix = [] for sample in text: #add feature vector to matrix for each message matrix.append(extract(features, sample)) matrix.append(extract(sample)) #label each sample - ham = -1, spam = +1 target = np.ones(len(labels), dtype=int) Loading @@ -109,39 +141,16 @@ def prepare(features, data): return np.array(matrix), target #read in training and testing data #kaggle and UCI contain the same data def read_data(): #read kaggle data data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1) #read enron data data = pd.concat([data, pd.read_csv('enron.csv', header=None).drop(2, axis=1)]) #read spamassassin data data = pd.concat([data, pd.read_csv('spamassassin.csv', header=None).drop(2, axis=1)]) data = data.drop_duplicates() data = data.dropna(axis=0, how='any') data.reset_index(drop=True, inplace=True) data.columns = ['class', 'text'] return data # for API def loadXY(refresh_data=False): if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib') or refresh_data: data = read_data() #extract features from training data features = define_features(data) dump(features, 'features.joblib') define_features(data) #create feature matrix for training and testing data X, Y = prepare(features, data) X, Y = prepare(data) dump(X, 'X.joblib') dump(Y, 'Y.joblib') return X, Y Loading