send json to server and make prediction (f9dc158e) · Commits · Chris / SpamHunter

features.joblib

+3 B (14.7 KiB)

File changed.

No diff preview for this file type.

ml.py

+43 −13

Original line number	Diff line number	Diff line
		@@ -5,14 +5,20 @@ from sklearn.linear_model import Perceptron, SGDClassifier
		from sklearn.neural_network import MLPClassifier
		from sklearn.tree import DecisionTreeClassifier
		from joblib import dump, load
		import os

		from warnings import simplefilter
		simplefilter(action='ignore')

		# preprocessing
		import preprocess

		# global variables
		X = []
		Y = []
		import preprocess
		pla = None
		sgc = None
		nn = None
		tree = None


		def printMetrics(y_actual, y_pred):
		@@ -23,19 +29,19 @@ def printMetrics(y_actual, y_pred):


		def preprocessing():
		#this function does nothing
		global X, Y
		XYList = preprocess.loadXY()
		X = XYList[0]
		Y = XYList[1]


		def trainModels():
		XYList = preprocess.loadXY()
		X = XYList[0]
		Y = XYList[1]
		preprocessing()

		X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

		global pla, sgc, nn, tree

		print('\n==================================================\n')

		### train perceptron
		@@ -122,31 +128,55 @@ def trainModels():
		print('\n==================================================\n')


		def testModels():
		XYList = preprocess.loadXY()
		X = XYList[0]
		Y = XYList[1]
		def loadModels():
		global pla, sgc, nn, tree

		if not os.path.isfile('pla.joblib') or not os.path.isfile('sgc.joblib') or not os.path.isfile('nn.joblib') or not os.path.isfile('tree.joblib'):
		print('Training models...')
		trainModels()
		else:
		print('Loading models from file...')
		pla = load('pla.joblib')
		sgc = load('sgc.joblib')
		nn = load('nn.joblib')
		tree = load('tree.joblib')


		def testModels():
		if len(X) == 0:
		preprocessing()

		if tree == None:
		loadModels()

		pred_y = pla.predict(X)
		print('Perceptron (all data):')
		printMetrics(Y, pred_y)

		sgc = load('sgc.joblib')
		pred_y = sgc.predict(X)
		print('\nStochastic Gradient Descent (all data):')
		printMetrics(Y, pred_y)

		nn = load('nn.joblib')
		pred_y = nn.predict(X)
		print('\nNeural Network (all data):')
		printMetrics(Y, pred_y)

		tree = load('tree.joblib')
		pred_y = tree.predict(X)
		print('\nDecision Tree (all data):')
		printMetrics(Y, pred_y)

		#### not finished
		def makePrediction(message):
		if tree == None:
		loadModels()

		pred_y = tree.predict([preprocess.extract(message)])
		print(pred_y)

		if pred_y[0] == 1:
		return 'spam'
		else:
		return 'ham'


		# declaring, training, fitting each algorithm

nn.joblib

+3.26 KiB (3.07 MiB)

File changed.

No diff preview for this file type.

View original file

View changed file

pla.joblib

(8.91 KiB)

File changed.

No diff preview for this file type.

View original file

View changed file

preprocess.py

+45 −36

Original line number	Diff line number	Diff line
		@@ -5,6 +5,27 @@ import os
		from collections import Counter
		from joblib import dump, load

		features = []

		#read in training and testing data
		#kaggle and UCI contain the same data
		def read_data():
		#read kaggle data
		data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1)

		#read enron data
		data = pd.concat([data, pd.read_csv('enron.csv', header=None).drop(2, axis=1)])

		#read spamassassin data
		data = pd.concat([data, pd.read_csv('spamassassin.csv', header=None).drop(2, axis=1)])

		data = data.drop_duplicates()
		data = data.dropna(axis=0, how='any')
		data.reset_index(drop=True, inplace=True)
		data.columns = ['class', 'text']

		return data


		def clean_text(text):
		#remove html tags
		@@ -20,7 +41,7 @@ def clean_text(text):



		### define features from training data
		### create features from training data
		### input - spam/ham training set
		### returns a list of feature words
		def define_features(data, num_features=1000):
		@@ -43,28 +64,39 @@ def define_features(data, num_features=1000):
		#create list of words from text
		words = clean.split()
		for word in words:
		#add each word occurrence for ham words, subtract for spam words
		#typically ham words will have a large positive value
		#typically spam words will have a large negative value
		#add each word occurrence for spam words, subtract for ham words
		if len(word) > 1 and word not in stop_words:
		if labels[i] == 'ham':
		if labels[i] == 'spam':
		word_count[word] += 1
		else:
		word_count[word] -= 1

		#select the most 50 most common ham and spam words as features
		features = word_count.most_common(num_features // 2)
		features.extend(word_count.most_common()[-(num_features //2):])
		feature_list = word_count.most_common(num_features // 2)
		feature_list.extend(word_count.most_common()[-(num_features //2):])

		#return list of keywords
		return [item[0] for item in features]
		global features
		features = [item[0] for item in feature_list]
		dump(features, 'features.joblib')



		### creates a feature vector from a message
		### input - list of feature words and an sms message
		### returns a feature vector of 1 if feature is found in text and 0 if feature is not
		def extract(features, message):
		def extract(message):
		global features

		if len(features) == 0:
		if os.path.isfile('features.joblib'):
		print('Loading features from file...')
		features = load('features.joblib')
		else:
		print('Defining features...')
		data = read_data()
		define_features(data)

		vector = [0] * len(features)
		#clean text
		clean = clean_text(message)
		@@ -89,7 +121,7 @@ def extract(features, message):
		### creates a feature matrix from a data set
		### input - list of feature words and data set to extract features from
		### returns a numpy matrix containing a feature vector for each message, also returns an array of correct labels
		def prepare(features, data):
		def prepare(data):
		#create list of text and labels
		text = data['text'].tolist()
		labels = data['class'].tolist()
		@@ -98,7 +130,7 @@ def prepare(features, data):
		matrix = []
		for sample in text:
		#add feature vector to matrix for each message
		matrix.append(extract(features, sample))
		matrix.append(extract(sample))

		#label each sample - ham = -1, spam = +1
		target = np.ones(len(labels), dtype=int)
		@@ -109,39 +141,16 @@ def prepare(features, data):
		return np.array(matrix), target



		#read in training and testing data
		#kaggle and UCI contain the same data
		def read_data():
		#read kaggle data
		data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1)

		#read enron data
		data = pd.concat([data, pd.read_csv('enron.csv', header=None).drop(2, axis=1)])

		#read spamassassin data
		data = pd.concat([data, pd.read_csv('spamassassin.csv', header=None).drop(2, axis=1)])

		data = data.drop_duplicates()
		data = data.dropna(axis=0, how='any')
		data.reset_index(drop=True, inplace=True)
		data.columns = ['class', 'text']

		return data



		# for API
		def loadXY(refresh_data=False):
		if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib') or refresh_data:
		data = read_data()

		#extract features from training data
		features = define_features(data)
		dump(features, 'features.joblib')
		define_features(data)

		#create feature matrix for training and testing data
		X, Y = prepare(features, data)
		X, Y = prepare(data)
		dump(X, 'X.joblib')
		dump(Y, 'Y.joblib')
		return X, Y