Commit f9dc158e authored by mjennewine's avatar mjennewine
Browse files

send json to server and make prediction

parent cd10ab5b
Loading
Loading
Loading
Loading
+3 B (14.7 KiB)

File changed.

No diff preview for this file type.

+43 −13
Original line number Diff line number Diff line
@@ -5,14 +5,20 @@ from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from joblib import dump, load
import os

from warnings import simplefilter
simplefilter(action='ignore')

# preprocessing
import preprocess

# global variables
X = []
Y = []
import preprocess
pla = None
sgc = None
nn = None
tree = None


def printMetrics(y_actual, y_pred):
@@ -23,19 +29,19 @@ def printMetrics(y_actual, y_pred):


def preprocessing():
    #this function does nothing
    global X, Y
    XYList = preprocess.loadXY()
    X = XYList[0]
    Y = XYList[1]


def trainModels():
    XYList = preprocess.loadXY()
    X = XYList[0]
    Y = XYList[1]
    preprocessing()
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    
    global pla, sgc, nn, tree
    
    print('\n==================================================\n')
    
    ### train perceptron
@@ -122,31 +128,55 @@ def trainModels():
    print('\n==================================================\n')


def testModels():
    XYList = preprocess.loadXY()
    X = XYList[0]
    Y = XYList[1]
def loadModels():
    global pla, sgc, nn, tree
    
    if not os.path.isfile('pla.joblib') or not os.path.isfile('sgc.joblib') or not os.path.isfile('nn.joblib') or not os.path.isfile('tree.joblib'):
        print('Training models...')
        trainModels()
    else:
        print('Loading models from file...')
        pla = load('pla.joblib')
        sgc = load('sgc.joblib')
        nn = load('nn.joblib')
        tree = load('tree.joblib')


def testModels():
    if len(X) == 0:
        preprocessing()
    
    if tree == None:
        loadModels()
    
    pred_y = pla.predict(X)
    print('Perceptron (all data):')
    printMetrics(Y, pred_y)
    
    sgc = load('sgc.joblib')
    pred_y = sgc.predict(X)
    print('\nStochastic Gradient Descent (all data):')
    printMetrics(Y, pred_y)
    
    nn = load('nn.joblib')
    pred_y = nn.predict(X)
    print('\nNeural Network (all data):')
    printMetrics(Y, pred_y)
    
    tree = load('tree.joblib')
    pred_y = tree.predict(X)
    print('\nDecision Tree (all data):')
    printMetrics(Y, pred_y)

#### not finished 
def makePrediction(message):
    if tree == None:
        loadModels()
    
    pred_y = tree.predict([preprocess.extract(message)])
    print(pred_y)
    
    if pred_y[0] == 1:
        return 'spam'
    else:
        return 'ham'


# declaring, training, fitting each algorithm
+3.26 KiB (3.07 MiB)

File changed.

No diff preview for this file type.

(8.91 KiB)

File changed.

No diff preview for this file type.

+45 −36
Original line number Diff line number Diff line
@@ -5,6 +5,27 @@ import os
from collections import Counter
from joblib import dump, load

features = []

#read in training and testing data
#kaggle and UCI contain the same data
def read_data():
    #read kaggle data
    data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1)
    
    #read enron data
    data = pd.concat([data, pd.read_csv('enron.csv', header=None).drop(2, axis=1)])
    
    #read spamassassin data
    data = pd.concat([data, pd.read_csv('spamassassin.csv', header=None).drop(2, axis=1)])
    
    data = data.drop_duplicates()
    data = data.dropna(axis=0, how='any')
    data.reset_index(drop=True, inplace=True)
    data.columns = ['class', 'text']

    return data


def clean_text(text):
    #remove html tags
@@ -20,7 +41,7 @@ def clean_text(text):



### define features from training data
### create features from training data
### input - spam/ham training set
### returns a list of feature words
def define_features(data, num_features=1000):
@@ -43,28 +64,39 @@ def define_features(data, num_features=1000):
        #create list of words from text
        words = clean.split()
        for word in words:
            #add each word occurrence for ham words, subtract for spam words
            #typically ham words will have a large positive value
            #typically spam words will have a large negative value
            #add each word occurrence for spam words, subtract for ham words
            if len(word) > 1 and word not in stop_words:
                if labels[i] == 'ham':
                if labels[i] == 'spam':
                    word_count[word] += 1
                else:
                    word_count[word] -= 1
    
    #select the most 50 most common ham and spam words as features
    features = word_count.most_common(num_features // 2)
    features.extend(word_count.most_common()[-(num_features //2):])
    feature_list = word_count.most_common(num_features // 2)
    feature_list.extend(word_count.most_common()[-(num_features //2):])
    
    #return list of keywords
    return [item[0] for item in features]
    global features
    features = [item[0] for item in feature_list]
    dump(features, 'features.joblib')



### creates a feature vector from a message
### input - list of feature words and an sms message
### returns a feature vector of 1 if feature is found in text and 0 if feature is not
def extract(features, message):
def extract(message):
    global features
    
    if len(features) == 0:
        if os.path.isfile('features.joblib'):
            print('Loading features from file...')
            features = load('features.joblib')
        else:
            print('Defining features...')
            data = read_data()
            define_features(data)
    
    vector = [0] * len(features)
    #clean text
    clean = clean_text(message)
@@ -89,7 +121,7 @@ def extract(features, message):
### creates a feature matrix from a data set
### input - list of feature words and data set to extract features from
### returns a numpy matrix containing a feature vector for each message, also returns an array of correct labels
def prepare(features, data):
def prepare(data):
    #create list of text and labels
    text =  data['text'].tolist()
    labels = data['class'].tolist()
@@ -98,7 +130,7 @@ def prepare(features, data):
    matrix = []
    for sample in text:
        #add feature vector to matrix for each message
        matrix.append(extract(features, sample))
        matrix.append(extract(sample))
    
    #label each sample - ham = -1, spam = +1
    target = np.ones(len(labels), dtype=int)
@@ -109,39 +141,16 @@ def prepare(features, data):
    return np.array(matrix), target



#read in training and testing data
#kaggle and UCI contain the same data
def read_data():
    #read kaggle data
    data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1)
    
    #read enron data
    data = pd.concat([data, pd.read_csv('enron.csv', header=None).drop(2, axis=1)])
    
    #read spamassassin data
    data = pd.concat([data, pd.read_csv('spamassassin.csv', header=None).drop(2, axis=1)])
    
    data = data.drop_duplicates()
    data = data.dropna(axis=0, how='any')
    data.reset_index(drop=True, inplace=True)
    data.columns = ['class', 'text']

    return data



# for API
def loadXY(refresh_data=False):
    if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib') or refresh_data:
        data = read_data()
        
        #extract features from training data
        features = define_features(data)
        dump(features, 'features.joblib')
        define_features(data)
        
        #create feature matrix for training and testing data
        X, Y = prepare(features, data)
        X, Y = prepare(data)
        dump(X, 'X.joblib')
        dump(Y, 'Y.joblib')        
        return X, Y
Loading