Commit 43b48795 authored by mjennewine's avatar mjennewine
Browse files

Added regex to clean input data

parent 56d3eea5
Loading
Loading
Loading
Loading
−50 B (14.8 KiB)

File changed.

No diff preview for this file type.

+32 −26
Original line number Diff line number Diff line
@@ -5,19 +5,21 @@ import os
from collections import Counter
from joblib import dump, load


def clean_text(text):
    #remove html tags
    """"""
    clean = re.sub('<[^<]+?>', '', text)
    #remove urls
    """"""
    clean = re.sub('http[s]?://\S+', '', clean)
    #remove email addresses
    """"""
    clean = re.sub('\S*@\S*\s?', '', clean)
    #remove any remaining non alphabetic characters
    clean = re.sub('[^a-zA-Z\']+', ' ', text)
    clean = clean.lower()
    
    clean = re.sub('[^a-zA-Z]+', ' ', clean)
    clean = clean.lower().strip()    
    return clean



### define features from training data
### input - spam/ham training set
### returns a list of feature words
@@ -58,6 +60,7 @@ def define_features(data, num_features=1000):
    return [item[0] for item in features]



### creates a feature vector from a message
### input - list of feature words and an sms message
### returns a feature vector of 1 if feature is found in text and 0 if feature is not
@@ -82,6 +85,7 @@ def extract(features, message):
    return vector



### creates a feature matrix from a data set
### input - list of feature words and data set to extract features from
### returns a numpy matrix containing a feature vector for each message, also returns an array of correct labels
@@ -105,7 +109,7 @@ def prepare(features, data):
    return np.array(matrix), target


##############################################################################################################################################

#read in training and testing data
#kaggle and UCI contain the same data
def read_data():
@@ -123,6 +127,15 @@ def read_data():
    data.reset_index(drop=True, inplace=True)
    data.columns = ['class', 'text']

    return data



# for API
def loadXY(refresh_data=False):
    if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib') or refresh_data:
        data = read_data()
        
        #extract features from training data
        features = define_features(data)
        dump(features, 'features.joblib')
@@ -131,15 +144,8 @@ def read_data():
        X, Y = prepare(features, data)
        dump(X, 'X.joblib')
        dump(Y, 'Y.joblib')        

# for API
def loadXY():
    if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib'):
        read_data()
    
        return X, Y
    else :
        X = load('X.joblib')
        Y = load('Y.joblib')
    
        return X, Y

read_data()
 No newline at end of file

test.py

0 → 100644
+15 −0
Original line number Diff line number Diff line
""" temporary script for testing things """

from joblib import dump, load
import preprocess as pre

"""
data = pre.read_data()

for i in range(len(data)):
    data['text'][i] = pre.clean_text(data['text'][i])
"""

X, Y = pre.loadXY(True)

#features = load('features.joblib')
 No newline at end of file