Loading features.joblib −50 B (14.8 KiB) File changed.No diff preview for this file type. View original file View changed file preprocess.py +32 −26 Original line number Diff line number Diff line Loading @@ -5,19 +5,21 @@ import os from collections import Counter from joblib import dump, load def clean_text(text): #remove html tags """""" clean = re.sub('<[^<]+?>', '', text) #remove urls """""" clean = re.sub('http[s]?://\S+', '', clean) #remove email addresses """""" clean = re.sub('\S*@\S*\s?', '', clean) #remove any remaining non alphabetic characters clean = re.sub('[^a-zA-Z\']+', ' ', text) clean = clean.lower() clean = re.sub('[^a-zA-Z]+', ' ', clean) clean = clean.lower().strip() return clean ### define features from training data ### input - spam/ham training set ### returns a list of feature words Loading Loading @@ -58,6 +60,7 @@ def define_features(data, num_features=1000): return [item[0] for item in features] ### creates a feature vector from a message ### input - list of feature words and an sms message ### returns a feature vector of 1 if feature is found in text and 0 if feature is not Loading @@ -82,6 +85,7 @@ def extract(features, message): return vector ### creates a feature matrix from a data set ### input - list of feature words and data set to extract features from ### returns a numpy matrix containing a feature vector for each message, also returns an array of correct labels Loading @@ -105,7 +109,7 @@ def prepare(features, data): return np.array(matrix), target ############################################################################################################################################## #read in training and testing data #kaggle and UCI contain the same data def read_data(): Loading @@ -123,6 +127,15 @@ def read_data(): data.reset_index(drop=True, inplace=True) data.columns = ['class', 'text'] return data # for API def loadXY(refresh_data=False): if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib') or refresh_data: data = read_data() #extract features from training data features = define_features(data) dump(features, 'features.joblib') Loading @@ -131,15 +144,8 @@ def read_data(): X, Y = prepare(features, data) dump(X, 'X.joblib') dump(Y, 'Y.joblib') # for API def loadXY(): if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib'): read_data() return X, Y else : X = load('X.joblib') Y = load('Y.joblib') return X, Y read_data() No newline at end of file test.py 0 → 100644 +15 −0 Original line number Diff line number Diff line """ temporary script for testing things """ from joblib import dump, load import preprocess as pre """ data = pre.read_data() for i in range(len(data)): data['text'][i] = pre.clean_text(data['text'][i]) """ X, Y = pre.loadXY(True) #features = load('features.joblib') No newline at end of file Loading
features.joblib −50 B (14.8 KiB) File changed.No diff preview for this file type. View original file View changed file
preprocess.py +32 −26 Original line number Diff line number Diff line Loading @@ -5,19 +5,21 @@ import os from collections import Counter from joblib import dump, load def clean_text(text): #remove html tags """""" clean = re.sub('<[^<]+?>', '', text) #remove urls """""" clean = re.sub('http[s]?://\S+', '', clean) #remove email addresses """""" clean = re.sub('\S*@\S*\s?', '', clean) #remove any remaining non alphabetic characters clean = re.sub('[^a-zA-Z\']+', ' ', text) clean = clean.lower() clean = re.sub('[^a-zA-Z]+', ' ', clean) clean = clean.lower().strip() return clean ### define features from training data ### input - spam/ham training set ### returns a list of feature words Loading Loading @@ -58,6 +60,7 @@ def define_features(data, num_features=1000): return [item[0] for item in features] ### creates a feature vector from a message ### input - list of feature words and an sms message ### returns a feature vector of 1 if feature is found in text and 0 if feature is not Loading @@ -82,6 +85,7 @@ def extract(features, message): return vector ### creates a feature matrix from a data set ### input - list of feature words and data set to extract features from ### returns a numpy matrix containing a feature vector for each message, also returns an array of correct labels Loading @@ -105,7 +109,7 @@ def prepare(features, data): return np.array(matrix), target ############################################################################################################################################## #read in training and testing data #kaggle and UCI contain the same data def read_data(): Loading @@ -123,6 +127,15 @@ def read_data(): data.reset_index(drop=True, inplace=True) data.columns = ['class', 'text'] return data # for API def loadXY(refresh_data=False): if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib') or refresh_data: data = read_data() #extract features from training data features = define_features(data) dump(features, 'features.joblib') Loading @@ -131,15 +144,8 @@ def read_data(): X, Y = prepare(features, data) dump(X, 'X.joblib') dump(Y, 'Y.joblib') # for API def loadXY(): if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib'): read_data() return X, Y else : X = load('X.joblib') Y = load('Y.joblib') return X, Y read_data() No newline at end of file
test.py 0 → 100644 +15 −0 Original line number Diff line number Diff line """ temporary script for testing things """ from joblib import dump, load import preprocess as pre """ data = pre.read_data() for i in range(len(data)): data['text'][i] = pre.clean_text(data['text'][i]) """ X, Y = pre.loadXY(True) #features = load('features.joblib') No newline at end of file