Added regex to clean input data (43b48795) · Commits · Chris / SpamHunter

features.joblib

−50 B (14.8 KiB)

File changed.

No diff preview for this file type.

preprocess.py

+32 −26

Original line number	Diff line number	Diff line
		@@ -5,19 +5,21 @@ import os
		from collections import Counter
		from joblib import dump, load


		def clean_text(text):
		#remove html tags
		""""""
		clean = re.sub('<[^<]+?>', '', text)
		#remove urls
		""""""
		clean = re.sub('http[s]?://\S+', '', clean)
		#remove email addresses
		""""""
		clean = re.sub('\S@\S\s?', '', clean)
		#remove any remaining non alphabetic characters
		clean = re.sub('[^a-zA-Z\']+', ' ', text)
		clean = clean.lower()

		clean = re.sub('[^a-zA-Z]+', ' ', clean)
		clean = clean.lower().strip()
		return clean



		### define features from training data
		### input - spam/ham training set
		### returns a list of feature words
		@@ -58,6 +60,7 @@ def define_features(data, num_features=1000):
		return [item[0] for item in features]



		### creates a feature vector from a message
		### input - list of feature words and an sms message
		### returns a feature vector of 1 if feature is found in text and 0 if feature is not
		@@ -82,6 +85,7 @@ def extract(features, message):
		return vector



		### creates a feature matrix from a data set
		### input - list of feature words and data set to extract features from
		### returns a numpy matrix containing a feature vector for each message, also returns an array of correct labels
		@@ -105,7 +109,7 @@ def prepare(features, data):
		return np.array(matrix), target


		##############################################################################################################################################

		#read in training and testing data
		#kaggle and UCI contain the same data
		def read_data():
		@@ -123,6 +127,15 @@ def read_data():
		data.reset_index(drop=True, inplace=True)
		data.columns = ['class', 'text']

		return data



		# for API
		def loadXY(refresh_data=False):
		if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib') or refresh_data:
		data = read_data()

		#extract features from training data
		features = define_features(data)
		dump(features, 'features.joblib')
		@@ -131,15 +144,8 @@ def read_data():
		X, Y = prepare(features, data)
		dump(X, 'X.joblib')
		dump(Y, 'Y.joblib')

		# for API
		def loadXY():
		if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib'):
		read_data()

		return X, Y
		else :
		X = load('X.joblib')
		Y = load('Y.joblib')

		return X, Y

		read_data()
		No newline at end of file

test.py

0 → 100644

+15 −0

Original line number	Diff line number	Diff line
		""" temporary script for testing things """

		from joblib import dump, load
		import preprocess as pre

		"""
		data = pre.read_data()

		for i in range(len(data)):
		data['text'][i] = pre.clean_text(data['text'][i])
		"""

		X, Y = pre.loadXY(True)

		#features = load('features.joblib')
		No newline at end of file