fixed datasets (84fcaffc) · Commits · Chris / SpamHunter

enron.csv

+33716 −33716

File changed.

Preview size limit exceeded, changes collapsed.

features.joblib

+106 B (14.8 KiB)

File changed.

No diff preview for this file type.

View original file

View changed file

preprocess.py

+18 −17

Original line number	Diff line number	Diff line
		@@ -97,28 +97,28 @@ def prepare(features, data):
		#read in training and testing data
		#kaggle and UCI contain the same data
		def read_data():
		#read kaggle data
		data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1)
		data = data.drop_duplicates()

		#remove subject lines
		df = pd.read_csv('enron.csv', header=None).drop(2, axis=1)
		for i in range(len(df)):
		df[1][i] = df[1][i].replace('Subject: ', '', 1)

		df = df.drop_duplicates()
		data = pd.concat([data, df])
		#read enron data
		data = pd.concat([data, pd.read_csv('enron.csv', header=None).drop(2, axis=1)])

		#remove html tags
		df = pd.read_csv('spamassassin.csv', header=None).drop(2, axis=1)
		for i in range(len(df)):
		#df[1][i] = '' + xml.etree.ElementTree.fromstring(str(df[1][i])).itertext()
		df[1][i] = re.sub(r'<[^>]+>', '', df[1][i])
		#read spamassassin data
		data = pd.concat([data, pd.read_csv('spamassassin.csv', header=None).drop(2, axis=1)])

		df = df.drop_duplicates()
		data = pd.concat([data, df])
		data = data.drop_duplicates()
		data = data.dropna(axis=0, how='any')
		data.reset_index(drop=True, inplace=True)
		data.columns = ['class', 'text']
		#print(data.describe())

		"""
		#remove html tags, urls, and emails from data
		for i in range(len(data)):
		#df[1][i] = '' + xml.etree.ElementTree.fromstring(str(df[1][i])).itertext()
		data[1][i] = re.sub(r'<[^>]+>', '', data[1][i])
		"""

		#extract features from training data
		features = defineFeatures(data)
		dump(features, 'features.joblib')
		@@ -128,7 +128,6 @@ def read_data():
		dump(X, 'X.joblib')
		dump(Y, 'Y.joblib')


		# for API
		def loadXY():
		if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib'):
		@@ -138,3 +137,5 @@ def loadXY():
		Y = load('Y.joblib')

		return X, Y

		read_data()
		No newline at end of file

spamassassin.csv

+0 −0

File changed.

Preview suppressed by a .gitattributes entry or the file's encoding is unsupported.

View original file

View changed file