Commit 5cfe72f4 authored by mjennewine's avatar mjennewine
Browse files

Added additional data files

parent b9335167
Loading
Loading
Loading
Loading
+14 −1
Original line number Diff line number Diff line
SpamHunter

Defeating spam, one message at a time.



Data Sources:

https://www.kaggle.com/uciml/sms-spam-collection-dataset

https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

http://www2.aueb.gr/users/ion/data/enron-spam/

https://spamassassin.apache.org/old/publiccorpus/

UCI.csv

0 → 100644
+5574 −0

File added.

Preview size limit exceeded, changes collapsed.

enron.csv

0 → 100644
+33716 −0

File added.

Preview size limit exceeded, changes collapsed.

+5573 −0

File changed and moved.

Preview size limit exceeded, changes collapsed.

preprocess.py

0 → 100644
+95 −0
Original line number Diff line number Diff line
import numpy as np
import pandas as pd
import re
from collections import Counter

### define features from training data
### input - spam/ham training set
### returns a list of feature words
def defineFeatures(data, num_features=100):
    #create list of text and labels
    text =  data['v2'].tolist()
    labels = data['v1'].tolist()
    
    #read in list of stop words
    stop_words = []
    fin = open('stop_words.txt')
    for line in fin:
        stop_words.append(line.strip().lower())
    fin.close()
    
    #extract words from text
    word_count = Counter()
    for i in range(len(text)):
        #remove all non alphabetic characters from text
        clean = re.sub('[^a-zA-Z\']+', '|', text[i])
        #create list of words from text
        words = clean.lower().split('|')
        for word in words:
            #add each word occurrence for ham words, subtract for spam words
            #typically ham words will have a large positive value
            #typically spam words will have a large negative value
            if len(word) > 1 and word not in stop_words:
                if labels[i] == 'ham':
                    word_count[word] += 1
                else:
                    word_count[word] -= 1
    
    #select the most 50 most common ham and spam words as features
    features = word_count.most_common(num_features // 2)
    features.extend(word_count.most_common()[-(num_features //2):])
    
    #return list of keywords
    return [item[0] for item in features]


### creates a feature vector from a message
### input - list of feature words and an sms message
### returns a feature vector of 1 if feature is found in text and 0 if feature is not
def extract(features, message):
    vector = [0] * len(features)
    #remove all non alphabetic characters from text
    clean = re.sub('[^a-zA-Z\']+', '|', message)
    #create list of words from text
    words = clean.lower().split('|')
    
    #set feature value to 1 if word is found in text
    for i in range(len(features)):
        if features[i] in words:
            vector[i] = 1
    return vector


### creates a feature matrix from a data set
### input - list of feature words and data set to extract features from
### returns a numpy matrix containing a feature vector for each message, also returns an array of correct labels
def prepare(features, data):
    #create list of text and labels
    text =  data['v2'].tolist()
    labels = data['v1'].tolist()
    
    #create feature matrix
    matrix = []
    for sample in text:
        #add feature vector to matrix for each message
        matrix.append(extract(features, sample))
    
    #label each sample - ham = -1, spam = +1
    target = np.ones(len(labels), dtype=int)
    for i in range(len(labels)):
        if labels[i] == 'ham':
            target[i] = -1
    
    return np.array(matrix), target


##############################################################################################################################################
#read in training and testing data
data = pd.read_csv('kaggle.csv', header=None).drop(2, axis=1)


#extract features from training data
#features = defineFeatures(data)

#create feature matrix for training and testing data
#X, Y = prepare(features, data)
 No newline at end of file
Loading