Commit 7ab3c465 authored by mjennewine's avatar mjennewine
Browse files

dump X, Y

parent b4fa03bf
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -2,8 +2,9 @@
X_train = []
Y_train = []
import preprocess

def preprocessing():
    XYList = preproces.loadXY()
    XYList = preprocess.loadXY()
    X_train = XYList[0]
    Y_train = XYList[1]

+41 −13
Original line number Diff line number Diff line
import numpy as np
import pandas as pd
import re
import os
from collections import Counter
import xml.etree.ElementTree
from joblib import dump, load
@@ -95,7 +96,7 @@ def prepare(features, data):
##############################################################################################################################################
#read in training and testing data
#kaggle and UCI contain the same data

def read_data():
    data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1)
    data = data.drop_duplicates()
    
@@ -113,6 +114,33 @@ for i in range(len(df)):
        #df[1][i] = '' + xml.etree.ElementTree.fromstring(str(df[1][i])).itertext()
        df[1][i] = re.sub(r'<[^>]+>', '', df[1][i])
    
    df = df.drop_duplicates()
    data = pd.concat([data, df])
    data.columns = ['class', 'text']
    #print(data.describe())
    
    #extract features from training data
    features = defineFeatures(data)
    dump(features, 'features.joblib')
    
    #create feature matrix for training and testing data
    X, Y = prepare(features, data)
    dump(X, 'X.joblib')
    dump(Y, 'Y.joblib')


# for API
def loadXY():
    if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib'):
        read_data()
    
    X = load('X.joblib')
    Y = load('Y.joblib')
    
    return X, Y



df = df.drop_duplicates()
data = pd.concat([data, df])
data.columns = ['class', 'text']