Loading ml.py +3 −2 Original line number Diff line number Diff line Loading @@ -2,8 +2,9 @@ X_train = [] Y_train = [] import preprocess def preprocessing(): XYList = preproces.loadXY() XYList = preprocess.loadXY() X_train = XYList[0] Y_train = XYList[1] Loading preprocess.py +41 −13 Original line number Diff line number Diff line import numpy as np import pandas as pd import re import os from collections import Counter import xml.etree.ElementTree from joblib import dump, load Loading Loading @@ -95,7 +96,7 @@ def prepare(features, data): ############################################################################################################################################## #read in training and testing data #kaggle and UCI contain the same data def read_data(): data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1) data = data.drop_duplicates() Loading @@ -113,6 +114,33 @@ for i in range(len(df)): #df[1][i] = '' + xml.etree.ElementTree.fromstring(str(df[1][i])).itertext() df[1][i] = re.sub(r'<[^>]+>', '', df[1][i]) df = df.drop_duplicates() data = pd.concat([data, df]) data.columns = ['class', 'text'] #print(data.describe()) #extract features from training data features = defineFeatures(data) dump(features, 'features.joblib') #create feature matrix for training and testing data X, Y = prepare(features, data) dump(X, 'X.joblib') dump(Y, 'Y.joblib') # for API def loadXY(): if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib'): read_data() X = load('X.joblib') Y = load('Y.joblib') return X, Y df = df.drop_duplicates() data = pd.concat([data, df]) data.columns = ['class', 'text'] Loading Loading
ml.py +3 −2 Original line number Diff line number Diff line Loading @@ -2,8 +2,9 @@ X_train = [] Y_train = [] import preprocess def preprocessing(): XYList = preproces.loadXY() XYList = preprocess.loadXY() X_train = XYList[0] Y_train = XYList[1] Loading
preprocess.py +41 −13 Original line number Diff line number Diff line import numpy as np import pandas as pd import re import os from collections import Counter import xml.etree.ElementTree from joblib import dump, load Loading Loading @@ -95,7 +96,7 @@ def prepare(features, data): ############################################################################################################################################## #read in training and testing data #kaggle and UCI contain the same data def read_data(): data = pd.read_csv('UCI.csv', header=None).drop(2, axis=1) data = data.drop_duplicates() Loading @@ -113,6 +114,33 @@ for i in range(len(df)): #df[1][i] = '' + xml.etree.ElementTree.fromstring(str(df[1][i])).itertext() df[1][i] = re.sub(r'<[^>]+>', '', df[1][i]) df = df.drop_duplicates() data = pd.concat([data, df]) data.columns = ['class', 'text'] #print(data.describe()) #extract features from training data features = defineFeatures(data) dump(features, 'features.joblib') #create feature matrix for training and testing data X, Y = prepare(features, data) dump(X, 'X.joblib') dump(Y, 'Y.joblib') # for API def loadXY(): if not os.path.isfile('X.joblib') or not os.path.isfile('Y.joblib'): read_data() X = load('X.joblib') Y = load('Y.joblib') return X, Y df = df.drop_duplicates() data = pd.concat([data, df]) data.columns = ['class', 'text'] Loading