Saturday, March 7, 2020

hello, Random Forest

"""
The task here is to predict whether a bank currency note is authentic or not based on attributes such as variance (of wavelet transformed image).

The code is tuned from https://stackabuse.com/random-forest-algorithm-with-python-and-scikit-learn/

Get dataset.csv from https://drive.google.com/file/d/13nw-uRXPY8XIZQxKRNZ3yYlho-CYm_Qt/view
 , column [0,4) is features(X: x0~x3),  column 4 is class value(label).

This demo must run under conda, setup your conda env and go into yours( for me, ` conda activate zxxu_conda ` ):
 (zxxu_conda) root@BadSectorsUbun...

"""

from os import name as os_name

from os.path import dirname
from os.path import join as join_path

OSN_HINT_UNIX   = 'posix'
OSN_HINT_WINDOWS= 'nt'
OSN             = os_name

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
#on error ` pip install -U scikit-learn ` under conda

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

PrjDir = dirname(__file__)

dataset_path = join_path(PrjDir, "is_bank_currency_fake.csv")

dataset = pd.read_csv(dataset_path)

#select all rows, and all columns( [x0, x4) )
X = dataset.iloc[:, 0:4].values

y = dataset.iloc[:, 4].values

num_recs = X.shape[0]
print( "number of records:%d" % num_recs )



#if you don't want same split results, suggest setting random_state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

num_1s_in_y_test = np.count_nonzero(y_test)
num_0s_in_y_test = y_test.size - num_1s_in_y_test
print( "number of 1s VS 0s in y_test:%d VS %d" % (num_1s_in_y_test, num_0s_in_y_test))

"""
if use NUM_HIJACK_OFF_1s_of_y_test, 1s in y_test will decrease while increasing 0s

NUM_HIJACK_OFF_1s_of_y_test = 2
assert num_1s_in_y_test > NUM_HIJACK_OFF_1s_of_y_test
num_hijack_off_1s_of_y_test = 0
for i in range(0,num_1s_in_y_test):
    if num_hijack_off_1s_of_y_test >= NUM_HIJACK_OFF_1s_of_y_test:
        break
    if y_test[i]:
        y_test[i] = 0
        num_hijack_off_1s_of_y_test += 1
"""

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


#The number of trees in the forest
#Changed in sklearn 0.22: The default value of n_estimators changed from 10 to 100
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)

#now we have 20 percent samples predicated, but we don't know how good the predicates are
y_pred = regressor.predict(X_test)


"""
>>> import numpy as np
>>> np.round([0.49])
array([0.])
>>> np.round([0.51])
array([1.])
"""

"""
https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/

true positives (TP): These are cases in which we predicted yes (they have the disease), and they do have the disease.
true negatives (TN): We predicted no, and they don't have the disease.

True Positive Rate = When it's actually yes, how often does it predict yes = also known as "Sensitivity" or "Recall"
True Negative Rate = When it's actually no, how often does it predict no = also known as "Specificity"

FP_rate = 1 - TN_rate


Accuracy: Overall, how often is the classifier correct? (TP+TN)/total
Misclassification Rate = also known as "Error Rate": 1 - Accuracy

Precision: When it predicts yes, how often is it correct? TP/predicted_yes

#Prevalence: How often does the yes condition actually occur in our sample? actual yes/total

"""
y_pred_rounded = y_pred.round()
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rounded).ravel()
assert (fp + tn) == num_0s_in_y_test
assert (fn + tp) == num_1s_in_y_test
print("true positives rate=%.2f, true negatives rate=%.2f"
    % (float(tp)/num_1s_in_y_test,float(tn)/num_0s_in_y_test))

"""
def of support:
The support is the number of occurrences of each class in y_true.
y_true is the ground truth (correct) target values.

if you don't understand the support column, use NUM_HIJACK_OFF_1s_of_y_test


F Score: This is a weighted average of the true positive rate (recall) and precision
"""
print(classification_report(y_test,y_pred_rounded))