"""
The task here is to predict whether a bank currency note is authentic or not based on attributes such as variance (of wavelet transformed image).
The code is tuned from https://stackabuse.com/random-forest-algorithm-with-python-and-scikit-learn/
Get dataset.csv from https://drive.google.com/file/d/13nw-uRXPY8XIZQxKRNZ3yYlho-CYm_Qt/view
, column [0,4) is features(X: x0~x3), column 4 is class value(label).
This demo must run under conda, setup your conda env and go into yours( for me, ` conda activate zxxu_conda ` ):
(zxxu_conda) root@BadSectorsUbun...
"""
from os import name as os_name
from os.path import dirname
from os.path import join as join_path
OSN_HINT_UNIX = 'posix'
OSN_HINT_WINDOWS= 'nt'
OSN = os_name
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#on error ` pip install -U scikit-learn ` under conda
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
PrjDir = dirname(__file__)
dataset_path = join_path(PrjDir, "is_bank_currency_fake.csv")
dataset = pd.read_csv(dataset_path)
#select all rows, and all columns( [x0, x4) )
X = dataset.iloc[:, 0:4].values
y = dataset.iloc[:, 4].values
num_recs = X.shape[0]
print( "number of records:%d" % num_recs )
#if you don't want same split results, suggest setting random_state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
num_1s_in_y_test = np.count_nonzero(y_test)
num_0s_in_y_test = y_test.size - num_1s_in_y_test
print( "number of 1s VS 0s in y_test:%d VS %d" % (num_1s_in_y_test, num_0s_in_y_test))
"""
if use NUM_HIJACK_OFF_1s_of_y_test, 1s in y_test will decrease while increasing 0s
NUM_HIJACK_OFF_1s_of_y_test = 2
assert num_1s_in_y_test > NUM_HIJACK_OFF_1s_of_y_test
num_hijack_off_1s_of_y_test = 0
for i in range(0,num_1s_in_y_test):
if num_hijack_off_1s_of_y_test >= NUM_HIJACK_OFF_1s_of_y_test:
break
if y_test[i]:
y_test[i] = 0
num_hijack_off_1s_of_y_test += 1
"""
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#The number of trees in the forest
#Changed in sklearn 0.22: The default value of n_estimators changed from 10 to 100
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
#now we have 20 percent samples predicated, but we don't know how good the predicates are
y_pred = regressor.predict(X_test)
"""
>>> import numpy as np
>>> np.round([0.49])
array([0.])
>>> np.round([0.51])
array([1.])
"""
"""
https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/
true positives (TP): These are cases in which we predicted yes (they have the disease), and they do have the disease.
true negatives (TN): We predicted no, and they don't have the disease.
True Positive Rate = When it's actually yes, how often does it predict yes = also known as "Sensitivity" or "Recall"
True Negative Rate = When it's actually no, how often does it predict no = also known as "Specificity"
FP_rate = 1 - TN_rate
Accuracy: Overall, how often is the classifier correct? (TP+TN)/total
Misclassification Rate = also known as "Error Rate": 1 - Accuracy
Precision: When it predicts yes, how often is it correct? TP/predicted_yes
#Prevalence: How often does the yes condition actually occur in our sample? actual yes/total
"""
y_pred_rounded = y_pred.round()
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rounded).ravel()
assert (fp + tn) == num_0s_in_y_test
assert (fn + tp) == num_1s_in_y_test
print("true positives rate=%.2f, true negatives rate=%.2f"
% (float(tp)/num_1s_in_y_test,float(tn)/num_0s_in_y_test))
"""
def of support:
The support is the number of occurrences of each class in y_true.
y_true is the ground truth (correct) target values.
if you don't understand the support column, use NUM_HIJACK_OFF_1s_of_y_test
F Score: This is a weighted average of the true positive rate (recall) and precision
"""
print(classification_report(y_test,y_pred_rounded))
No comments:
Post a Comment