from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from imblearn.datasets import fetch_datasets

from imblearn.under_sampling import (
    RandomUnderSampler,
    CondensedNearestNeighbour,
    TomekLinks,
    OneSidedSelection,
    EditedNearestNeighbours,
    RepeatedEditedNearestNeighbours,
    AllKNN,
    NeighbourhoodCleaningRule,
    NearMiss,
    InstanceHardnessThreshold
)


undersampler_dict = {

    'random': RandomUnderSampler(
        sampling_strategy='auto',
        random_state=0,
        replacement=False),

    'cnn': CondensedNearestNeighbour(
        sampling_strategy='auto',
        random_state=0,
        n_neighbors=1,
        n_jobs=4),

    'tomek': TomekLinks(
        sampling_strategy='auto',
        n_jobs=4),

    'oss': OneSidedSelection(
        sampling_strategy='auto',
        random_state=0,
        n_neighbors=1,
        n_jobs=4),

    'enn': EditedNearestNeighbours(
        sampling_strategy='auto',
        n_neighbors=3,
        kind_sel='all',
        n_jobs=4),

    'renn': RepeatedEditedNearestNeighbours(
        sampling_strategy='auto',
        n_neighbors=3,
        kind_sel='all',
        n_jobs=4,
        max_iter=100),

    'allknn': AllKNN(
        sampling_strategy='auto',
        n_neighbors=3,
        kind_sel='all',
        n_jobs=4),

    'ncr': NeighbourhoodCleaningRule(
        sampling_strategy='auto',
        n_neighbors=3,
        kind_sel='all',
        n_jobs=4,
        threshold_cleaning=0.5),

    'nm1': NearMiss(
        sampling_strategy='auto',
        version=1,
        n_neighbors=3,
        n_jobs=4),

    'nm2': NearMiss(
        sampling_strategy='auto',
        version=2,
        n_neighbors=3,
        n_jobs=4),

    'iht': InstanceHardnessThreshold(
        estimator=LogisticRegression(random_state=0),
        sampling_strategy='auto',
        random_state=0,
        n_jobs=4,
        cv=3)
}


datasets_ls = [
    'car_eval_34',
    'ecoli',
    'thyroid_sick',
    'arrhythmia',
    'ozone_level'
]


# in ra lớp mất cân bằng
## Yêu cầu 1:
## VIẾT CODE Ở ĐÂY:
for dataset in datasets_ls:
    data = fetch_datasets()[dataset]
    print(dataset)
    print(Counter(data.target))
    print()

car_eval_34
Counter({-1: 1594, 1: 134})

ecoli
Counter({-1: 301, 1: 35})

thyroid_sick
Counter({-1: 3541, 1: 231})

arrhythmia
Counter({-1: 427, 1: 25})

ozone_level
Counter({-1: 2463, 1: 73})


# hàm huấn luyện random forest và đánh giá chất lượng

def run_randomForests(X_train, X_test, y_train, y_test):
    ## Yêu cầu 2:
    ## VIẾT CODE Ở ĐÂY:
    rf = RandomForestClassifier(
        n_estimators=100, random_state=39, max_depth=3, n_jobs=4)
    rf.fit(X_train, y_train)

    print('Train set')
    # tính xác suất của các dự đoán (sử dụng 'predict_proba')
    pred = rf.predict_proba(X_train)
    # sử dụng 'roc_auc_score' từ sklearn để in ra hệ số huấn luyện 
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    # tính xác suất của các dự đoán (sử dụng 'predict_proba')
    pred = rf.predict_proba(X_test)
    # sử dụng 'roc_auc_score' ừ sklearn để in ra hệ số kiểm tra
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

    return roc_auc_score(y_test, pred[:, 1])


# lưu kết quả
results_dict = {}
shapes_dict = {}

## Yêu cầu 3:
## VIẾT CODE Ở ĐÂY:

for dataset in datasets_ls:
    
    results_dict[dataset] = {}
    shapes_dict[dataset] = {}
    
    print(dataset)
    
    # load tập dữ liệu
    data = fetch_datasets()[dataset]
    
    # tách thành tập huấn luyện và tập kiểm tra
    X_train, X_test, y_train, y_test = train_test_split(
    data.data,  
    data.target, 
    test_size=0.3,
    random_state=0)
    
    # do nhiều kỹ thuật undersampling sử dụng KNN
    # chúng ta đặt biến trong cùng một tỷ lệ
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
        
    roc = run_randomForests(X_train, X_test, y_train, y_test)
    
    # lưu trữ kết quả
    results_dict[dataset]['full_data'] = roc
    shapes_dict[dataset]['full_data'] = len(X_train)
    
    print()
    
    for undersampler in undersampler_dict.keys():
        
        print(undersampler)
        
        # lấy lại mẫu
        X_resampled, y_resampled = undersampler_dict[undersampler].fit_resample(X_train, y_train)
        
        # đánh giá chất lượng
        roc = run_randomForests(X_resampled, X_test, y_resampled, y_test)
        
        # lưu trữ kết quả
        results_dict[dataset][undersampler] = roc
        shapes_dict[dataset][undersampler] = len(X_resampled)
        print()
        
    print()

car_eval_34
Train set
Random Forests roc-auc: 0.9765820711450264
Test set
Random Forests roc-auc: 0.9682620675579141

random
Train set
Random Forests roc-auc: 0.9963001503063939
Test set
Random Forests roc-auc: 0.9897438514134096

cnn
Train set
Random Forests roc-auc: 0.9682002467830072
Test set
Random Forests roc-auc: 0.986376160832738

tomek
Train set
Random Forests roc-auc: 0.980435542098521
Test set
Random Forests roc-auc: 0.973288090621492

oss
Train set
Random Forests roc-auc: 0.9788177918796639
Test set
Random Forests roc-auc: 0.9683130931727727

enn
Train set
Random Forests roc-auc: 0.9864826470845354
Test set
Random Forests roc-auc: 0.9644351464435147

renn
Train set
Random Forests roc-auc: 0.9958987301837685
Test set
Random Forests roc-auc: 0.9692315542402286

allknn
Train set
Random Forests roc-auc: 0.9904645972813958
Test set
Random Forests roc-auc: 0.9684151444024901

ncr
Train set
Random Forests roc-auc: 0.9799498012559451
Test set
Random Forests roc-auc: 0.9674456577201754

nm1
Train set
Random Forests roc-auc: 0.9687825182101977
Test set
Random Forests roc-auc: 0.9951525665884273

nm2
Train set
Random Forests roc-auc: 0.9695918603306741
Test set
Random Forests roc-auc: 0.9342279824471884

iht
Train set
Random Forests roc-auc: 0.9832350560758469
Test set
Random Forests roc-auc: 0.9955607715072966


ecoli
Train set
Random Forests roc-auc: 0.9920868605079132
Test set
Random Forests roc-auc: 0.9504830917874396

random
Train set
Random Forests roc-auc: 0.9940828402366864
Test set
Random Forests roc-auc: 0.9528985507246377

cnn
Train set
Random Forests roc-auc: 0.9743589743589743
Test set
Random Forests roc-auc: 0.9577294685990339

tomek
Train set
Random Forests roc-auc: 0.9924213717317166
Test set
Random Forests roc-auc: 0.9589371980676329

oss
Train set
Random Forests roc-auc: 0.9932336182336182
Test set
Random Forests roc-auc: 0.9094202898550725

enn
Train set
Random Forests roc-auc: 0.9997839239412273
Test set
Random Forests roc-auc: 0.9541062801932367

renn
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.9577294685990337

allknn
Train set
Random Forests roc-auc: 0.9997763864042933
Test set
Random Forests roc-auc: 0.9625603864734299

ncr
Train set
Random Forests roc-auc: 0.9953379953379953
Test set
Random Forests roc-auc: 0.967391304347826

nm1
Train set
Random Forests roc-auc: 0.9837278106508875
Test set
Random Forests roc-auc: 0.45048309178743956

nm2
Train set
Random Forests roc-auc: 0.9792899408284024
Test set
Random Forests roc-auc: 0.6690821256038646

iht
Train set
Random Forests roc-auc: 0.9837278106508875
Test set
Random Forests roc-auc: 0.4070048309178744


thyroid_sick
Train set
Random Forests roc-auc: 0.9740873567301565
Test set
Random Forests roc-auc: 0.9616175293900413

random
Train set
Random Forests roc-auc: 0.9869708213864058
Test set
Random Forests roc-auc: 0.9606327321967133

cnn
Train set
Random Forests roc-auc: 0.9495073891625616
Test set
Random Forests roc-auc: 0.9674770726903428

tomek
Train set
Random Forests roc-auc: 0.9748379929642659
Test set
Random Forests roc-auc: 0.9617652489690404

oss
Train set
Random Forests roc-auc: 0.9757888805409467
Test set
Random Forests roc-auc: 0.9623684372499539

enn
Train set
Random Forests roc-auc: 0.9775154362937362
Test set
Random Forests roc-auc: 0.959684864898135

renn
Train set
Random Forests roc-auc: 0.9819121667805879
Test set
Random Forests roc-auc: 0.9601280236351326

allknn
Train set
Random Forests roc-auc: 0.9796782506921156
Test set
Random Forests roc-auc: 0.958084569458977

ncr
Train set
Random Forests roc-auc: 0.9764497930757099
Test set
Random Forests roc-auc: 0.9583184587923924

nm1
Train set
Random Forests roc-auc: 0.989585090234441
Test set
Random Forests roc-auc: 0.780057856835108

nm2
Train set
Random Forests roc-auc: 0.9985242030696576
Test set
Random Forests roc-auc: 0.7325660121868653

iht
Train set
Random Forests roc-auc: 0.9846517119244391
Test set
Random Forests roc-auc: 0.44677786668307995


arrhythmia
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.8994140625

random
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.9306640625

cnn
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.9140625

tomek
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.9599609375

oss
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.8935546875

enn
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.9267578125

renn
Train set
Random Forests roc-auc: 0.9995198079231692
Test set
Random Forests roc-auc: 0.9072265625

allknn
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.9638671875

ncr
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.8994140625

nm1
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.6015625

nm2
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.79296875

iht
Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 0.9716796875


ozone_level
Train set
Random Forests roc-auc: 0.9730903165319881
Test set
Random Forests roc-auc: 0.826061776061776

random
Train set
Random Forests roc-auc: 0.9892751479289941
Test set
Random Forests roc-auc: 0.8254826254826255

cnn
Train set
Random Forests roc-auc: 0.9530956848030019
Test set
Random Forests roc-auc: 0.8268339768339769

tomek
Train set
Random Forests roc-auc: 0.9601231128684399
Test set
Random Forests roc-auc: 0.8096525096525096

oss
Train set
Random Forests roc-auc: 0.957703304558201
Test set
Random Forests roc-auc: 0.7922779922779922

enn
Train set
Random Forests roc-auc: 0.9664294554455445
Test set
Random Forests roc-auc: 0.834041184041184

renn
Train set
Random Forests roc-auc: 0.9762014675154284
Test set
Random Forests roc-auc: 0.8323037323037322

allknn
Train set
Random Forests roc-auc: 0.9764543269230769
Test set
Random Forests roc-auc: 0.8305662805662806

ncr
Train set
Random Forests roc-auc: 0.963874593716143
Test set
Random Forests roc-auc: 0.82001287001287

nm1
Train set
Random Forests roc-auc: 0.9774408284023668
Test set
Random Forests roc-auc: 0.5876447876447877

nm2
Train set
Random Forests roc-auc: 0.9985207100591715
Test set
Random Forests roc-auc: 0.8113899613899613

iht
Train set
Random Forests roc-auc: 0.9885355029585798
Test set
Random Forests roc-auc: 0.4407335907335907


## Yêu cầu 4:
## VIẾT CODE Ở ĐÂY:
for dataset in datasets_ls:
    # hiển thị biểu đồ thanh 
    pd.Series(results_dict[dataset]).plot.bar()
    plt.title(dataset)
    plt.ylabel('roc-auc')
#     plt.ylim(0.55, 0.9)
    # thêm một đường ngang qua trục
    plt.axhline(results_dict[dataset]['full_data'], color='r')
    plt.show()


## Yêu cầu 5:
## VIẾT CODE Ở ĐÂY:

for dataset in datasets_ls:
    # hiển thị biểu đồ thanh
    pd.Series(results_dict[dataset]).plot.bar()
    plt.title(dataset)
    plt.ylabel('roc-auc')
    plt.ylim(0.8, 1)
    # thêm một đường ngang qua trục
    plt.axhline(results_dict[dataset]['full_data'], color='r')
    plt.show()


## Yêu cầu 6:
## VIẾT CODE Ở ĐÂY:

for dataset in datasets_ls:
    # hiển thị biểu đồ thanh
    pd.Series(shapes_dict[dataset]).plot.bar()
    plt.title(dataset)
    plt.ylabel('Number of observations')
    plt.show()

Imbalanced data¶

Resampling technique¶

Undersampling¶

Fixed under-sampling¶

Random Under-sampling¶

NearMiss¶

Instance Hardness¶

Clean under-sampling¶

Tomek links¶

Oversampling¶

So sánh các phương pháp Under-sampling¶