From ffc778cb241cc4433a79cf51fd0fa7c6b3b9d7ed Mon Sep 17 00:00:00 2001 From: ksiminski Date: Wed, 26 Jun 2024 00:15:46 +0200 Subject: [PATCH] make 1: added example for train-test and train-validate-test cross validation data split --- README.md | 3 +- source/experiments/exp-001.cpp | 127 +++++++++++++++++++++++++-------- 2 files changed, 100 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 2a6c36b..d15de3b 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ This example presents modifiers of data: 7. imputation of missing data with values from 3 nearest neighbours and then the imputed data set standardized 8. a series of data modifiers: imputer with values from knn (k == 3) + standardiser, imputer with constant value, normaliser, standardiser, average imputer, imputer with constant value + normaliser, average imputer + standardiser 9. imputer with constant value + normaliser + average imputer + standardiser - +10. split of a data set into _n_ chunks of train and test datasets for _n_-fold cross-validation +11. split of a data set into _n_ chunks of train, validate, and test datasets for _n_-fold cross-validation ### make 2 Run `make 2` for this example. diff --git a/source/experiments/exp-001.cpp b/source/experiments/exp-001.cpp index b0ea46d..6889c8c 100644 --- a/source/experiments/exp-001.cpp +++ b/source/experiments/exp-001.cpp @@ -1,13 +1,12 @@ /** @file */ #include -#include #include - - #include "../readers/reader-complete.h" #include "../readers/reader-incomplete.h" +#include "../readers/train_test_model.h" +#include "../readers/train_validation_test_model.h" #include "../common/data-modifier-marginaliser.h" #include "../common/data-modifier-standardiser.h" #include "../common/data-modifier-normaliser.h" @@ -21,20 +20,22 @@ #include "../common/data-modifier-imputer-values-from-knn.h" #include "../common/data-modifier-outlier-remove-sigma.h" - #include "../experiments/exp-001.h" -#include void ksi::exp_001::execute() { try { std::string dataDir ("../data/exp-001"); - - + // complete data { + std::cout << std::endl; + std::cout << "=====================" << std::endl; + std::cout << "reading complete data" << std::endl; + std::cout << "=====================" << std::endl; + std::string CompleteDataset ("/complete.data"); ksi::reader_complete DataReader; auto dane = DataReader.read(dataDir + CompleteDataset); @@ -59,12 +60,12 @@ void ksi::exp_001::execute() std::cout << data << std::endl; std::cout << std::endl; } - + { auto data = dane; auto filtering_function = [] (double value) -> bool { - return value > 5; + return value > 5; }; std::size_t attribute {4}; ksi::data_modifier_filter filter (attribute, filtering_function); @@ -74,16 +75,21 @@ void ksi::exp_001::execute() std::cout << std::endl; } } - + // outliers { + std::cout << std::endl; + std::cout << "========" << std::endl; + std::cout << "outliers" << std::endl; + std::cout << "========" << std::endl; + std::string OutlierDataset ("/outliers.data"); ksi::reader_complete DataReader; auto dane = DataReader.read(dataDir + OutlierDataset); std::cout << "original data" << std::endl; std::cout << dane << std::endl; std::cout << std::endl; - + { const double n { 1 }; @@ -117,18 +123,23 @@ void ksi::exp_001::execute() } } - - + + // incomplete data { + std::cout << std::endl; + std::cout << "===============" << std::endl; + std::cout << "incomplete data" << std::endl; + std::cout << "===============" << std::endl; + std::string IncompleteDataset ("/incomplete.data"); - + ksi::reader_incomplete DataReader; auto dane = DataReader.read(dataDir + IncompleteDataset); std::cout << "original incomplete data" << std::endl; std::cout << dane << std::endl; std::cout << std::endl; - + { auto marg = dane; ksi::data_modifier_marginaliser marginaliser; @@ -147,7 +158,7 @@ void ksi::exp_001::execute() std::cout << marg << std::endl; std::cout << std::endl; } - + { auto marg = dane; ksi::data_modifier_imputer_average imputer; @@ -156,7 +167,7 @@ void ksi::exp_001::execute() std::cout << marg << std::endl; std::cout << std::endl; } - + { auto marg = dane; ksi::data_modifier_imputer_median imputer; @@ -165,7 +176,7 @@ void ksi::exp_001::execute() std::cout << marg << std::endl; std::cout << std::endl; } - + { auto marg = dane; int k = 3; @@ -175,7 +186,7 @@ void ksi::exp_001::execute() std::cout << marg << std::endl; std::cout << std::endl; } - + { auto marg = dane; int k = 3; @@ -185,7 +196,7 @@ void ksi::exp_001::execute() std::cout << marg << std::endl; std::cout << std::endl; } - + { auto marg = dane; int k = 3; @@ -195,10 +206,15 @@ void ksi::exp_001::execute() std::cout << marg << std::endl; std::cout << std::endl; } - - + + // chains of modifiers: { + std::cout << std::endl; + std::cout << "===================" << std::endl; + std::cout << "chains of modifiers" << std::endl; + std::cout << "===================" << std::endl; + auto marg = dane; int k = 3; ksi::data_modifier_imputer_values_from_knn imputer (k); @@ -209,33 +225,86 @@ void ksi::exp_001::execute() std::cout << marg << std::endl; std::cout << std::endl; std::cout << imputer.print() << std::endl; - + } - + { + std::cout << std::endl; + std::cout << "============================" << std::endl; + std::cout << "imputation of missing values" << std::endl; + std::cout << "============================" << std::endl; + auto marg = dane; - + ksi::data_modifier_imputer dm1; ksi::data_modifier_normaliser dm2; ksi::data_modifier_standardiser dm3; ksi::data_modifier_imputer_average dm4; - + std::cout << dm1.print() << std::endl; std::cout << dm2.print() << std::endl; std::cout << dm3.print() << std::endl; std::cout << dm4.print() << std::endl; - + dm1.addModifier(dm2); std::cout << dm1.print() << std::endl; dm4.addModifier(dm3); std::cout << dm4.print() << std::endl; dm1.addModifier(dm4); - + dm1.modify(marg); - + std::cout << marg << std::endl; std::cout << std::endl; std::cout << dm1.print() << std::endl; + } + { + std::cout << std::endl; + std::cout << "==================================" << std::endl; + std::cout << "split into train and test datasets" << std::endl; + std::cout << "==================================" << std::endl; + + { + std::cout << std::endl; + std::cout << "train and test cross validation" << std::endl; + std::cout << std::endl; + + std::string CompleteDataset ("/complete.data"); + ksi::reader_complete cr; + ksi::train_test_model tt(cr); + + const int chunks { 5 }; + tt.read_and_split_file(dataDir + CompleteDataset, chunks); + + int counter { 1 }; + for (const auto & [train, test] : tt) + { + std::cout << "chunk: " << counter++ << ", train dataset size: " << train.size() << ", test data set size: " << test.size() << std::endl; + } + } + + { + std::cout << std::endl; + std::cout << "train, validate, and test cross validation" << std::endl; + std::cout << std::endl; + + std::string CompleteDataset ("/complete.data"); + ksi::reader_complete cr; + ksi::train_validation_test_model tvt(cr); + + const int chunks { 5 }; + tvt.read_and_split_file(dataDir + CompleteDataset, chunks); + + int counter { 1 }; + for (const auto & [train, validate, test] : tvt) + { + std::cout << "chunk: " << counter++ << ", train dataset size: " << train.size() << ", validation data set size: " << validate.size() << ", test data set size: " << test.size() << std::endl; + } + } + + + + } } }