diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c85219e3..2d3e3dca 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv] + benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv, sklearn] steps: - name: Checkout code diff --git a/sklearn/.gitignore b/sklearn/.gitignore deleted file mode 100644 index 7cad6b99..00000000 --- a/sklearn/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -tmp -result -inputs -.vscode diff --git a/sklearn/README.md b/sklearn/README.md index f21b1f2b..2507c0ad 100644 --- a/sklearn/README.md +++ b/sklearn/README.md @@ -1,5 +1,5 @@ # sklearn benchmark -This benchmark runs a series of scripts that trains a model from sklearn (Scikit-Learn). I got the series of scripts via decomposing the sklearn source code by hand. [Original](https://github.com/scikit-learn/scikit-learn/blob/289326704e13f7a5bf4c6c594c038051e968e1fd/sklearn/linear_model/_logistic.py) +This benchmark runs a series of scripts that trains a model from sklearn (Scikit-Learn). I got the series of scripts via decomposing the sklearn source code by hand. ## Purpose I think this benchmark shows two things for a system like hS - viability in AI workflows and correctness. The first is quite self explanatory. If hS can run this benchmark, then it has proven that hS can handle the task of gluing together a nontrivial ML training workflow. @@ -8,12 +8,6 @@ The second is correctness. There is a very clear ground truth (the model trained ## Usage Running fit.sh will generate temporary files in a ./tmp folder -Before running, the user need to install the packages (possibly in a -virtual environment) by `pip install -r requirements.txt`, and make -sure the result direcotry exists (`mkdir -p result`). Then run -`run.sh` with appropriate environment where python is aliased to the -correct python3 installation (a.k.a. in a virtual environment). - To parallelize, we want one-vs-rest classification, where we generate multiple models. Additionally, the forest cover dataset has much more samples than it has features. This makes the Newton-Cholesky solver ideal for this task. diff --git a/sklearn/deps.sh b/sklearn/deps.sh new file mode 100755 index 00000000..e3ddb647 --- /dev/null +++ b/sklearn/deps.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export PASH_SPEC_TOP=${PASH_SPEC_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)} + +benchmark_dir="sklearn" + +cd "$(realpath $(dirname "$0"))" +mkdir -p "$PASH_SPEC_TOP/report/resources/sklearn" +mkdir -p "$PASH_SPEC_TOP/report/output/sklearn" + +# Currently just dumped the entire dataset, but ideally we actually download it + +pip install -r requirements.txt diff --git a/sklearn/generate.sh b/sklearn/generate.sh deleted file mode 100755 index 4cee464a..00000000 --- a/sklearn/generate.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -# Script to generate lines that launch individual regressors. We need n regressors for n class classifications -n_classes=104 -for i in `seq 1 $n_classes` -do - echo "\$PYTHON \$SCRIPTS/parallel.py \$MODEL \$X \$y \$C_ \$WARM_COEF \$MAX_SQ_SUM \$multiclass \$penalty $i" -done diff --git a/sklearn/inputs.sh b/sklearn/input.sh similarity index 89% rename from sklearn/inputs.sh rename to sklearn/input.sh index 03fe59cf..b79d0a45 100755 --- a/sklearn/inputs.sh +++ b/sklearn/input.sh @@ -5,6 +5,4 @@ mkdir -p tmp mkdir -p result mkdir -p inputs -/usr/bin/env python3 -c "from sklearn.datasets import fetch_kddcup99; fetch_kddcup99(data_home=\"inputs\", percent10=False, download_if_missing=True)" - - +/usr/bin/env python3 -c "from sklearn.datasets import fetch_kddcup99; fetch_kddcup99(data_home=\"inputs\", percent10=False, download_if_missing=True)" \ No newline at end of file diff --git a/sklearn/inputs/covertype/samples_py3 b/sklearn/inputs/covertype/samples_py3 new file mode 100644 index 00000000..d3264d77 Binary files /dev/null and b/sklearn/inputs/covertype/samples_py3 differ diff --git a/sklearn/inputs/covertype/targets_py3 b/sklearn/inputs/covertype/targets_py3 new file mode 100644 index 00000000..cc386098 Binary files /dev/null and b/sklearn/inputs/covertype/targets_py3 differ diff --git a/sklearn/run b/sklearn/run new file mode 100755 index 00000000..0919650d --- /dev/null +++ b/sklearn/run @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +import argparse +from pathlib import Path +import os +import time +from subprocess import run, PIPE + +parser = argparse.ArgumentParser(description="Run benchmark") +parser = argparse.ArgumentParser(description="Run benchmark") +parser.add_argument('--window', default=5, type=int, help='window size to run hs with') +parser.add_argument('--target', choices=['hs-only', 'sh-only', 'both'], + help='to run with sh or hs') +parser.add_argument('--log', choices=['enable', 'disable'], default="enable", + help='whether to enable logging for hs') + +env = os.environ.copy() +SCRIPT_NAME = "run.sh" + + +def do_sh_run(test_base: Path, output_base: Path, env: dict): + before = time.time() + print(f'Running {test_base / SCRIPT_NAME}') + result = run(['/bin/sh', test_base / SCRIPT_NAME], stdout=PIPE, env=env) + duration = time.time() - before + with open(output_base / "sh_time", 'w') as f: + f.write(f'{duration}\n') + os.rename(env["OUTPUT_DIR"] / "trained_model.obj", env["OUTPUT_DIR"] / "sh_trained_model.obj") + return result.returncode, result.stdout + +def do_hs_run(test_base: Path, output_base: Path, hs_base: Path, window: int, env: dict, log: bool): + cmd = [hs_base / 'pash-spec.sh', '--window', str(window)] + if log: + cmd.extend(['-d', '2']) + cmd.append(test_base / SCRIPT_NAME) + before = time.time() + print(f'Running {cmd}') + with open(output_base / 'hs_log', 'w') as log: + result = run(cmd, stdout=PIPE, stderr=log, env=env) + duration = time.time() - before + with open(output_base / "hs_time", 'w') as f: + f.write(f'{duration}\n') + os.rename(env["OUTPUT_DIR"] / "trained_model.obj", env["OUTPUT_DIR"] / "hs_trained_model.obj") + return result.returncode, result.stdout + +if __name__ == '__main__': + args = parser.parse_args() + test_base = Path(__file__).parent.resolve() + hs_base = test_base.parent.parent.parent + + ####################### + # SPECIFY ENV VARS HERE + + env['TMP'] = hs_base / 'report' / 'resources' / 'sklearn' + env['RESULT'] = hs_base / 'report' / 'output' / 'sklearn' + env['OUTPUT_DIR'] = hs_base / 'report' / 'output' / 'sklearn' + + ####################### + + bench_base = test_base.parent + local_name = os.sep.join(test_base.parts[-1:]) + print(local_name) + output_base = hs_base / "report" / "output" / 'sklearn' / local_name + run_hs = False + run_sh = False + if args.target in ["hs-only", "both"]: + run_hs = True + if args.target in ["sh-only", "both"]: + run_sh = True + if not run_hs and not run_sh: + raise("Not running anything, add --target argument") + output_base.mkdir(parents=True, exist_ok=True) + + + if run_sh: + output_sh = do_sh_run(test_base, output_base, env) + if run_hs: + output_hs = do_hs_run(test_base, output_base, hs_base, args.window, env, args.log == 'enable') + if run_sh and run_hs: + with open(output_base / 'error', 'w') as errf: + print(output_sh[:100]) + if output_sh == output_hs: + errf.write('') + else: + errf.write('error\n') + errf.write(f'return code {output_sh[0]} vs {output_hs[0]}\n') + errf.write(f'==== output sh ====\n') + errf.write(output_sh[1].decode('UTF-8')) + errf.write(f'==== output hs ====\n') + errf.write(output_hs[1].decode('UTF-8')) + + diff --git a/sklearn/run.sh b/sklearn/run.sh index 261a7bf1..a3c57c8c 100755 --- a/sklearn/run.sh +++ b/sklearn/run.sh @@ -1,10 +1,10 @@ #!/bin/bash -set -e - -PYTHON=${PYTHON:-`which python3`} +PYTHON="python3" OUT=${OUT:-$PWD/result} TMP=${TMP:-$PWD/tmp} +#export tmp to env +export TMP SCRIPTS=${SCRIPTS:-$PWD/scripts} # Ideally, we'll move on to piping rather than writing to a file @@ -17,6 +17,18 @@ MAX_SQ_SUM=$TMP/max_squared_sum.obj WARM_COEF=$TMP/warm_start_coef.obj C_=$TMP/C_.obj +echo $PYTHON >&2 +echo "DIR: $DIR" >&2 +echo "SCRIPTS: $SCRIPTS" >&2 +echo "MODEL: $MODEL" >&2 +echo "X: $X" >&2 +echo "y: $y" >&2 +echo "CLASSES: $CLASSES" >&2 +echo "DUAL: $DUAL" >&2 +echo "MAX_SQ_SUM: $MAX_SQ_SUM" >&2 +echo "WARM_COEF: $WARM_COEF" >&2 +echo "C_: $C_" >&2 + # TODO: Try this out on a larger dataset # TODO: Benchmark each phase @@ -31,8 +43,9 @@ $PYTHON $SCRIPTS/check_solver.py $MODEL penalty=$($PYTHON $SCRIPTS/penalty.py $MODEL) $PYTHON $SCRIPTS/val_data.py $MODEL $X $y $PYTHON $SCRIPTS/classes.py $MODEL $y # This should return a classes with just the unique classes in y +echo "$PYTHON $SCRIPTS/check_multiclass.py $MODEL" >&2 multiclass=$($PYTHON $SCRIPTS/check_multiclass.py $MODEL) - +echo "------" >&2 # TODO: Benchmark each step of the pipeline # Make a modified pipeline where each step writes its output to a file @@ -41,7 +54,12 @@ $PYTHON $SCRIPTS/rownorm.py $X n_classes=$($PYTHON $SCRIPTS/reshape_classes.py $MODEL $CLASSES) $PYTHON $SCRIPTS/warm_start.py $MODEL $multiclass $n_classes # pipes coefficients -# KDD Cup 99 dataset has 23 classes +# Covtype dataset has 7 classes +echo "WARM_COEF: $WARM_COEF" >&2 +echo "MAX_SQ_SUM: $MAX_SQ_SUM" >&2 + +echo "multiclass: $multiclass" >&2 +echo "penalty: $penalty" >&2 $PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 1 $PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 2 $PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 3 @@ -49,22 +67,6 @@ $PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 5 $PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 6 $PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 7 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 8 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 9 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 10 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 11 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 12 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 13 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 14 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 15 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 16 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 17 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 18 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 19 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 20 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 21 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 22 -$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 23 -$PYTHON $SCRIPTS/zip_coef.py $MODEL $n_classes -$PYTHON $SCRIPTS/adjust_coef.py $MODEL $X $multiclass $n_classes $OUT/trained_model.obj +$PYTHON $SCRIPTS/zip_coef.py $MODEL +$PYTHON $SCRIPTS/adjust_coef.py $MODEL $X $multiclass $n_classes $RESULT/trained_model.obj diff --git a/sklearn/scripts/adjust_coef.py b/sklearn/scripts/adjust_coef.py old mode 100644 new mode 100755 index dda50a4c..48a0200b --- a/sklearn/scripts/adjust_coef.py +++ b/sklearn/scripts/adjust_coef.py @@ -1,9 +1,11 @@ -import os import sys import pickle import numpy as np +import os -with open(f'{os.environ.get("TMP","./tmp")}/fold_coef.obj', 'rb') as file: +tmp = os.environ.get('TMP') +filepath = os.path.join(tmp, 'fold_coef.obj') +with open(filepath, 'rb') as file: fold_coefs_ = pickle.load(file) model_file, X_file, multi_class, n_classes, destination = sys.argv[1:6] @@ -30,5 +32,6 @@ else: model.intercept_ = np.zeros(n_classes) -with open(destination, 'wb') as file: - pickle.dump(model, file) +filepath = os.path.join(tmp, 'trained_model.obj') +with open(filepath, 'wb') as file: + pickle.dump(model, file) \ No newline at end of file diff --git a/sklearn/scripts/check_multiclass.py b/sklearn/scripts/check_multiclass.py old mode 100644 new mode 100755 diff --git a/sklearn/scripts/check_solver.py b/sklearn/scripts/check_solver.py old mode 100644 new mode 100755 diff --git a/sklearn/scripts/classes.py b/sklearn/scripts/classes.py old mode 100644 new mode 100755 diff --git a/sklearn/scripts/fold_coef.py b/sklearn/scripts/fold_coef.py old mode 100644 new mode 100755 index f2b5d450..563827ee --- a/sklearn/scripts/fold_coef.py +++ b/sklearn/scripts/fold_coef.py @@ -62,6 +62,7 @@ ) for class_, warm_start_coef_ in zip(classes, warm_start_coef) ) - -with open(f'{os.environ.get("TMP","./tmp")}/fold_coef.obj', 'w+b') as file: - pickle.dump(fold_coefs_, file) +tmp = os.environ.get('TMP') +filepath = os.path.join(tmp, 'fold_coef.obj') +with open(filepath, 'w+b') as file: + pickle.dump(fold_coefs_, file) \ No newline at end of file diff --git a/sklearn/scripts/gen_model.py b/sklearn/scripts/gen_model.py old mode 100644 new mode 100755 index f4b2c327..c9f10c8a --- a/sklearn/scripts/gen_model.py +++ b/sklearn/scripts/gen_model.py @@ -6,5 +6,8 @@ reg = LogisticRegression(max_iter=int(sys.argv[1]), solver='newton-cholesky', multi_class='ovr') -with open(f'{os.environ.get("TMP","./tmp")}/model.obj', 'w+b') as file: - pickle.dump(reg, file) + +tmp = os.environ.get('TMP') +filepath = os.path.join(tmp, 'model.obj') +with open(filepath, 'w+b') as file: + pickle.dump(reg, file) \ No newline at end of file diff --git a/sklearn/scripts/gen_samples.py b/sklearn/scripts/gen_samples.py old mode 100644 new mode 100755 index f2631f6f..31a66a6f --- a/sklearn/scripts/gen_samples.py +++ b/sklearn/scripts/gen_samples.py @@ -1,22 +1,19 @@ from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder, MinMaxScaler from sklearn import datasets import pickle -import pandas as pd -import numpy as np import os -X, y = datasets.fetch_kddcup99(data_home="inputs", percent10=False, return_X_y=True, as_frame=True, download_if_missing=True) -X = pd.DataFrame(X).drop(columns=["protocol_type", "service", "flag"]).astype(float) -X[X.columns] = MinMaxScaler().fit_transform(X[X.columns]) -X = X.to_numpy() -y = LabelEncoder().fit_transform(y).astype(np.int32) -data = train_test_split(X, - y, +raw_data = datasets.fetch_covtype(data_home="inputs", download_if_missing=False) + +data = train_test_split(raw_data.data, + raw_data.target, test_size=0.2, random_state=0) filenames = ['X_train', 'X_test', 'y_train', 'y_test'] +tmp = os.environ.get('TMP') +filepath = os.path.join(tmp, 'model.obj') for datum, name in zip(data, filenames): - with open(f'{os.environ.get("TMP","./tmp")}/{name}.obj', 'w+b') as file: + filepath = os.path.join(tmp, f'{name}.obj') + with open(filepath, 'w+b') as file: pickle.dump(datum, file) diff --git a/sklearn/scripts/parallel.py b/sklearn/scripts/parallel.py old mode 100644 new mode 100755 index a3e3a930..65fb50a4 --- a/sklearn/scripts/parallel.py +++ b/sklearn/scripts/parallel.py @@ -34,5 +34,7 @@ sample_weight=None, ) -with open(f'{os.environ.get("TMP","./tmp")}/result_{class_}.obj', 'wb') as file: - pickle.dump(result, file) +tmp = os.environ.get('TMP') +filepath = os.path.join(tmp, f'result_{class_}.obj') +with open(filepath, 'w+b') as file: + pickle.dump(result, file) \ No newline at end of file diff --git a/sklearn/scripts/penalty.py b/sklearn/scripts/penalty.py old mode 100644 new mode 100755 index 7265b9e9..52ae7bbe --- a/sklearn/scripts/penalty.py +++ b/sklearn/scripts/penalty.py @@ -1,8 +1,8 @@ import warnings import sys -import os import pickle import numpy as np +import os with open(sys.argv[1], 'rb') as file: model = pickle.load(file) @@ -34,6 +34,8 @@ C_ = model.C penalty = model.penalty -with open(f'{os.environ.get("TMP","./tmp")}/C_.obj', 'w+b') as file: +tmp = os.environ.get('TMP') +filepath = os.path.join(tmp, 'C_.obj') +with open(filepath, 'w+b') as file: pickle.dump(C_, file) print(penalty) diff --git a/sklearn/scripts/reshape_classes.py b/sklearn/scripts/reshape_classes.py old mode 100644 new mode 100755 diff --git a/sklearn/scripts/rownorm.py b/sklearn/scripts/rownorm.py old mode 100644 new mode 100755 index 32615c3f..5fd6f254 --- a/sklearn/scripts/rownorm.py +++ b/sklearn/scripts/rownorm.py @@ -1,12 +1,14 @@ from sklearn.linear_model import _logistic import sys -import os import pickle +import os with open(sys.argv[1], 'rb') as file: X = pickle.load(file) max_squared_sum = _logistic.row_norms(X, squared=True).max() -with open(f'{os.environ.get("TMP","./tmp")}/max_squared_sum.obj', 'w+b') as file: +tmp = os.environ.get('TMP') +filepath = os.path.join(tmp, 'max_squared_sum.obj') +with open(filepath, 'w+b') as file: pickle.dump(max_squared_sum, file) diff --git a/sklearn/scripts/val_data.py b/sklearn/scripts/val_data.py old mode 100644 new mode 100755 diff --git a/sklearn/scripts/warm_start.py b/sklearn/scripts/warm_start.py old mode 100644 new mode 100755 index adde91b3..14ae79ab --- a/sklearn/scripts/warm_start.py +++ b/sklearn/scripts/warm_start.py @@ -1,7 +1,7 @@ import sys -import os import numpy as np import pickle +import os with open(sys.argv[1], 'rb') as file: model = pickle.load(file) @@ -16,13 +16,15 @@ warm_start_coef = np.append( warm_start_coef, model.intercept_[:, np.newaxis], axis=1 ) - +tmp = os.environ.get('TMP') if multi_class == "multinomial": - with open(f'{os.environ.get("TMP","./tmp")}/classes.obj', 'wb') as file: + filepath = os.path.join(tmp, 'classes.obj') + with open(filepath, 'wb') as file: pickle.dump([None], file) warm_start_coef = [warm_start_coef] if warm_start_coef is None: warm_start_coef = [None] * n_classes -with open(f'{os.environ.get("TMP","./tmp")}/warm_start_coef.obj', 'w+b') as file: +filepath = os.path.join(tmp, 'warm_start_coef.obj') +with open(filepath, 'w+b') as file: pickle.dump(warm_start_coef, file) diff --git a/sklearn/scripts/zip_coef.py b/sklearn/scripts/zip_coef.py old mode 100644 new mode 100755 index bc5c1fa8..8aece5f8 --- a/sklearn/scripts/zip_coef.py +++ b/sklearn/scripts/zip_coef.py @@ -1,15 +1,18 @@ import sys -import os import numpy as np import pickle +import os fold_coefs_ = [] +tmp = os.environ.get('TMP') + with open(sys.argv[1], 'r+b') as file1: model = pickle.load(file1) - for i in range(1, int(sys.argv[2]) + 1): - with open(f'{os.environ.get("TMP","./tmp")}/result_{i}.obj', 'r+b') as file2: + for i in range(1, 8): + filepath = os.path.join(tmp, f'result_{i}.obj') + with open(filepath, 'r+b') as file2: fold_coefs_.append(pickle.load(file2)) fold_coefs_, _, n_iter_ = zip(*fold_coefs_) @@ -17,5 +20,6 @@ model.n_iter = np.asarray(n_iter_, dtype=np.int32)[:, 0] pickle.dump(model, file1) -with open(f'{os.environ.get("TMP","./tmp")}/fold_coef.obj', 'w+b') as file: +filepath = os.path.join(tmp, 'fold_coef.obj') +with open(filepath, 'w+b') as file: pickle.dump(fold_coefs_, file) diff --git a/sklearn/verify.py b/sklearn/verify.py index 55da0dc6..6419bb57 100755 --- a/sklearn/verify.py +++ b/sklearn/verify.py @@ -8,9 +8,9 @@ max_iter = 100 -dataset = datasets.fetch_rcv1(data_home="inputs") +dataset = datasets.fetch_covtype(data_home="inputs", download_if_missing=False) -X_train, X_test, y_train, y_test = train_test_split(dataset.data, np.argmax(dataset.target.toarray()), +X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=0) @@ -20,9 +20,9 @@ control_model.fit(X_train, y_train) control_score = control_model.score(X_test, y_test) -with open('result/trained_model.obj', 'rb') as file: +with open('tmp/trained_model.obj', 'rb') as file: experiment_model = pickle.load(file) experiment_score = experiment_model.score(X_test, y_test) assert experiment_score == control_score -assert np.array_equal(control_model.coef_, experiment_model.coef_) +assert np.array_equal(control_model.coef_, experiment_model.coef_) \ No newline at end of file diff --git a/sklearn/verify.sh b/sklearn/verify.sh new file mode 100755 index 00000000..f6301025 --- /dev/null +++ b/sklearn/verify.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# shell script to run verify.py + +# run the Python script +python3 verify.py + +# check if the script ran successfully +if [ $? -eq 0 ]; then + echo "verify.py ran successfully." +else + echo "verify.py encountered an error." +fi \ No newline at end of file