diff --git a/.gitignore b/.gitignore index 9539415..6dbcb5b 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ Cargo.lock /docs/*.gz /docs/*.fdb_latexmk /docs/*.out + +# Ignore checkpoint folder +.ipynb_checkpoints/ \ No newline at end of file diff --git a/benches/matrix_benchmark.rs b/benches/matrix_benchmark.rs index a413045..e5dfe18 100644 --- a/benches/matrix_benchmark.rs +++ b/benches/matrix_benchmark.rs @@ -1,5 +1,5 @@ use criterion::{self, black_box, criterion_group, criterion_main, Criterion}; -use rustic_ml::matrix::Matrix; +use rustic_ml::data_utils::matrix::Matrix; fn benchmark_matrix_multiplication(c: &mut Criterion) { // Define matrix sizes and data diff --git a/datasets/european_cities.csv b/datasets/european_cities.csv new file mode 100644 index 0000000..7a614d6 --- /dev/null +++ b/datasets/european_cities.csv @@ -0,0 +1,25 @@ +Barcelona;Belgrade;Berlin;Brussels;Bucharest;Budapest;Copenhagen;Dublin;Hamburg;Istanbul;Kyiv;London;Madrid;Milan;Moscow;Munich;Paris;Prague;Rome;Saint Petersburg;Sofia;Stockholm;Vienna;Warsaw +0;1528.13;1497.61;1062.89;1968.42;1498.79;1757.54;1469.29;1471.78;2230.42;2391.06;1137.67;504.64;725.12;3006.93;1054.55;831.59;1353.90;856.69;2813.02;1745.55;2276.51;1347.43;1862.33 +1528.13;0;999.25;1372.59;447.34;316.41;1327.24;2145.39;1229.93;809.48;976.02;1688.97;2026.94;885.32;1710.99;773.33;1445.70;738.10;721.55;1797.75;329.46;1620.96;489.28;826.66 +1497.61;999.25;0;651.62;1293.40;689.06;354.03;1315.16;254.51;1735.01;1204.00;929.97;1867.69;840.72;1607.99;501.97;876.96;280.34;1181.67;1319.62;1318.67;810.38;523.61;516.06 +1062.89;1372.59;651.62;0;1769.69;1131.52;766.67;773.20;489.76;2178.85;1836.20;318.72;1314.30;696.61;2253.26;601.87;261.29;721.08;1171.34;1903.66;1697.83;1280.88;914.81;1159.85 +1968.42;447.34;1293.40;1769.69;0;639.77;1571.54;2534.72;1544.17;445.62;744.44;2088.42;2469.71;1331.46;1497.56;1186.37;1869.95;1076.82;1137.38;1740.39;296.68;1742.25;855.32;946.12 +1498.79;316.41;689.06;1131.52;639.77;0;1011.31;1894.95;927.92;1064.76;894.29;1450.12;1975.38;788.56;1565.19;563.93;1247.61;443.26;811.11;1556.51;629.63;1316.59;216.98;545.29 +1757.54;1327.24;354.03;766.67;1571.54;1011.31;0;1238.38;287.97;2017.17;1326.33;955.13;2071.75;1157.89;1558.52;838.00;1025.90;633.05;1529.69;1143.40;1635.54;521.68;868.87;667.80 +1469.29;2145.39;1315.16;773.20;2534.72;1894.95;1238.38;0;1073.36;2950.11;2513.69;462.60;1449.96;1413.37;2792.41;1374.91;776.83;1465.61;1882.22;2314.19;2471.02;1626.56;1680.00;1823.72 +1471.78;1229.93;254.51;489.76;1544.17;927.92;287.97;1073.36;0;1983.75;1440.34;720.12;1785.33;900.01;1779.93;610.17;744.63;492.25;1307.51;1414.16;1554.82;809.65;742.79;750.49 +2230.42;809.48;1735.01;2178.85;445.62;1064.76;2017.17;2950.11;1983.75;0;1052.95;2496.39;2734.60;1669.43;1753.97;1582.16;2253.98;1507.55;1373.81;2099.29;502.61;2171.65;1273.88;1386.08 +2391.06;976.02;1204.00;1836.20;744.44;894.29;1326.33;2513.69;1440.34;1052.95;0;2131.20;2859.32;1672.69;756.61;1391.36;2022.76;1138.61;1673.74;1051.39;1020.76;1265.79;1052.76;690.12 +1137.67;1688.97;929.97;318.72;2088.42;1450.12;955.13;462.60;720.12;2496.39;2131.20;0;1263.37;957.91;2498.32;916.23;340.55;1034.57;1431.21;2093.69;2012.70;1431.07;1233.48;1445.85 +504.64;2026.94;1867.69;1314.30;2469.71;1975.38;2071.75;1449.96;1785.33;2734.60;2859.32;1263.37;0;1187.73;3437.70;1484.53;1053.40;1773.73;1360.80;3183.43;2250.10;2591.53;1807.09;2288.42 +725.12;885.32;840.72;696.61;1331.46;788.56;1157.89;1413.37;900.01;1669.43;1672.69;957.91;1187.73;0;2283.19;348.89;641.31;646.04;476.00;2122.15;1166.83;1650.12;623.36;1143.01 +3006.93;1710.99;1607.99;2253.26;1497.56;1565.19;1558.52;2792.41;1779.93;1753.97;756.61;2498.32;3437.70;2283.19;0;1957.15;2484.92;1664.04;2374.26;632.59;1777.35;1227.38;1669.22;1149.41 +1054.55;773.33;501.97;601.87;1186.37;563.93;838.00;1374.91;610.17;1582.16;1391.36;916.23;1484.53;348.89;1957.15;0;685.14;300.16;698.04;1773.83;1096.54;1311.80;354.42;809.02 +831.59;1445.70;876.96;261.29;1869.95;1247.61;1025.90;776.83;744.63;2253.98;2022.76;340.55;1053.40;641.31;2484.92;685.14;0;885.38;1105.76;2157.99;1758.03;1541.83;1033.73;1365.91 +1353.90;738.10;280.34;721.08;1076.82;443.26;633.05;1465.61;492.25;1507.55;1138.61;1034.57;1773.73;646.04;1664.04;300.16;885.38;0;922.00;1476.73;1064.43;1052.85;250.71;514.69 +856.69;721.55;1181.67;1171.34;1137.38;811.11;1529.69;1882.22;1307.51;1373.81;1673.74;1431.21;1360.80;476.00;2374.26;698.04;1105.76;922.00;0;2339.22;894.06;1974.79;763.26;1316.24 +2813.02;1797.75;1319.62;1903.66;1740.39;1556.51;1143.40;2314.19;1414.16;2099.29;1051.39;2093.69;3183.43;2122.15;632.59;1773.83;2157.99;1476.73;2339.22;0;1969.82;688.33;1577.56;1023.41 +1745.55;329.46;1318.67;1697.83;296.68;629.63;1635.54;2471.02;1554.82;502.61;1020.76;2012.70;2250.10;1166.83;1777.35;1096.54;1758.03;1064.43;894.06;1969.82;0;1884.91;817.45;1076.99 +2276.51;1620.96;810.38;1280.88;1742.25;1316.59;521.68;1626.56;809.65;2171.65;1265.79;1431.07;2591.53;1650.12;1227.38;1311.80;1541.83;1052.85;1974.79;688.33;1884.91;0;1241.90;808.14 +1347.43;489.28;523.61;914.81;855.32;216.98;868.87;1680.00;742.79;1273.88;1052.76;1233.48;1807.09;623.36;1669.22;354.42;1033.73;250.71;763.26;1577.56;817.45;1241.90;0;557.43 +1862.33;826.66;516.06;1159.85;946.12;545.29;667.80;1823.72;750.49;1386.08;690.12;1445.85;2288.42;1143.01;1149.41;809.02;1365.91;514.69;1316.24;1023.41;1076.99;808.14;557.43;0 diff --git a/datasets/european_cities.txt b/datasets/european_cities.txt new file mode 100644 index 0000000..c19913e --- /dev/null +++ b/datasets/european_cities.txt @@ -0,0 +1,25 @@ +Barcelona Belgrade Berlin Brussels Bucharest Budapest Copenhagen Dublin Hamburg Istanbul Kyiv London Madrid Milan Moscow Munich Paris Prague Rome Saint Petersburg Sofia Stockholm Vienna Warsaw +0 1528.13 1497.61 1062.89 1968.42 1498.79 1757.54 1469.29 1471.78 2230.42 2391.06 1137.67 504.64 725.12 3006.93 1054.55 831.59 1353.90 856.69 2813.02 1745.55 2276.51 1347.43 1862.33 +1528.13 0 999.25 1372.59 447.34 316.41 1327.24 2145.39 1229.93 809.48 976.02 1688.97 2026.94 885.32 1710.99 773.33 1445.70 738.10 721.55 1797.75 329.46 1620.96 489.28 826.66 +1497.61 999.25 0 651.62 1293.40 689.06 354.03 1315.16 254.51 1735.01 1204.00 929.97 1867.69 840.72 1607.99 501.97 876.96 280.34 1181.67 1319.62 1318.67 810.38 523.61 516.06 +1062.89 1372.59 651.62 0 1769.69 1131.52 766.67 773.20 489.76 2178.85 1836.20 318.72 1314.30 696.61 2253.26 601.87 261.29 721.08 1171.34 1903.66 1697.83 1280.88 914.81 1159.85 +1968.42 447.34 1293.40 1769.69 0 639.77 1571.54 2534.72 1544.17 445.62 744.44 2088.42 2469.71 1331.46 1497.56 1186.37 1869.95 1076.82 1137.38 1740.39 296.68 1742.25 855.32 946.12 +1498.79 316.41 689.06 1131.52 639.77 0 1011.31 1894.95 927.92 1064.76 894.29 1450.12 1975.38 788.56 1565.19 563.93 1247.61 443.26 811.11 1556.51 629.63 1316.59 216.98 545.29 +1757.54 1327.24 354.03 766.67 1571.54 1011.31 0 1238.38 287.97 2017.17 1326.33 955.13 2071.75 1157.89 1558.52 838.00 1025.90 633.05 1529.69 1143.40 1635.54 521.68 868.87 667.80 +1469.29 2145.39 1315.16 773.20 2534.72 1894.95 1238.38 0 1073.36 2950.11 2513.69 462.60 1449.96 1413.37 2792.41 1374.91 776.83 1465.61 1882.22 2314.19 2471.02 1626.56 1680.00 1823.72 +1471.78 1229.93 254.51 489.76 1544.17 927.92 287.97 1073.36 0 1983.75 1440.34 720.12 1785.33 900.01 1779.93 610.17 744.63 492.25 1307.51 1414.16 1554.82 809.65 742.79 750.49 +2230.42 809.48 1735.01 2178.85 445.62 1064.76 2017.17 2950.11 1983.75 0 1052.95 2496.39 2734.60 1669.43 1753.97 1582.16 2253.98 1507.55 1373.81 2099.29 502.61 2171.65 1273.88 1386.08 +2391.06 976.02 1204.00 1836.20 744.44 894.29 1326.33 2513.69 1440.34 1052.95 0 2131.20 2859.32 1672.69 756.61 1391.36 2022.76 1138.61 1673.74 1051.39 1020.76 1265.79 1052.76 690.12 +1137.67 1688.97 929.97 318.72 2088.42 1450.12 955.13 462.60 720.12 2496.39 2131.20 0 1263.37 957.91 2498.32 916.23 340.55 1034.57 1431.21 2093.69 2012.70 1431.07 1233.48 1445.85 +504.64 2026.94 1867.69 1314.30 2469.71 1975.38 2071.75 1449.96 1785.33 2734.60 2859.32 1263.37 0 1187.73 3437.70 1484.53 1053.40 1773.73 1360.80 3183.43 2250.10 2591.53 1807.09 2288.42 +725.12 885.32 840.72 696.61 1331.46 788.56 1157.89 1413.37 900.01 1669.43 1672.69 957.91 1187.73 0 2283.19 348.89 641.31 646.04 476.00 2122.15 1166.83 1650.12 623.36 1143.01 +3006.93 1710.99 1607.99 2253.26 1497.56 1565.19 1558.52 2792.41 1779.93 1753.97 756.61 2498.32 3437.70 2283.19 0 1957.15 2484.92 1664.04 2374.26 632.59 1777.35 1227.38 1669.22 1149.41 +1054.55 773.33 501.97 601.87 1186.37 563.93 838.00 1374.91 610.17 1582.16 1391.36 916.23 1484.53 348.89 1957.15 0 685.14 300.16 698.04 1773.83 1096.54 1311.80 354.42 809.02 +831.59 1445.70 876.96 261.29 1869.95 1247.61 1025.90 776.83 744.63 2253.98 2022.76 340.55 1053.40 641.31 2484.92 685.14 0 885.38 1105.76 2157.99 1758.03 1541.83 1033.73 1365.91 +1353.90 738.10 280.34 721.08 1076.82 443.26 633.05 1465.61 492.25 1507.55 1138.61 1034.57 1773.73 646.04 1664.04 300.16 885.38 0 922.00 1476.73 1064.43 1052.85 250.71 514.69 +856.69 721.55 1181.67 1171.34 1137.38 811.11 1529.69 1882.22 1307.51 1373.81 1673.74 1431.21 1360.80 476.00 2374.26 698.04 1105.76 922.00 0 2339.22 894.06 1974.79 763.26 1316.24 +2813.02 1797.75 1319.62 1903.66 1740.39 1556.51 1143.40 2314.19 1414.16 2099.29 1051.39 2093.69 3183.43 2122.15 632.59 1773.83 2157.99 1476.73 2339.22 0 1969.82 688.33 1577.56 1023.41 +1745.55 329.46 1318.67 1697.83 296.68 629.63 1635.54 2471.02 1554.82 502.61 1020.76 2012.70 2250.10 1166.83 1777.35 1096.54 1758.03 1064.43 894.06 1969.82 0 1884.91 817.45 1076.99 +2276.51 1620.96 810.38 1280.88 1742.25 1316.59 521.68 1626.56 809.65 2171.65 1265.79 1431.07 2591.53 1650.12 1227.38 1311.80 1541.83 1052.85 1974.79 688.33 1884.91 0 1241.90 808.14 +1347.43 489.28 523.61 914.81 855.32 216.98 868.87 1680.00 742.79 1273.88 1052.76 1233.48 1807.09 623.36 1669.22 354.42 1033.73 250.71 763.26 1577.56 817.45 1241.90 0 557.43 +1862.33 826.66 516.06 1159.85 946.12 545.29 667.80 1823.72 750.49 1386.08 690.12 1445.85 2288.42 1143.01 1149.41 809.02 1365.91 514.69 1316.24 1023.41 1076.99 808.14 557.43 0 \ No newline at end of file diff --git a/examples/DataframeLab.ipynb b/examples/DataframeLab.ipynb new file mode 100644 index 0000000..01187c7 --- /dev/null +++ b/examples/DataframeLab.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e237e057-796b-4e8a-89f8-076882ea2f9c", + "metadata": {}, + "source": [ + "# Using `rustic_ml` in a Jupyter notebook" + ] + }, + { + "cell_type": "markdown", + "id": "733b683a-a558-4596-96e2-6faca1e4c29a", + "metadata": {}, + "source": [ + "First step is to include the create to the notebook. \n", + "To get started see `README.md` on how to setup the notebook environment. \n", + "When it is installed, run `jupyter lab` to start the notebook in the browser.\n", + "Create a new notebook with the Rust option, and set the depencency to: \n", + "```rust\n", + ":dep rustic_ml = \"0.x.x\"\n", + "extern crate rustic_ml;\n", + "```\n", + "\n", + "After this, you will be able to use the libaries functionality in the following files.\n", + "\n", + "Since this is a example within the library iteself, we import the libary using the path:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f541ad4f-9472-4cf6-a787-78a54389be91", + "metadata": { + "vscode": { + "languageId": "rust" + } + }, + "outputs": [], + "source": [ + ":dep rustic_ml = { path = \"../\" }\n", + "extern crate rustic_ml;" + ] + }, + { + "cell_type": "markdown", + "id": "67ec5c36-a929-4ee0-92bb-58cdcc7d5a5b", + "metadata": {}, + "source": [ + "Next, include the function that we are going to use from the library: " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3f2c42e6-7cf4-45b5-bdfb-1228b2bce10d", + "metadata": { + "vscode": { + "languageId": "rust" + } + }, + "outputs": [], + "source": [ + "use rustic_ml::data_utils::dataframe::Dataframe;" + ] + }, + { + "cell_type": "markdown", + "id": "76a9d147-f8ce-4bd5-9f12-dee698b0a942", + "metadata": {}, + "source": [ + "Reading a csv file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5d58a4d-8e12-44b8-9974-6408333cefc4", + "metadata": {}, + "outputs": [], + "source": [ + "let path = String::from(\"../datasets/european_cities.csv\");\n", + "let dataframe = Dataframe::from_csv(path).unwrap();" + ] + }, + { + "cell_type": "markdown", + "id": "152858c3-707d-4563-8e69-5f2681797399", + "metadata": {}, + "source": [ + "Run the following codeblock to see the information about the dataframe:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "11070ca7-ffcc-41bd-851e-e67c7fb89d48", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Column Name Type None Some Total Length \n", + "-----------------------------------------------------------------\n", + "Barcelona Float 0 24 24 \n", + "Belgrade Float 0 24 24 \n", + "Berlin Float 0 24 24 \n", + "Brussels Float 0 24 24 \n", + "Bucharest Float 0 24 24 \n", + "Budapest Float 0 24 24 \n", + "Copenhagen Float 0 24 24 \n", + "Dublin Float 0 24 24 \n", + "Hamburg Float 0 24 24 \n", + "Istanbul Float 0 24 24 \n", + "Kyiv Float 0 24 24 \n", + "London Float 0 24 24 \n", + "Madrid Float 0 24 24 \n", + "Milan Float 0 24 24 \n", + "Moscow Float 0 24 24 \n", + "Munich Float 0 24 24 \n", + "Paris Float 0 24 24 \n", + "Prague Float 0 24 24 \n", + "Rome Float 0 24 24 \n", + "Saint Petersburg Float 0 24 24 \n", + "Sofia Float 0 24 24 \n", + "Stockholm Float 0 24 24 \n", + "Vienna Float 0 24 24 \n", + "Warsaw Float 0 24 24 \n" + ] + }, + { + "data": { + "text/plain": [ + "()" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataframe.info()" + ] + }, + { + "cell_type": "markdown", + "id": "546e0b63-3577-456f-8a1f-83d306f0e7af", + "metadata": {}, + "source": [ + "To see the memory usage of the dataframe, we call `memory_usage()`:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "63ed09c7-6779-4fa0-a4c4-fb77d421f43a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4608" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataframe.memory_usage()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Rust", + "language": "rust", + "name": "rust" + }, + "language_info": { + "codemirror_mode": "rust", + "file_extension": ".rs", + "mimetype": "text/rust", + "name": "Rust", + "pygment_lexer": "rust", + "version": "" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/dataframe_read_from_csv.rs b/examples/dataframe_read_from_csv.rs new file mode 100644 index 0000000..33a2012 --- /dev/null +++ b/examples/dataframe_read_from_csv.rs @@ -0,0 +1,19 @@ +use rustic_ml::data_utils::dataframe::Dataframe; + +fn main() { + let path = String::from("./datasets/european_cities.csv"); + let dataframe = Dataframe::from_csv(path).unwrap(); + + // Print the info + dataframe.info(); + + // We can also get the total amount of bytes used + let total_bytes_used = dataframe.memory_usage(); + println!( + "\nMemory usage for the dataframe: {} bytes", + total_bytes_used + ); + + // Lets print the first five records with the head method: + dataframe.head(); +} diff --git a/examples/nootebook_read_dataframe.ipynb b/examples/nootebook_read_dataframe.ipynb new file mode 100644 index 0000000..7e58ccc --- /dev/null +++ b/examples/nootebook_read_dataframe.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e237e057-796b-4e8a-89f8-076882ea2f9c", + "metadata": {}, + "source": [ + "# Using `rustic_ml` in a Jupyter notebook" + ] + }, + { + "cell_type": "markdown", + "id": "733b683a-a558-4596-96e2-6faca1e4c29a", + "metadata": {}, + "source": [ + "First step is to include the create to the notebook. Since this is a example within the library iteself, we import the libary using the path:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f541ad4f-9472-4cf6-a787-78a54389be91", + "metadata": { + "vscode": { + "languageId": "rust" + } + }, + "outputs": [], + "source": [ + ":dep rustic_ml = { path = \"../\" }\n", + "extern crate rustic_ml;" + ] + }, + { + "cell_type": "markdown", + "id": "67ec5c36-a929-4ee0-92bb-58cdcc7d5a5b", + "metadata": {}, + "source": [ + "Next, include the function that we are going to use from the library: " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3f2c42e6-7cf4-45b5-bdfb-1228b2bce10d", + "metadata": { + "vscode": { + "languageId": "rust" + } + }, + "outputs": [], + "source": [ + "use rustic_ml::data_utils::dataframe::Dataframe;" + ] + }, + { + "cell_type": "markdown", + "id": "76a9d147-f8ce-4bd5-9f12-dee698b0a942", + "metadata": {}, + "source": [ + "Reading a csv file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5d58a4d-8e12-44b8-9974-6408333cefc4", + "metadata": {}, + "outputs": [], + "source": [ + "let path = String::from(\"../datasets/european_cities.csv\");\n", + "let dataframe = Dataframe::from_csv(path).unwrap();" + ] + }, + { + "cell_type": "markdown", + "id": "152858c3-707d-4563-8e69-5f2681797399", + "metadata": {}, + "source": [ + "Run the following codeblock to see the information about the dataframe:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "11070ca7-ffcc-41bd-851e-e67c7fb89d48", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Column Name Type None Some Total Length \n", + "-----------------------------------------------------------------\n", + "Barcelona Float 0 24 24 \n", + "Belgrade Float 0 24 24 \n", + "Berlin Float 0 24 24 \n", + "Brussels Float 0 24 24 \n", + "Bucharest Float 0 24 24 \n", + "Budapest Float 0 24 24 \n", + "Copenhagen Float 0 24 24 \n", + "Dublin Float 0 24 24 \n", + "Hamburg Float 0 24 24 \n", + "Istanbul Float 0 24 24 \n", + "Kyiv Float 0 24 24 \n", + "London Float 0 24 24 \n", + "Madrid Float 0 24 24 \n", + "Milan Float 0 24 24 \n", + "Moscow Float 0 24 24 \n", + "Munich Float 0 24 24 \n", + "Paris Float 0 24 24 \n", + "Prague Float 0 24 24 \n", + "Rome Float 0 24 24 \n", + "Saint Petersburg Float 0 24 24 \n", + "Sofia Float 0 24 24 \n", + "Stockholm Float 0 24 24 \n", + "Vienna Float 0 24 24 \n", + "Warsaw Float 0 24 24 \n" + ] + }, + { + "data": { + "text/plain": [ + "()" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataframe.info()" + ] + }, + { + "cell_type": "markdown", + "id": "546e0b63-3577-456f-8a1f-83d306f0e7af", + "metadata": {}, + "source": [ + "To see the memory usage of the dataframe, we call `memory_usage()`:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "63ed09c7-6779-4fa0-a4c4-fb77d421f43a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4608" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataframe.memory_usage()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Rust", + "language": "rust", + "name": "rust" + }, + "language_info": { + "codemirror_mode": "rust", + "file_extension": ".rs", + "mimetype": "text/rust", + "name": "Rust", + "pygment_lexer": "rust", + "version": "" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/data_utils/datacolumn.rs b/src/data_utils/datacolumn.rs new file mode 100644 index 0000000..6ac0260 --- /dev/null +++ b/src/data_utils/datacolumn.rs @@ -0,0 +1,102 @@ +use std::{ + any::{type_name, Any}, + slice::Iter, +}; + +// Define a trait for DataColumn +pub trait DataColumnTrait { + type Item; // Associated type to represent the type of data (T) + + fn new(data: Vec>, name: String) -> Self + where + Self: Sized; + fn as_any(&self) -> &dyn Any; + fn get(&self, index: usize) -> Option<&Self::Item>; + fn size(&self) -> usize; + fn none_count(&self) -> usize; + fn some_count(&self) -> usize; + fn set(&mut self, index: usize, item: Self::Item); + fn remove(&mut self, index: usize); + fn append(&mut self, item: Self::Item); + fn reset(&mut self); + fn extract(&self) -> Vec>; + fn reset_default(&mut self) + where + Self::Item: Default; + fn iter_column(&self) -> Iter>; +} + +#[allow(dead_code)] +pub struct DataColumn { + data: Vec>, + pub name: String, + pub data_type: &'static str, +} + +impl DataColumnTrait for DataColumn { + type Item = T; + + fn new(data: Vec>, name: String) -> Self { + Self { + data, + name, + data_type: type_name::(), + } + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn get(&self, index: usize) -> Option<&T> { + if index >= self.data.len() { + return None; + } + self.data[index].as_ref() + } + + fn size(&self) -> usize { + self.data.len() + } + + fn none_count(&self) -> usize { + self.data.len() - self.data.iter().flatten().count() + } + + fn some_count(&self) -> usize { + self.data.iter().flatten().count() + } + + fn set(&mut self, index: usize, item: T) { + if index < self.data.len() { + self.data[index] = Some(item); + } + } + + fn remove(&mut self, index: usize) { + if index < self.data.len() { + self.data[index] = None; + } + } + + fn append(&mut self, item: T) { + self.data.push(Some(item)); + } + + fn reset(&mut self) { + self.data = (0..self.data.len()).map(|_| None).collect(); + } + + fn reset_default(&mut self) { + self.data = (0..self.data.len()).map(|_| Some(T::default())).collect(); + } + + fn iter_column(&self) -> Iter> { + self.data.iter() + } + + fn extract(&self) -> Vec> { + let vec: Vec<_> = self.data.iter().cloned().collect(); + return vec; + } +} diff --git a/src/data_utils/dataframe.rs b/src/data_utils/dataframe.rs new file mode 100644 index 0000000..53a699c --- /dev/null +++ b/src/data_utils/dataframe.rs @@ -0,0 +1,1024 @@ +use super::datacolumn::DataColumnTrait; +use crate::data_utils::datacolumn::DataColumn; +use std::fs; + +/// A enumeration type that represents different types of columns that can be present in a dataset. +/// +/// The variants of this enum are `Integer`, `Float`, `Boolean`, and +/// `Text`, which correspond to the possible data types that a column can have. This enum is used in the +/// `Dataframe` struct to infer the type of data present in each column when reading data from a file. +#[derive(PartialEq, Eq, Debug)] +pub enum ColumnType { + Integer, + Float, + Boolean, + Text, +} + +/// `DataColumnEnum` enum is used to represent different types of `DataColumn` instances. +/// +/// Each variant of the enum corresponds to a specific type of data column +/// - `IntColumn` for columns containing integer data. +/// - `FloatColumn` for columns containing floating-point data. +/// - `BoolColumn` for columns containing boolean data. +/// - `TextColumn` for columns containing text data. +#[allow(dead_code)] +pub enum DataColumnEnum { + /// Data column with i32 values + IntColumn(DataColumn), + + /// Data column with f32 values + FloatColumn(DataColumn), + + /// Data column with boolean values + BoolColumn(DataColumn), + + /// Data column with string values + TextColumn(DataColumn), +} + +/// `Dataframe` that represents a collection of columns of different data types. +/// +/// Used for managing data in an efficient way. +#[allow(dead_code)] +pub struct Dataframe { + columns: Vec, + rows_count: u32, +} + +impl Dataframe { + /// Reads data from a CSV file using a semicolon as the delimiter, and creates a `Dataframe` + /// + /// # Examples + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let dataframe = Dataframe::from_csv(path).unwrap(); + /// ``` + /// + /// # Arguments: + /// + /// - `path`: The `path` parameter is a `String` that represents the file path to a CSV file that + /// you want to read from. + /// + /// # Errors: + /// - When file is not found, path was not correct + /// + /// # Returns: + /// + /// The `from_csv` function is returning a `Result` containing either an instance of the struct it + /// belongs to (represented by `Self`) or an empty tuple `()`. + pub fn from_csv(path: String) -> Result { + Self::from_file(path, ';') + } + + /// Get the `ColumnType` of a given list of data. + /// + /// Will check the whole column, and determine its data based on what it was able to cast to. + fn infer_column_type(column_data: &[String]) -> ColumnType { + let mut is_integer = true; + let mut is_float = true; + let mut is_boolean = true; + + for value in column_data { + if value.is_empty() { + continue; // Skip empty values + } + + if is_integer && value.parse::().is_err() { + is_integer = false; + } + + if is_float && value.parse::().is_err() { + is_float = false; + } + + if is_boolean && value.parse::().is_err() { + is_boolean = false; + } + + // If none of the above parsers succeeded, it must be text + if !is_integer && !is_float && !is_boolean { + return ColumnType::Text; + } + } + + // Decide the type based on what was true + if is_integer { + ColumnType::Integer + } else if is_float { + ColumnType::Float + } else if is_boolean { + ColumnType::Boolean + } else { + ColumnType::Text + } + } + + /// Infer the column type from a vector + fn infer_column_type_from_vec(column_data: &Vec) -> ColumnType + where + T: ToString, // Ensure T can be converted to string and parsed + { + let mut is_integer = true; + let mut is_float = true; + let mut is_boolean = true; + + for value in column_data.iter() { + let value_as_string = value.to_string().trim().to_string(); + + if value_as_string.is_empty() { + continue; // Skip empty values + } + + // Check if all values can be integers + if is_integer && value_as_string.parse::().is_err() { + is_integer = false; + } + + // Check if all values can be floats + if is_float && value_as_string.parse::().is_err() { + is_float = false; + } + + // Check if all values can be booleans + if is_boolean && value_as_string.parse::().is_err() { + is_boolean = false; + } + + // If none of the parsing succeeded, treat the column as text + if !is_integer && !is_float && !is_boolean { + return ColumnType::Text; + } + } + + // Return the most appropriate type based on successful parsing + if is_integer { + ColumnType::Integer + } else if is_float { + ColumnType::Float + } else if is_boolean { + ColumnType::Boolean + } else { + ColumnType::Text + } + } + + /// Reads data from a file using the given delimiter, and creates a `Dataframe` + /// + /// # Examples + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.txt"); + /// let dataframe = Dataframe::from_file(path, ' ').unwrap(); + /// ``` + /// + /// # Arguments: + /// + /// - `path`: The `path` parameter is a `String` that represents the file path to a CSV file that + /// you want to read from. + /// - 'delimiter': The delimiter that septate records + /// + /// # Errors: + /// - When file is not found, path was not correct + /// + /// # Returns: + /// + /// The `from_csv` function is returning a `Result` containing either an instance of the struct it + /// belongs to (represented by `Self`) or an empty tuple `()`. + pub fn from_file(path: String, delimiter: char) -> Result { + // Read the file + let contents = match fs::read_to_string(&path) { + Ok(val) => val, + Err(_) => { + println!("ERROR: could not read csv file: {}", path); + return Err(()); + } + }; + + // Collect to a vector of lines + let csv_lines: Vec<_> = contents.lines().collect(); + + // Count how many columns there are + let column_count: usize = csv_lines[0].split(delimiter).count(); + + // Column names for the dataset + let column_names: Vec<_> = csv_lines[0].split(delimiter).collect(); + + // Get the data types for each column and initialize each column + let columns_with_data: Vec<_> = csv_lines[1].split(delimiter).collect(); + + // Create the vector of column data + let mut dataframe_columns: Vec = Vec::with_capacity(column_count); + + for (index, _) in columns_with_data.iter().enumerate() { + // Gather all data in this column as a vector of items + let column_data: Vec<_> = csv_lines + .iter() + .skip(1) + .map(|line| { + let values = line.split(delimiter).collect::>(); + values[index].trim().to_string() // Trim the value + }) + .collect(); + + // Get the column type + let column_type = Self::infer_column_type(&column_data); + + match column_type { + ColumnType::Integer => { + // Collect all data for the given column + let data_vec: Vec> = csv_lines + .iter() + .skip(1) + .map(|line| { + let value = line.split(delimiter).collect::>()[index]; + match value.parse::() { + Ok(parsed_val) => Some(parsed_val), + Err(_) => None, // Handle non-integer values as None + } + }) + .collect(); + + let new_column = DataColumn::new(data_vec, column_names[index].to_owned()); + dataframe_columns.push(DataColumnEnum::IntColumn(new_column)); + } + ColumnType::Float => { + let data_vec: Vec> = csv_lines + .iter() + .skip(1) + .map(|line| { + let value = line.split(delimiter).collect::>()[index]; + match value.parse::() { + Ok(parsed_val) => Some(parsed_val), + Err(_) => None, // Handle non-float values as None + } + }) + .collect(); + + let new_column = DataColumn::new(data_vec, column_names[index].to_owned()); + dataframe_columns.push(DataColumnEnum::FloatColumn(new_column)); + } + ColumnType::Boolean => { + let data_vec: Vec> = csv_lines + .iter() + .skip(1) + .map(|line| { + let value = line.split(delimiter).collect::>()[index]; + match value.parse::() { + Ok(parsed_val) => Some(parsed_val), + Err(_) => None, // Handle non-boolean values as None + } + }) + .collect(); + + let new_column = DataColumn::new(data_vec, column_names[index].to_owned()); + dataframe_columns.push(DataColumnEnum::BoolColumn(new_column)); + } + ColumnType::Text => { + let data_vec: Vec> = csv_lines + .iter() + .skip(1) + .map(|line| { + let value = + line.split(delimiter).collect::>()[index].to_string(); + Some(value) + }) + .collect(); + + let new_column = DataColumn::new(data_vec, column_names[index].to_owned()); + dataframe_columns.push(DataColumnEnum::TextColumn(new_column)); + } + } + } + + Ok(Dataframe { + columns: dataframe_columns, + rows_count: contents.lines().count() as u32, + }) + } + + pub fn to_csv(&self, _path: String) -> Result<(), ()> { + unimplemented!() + } + + /// Get all the column names for the `Dataframe` + /// + /// # Examples + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let dataframe = Dataframe::from_csv(path).unwrap(); + /// + /// let columns = dataframe.column_names(); + /// assert!(columns == vec!["Barcelona","Belgrade","Berlin","Brussels", "Bucharest","Budapest","Copenhagen","Dublin","Hamburg","Istanbul","Kyiv","London","Madrid","Milan","Moscow","Munich","Paris","Prague","Rome","Saint Petersburg","Sofia","Stockholm","Vienna","Warsaw"]) + /// + /// ``` + pub fn column_names(&self) -> Vec { + // Vector of all the names created with capacity + let mut names: Vec = Vec::with_capacity(self.columns.len()); + + // Loop through each column and take add the name to the vector + for column in &self.columns { + match column { + DataColumnEnum::IntColumn(data_column) => names.push(data_column.name.clone()), + DataColumnEnum::FloatColumn(data_column) => names.push(data_column.name.clone()), + DataColumnEnum::BoolColumn(data_column) => names.push(data_column.name.clone()), + DataColumnEnum::TextColumn(data_column) => names.push(data_column.name.clone()), + } + } + + // Return names + return names; + } + + pub fn print(&self) { + self.head(); + println!("............"); + self.tail(); + } + + pub fn print_full_table(&self) { + unimplemented!() + } + + /// Rename the column at given index to a new column name + /// + /// # Example + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let mut dataframe = Dataframe::from_csv(path).unwrap(); + /// + /// assert!(dataframe.has_column("Barcelona")); + /// assert!(!dataframe.has_column("Oslo")); + /// + /// dataframe.rename_column(0, "Oslo"); + /// assert!(dataframe.has_column("Oslo")); + /// ``` + /// + /// # Errors + /// + /// This method does not throw any error. If there is not column at given index, it does nothing. + /// Assume that given column is renamed, if a valid index is given. + pub fn rename_column(&mut self, index: usize, column_name: &str) { + if index < self.columns.len() { + match &mut self.columns[index] { + DataColumnEnum::IntColumn(data_column) => data_column.name = column_name.to_owned(), + DataColumnEnum::FloatColumn(data_column) => { + data_column.name = column_name.to_owned() + } + DataColumnEnum::BoolColumn(data_column) => { + data_column.name = column_name.to_owned() + } + DataColumnEnum::TextColumn(data_column) => { + data_column.name = column_name.to_owned() + } + } + } + } + + /// Print the first 5 rows of the `Dataframe`. + /// + /// If the `Dataframe` has less then 5 rows, then it prints the whole `Dataframe`. + /// Note that current implementation does not take into account the terminal width. + /// + /// # Examples + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let dataframe = Dataframe::from_csv(path).unwrap(); + /// + /// dataframe.head(); + /// ``` + /// + /// # Errors + /// + /// Does create an error. If the dataframe is empty, then it will print a information string + pub fn head(&self) { + // Determine the number of rows to display (5 or fewer if not enough rows) + let row_count = self.columns.get(0).map_or(0, |col| match col { + DataColumnEnum::IntColumn(c) => c.size(), + DataColumnEnum::FloatColumn(c) => c.size(), + DataColumnEnum::BoolColumn(c) => c.size(), + DataColumnEnum::TextColumn(c) => c.size(), + }); + + let rows_to_display = usize::min(5, row_count); + + if row_count == 0 { + println!("Dataframe is empty."); + return; + } + + // Print column headers (names) + for column in &self.columns { + match column { + DataColumnEnum::IntColumn(c) => print!("{:<15}", c.name), + DataColumnEnum::FloatColumn(c) => print!("{:<15}", c.name), + DataColumnEnum::BoolColumn(c) => print!("{:<15}", c.name), + DataColumnEnum::TextColumn(c) => print!("{:<15}", c.name), + } + } + println!(); + + // Print separator + for _ in &self.columns { + print!("{:-<15}", "_"); + } + println!(); + + // Print the rows + for row_idx in 0..rows_to_display { + for column in &self.columns { + match column { + DataColumnEnum::IntColumn(c) => { + if let Some(value) = c.get(row_idx) { + print!("{:<15}", value); + } else { + print!("{:<15}", "None"); + } + } + DataColumnEnum::FloatColumn(c) => { + if let Some(value) = c.get(row_idx) { + print!("{:<15}", value); + } else { + print!("{:<15}", "None"); + } + } + DataColumnEnum::BoolColumn(c) => { + if let Some(value) = c.get(row_idx) { + print!("{:<15}", value); + } else { + print!("{:<15}", "None"); + } + } + DataColumnEnum::TextColumn(c) => { + if let Some(value) = c.get(row_idx) { + print!("{:<15}", value); + } else { + print!("{:<15}", "None"); + } + } + } + } + println!(); // Move to the next line after each row + } + } + + /// Print the last 5 rows of the `Dataframe`. + /// + /// If the `Dataframe` has less then 5 rows, then it prints the whole `Dataframe` + pub fn tail(&self) { + unimplemented!() + } + + /// Prints information about columns in the `Dataframe` + /// + /// Print information about each column. For each column it prints the following information: + /// - column name + /// - type + /// - counts of None + /// - count of Some values + /// - Total length of rows. + pub fn info(&self) { + // Print table headers + println!( + "{:<20} {:<10} {:<10} {:<15} {:<15}", + "Column Name", "Type", "None", "Some", "Total Length" + ); + println!("{:-<65}", ""); // Divider line + + // Iterate through each column and print the info + for column in &self.columns { + match column { + DataColumnEnum::IntColumn(col) => { + println!( + "{:<20} {:<10} {:<10} {:<15} {:<15}", + col.name, // Column name + "Integer", // Type + col.none_count(), // None values count + col.some_count(), // Some values count + col.size() // Total length + ); + } + DataColumnEnum::FloatColumn(col) => { + println!( + "{:<20} {:<10} {:<10} {:<15} {:<15}", + col.name, + "Float", + col.none_count(), + col.some_count(), + col.size() + ); + } + DataColumnEnum::BoolColumn(col) => { + println!( + "{:<20} {:<10} {:<10} {:<15} {:<15}", + col.name, + "Boolean", + col.none_count(), + col.some_count(), + col.size() + ); + } + DataColumnEnum::TextColumn(col) => { + println!( + "{:<20} {:<10} {:<10} {:<15} {:<15}", + col.name, + "Text", + col.none_count(), + col.some_count(), + col.size() + ); + } + } + } + } + + /// Calculate the total memory used for the `Dataframe` + /// + /// # Example + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let dataframe = Dataframe::from_csv(path).unwrap(); + /// assert!(dataframe.memory_usage() == 4608); + /// ``` + /// + /// # Returns: + /// + /// The total memory usage of all columns in the `Dataframe` in bytes. + pub fn memory_usage(&self) -> usize { + let mut total_memory: usize = 0; + + for column in &self.columns { + let column_memory = match column { + DataColumnEnum::IntColumn(col) => { + let memory = col.size() * (size_of::>()); + memory + } + DataColumnEnum::FloatColumn(col) => { + let memory = col.size() * (size_of::>()); + memory + } + DataColumnEnum::BoolColumn(col) => { + let memory = col.size() * (size_of::>()); + memory + } + DataColumnEnum::TextColumn(col) => { + let memory: usize = col + .iter_column() + .map(|opt| { + match opt { + Some(s) => size_of::>() + s.capacity(), // size of the string's heap allocation + None => size_of::>(), // just the size of the Option + } + }) + .sum(); + println!("{:<15} {:<15}", col.name, memory); + memory + } + }; + + total_memory += column_memory; + } + + total_memory + } + + /// Check if the `Dataframe` has rows. + /// + /// # Example + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let dataframe = Dataframe::from_csv(path).unwrap(); + /// + /// assert!(dataframe.has_rows()); + /// ``` + /// + /// # Returns + /// Returns true if there are rows, that could be None, rows. + pub fn has_rows(&self) -> bool { + for column in &self.columns { + match column { + DataColumnEnum::IntColumn(data_column) => { + if data_column.size() > 0 { + return true; + } + } + DataColumnEnum::FloatColumn(data_column) => { + if data_column.size() > 0 { + return true; + } + } + DataColumnEnum::BoolColumn(data_column) => { + if data_column.size() > 0 { + return true; + } + } + DataColumnEnum::TextColumn(data_column) => { + if data_column.size() > 0 { + return true; + } + } + } + } + + // No column had any rows + false + } + + /// Check if the `Dataframe` any records. + /// + /// Record is a line with no `None` values. Use + /// + /// # Example + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let dataframe = Dataframe::from_csv(path).unwrap(); + /// + /// assert!(dataframe.has_records()); + /// ``` + /// + /// # Returns + /// Returns true if there are rows, that could be None, rows. + pub fn has_records(&self) -> bool { + for column in &self.columns { + match column { + DataColumnEnum::IntColumn(data_column) => { + if data_column.iter_column().any(|x| x.is_some()) { + return true; + } + } + DataColumnEnum::FloatColumn(data_column) => { + if data_column.iter_column().any(|x| x.is_some()) { + return true; + } + } + DataColumnEnum::BoolColumn(data_column) => { + if data_column.iter_column().any(|x| x.is_some()) { + return true; + } + } + DataColumnEnum::TextColumn(data_column) => { + if data_column.iter_column().any(|x| x.is_some()) { + return true; + } + } + } + } + + // No column had any records + false + } + + /// Check if the `Dataframe` has columns defined. + /// + /// # Example + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let dataframe = Dataframe::from_csv(path).unwrap(); + /// assert!(dataframe.has_columns()); + /// ``` + /// + /// Returns true if there is at least one `DataColumn` + pub fn has_columns(&self) -> bool { + self.columns.len() > 0 + } + + /// Check if a column with given column name exists in the `Dataframe` + /// + /// # Example + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let dataframe = Dataframe::from_csv(path).unwrap(); + /// + /// assert!(dataframe.has_column("Barcelona")); + /// assert!(!dataframe.has_column("Oslo")); + /// ``` + /// + /// # Returns + /// + /// True if there is a column that has the given column name + pub fn has_column(&self, column_name: &str) -> bool { + self.columns.iter().any(|col| match col { + DataColumnEnum::IntColumn(int_col) => int_col.name == column_name, + DataColumnEnum::FloatColumn(float_col) => float_col.name == column_name, + DataColumnEnum::BoolColumn(bool_col) => bool_col.name == column_name, + DataColumnEnum::TextColumn(text_col) => text_col.name == column_name, + }) + } + + /// Drop the column with the given column name + /// + /// Method is not verbose, and therefor assume that the column was removed, or that it never existed. + /// + /// # Example + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let mut dataframe = Dataframe::from_csv(path).unwrap(); + /// + /// assert!(dataframe.has_column("Barcelona")); + /// + /// dataframe.drop_column("Barcelona"); + /// assert!(!dataframe.has_column("Barcelona")); + /// ``` + /// + pub fn drop_column(&mut self, column_name: &str) { + self.columns.retain(|col| match col { + DataColumnEnum::IntColumn(int_col) => int_col.name != column_name, + DataColumnEnum::FloatColumn(float_col) => float_col.name != column_name, + DataColumnEnum::BoolColumn(bool_col) => bool_col.name != column_name, + DataColumnEnum::TextColumn(text_col) => text_col.name != column_name, + }) + } + + /// Add a new column to the `Dataframe` + /// + /// # Example + /// + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let mut dataframe = Dataframe::from_csv(path).unwrap(); + /// + /// dataframe.add_column(vec![1, 2, 3, 4], "custom_index_column"); + /// ``` + pub fn add_column(&mut self, list: Vec, column_name: &str) { + // Infer the column type based on the list values + match Self::infer_column_type_from_vec(&list) { + ColumnType::Integer => { + // Parse values as i32 and collect them into a Vec> + let data: Vec> = list + .iter() + .map(|value| { + value.to_string().parse::().ok() // Parse i32, return None on failure + }) + .collect(); + // Add a new integer column to the dataframe + let new_column = DataColumn::new(data, column_name.to_owned()); + self.columns.push(DataColumnEnum::IntColumn(new_column)); + } + ColumnType::Float => { + // Parse values as f64 and collect them into a Vec> + let data: Vec> = list + .iter() + .map(|value| { + value.to_string().parse::().ok() // Parse f64, return None on failure + }) + .collect(); + // Add a new float column to the dataframe + let new_column = DataColumn::new(data, column_name.to_owned()); + self.columns.push(DataColumnEnum::FloatColumn(new_column)); + } + ColumnType::Boolean => { + // Parse values as bool and collect them into a Vec> + let data: Vec> = list + .iter() + .map(|value| { + value.to_string().parse::().ok() // Parse bool, return None on failure + }) + .collect(); + // Add a new boolean column to the dataframe + let new_column = DataColumn::new(data, column_name.to_owned()); + self.columns.push(DataColumnEnum::BoolColumn(new_column)); + } + ColumnType::Text => { + // Treat all values as strings, convert to Vec> + let data: Vec> = list + .into_iter() + .map(|value| { + Some(value.to_string()) // Convert T to string and wrap in Some + }) + .collect(); + // Add a new text column to the dataframe + let new_column = DataColumn::new(data, column_name.to_owned()); + self.columns.push(DataColumnEnum::TextColumn(new_column)); + } + }; + } + + pub fn add_record(&self) { + unimplemented!() + } + + /// Get the `ColumnType` for a given column. + /// + /// # Example + /// ```rust + /// use rustic_ml::data_utils::dataframe::Dataframe; + /// use rustic_ml::data_utils::dataframe::ColumnType; + /// + /// let path = String::from("./datasets/european_cities.csv"); + /// let dataframe = Dataframe::from_csv(path).unwrap(); + /// assert!(dataframe.has_column("Barcelona")); + /// assert!(!dataframe.has_column("Oslo")); + /// + /// assert!(dataframe.get_column_type("Barcelona") == Some(ColumnType::Float)); + /// assert!(dataframe.get_column_type("Oslo") == None); + /// ``` + /// + /// # Returns + /// + /// Returns `Ǹone` if no column had given name, or the `ColumnType` of the column with the given name. + /// + pub fn get_column_type(&self, column_name: &str) -> Option { + // Iterate through each column and check if there is any column with the given name + for column in &self.columns { + match column { + DataColumnEnum::IntColumn(data_column) => { + if data_column.name == column_name { + return Some(ColumnType::Integer); + } + } + DataColumnEnum::FloatColumn(data_column) => { + if data_column.name == column_name { + return Some(ColumnType::Float); + } + } + DataColumnEnum::BoolColumn(data_column) => { + if data_column.name == column_name { + return Some(ColumnType::Boolean); + } + } + DataColumnEnum::TextColumn(data_column) => { + if data_column.name == column_name { + return Some(ColumnType::Text); + } + } + } + } + + // No column name match, return None + None + } + + /// Extract a single feature of floats into a `Vec>` + /// + /// Creates a clone of the column. Values within the vector might be None. + /// Use the column name to identify the column that will be extracted. + pub fn float_feature(&self, column_name: &str) -> Option>> { + // Return none if there is no vector with + if !self.has_column(column_name) { + return None; + } + + // Iterate through the columns until the correct one is found + for column in &self.columns { + match column { + DataColumnEnum::FloatColumn(float_col) => { + if float_col.name == column_name { + return Some(float_col.extract()); + } + } + _ => continue, + } + } + + // The desired column was not a float value + None + } + + /// Extract two sets of features into a single vector of tuples (`Vec>`). + /// + /// Creates a clone of the column. Values within the vector might be `None`. + /// A row in the vector is `None`, if one of the vectors are none. + /// Use the column name to identify the column that will be extracted. + /// + /// + /// # Returns + /// + /// Returns a `Vec>` created from the two features. + /// Returns `None` if the two feature vectors are not the same length or of any vector did not exist. + pub fn float_features( + &self, + first_column_name: &str, + second_column_name: &str, + ) -> Option>> { + if !self.has_column(first_column_name) || !self.has_column(second_column_name) { + return None; + } + + let mut first_column: Option>> = None; + let mut second_column: Option>> = None; + + for column in &self.columns { + match column { + DataColumnEnum::FloatColumn(float_col) => { + if float_col.name == first_column_name { + first_column = Some(float_col.extract()); + } else if float_col.name == second_column_name { + second_column = Some(float_col.extract()); + } + } + _ => continue, + } + } + + // Return none if one of the columns are none + if first_column.is_none() || second_column.is_none() { + return None; + } + + // Merge the two columns + let merged_column: Option>> = match (first_column, second_column) { + (Some(first_vec), Some(second_vec)) => { + // Ensure the lengths of both vectors are the same + if first_vec.len() == second_vec.len() { + // Combine the two vectors element-wise + Some( + first_vec + .into_iter() + .zip(second_vec.into_iter()) + .map(|(first_opt, second_opt)| { + match (first_opt, second_opt) { + (Some(first_val), Some(second_val)) => { + Some((first_val, second_val)) + } + _ => None, // If either is None, return None + } + }) + .collect(), + ) + } else { + None + } + } + _ => None, + }; + merged_column + } + + /// Get the value at given column and given row index. + /// + /// + /// # Returns + /// + /// The value as a `String` or `None` if: + /// - there was no column with that name + /// - the given row index was out of bounce + /// - the value at that entry was None + pub fn at_str(&self, column_name: &str, row_index: usize) -> Option { + if self.has_column(column_name) { + for column in &self.columns { + match column { + DataColumnEnum::IntColumn(data_column) => { + if data_column.name == column_name { + return data_column.get(row_index).map(|v| v.to_string()); + } + } + DataColumnEnum::FloatColumn(data_column) => { + if data_column.name == column_name { + return data_column.get(row_index).map(|v| v.to_string()); + } + } + DataColumnEnum::BoolColumn(data_column) => { + if data_column.name == column_name { + return data_column.get(row_index).map(|v| v.to_string()); + } + } + DataColumnEnum::TextColumn(data_column) => { + if data_column.name == column_name { + return data_column.get(row_index).map(|v| v.to_string()); + } + } + } + } + } + + // No match none is returned + None + } + + pub fn at_index_str(&self) -> Option<&str> { + unimplemented!() + } +} diff --git a/src/matrix.rs b/src/data_utils/matrix.rs similarity index 100% rename from src/matrix.rs rename to src/data_utils/matrix.rs diff --git a/src/data_utils/mod.rs b/src/data_utils/mod.rs new file mode 100644 index 0000000..4982c0d --- /dev/null +++ b/src/data_utils/mod.rs @@ -0,0 +1,8 @@ +//! This module provides the core functionality for working with data structures. +//! It includes the following modules: +//! - `datacolumn`: Handles operations related to data columns. +//! - `dataframe`: Implements a data frame structure for data manipulation. +//! - `matrix`: Provides functionality for matrix operations. +pub mod datacolumn; +pub mod dataframe; +pub mod matrix; diff --git a/src/lib.rs b/src/lib.rs index f0ad14f..c14e1f6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +#![doc = include_str!("../README.md")] + pub mod activation; -pub mod matrix; +pub mod data_utils; pub mod perceptron; diff --git a/tests/test_datacolumn.rs b/tests/test_datacolumn.rs new file mode 100644 index 0000000..6691444 --- /dev/null +++ b/tests/test_datacolumn.rs @@ -0,0 +1,89 @@ +#[cfg(test)] +mod tests { + use rustic_ml::data_utils::datacolumn::{DataColumn, DataColumnTrait}; + + #[test] + fn test_new() { + let column: DataColumn = + DataColumn::new(vec![Some(1), None, Some(3)], "column_name".to_string()); + assert_eq!(column.size(), 3); + assert_eq!(column.data_type, "i32"); + } + + #[test] + fn test_size() { + let column: DataColumn = + DataColumn::new(vec![Some(1), None, Some(3)], "column_name".to_string()); + assert_eq!(column.size(), 3); + } + + #[test] + fn test_none_count() { + let column: DataColumn = + DataColumn::new(vec![Some(1), None, Some(3)], "column_name".to_string()); + assert_eq!(column.none_count(), 1); + } + + #[test] + fn test_some_count() { + let column: DataColumn = + DataColumn::new(vec![Some(1), None, Some(3)], "column_name".to_string()); + assert_eq!(column.some_count(), 2); + } + + #[test] + fn test_get() { + let column: DataColumn = + DataColumn::new(vec![Some(1), None, Some(3)], "column_name".to_string()); + assert_eq!(column.get(0), Some(&1)); + assert_eq!(column.get(1), None); + assert_eq!(column.get(2), Some(&3)); + assert_eq!(column.get(3), None); + assert_eq!(column.get(1234), None); + } + + #[test] + fn test_set() { + let mut column: DataColumn = + DataColumn::new(vec![Some(1), None, Some(3)], "column_name".to_string()); + column.set(1, 5); // Setting index 1 + assert_eq!(column.get(1), Some(&5)); + } + + #[test] + fn test_remove() { + let mut column: DataColumn = + DataColumn::new(vec![Some(1), Some(2), Some(3)], "column_name".to_string()); + column.remove(1); // Removing the value at index 1 + assert_eq!(column.get(1), None); + } + + #[test] + fn test_append() { + let mut column: DataColumn = + DataColumn::new(vec![Some(1), None], "column_name".to_string()); + column.append(10); + assert_eq!(column.size(), 3); + assert_eq!(column.get(2), Some(&10)); + } + + #[test] + fn test_reset() { + let mut column: DataColumn = + DataColumn::new(vec![Some(1), Some(2), Some(3)], "column_name".to_string()); + column.reset(); + + // All should be None + assert!(column.iter_column().all(|x| x.is_none())); + } + + #[test] + fn test_reset_default() { + let mut column: DataColumn = + DataColumn::new(vec![Some(1), None, Some(3)], "column_name".to_string()); + column.reset_default(); + + // All should be the default, which is Some(0) for Option + assert!(column.iter_column().all(|x| x.is_some_and(|val| val == 0))); + } +} diff --git a/tests/test_dataframe.rs b/tests/test_dataframe.rs new file mode 100644 index 0000000..2645ba5 --- /dev/null +++ b/tests/test_dataframe.rs @@ -0,0 +1,142 @@ +#[cfg(test)] +mod tests { + use rustic_ml::data_utils::dataframe::{ColumnType, Dataframe}; + + #[test] + fn test_from_csv() { + let path = String::from("./datasets/european_cities.csv"); + let dataframe = Dataframe::from_csv(path); + assert!(dataframe.is_ok()) + } + + #[test] + fn test_from_file() { + let path = String::from("./datasets/european_cities.txt"); + let dataframe = Dataframe::from_file(path, ' '); + assert!(dataframe.is_ok()) + } + + #[test] + fn test_column_names() { + let path = String::from("./datasets/european_cities.csv"); + let dataframe = Dataframe::from_csv(path); + assert!(dataframe.is_ok()); + + let columns = dataframe.unwrap().column_names(); + assert!( + columns + == vec![ + "Barcelona", + "Belgrade", + "Berlin", + "Brussels", + "Bucharest", + "Budapest", + "Copenhagen", + "Dublin", + "Hamburg", + "Istanbul", + "Kyiv", + "London", + "Madrid", + "Milan", + "Moscow", + "Munich", + "Paris", + "Prague", + "Rome", + "Saint Petersburg", + "Sofia", + "Stockholm", + "Vienna", + "Warsaw" + ] + ) + } + + #[test] + fn test_rename_colum() { + let path = String::from("./datasets/european_cities.csv"); + let mut dataframe = Dataframe::from_csv(path).unwrap(); + + assert!(dataframe.has_column("Barcelona")); + assert!(!dataframe.has_column("Oslo")); + + dataframe.rename_column(0, "Oslo"); + assert!(dataframe.has_column("Oslo")); + } + + #[test] + fn test_memory_usage() { + let path = String::from("./datasets/european_cities.csv"); + let dataframe = Dataframe::from_csv(path).unwrap(); + assert!( + dataframe.memory_usage() == 4608, + "Memory usage was {}", + dataframe.memory_usage() + ); + } + + #[test] + fn test_has_rows() { + let path = String::from("./datasets/european_cities.csv"); + let dataframe = Dataframe::from_csv(path).unwrap(); + assert!(dataframe.has_rows()); + } + + #[test] + fn test_has_records() { + let path = String::from("./datasets/european_cities.csv"); + let dataframe = Dataframe::from_csv(path).unwrap(); + + assert!(dataframe.has_records()); + } + + #[test] + fn test_has_columns() { + let path = String::from("./datasets/european_cities.csv"); + let dataframe = Dataframe::from_csv(path).unwrap(); + assert!(dataframe.has_columns()); + } + + #[test] + fn test_has_column() { + let path = String::from("./datasets/european_cities.csv"); + let dataframe = Dataframe::from_csv(path).unwrap(); + + assert!(dataframe.has_column("Barcelona")); + assert!(!dataframe.has_column("Oslo")); + } + + #[test] + fn test_drop_column() { + let path = String::from("./datasets/european_cities.csv"); + let mut dataframe = Dataframe::from_csv(path).unwrap(); + + assert!(dataframe.has_column("Barcelona")); + + dataframe.drop_column("Barcelona"); + assert!(!dataframe.has_column("Barcelona")); + } + + #[test] + fn test_add_column() { + let path = String::from("./datasets/european_cities.csv"); + let mut dataframe = Dataframe::from_csv(path).unwrap(); + assert!(!dataframe.has_column("custom_index_column")); + + dataframe.add_column(vec![1, 2, 3, 4], "custom_index_column"); + assert!(dataframe.has_column("custom_index_column")); + } + + #[test] + fn test_get_column_type() { + let path = String::from("./datasets/european_cities.csv"); + let dataframe = Dataframe::from_csv(path).unwrap(); + assert!(dataframe.has_column("Barcelona")); + assert!(!dataframe.has_column("Oslo")); + + assert!(dataframe.get_column_type("Barcelona") == Some(ColumnType::Float)); + assert!(dataframe.get_column_type("Oslo") == None); + } +} diff --git a/tests/test_matrix.rs b/tests/test_matrix.rs index 92a105f..c63bb13 100644 --- a/tests/test_matrix.rs +++ b/tests/test_matrix.rs @@ -2,7 +2,7 @@ mod tests { use std::{ops::RangeInclusive, vec}; - use rustic_ml::matrix::{Matrix, MatrixError}; + use rustic_ml::data_utils::matrix::{Matrix, MatrixError}; #[test] fn test_new_matrix_constructor() {