Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataframe implementation #36

Merged
merged 22 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
57fc8b7
add: first draft of dataframe api
KjetilIN May 30, 2024
8f0b1f1
Merge branch 'main' into 21-feat-dataframe-for-reading-files
KjetilIN Sep 23, 2024
597c74b
refactor: move dataframe to own folder + added more methods to implement
KjetilIN Sep 23, 2024
cc9dac7
refactor: move matrix into data utils folder
KjetilIN Sep 24, 2024
7f24c25
add: data column struct + unit tests
KjetilIN Sep 24, 2024
27c1a41
add: setup generic column for dataframe
KjetilIN Sep 25, 2024
386004b
add: simple demo of dataframe implementation
KjetilIN Sep 26, 2024
ae4bc54
add: correct parsing of csv file
KjetilIN Sep 26, 2024
76ee448
fix: from file method for dataframe
KjetilIN Sep 26, 2024
78da13f
add: method for checking memory usage for the dataframe
KjetilIN Sep 26, 2024
3ea8668
add: docs for the dataframe
KjetilIN Sep 26, 2024
4922b44
add: rust doc for the data utils module and the main module
KjetilIN Sep 26, 2024
e22ff6a
add: docs + methods for column manipulation
KjetilIN Sep 26, 2024
e9ce350
add: column methods for dataframe + methods for checking dataframe co…
KjetilIN Sep 27, 2024
279caf5
add: method for printing the head of the dataframe
KjetilIN Sep 27, 2024
f676a6a
add: method for adding a column
KjetilIN Sep 29, 2024
a361bcb
add: unit tests for the dataframe
KjetilIN Oct 3, 2024
c6640b7
add: methods for extracting feature into a vector
KjetilIN Oct 3, 2024
e21cc18
add: method for getting a value at that position
KjetilIN Oct 3, 2024
cae6fdf
add: note book example for reading the a dataframe
KjetilIN Oct 3, 2024
6819aef
docs: clarified how to use rust in a notebook
KjetilIN Oct 3, 2024
fa8a1ca
fix: cleanup with fmt + fix unused error
KjetilIN Oct 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,6 @@ Cargo.lock
/docs/*.gz
/docs/*.fdb_latexmk
/docs/*.out

# Ignore checkpoint folder
.ipynb_checkpoints/
2 changes: 1 addition & 1 deletion benches/matrix_benchmark.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use criterion::{self, black_box, criterion_group, criterion_main, Criterion};
use rustic_ml::matrix::Matrix;
use rustic_ml::data_utils::matrix::Matrix;

fn benchmark_matrix_multiplication(c: &mut Criterion) {
// Define matrix sizes and data
Expand Down
25 changes: 25 additions & 0 deletions datasets/european_cities.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Barcelona;Belgrade;Berlin;Brussels;Bucharest;Budapest;Copenhagen;Dublin;Hamburg;Istanbul;Kyiv;London;Madrid;Milan;Moscow;Munich;Paris;Prague;Rome;Saint Petersburg;Sofia;Stockholm;Vienna;Warsaw
0;1528.13;1497.61;1062.89;1968.42;1498.79;1757.54;1469.29;1471.78;2230.42;2391.06;1137.67;504.64;725.12;3006.93;1054.55;831.59;1353.90;856.69;2813.02;1745.55;2276.51;1347.43;1862.33
1528.13;0;999.25;1372.59;447.34;316.41;1327.24;2145.39;1229.93;809.48;976.02;1688.97;2026.94;885.32;1710.99;773.33;1445.70;738.10;721.55;1797.75;329.46;1620.96;489.28;826.66
1497.61;999.25;0;651.62;1293.40;689.06;354.03;1315.16;254.51;1735.01;1204.00;929.97;1867.69;840.72;1607.99;501.97;876.96;280.34;1181.67;1319.62;1318.67;810.38;523.61;516.06
1062.89;1372.59;651.62;0;1769.69;1131.52;766.67;773.20;489.76;2178.85;1836.20;318.72;1314.30;696.61;2253.26;601.87;261.29;721.08;1171.34;1903.66;1697.83;1280.88;914.81;1159.85
1968.42;447.34;1293.40;1769.69;0;639.77;1571.54;2534.72;1544.17;445.62;744.44;2088.42;2469.71;1331.46;1497.56;1186.37;1869.95;1076.82;1137.38;1740.39;296.68;1742.25;855.32;946.12
1498.79;316.41;689.06;1131.52;639.77;0;1011.31;1894.95;927.92;1064.76;894.29;1450.12;1975.38;788.56;1565.19;563.93;1247.61;443.26;811.11;1556.51;629.63;1316.59;216.98;545.29
1757.54;1327.24;354.03;766.67;1571.54;1011.31;0;1238.38;287.97;2017.17;1326.33;955.13;2071.75;1157.89;1558.52;838.00;1025.90;633.05;1529.69;1143.40;1635.54;521.68;868.87;667.80
1469.29;2145.39;1315.16;773.20;2534.72;1894.95;1238.38;0;1073.36;2950.11;2513.69;462.60;1449.96;1413.37;2792.41;1374.91;776.83;1465.61;1882.22;2314.19;2471.02;1626.56;1680.00;1823.72
1471.78;1229.93;254.51;489.76;1544.17;927.92;287.97;1073.36;0;1983.75;1440.34;720.12;1785.33;900.01;1779.93;610.17;744.63;492.25;1307.51;1414.16;1554.82;809.65;742.79;750.49
2230.42;809.48;1735.01;2178.85;445.62;1064.76;2017.17;2950.11;1983.75;0;1052.95;2496.39;2734.60;1669.43;1753.97;1582.16;2253.98;1507.55;1373.81;2099.29;502.61;2171.65;1273.88;1386.08
2391.06;976.02;1204.00;1836.20;744.44;894.29;1326.33;2513.69;1440.34;1052.95;0;2131.20;2859.32;1672.69;756.61;1391.36;2022.76;1138.61;1673.74;1051.39;1020.76;1265.79;1052.76;690.12
1137.67;1688.97;929.97;318.72;2088.42;1450.12;955.13;462.60;720.12;2496.39;2131.20;0;1263.37;957.91;2498.32;916.23;340.55;1034.57;1431.21;2093.69;2012.70;1431.07;1233.48;1445.85
504.64;2026.94;1867.69;1314.30;2469.71;1975.38;2071.75;1449.96;1785.33;2734.60;2859.32;1263.37;0;1187.73;3437.70;1484.53;1053.40;1773.73;1360.80;3183.43;2250.10;2591.53;1807.09;2288.42
725.12;885.32;840.72;696.61;1331.46;788.56;1157.89;1413.37;900.01;1669.43;1672.69;957.91;1187.73;0;2283.19;348.89;641.31;646.04;476.00;2122.15;1166.83;1650.12;623.36;1143.01
3006.93;1710.99;1607.99;2253.26;1497.56;1565.19;1558.52;2792.41;1779.93;1753.97;756.61;2498.32;3437.70;2283.19;0;1957.15;2484.92;1664.04;2374.26;632.59;1777.35;1227.38;1669.22;1149.41
1054.55;773.33;501.97;601.87;1186.37;563.93;838.00;1374.91;610.17;1582.16;1391.36;916.23;1484.53;348.89;1957.15;0;685.14;300.16;698.04;1773.83;1096.54;1311.80;354.42;809.02
831.59;1445.70;876.96;261.29;1869.95;1247.61;1025.90;776.83;744.63;2253.98;2022.76;340.55;1053.40;641.31;2484.92;685.14;0;885.38;1105.76;2157.99;1758.03;1541.83;1033.73;1365.91
1353.90;738.10;280.34;721.08;1076.82;443.26;633.05;1465.61;492.25;1507.55;1138.61;1034.57;1773.73;646.04;1664.04;300.16;885.38;0;922.00;1476.73;1064.43;1052.85;250.71;514.69
856.69;721.55;1181.67;1171.34;1137.38;811.11;1529.69;1882.22;1307.51;1373.81;1673.74;1431.21;1360.80;476.00;2374.26;698.04;1105.76;922.00;0;2339.22;894.06;1974.79;763.26;1316.24
2813.02;1797.75;1319.62;1903.66;1740.39;1556.51;1143.40;2314.19;1414.16;2099.29;1051.39;2093.69;3183.43;2122.15;632.59;1773.83;2157.99;1476.73;2339.22;0;1969.82;688.33;1577.56;1023.41
1745.55;329.46;1318.67;1697.83;296.68;629.63;1635.54;2471.02;1554.82;502.61;1020.76;2012.70;2250.10;1166.83;1777.35;1096.54;1758.03;1064.43;894.06;1969.82;0;1884.91;817.45;1076.99
2276.51;1620.96;810.38;1280.88;1742.25;1316.59;521.68;1626.56;809.65;2171.65;1265.79;1431.07;2591.53;1650.12;1227.38;1311.80;1541.83;1052.85;1974.79;688.33;1884.91;0;1241.90;808.14
1347.43;489.28;523.61;914.81;855.32;216.98;868.87;1680.00;742.79;1273.88;1052.76;1233.48;1807.09;623.36;1669.22;354.42;1033.73;250.71;763.26;1577.56;817.45;1241.90;0;557.43
1862.33;826.66;516.06;1159.85;946.12;545.29;667.80;1823.72;750.49;1386.08;690.12;1445.85;2288.42;1143.01;1149.41;809.02;1365.91;514.69;1316.24;1023.41;1076.99;808.14;557.43;0
25 changes: 25 additions & 0 deletions datasets/european_cities.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Barcelona Belgrade Berlin Brussels Bucharest Budapest Copenhagen Dublin Hamburg Istanbul Kyiv London Madrid Milan Moscow Munich Paris Prague Rome Saint Petersburg Sofia Stockholm Vienna Warsaw
0 1528.13 1497.61 1062.89 1968.42 1498.79 1757.54 1469.29 1471.78 2230.42 2391.06 1137.67 504.64 725.12 3006.93 1054.55 831.59 1353.90 856.69 2813.02 1745.55 2276.51 1347.43 1862.33
1528.13 0 999.25 1372.59 447.34 316.41 1327.24 2145.39 1229.93 809.48 976.02 1688.97 2026.94 885.32 1710.99 773.33 1445.70 738.10 721.55 1797.75 329.46 1620.96 489.28 826.66
1497.61 999.25 0 651.62 1293.40 689.06 354.03 1315.16 254.51 1735.01 1204.00 929.97 1867.69 840.72 1607.99 501.97 876.96 280.34 1181.67 1319.62 1318.67 810.38 523.61 516.06
1062.89 1372.59 651.62 0 1769.69 1131.52 766.67 773.20 489.76 2178.85 1836.20 318.72 1314.30 696.61 2253.26 601.87 261.29 721.08 1171.34 1903.66 1697.83 1280.88 914.81 1159.85
1968.42 447.34 1293.40 1769.69 0 639.77 1571.54 2534.72 1544.17 445.62 744.44 2088.42 2469.71 1331.46 1497.56 1186.37 1869.95 1076.82 1137.38 1740.39 296.68 1742.25 855.32 946.12
1498.79 316.41 689.06 1131.52 639.77 0 1011.31 1894.95 927.92 1064.76 894.29 1450.12 1975.38 788.56 1565.19 563.93 1247.61 443.26 811.11 1556.51 629.63 1316.59 216.98 545.29
1757.54 1327.24 354.03 766.67 1571.54 1011.31 0 1238.38 287.97 2017.17 1326.33 955.13 2071.75 1157.89 1558.52 838.00 1025.90 633.05 1529.69 1143.40 1635.54 521.68 868.87 667.80
1469.29 2145.39 1315.16 773.20 2534.72 1894.95 1238.38 0 1073.36 2950.11 2513.69 462.60 1449.96 1413.37 2792.41 1374.91 776.83 1465.61 1882.22 2314.19 2471.02 1626.56 1680.00 1823.72
1471.78 1229.93 254.51 489.76 1544.17 927.92 287.97 1073.36 0 1983.75 1440.34 720.12 1785.33 900.01 1779.93 610.17 744.63 492.25 1307.51 1414.16 1554.82 809.65 742.79 750.49
2230.42 809.48 1735.01 2178.85 445.62 1064.76 2017.17 2950.11 1983.75 0 1052.95 2496.39 2734.60 1669.43 1753.97 1582.16 2253.98 1507.55 1373.81 2099.29 502.61 2171.65 1273.88 1386.08
2391.06 976.02 1204.00 1836.20 744.44 894.29 1326.33 2513.69 1440.34 1052.95 0 2131.20 2859.32 1672.69 756.61 1391.36 2022.76 1138.61 1673.74 1051.39 1020.76 1265.79 1052.76 690.12
1137.67 1688.97 929.97 318.72 2088.42 1450.12 955.13 462.60 720.12 2496.39 2131.20 0 1263.37 957.91 2498.32 916.23 340.55 1034.57 1431.21 2093.69 2012.70 1431.07 1233.48 1445.85
504.64 2026.94 1867.69 1314.30 2469.71 1975.38 2071.75 1449.96 1785.33 2734.60 2859.32 1263.37 0 1187.73 3437.70 1484.53 1053.40 1773.73 1360.80 3183.43 2250.10 2591.53 1807.09 2288.42
725.12 885.32 840.72 696.61 1331.46 788.56 1157.89 1413.37 900.01 1669.43 1672.69 957.91 1187.73 0 2283.19 348.89 641.31 646.04 476.00 2122.15 1166.83 1650.12 623.36 1143.01
3006.93 1710.99 1607.99 2253.26 1497.56 1565.19 1558.52 2792.41 1779.93 1753.97 756.61 2498.32 3437.70 2283.19 0 1957.15 2484.92 1664.04 2374.26 632.59 1777.35 1227.38 1669.22 1149.41
1054.55 773.33 501.97 601.87 1186.37 563.93 838.00 1374.91 610.17 1582.16 1391.36 916.23 1484.53 348.89 1957.15 0 685.14 300.16 698.04 1773.83 1096.54 1311.80 354.42 809.02
831.59 1445.70 876.96 261.29 1869.95 1247.61 1025.90 776.83 744.63 2253.98 2022.76 340.55 1053.40 641.31 2484.92 685.14 0 885.38 1105.76 2157.99 1758.03 1541.83 1033.73 1365.91
1353.90 738.10 280.34 721.08 1076.82 443.26 633.05 1465.61 492.25 1507.55 1138.61 1034.57 1773.73 646.04 1664.04 300.16 885.38 0 922.00 1476.73 1064.43 1052.85 250.71 514.69
856.69 721.55 1181.67 1171.34 1137.38 811.11 1529.69 1882.22 1307.51 1373.81 1673.74 1431.21 1360.80 476.00 2374.26 698.04 1105.76 922.00 0 2339.22 894.06 1974.79 763.26 1316.24
2813.02 1797.75 1319.62 1903.66 1740.39 1556.51 1143.40 2314.19 1414.16 2099.29 1051.39 2093.69 3183.43 2122.15 632.59 1773.83 2157.99 1476.73 2339.22 0 1969.82 688.33 1577.56 1023.41
1745.55 329.46 1318.67 1697.83 296.68 629.63 1635.54 2471.02 1554.82 502.61 1020.76 2012.70 2250.10 1166.83 1777.35 1096.54 1758.03 1064.43 894.06 1969.82 0 1884.91 817.45 1076.99
2276.51 1620.96 810.38 1280.88 1742.25 1316.59 521.68 1626.56 809.65 2171.65 1265.79 1431.07 2591.53 1650.12 1227.38 1311.80 1541.83 1052.85 1974.79 688.33 1884.91 0 1241.90 808.14
1347.43 489.28 523.61 914.81 855.32 216.98 868.87 1680.00 742.79 1273.88 1052.76 1233.48 1807.09 623.36 1669.22 354.42 1033.73 250.71 763.26 1577.56 817.45 1241.90 0 557.43
1862.33 826.66 516.06 1159.85 946.12 545.29 667.80 1823.72 750.49 1386.08 690.12 1445.85 2288.42 1143.01 1149.41 809.02 1365.91 514.69 1316.24 1023.41 1076.99 808.14 557.43 0
194 changes: 194 additions & 0 deletions examples/DataframeLab.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e237e057-796b-4e8a-89f8-076882ea2f9c",
"metadata": {},
"source": [
"# Using `rustic_ml` in a Jupyter notebook"
]
},
{
"cell_type": "markdown",
"id": "733b683a-a558-4596-96e2-6faca1e4c29a",
"metadata": {},
"source": [
"First step is to include the create to the notebook. \n",
"To get started see `README.md` on how to setup the notebook environment. \n",
"When it is installed, run `jupyter lab` to start the notebook in the browser.\n",
"Create a new notebook with the Rust option, and set the depencency to: \n",
"```rust\n",
":dep rustic_ml = \"0.x.x\"\n",
"extern crate rustic_ml;\n",
"```\n",
"\n",
"After this, you will be able to use the libaries functionality in the following files.\n",
"\n",
"Since this is a example within the library iteself, we import the libary using the path:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f541ad4f-9472-4cf6-a787-78a54389be91",
"metadata": {
"vscode": {
"languageId": "rust"
}
},
"outputs": [],
"source": [
":dep rustic_ml = { path = \"../\" }\n",
"extern crate rustic_ml;"
]
},
{
"cell_type": "markdown",
"id": "67ec5c36-a929-4ee0-92bb-58cdcc7d5a5b",
"metadata": {},
"source": [
"Next, include the function that we are going to use from the library: "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3f2c42e6-7cf4-45b5-bdfb-1228b2bce10d",
"metadata": {
"vscode": {
"languageId": "rust"
}
},
"outputs": [],
"source": [
"use rustic_ml::data_utils::dataframe::Dataframe;"
]
},
{
"cell_type": "markdown",
"id": "76a9d147-f8ce-4bd5-9f12-dee698b0a942",
"metadata": {},
"source": [
"Reading a csv file:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5d58a4d-8e12-44b8-9974-6408333cefc4",
"metadata": {},
"outputs": [],
"source": [
"let path = String::from(\"../datasets/european_cities.csv\");\n",
"let dataframe = Dataframe::from_csv(path).unwrap();"
]
},
{
"cell_type": "markdown",
"id": "152858c3-707d-4563-8e69-5f2681797399",
"metadata": {},
"source": [
"Run the following codeblock to see the information about the dataframe:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "11070ca7-ffcc-41bd-851e-e67c7fb89d48",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Column Name Type None Some Total Length \n",
"-----------------------------------------------------------------\n",
"Barcelona Float 0 24 24 \n",
"Belgrade Float 0 24 24 \n",
"Berlin Float 0 24 24 \n",
"Brussels Float 0 24 24 \n",
"Bucharest Float 0 24 24 \n",
"Budapest Float 0 24 24 \n",
"Copenhagen Float 0 24 24 \n",
"Dublin Float 0 24 24 \n",
"Hamburg Float 0 24 24 \n",
"Istanbul Float 0 24 24 \n",
"Kyiv Float 0 24 24 \n",
"London Float 0 24 24 \n",
"Madrid Float 0 24 24 \n",
"Milan Float 0 24 24 \n",
"Moscow Float 0 24 24 \n",
"Munich Float 0 24 24 \n",
"Paris Float 0 24 24 \n",
"Prague Float 0 24 24 \n",
"Rome Float 0 24 24 \n",
"Saint Petersburg Float 0 24 24 \n",
"Sofia Float 0 24 24 \n",
"Stockholm Float 0 24 24 \n",
"Vienna Float 0 24 24 \n",
"Warsaw Float 0 24 24 \n"
]
},
{
"data": {
"text/plain": [
"()"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataframe.info()"
]
},
{
"cell_type": "markdown",
"id": "546e0b63-3577-456f-8a1f-83d306f0e7af",
"metadata": {},
"source": [
"To see the memory usage of the dataframe, we call `memory_usage()`:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "63ed09c7-6779-4fa0-a4c4-fb77d421f43a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4608"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataframe.memory_usage()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Rust",
"language": "rust",
"name": "rust"
},
"language_info": {
"codemirror_mode": "rust",
"file_extension": ".rs",
"mimetype": "text/rust",
"name": "Rust",
"pygment_lexer": "rust",
"version": ""
}
},
"nbformat": 4,
"nbformat_minor": 5
}
19 changes: 19 additions & 0 deletions examples/dataframe_read_from_csv.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use rustic_ml::data_utils::dataframe::Dataframe;

fn main() {
let path = String::from("./datasets/european_cities.csv");
let dataframe = Dataframe::from_csv(path).unwrap();

// Print the info
dataframe.info();

// We can also get the total amount of bytes used
let total_bytes_used = dataframe.memory_usage();
println!(
"\nMemory usage for the dataframe: {} bytes",
total_bytes_used
);

// Lets print the first five records with the head method:
dataframe.head();
}
Loading
Loading