GGgary666
diff --git a/‎data/data_generation.ipynb
+172 b/‎data/data_generation.ipynb
+172
diff --git a/‎data/data_test.cpp
+63 b/‎data/data_test.cpp
+63
diff --git a/‎data/data_test.ipynb
+241 b/‎data/data_test.ipynb
+241
diff --git a/‎data/test.py
+20 b/‎data/test.py
+20
diff --git a/‎v0/a.out
918 KB b/‎v0/a.out
918 KB
diff --git a/‎v0/data_load_test.cpp
+51 b/‎v0/data_load_test.cpp
+51
diff --git a/‎v0/error.cuh
+18 b/‎v0/error.cuh
+18
@@ -0,0 +1,172 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import make_classification\n",
+    "import numpy as np\n",
+    "import time\n",
+    "import faiss\n",
+    "\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((100, 256), (100,))"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# X, y = make_classification(\n",
+    "#     n_samples=100000, n_features=4, n_classes=4, n_clusters_per_class=2, random_state=1024, n_informative=4, n_redundant = 0\n",
+    "# )\n",
+    "features = 256\n",
+    "clusters = 256\n",
+    "X, y = make_classification(\n",
+    "    n_samples=100, n_features=features, n_classes=2, random_state=1024, n_informative=features, n_redundant=0, n_clusters_per_class = 1\n",
+    ")\n",
+    "X.shape, y.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xy = np.concatenate((X, y.reshape(-1, 1)), axis=1)\n",
+    "# np.savetxt(\"test_clusters_\" + str(clusters) + \".csv\", xy, delimiter=\",\")\n",
+    "np.savetxt(\"test_samples.csv\", xy, delimiter=\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "used 50 iterations (17.7545s) to cluster 100000 items into 256 clusters\n",
+      "KMeans fit time: 17754.658460617065 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "from fast_pytorch_kmeans import KMeans\n",
+    "\n",
+    "kmeans = KMeans(n_clusters=clusters, mode='euclidean', verbose=1, tol = -1, max_iter=50)\n",
+    "x = torch.from_numpy(X).to(\"cuda\")\n",
+    "\n",
+    "start_time = time.time()\n",
+    "labels = kmeans.fit_predict(x)\n",
+    "# 记录结束时间\n",
+    "end_time = time.time()\n",
+    "\n",
+    "# 计算并输出fit方法所需的时间\n",
+    "fit_time = end_time - start_time\n",
+    "print(f\"KMeans fit time: {fit_time * 1000} ms\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Clustering 100000 points in 1024D to 256 clusters, redo 1 times, 50 iterations\n",
+      "  Preprocessing in 0.04 s\n",
+      "KMeans fit time: 3554.8036098480225 ms: objective=3.40156e+10 imbalance=1.259 nsplit=0       \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "d = X.shape[1]\n",
+    "clus = faiss.Clustering(d, clusters)\n",
+    "clus.verbose = True\n",
+    "clus.niter = 50\n",
+    "# otherwise the kmeans implementation sub-samples the training set\n",
+    "clus.max_points_per_centroid = 10**9\n",
+    "cfg = faiss.GpuIndexFlatConfig()\n",
+    "cfg.useFloat16 = False\n",
+    "cfg.device = 0\n",
+    "index = faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), d, cfg)\n",
+    "start_time = time.time()\n",
+    "clus.train(X, index)\n",
+    "# 记录结束时间\n",
+    "end_time = time.time()\n",
+    "#计算并输出fit方法所需的时间\n",
+    "fit_time = end_time - start_time\n",
+    "print(f\"KMeans fit time: {fit_time * 1000} ms\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ncentroids = clusters\n",
+    "# niter = 50\n",
+    "# verbose = True\n",
+    "# d = X.shape[1]\n",
+    "# kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose, nredo = 1, gpu=True)\n",
+    "# kmeans.max_points_per_centroid = 10**9\n",
+    "# start_time = time.time()\n",
+    "# kmeans.train(X)\n",
+    "# # 记录结束时间\n",
+    "# end_time = time.time()\n",
+    "# # 计算并输出fit方法所需的时间\n",
+    "# fit_time = end_time - start_time\n",
+    "# print(f\"KMeans fit time: {fit_time * 1000} ms\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dl",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,63 @@
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <unordered_map>
+
+// 函数声明
+std::vector<int> mapClusterToLabel(const std::vector<int>& clusterLabels, const std::vector<int>& trueLabels);
+
+int main() {
+    // 示例数据：聚类结果和真实标签
+    std::vector<int> clusterLabels = {1, 0, 0, 2, 1, 0, 2, 2, 1, 2};
+    std::vector<int> trueLabels =    {0, 0, 1, 1, 0, 1, 2, 2, 0, 2};
+
+    // 调用函数进行映射
+    std::vector<int> mappedLabels = mapClusterToLabel(clusterLabels, trueLabels);
+
+    // 打印映射后的标签
+    std::cout << "映射后的标签：" << std::endl;
+    for (int label : mappedLabels) {
+        std::cout << label << " ";
+    }
+    std::cout << std::endl;
+
+    return 0;
+}
+
+std::vector<int> mapClusterToLabel(const std::vector<int>& clusterLabels, const std::vector<int>& trueLabels) {
+    std::vector<int> mappedLabels = clusterLabels;
+
+    // 遍历每个簇
+    for (int clusterNum = 0; clusterNum < 3; ++clusterNum) {
+        std::vector<int> clusterIndices;
+        // 找到属于当前簇的数据点的索引
+        for (size_t i = 0; i < clusterLabels.size(); ++i) {
+            if (clusterLabels[i] == clusterNum) {
+                clusterIndices.push_back(i);
+            }
+        }
+
+        // 统计当前簇中真实标签出现的频率
+        std::unordered_map<int, int> labelFreq;
+        for (int index : clusterIndices) {
+            ++labelFreq[trueLabels[index]];
+        }
+
+        // 找到当前簇中出现频率最高的真实标签
+        int mostFrequentLabel = -1;
+        int maxFreq = 0;
+        for (const auto& pair : labelFreq) {
+            if (pair.second > maxFreq) {
+                mostFrequentLabel = pair.first;
+                maxFreq = pair.second;
+            }
+        }
+
+        // 将当前簇中的标签映射为出现频率最高的真实标签
+        for (int index : clusterIndices) {
+            mappedLabels[index] = mostFrequentLabel;
+        }
+    }
+
+    return mappedLabels;
+}
@@ -0,0 +1,20 @@
+import numpy as np
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+
+# 随机生成一些数据
+np.random.seed(0)  # 确保每次运行代码时生成的数据相同
+X = np.random.rand(100, 2)  # 生成一个100x2的矩阵，表示100个二维数据点
+
+# 创建KMeans实例，设置聚类数为3，并强制迭代50次
+kmeans = KMeans(n_clusters=4, n_init=1, max_iter=50, init='random', verbose=1, tol=-1)
+
+# 训练模型
+kmeans.fit(X)
+
+# 获取聚类中心
+centroids = kmeans.cluster_centers_
+
+# 获取每个数据点的标签
+labels = kmeans.labels_
+
@@ -0,0 +1,51 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <sstream>
+#include <stdexcept>
+
+using namespace std;
+
+void readCoordinate(float *data, int *label, const int n_features, int &n) {
+    std::ifstream ifs;
+    ifs.open("/home/gg/Desktop/kmeans/data/test.csv", std::ios::in);
+    if (ifs.fail()) {
+        std::cout << "No such file or directory: test.csv" << std::endl;
+        exit(1);
+    }
+    std::string line;
+    while (std::getline(ifs, line)) {
+        std::stringstream sstream(line);
+        if (line.empty()) continue;
+        int m = 0;
+        std::string s_fea;
+        while (std::getline(sstream, s_fea, ',')) {
+            if (m < n_features) data[n * n_features + m] = std::stod(s_fea);
+            else label[n] = std::stoi(s_fea);
+            m++;
+        }
+        n++;
+    }
+    ifs.close();
+}
+
+int main() {
+    const int n_features = 4; // 假设有100个特征
+    const int n_nums = 100;
+    float data[n_features * n_nums]; // 假设最多读取1000个样本
+    int label[n_nums]; // 对应的标签数组
+    int n = 0; // 实际读取的样本数量
+
+    // 读取CSV文件
+    readCoordinate(data, label, n_features, n);
+
+    // 打印输出读取的数据和标签
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n_features; ++j) {
+            std::cout << data[i * n_features + j] << ",";
+        }
+        std::cout << "Label: " << label[i] << std::endl;
+    }
+
+    return 0;
+}
@@ -0,0 +1,18 @@
+#pragma once
+#include <stdio.h>
+
+#define CHECK(call)                                   \
+do                                                    \
+{                                                     \
+    const cudaError_t error_code = call;              \
+    if (error_code != cudaSuccess)                    \
+    {                                                 \
+        printf("CUDA Error:\n");                      \
+        printf("    File:       %s\n", __FILE__);     \
+        printf("    Line:       %d\n", __LINE__);     \
+        printf("    Error code: %d\n", error_code);   \
+        printf("    Error text: %s\n",                \
+            cudaGetErrorString(error_code));          \
+        exit(1);                                      \
+    }                                                 \
+} while (0)