Skip to content

Commit 362357b

Browse files
author
gg
committed
first commit
1 parent 8d86d1c commit 362357b

16 files changed

+3365
-0
lines changed

data/data_generation.ipynb

+172
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from sklearn.datasets import make_classification\n",
10+
"import numpy as np\n",
11+
"import time\n",
12+
"import faiss\n",
13+
"\n",
14+
"import torch"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 2,
20+
"metadata": {},
21+
"outputs": [
22+
{
23+
"data": {
24+
"text/plain": [
25+
"((100, 256), (100,))"
26+
]
27+
},
28+
"execution_count": 2,
29+
"metadata": {},
30+
"output_type": "execute_result"
31+
}
32+
],
33+
"source": [
34+
"# X, y = make_classification(\n",
35+
"# n_samples=100000, n_features=4, n_classes=4, n_clusters_per_class=2, random_state=1024, n_informative=4, n_redundant = 0\n",
36+
"# )\n",
37+
"features = 256\n",
38+
"clusters = 256\n",
39+
"X, y = make_classification(\n",
40+
" n_samples=100, n_features=features, n_classes=2, random_state=1024, n_informative=features, n_redundant=0, n_clusters_per_class = 1\n",
41+
")\n",
42+
"X.shape, y.shape"
43+
]
44+
},
45+
{
46+
"cell_type": "code",
47+
"execution_count": 3,
48+
"metadata": {},
49+
"outputs": [],
50+
"source": [
51+
"xy = np.concatenate((X, y.reshape(-1, 1)), axis=1)\n",
52+
"# np.savetxt(\"test_clusters_\" + str(clusters) + \".csv\", xy, delimiter=\",\")\n",
53+
"np.savetxt(\"test_samples.csv\", xy, delimiter=\",\")"
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": 137,
59+
"metadata": {},
60+
"outputs": [
61+
{
62+
"name": "stdout",
63+
"output_type": "stream",
64+
"text": [
65+
"used 50 iterations (17.7545s) to cluster 100000 items into 256 clusters\n",
66+
"KMeans fit time: 17754.658460617065 ms\n"
67+
]
68+
}
69+
],
70+
"source": [
71+
"from fast_pytorch_kmeans import KMeans\n",
72+
"\n",
73+
"kmeans = KMeans(n_clusters=clusters, mode='euclidean', verbose=1, tol = -1, max_iter=50)\n",
74+
"x = torch.from_numpy(X).to(\"cuda\")\n",
75+
"\n",
76+
"start_time = time.time()\n",
77+
"labels = kmeans.fit_predict(x)\n",
78+
"# 记录结束时间\n",
79+
"end_time = time.time()\n",
80+
"\n",
81+
"# 计算并输出fit方法所需的时间\n",
82+
"fit_time = end_time - start_time\n",
83+
"print(f\"KMeans fit time: {fit_time * 1000} ms\")"
84+
]
85+
},
86+
{
87+
"cell_type": "code",
88+
"execution_count": 138,
89+
"metadata": {},
90+
"outputs": [
91+
{
92+
"name": "stdout",
93+
"output_type": "stream",
94+
"text": [
95+
"Clustering 100000 points in 1024D to 256 clusters, redo 1 times, 50 iterations\n",
96+
" Preprocessing in 0.04 s\n",
97+
"KMeans fit time: 3554.8036098480225 ms: objective=3.40156e+10 imbalance=1.259 nsplit=0 \n",
98+
"\n"
99+
]
100+
}
101+
],
102+
"source": [
103+
"d = X.shape[1]\n",
104+
"clus = faiss.Clustering(d, clusters)\n",
105+
"clus.verbose = True\n",
106+
"clus.niter = 50\n",
107+
"# otherwise the kmeans implementation sub-samples the training set\n",
108+
"clus.max_points_per_centroid = 10**9\n",
109+
"cfg = faiss.GpuIndexFlatConfig()\n",
110+
"cfg.useFloat16 = False\n",
111+
"cfg.device = 0\n",
112+
"index = faiss.GpuIndexFlatL2(faiss.StandardGpuResources(), d, cfg)\n",
113+
"start_time = time.time()\n",
114+
"clus.train(X, index)\n",
115+
"# 记录结束时间\n",
116+
"end_time = time.time()\n",
117+
"#计算并输出fit方法所需的时间\n",
118+
"fit_time = end_time - start_time\n",
119+
"print(f\"KMeans fit time: {fit_time * 1000} ms\")"
120+
]
121+
},
122+
{
123+
"cell_type": "code",
124+
"execution_count": 134,
125+
"metadata": {},
126+
"outputs": [],
127+
"source": [
128+
"# ncentroids = clusters\n",
129+
"# niter = 50\n",
130+
"# verbose = True\n",
131+
"# d = X.shape[1]\n",
132+
"# kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose, nredo = 1, gpu=True)\n",
133+
"# kmeans.max_points_per_centroid = 10**9\n",
134+
"# start_time = time.time()\n",
135+
"# kmeans.train(X)\n",
136+
"# # 记录结束时间\n",
137+
"# end_time = time.time()\n",
138+
"# # 计算并输出fit方法所需的时间\n",
139+
"# fit_time = end_time - start_time\n",
140+
"# print(f\"KMeans fit time: {fit_time * 1000} ms\")"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": null,
146+
"metadata": {},
147+
"outputs": [],
148+
"source": []
149+
}
150+
],
151+
"metadata": {
152+
"kernelspec": {
153+
"display_name": "dl",
154+
"language": "python",
155+
"name": "python3"
156+
},
157+
"language_info": {
158+
"codemirror_mode": {
159+
"name": "ipython",
160+
"version": 3
161+
},
162+
"file_extension": ".py",
163+
"mimetype": "text/x-python",
164+
"name": "python",
165+
"nbconvert_exporter": "python",
166+
"pygments_lexer": "ipython3",
167+
"version": "3.10.13"
168+
}
169+
},
170+
"nbformat": 4,
171+
"nbformat_minor": 2
172+
}

data/data_test.cpp

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#include <iostream>
2+
#include <vector>
3+
#include <algorithm>
4+
#include <unordered_map>
5+
6+
// 函数声明
7+
std::vector<int> mapClusterToLabel(const std::vector<int>& clusterLabels, const std::vector<int>& trueLabels);
8+
9+
int main() {
10+
// 示例数据:聚类结果和真实标签
11+
std::vector<int> clusterLabels = {1, 0, 0, 2, 1, 0, 2, 2, 1, 2};
12+
std::vector<int> trueLabels = {0, 0, 1, 1, 0, 1, 2, 2, 0, 2};
13+
14+
// 调用函数进行映射
15+
std::vector<int> mappedLabels = mapClusterToLabel(clusterLabels, trueLabels);
16+
17+
// 打印映射后的标签
18+
std::cout << "映射后的标签:" << std::endl;
19+
for (int label : mappedLabels) {
20+
std::cout << label << " ";
21+
}
22+
std::cout << std::endl;
23+
24+
return 0;
25+
}
26+
27+
std::vector<int> mapClusterToLabel(const std::vector<int>& clusterLabels, const std::vector<int>& trueLabels) {
28+
std::vector<int> mappedLabels = clusterLabels;
29+
30+
// 遍历每个簇
31+
for (int clusterNum = 0; clusterNum < 3; ++clusterNum) {
32+
std::vector<int> clusterIndices;
33+
// 找到属于当前簇的数据点的索引
34+
for (size_t i = 0; i < clusterLabels.size(); ++i) {
35+
if (clusterLabels[i] == clusterNum) {
36+
clusterIndices.push_back(i);
37+
}
38+
}
39+
40+
// 统计当前簇中真实标签出现的频率
41+
std::unordered_map<int, int> labelFreq;
42+
for (int index : clusterIndices) {
43+
++labelFreq[trueLabels[index]];
44+
}
45+
46+
// 找到当前簇中出现频率最高的真实标签
47+
int mostFrequentLabel = -1;
48+
int maxFreq = 0;
49+
for (const auto& pair : labelFreq) {
50+
if (pair.second > maxFreq) {
51+
mostFrequentLabel = pair.first;
52+
maxFreq = pair.second;
53+
}
54+
}
55+
56+
// 将当前簇中的标签映射为出现频率最高的真实标签
57+
for (int index : clusterIndices) {
58+
mappedLabels[index] = mostFrequentLabel;
59+
}
60+
}
61+
62+
return mappedLabels;
63+
}

data/data_test.ipynb

+241
Large diffs are not rendered by default.

data/test.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import numpy as np
2+
from sklearn.cluster import KMeans
3+
import matplotlib.pyplot as plt
4+
5+
# 随机生成一些数据
6+
np.random.seed(0) # 确保每次运行代码时生成的数据相同
7+
X = np.random.rand(100, 2) # 生成一个100x2的矩阵,表示100个二维数据点
8+
9+
# 创建KMeans实例,设置聚类数为3,并强制迭代50次
10+
kmeans = KMeans(n_clusters=4, n_init=1, max_iter=50, init='random', verbose=1, tol=-1)
11+
12+
# 训练模型
13+
kmeans.fit(X)
14+
15+
# 获取聚类中心
16+
centroids = kmeans.cluster_centers_
17+
18+
# 获取每个数据点的标签
19+
labels = kmeans.labels_
20+

v0/a.out

918 KB
Binary file not shown.

v0/data_load_test.cpp

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#include <iostream>
2+
#include <fstream>
3+
#include <string>
4+
#include <sstream>
5+
#include <stdexcept>
6+
7+
using namespace std;
8+
9+
void readCoordinate(float *data, int *label, const int n_features, int &n) {
10+
std::ifstream ifs;
11+
ifs.open("/home/gg/Desktop/kmeans/data/test.csv", std::ios::in);
12+
if (ifs.fail()) {
13+
std::cout << "No such file or directory: test.csv" << std::endl;
14+
exit(1);
15+
}
16+
std::string line;
17+
while (std::getline(ifs, line)) {
18+
std::stringstream sstream(line);
19+
if (line.empty()) continue;
20+
int m = 0;
21+
std::string s_fea;
22+
while (std::getline(sstream, s_fea, ',')) {
23+
if (m < n_features) data[n * n_features + m] = std::stod(s_fea);
24+
else label[n] = std::stoi(s_fea);
25+
m++;
26+
}
27+
n++;
28+
}
29+
ifs.close();
30+
}
31+
32+
int main() {
33+
const int n_features = 4; // 假设有100个特征
34+
const int n_nums = 100;
35+
float data[n_features * n_nums]; // 假设最多读取1000个样本
36+
int label[n_nums]; // 对应的标签数组
37+
int n = 0; // 实际读取的样本数量
38+
39+
// 读取CSV文件
40+
readCoordinate(data, label, n_features, n);
41+
42+
// 打印输出读取的数据和标签
43+
for (int i = 0; i < n; ++i) {
44+
for (int j = 0; j < n_features; ++j) {
45+
std::cout << data[i * n_features + j] << ",";
46+
}
47+
std::cout << "Label: " << label[i] << std::endl;
48+
}
49+
50+
return 0;
51+
}

v0/error.cuh

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#pragma once
2+
#include <stdio.h>
3+
4+
#define CHECK(call) \
5+
do \
6+
{ \
7+
const cudaError_t error_code = call; \
8+
if (error_code != cudaSuccess) \
9+
{ \
10+
printf("CUDA Error:\n"); \
11+
printf(" File: %s\n", __FILE__); \
12+
printf(" Line: %d\n", __LINE__); \
13+
printf(" Error code: %d\n", error_code); \
14+
printf(" Error text: %s\n", \
15+
cudaGetErrorString(error_code)); \
16+
exit(1); \
17+
} \
18+
} while (0)

0 commit comments

Comments
 (0)