openvinotoolkit · sovrasov · Mar 7, 2025 · Mar 4, 2025 · Mar 5, 2025 · Mar 5, 2025
diff --git a/src/cpp/models/src/keypoint_detection.cpp b/src/cpp/models/src/keypoint_detection.cpp
@@ -53,6 +53,7 @@ void colArgMax(const cv::Mat& src,
 DetectedKeypoints decode_simcc(const cv::Mat& simcc_x,
                                const cv::Mat& simcc_y,
                                const cv::Point2f& extra_scale = cv::Point2f(1.f, 1.f),
+                               const cv::Point2i& extra_offset = cv::Point2f(0.f, 0.f),
                                bool apply_softmax = false,
                                float simcc_split_ratio = 2.0f) {
     cv::Mat x_locs, max_val_x;
@@ -64,8 +65,9 @@ DetectedKeypoints decode_simcc(const cv::Mat& simcc_x,
     std::vector<cv::Point2f> keypoints(x_locs.rows);
     cv::Mat scores = cv::Mat::zeros(x_locs.rows, 1, CV_32F);
     for (int i = 0; i < x_locs.rows; i++) {
-        keypoints[i] =
-            cv::Point2f(x_locs.at<int>(i) * extra_scale.x, y_locs.at<int>(i) * extra_scale.y) / simcc_split_ratio;
+        keypoints[i] = cv::Point2f((x_locs.at<int>(i) - extra_offset.x) * extra_scale.x,
+                                   (y_locs.at<int>(i) - extra_offset.y) * extra_scale.y) /
+                       simcc_split_ratio;
         scores.at<float>(i) = std::min(max_val_x.at<float>(i), max_val_y.at<float>(i));
 
         if (scores.at<float>(i) <= 0.f) {
@@ -220,8 +222,22 @@ std::unique_ptr<ResultBase> KeypointDetectionModel::postprocess(InferenceResult&
     float inverted_scale_x = static_cast<float>(image_data.inputImgWidth) / netInputWidth,
           inverted_scale_y = static_cast<float>(image_data.inputImgHeight) / netInputHeight;
 
+    int pad_left = 0, pad_top = 0;
+    if (RESIZE_KEEP_ASPECT == resizeMode || RESIZE_KEEP_ASPECT_LETTERBOX == resizeMode) {
+        inverted_scale_x = inverted_scale_y = std::max(inverted_scale_x, inverted_scale_y);
+        if (RESIZE_KEEP_ASPECT_LETTERBOX == resizeMode) {
+            pad_left = (netInputWidth -
+                        static_cast<int>(std::round(static_cast<float>(image_data.inputImgWidth) / inverted_scale_x))) /
+                       2;
+            pad_top = (netInputHeight -
+                       static_cast<int>(std::round(static_cast<float>(image_data.inputImgHeight) / inverted_scale_y))) /
+                      2;
+        }
+    }
+
     result->poses.emplace_back(
-        decode_simcc(pred_x_mat, pred_y_mat, {inverted_scale_x, inverted_scale_y}, apply_softmax));
+        decode_simcc(pred_x_mat, pred_y_mat, {inverted_scale_x, inverted_scale_y}, {pad_left, pad_top}, apply_softmax));
+
     return std::unique_ptr<ResultBase>(result);
 }
 

diff --git a/src/python/model_api/models/keypoint_detection.py b/src/python/model_api/models/keypoint_detection.py
@@ -55,7 +55,19 @@ def postprocess(
         orig_h, orig_w = meta["original_shape"][:2]
         kp_scale_h = orig_h / self.h
         kp_scale_w = orig_w / self.w
-        batch_keypoints = batch_keypoints.squeeze() * np.array([kp_scale_w, kp_scale_h])
+
+        batch_keypoints = batch_keypoints.squeeze()
+
+        if self.resize_type in ["fit_to_window", "fit_to_window_letterbox"]:
+            inverted_scale = max(kp_scale_h, kp_scale_w)
+            kp_scale_h = kp_scale_w = inverted_scale
+            if self.resize_type == "fit_to_window_letterbox":
+                pad_left = (self.w - round(orig_w / inverted_scale)) // 2
+                pad_top = (self.h - round(orig_h / inverted_scale)) // 2
+                batch_keypoints -= np.array([pad_left, pad_top])
+
+        batch_keypoints *= np.array([kp_scale_w, kp_scale_h])
+
         return DetectedKeypoints(batch_keypoints, batch_scores.squeeze())
 
     @classmethod