|
27 | 27 | #include <openvino/openvino.hpp>
|
28 | 28 |
|
29 | 29 | #include <utils/common.hpp>
|
| 30 | +#include <utils/nms.hpp> |
30 | 31 | #include <utils/slog.hpp>
|
31 | 32 |
|
32 | 33 | #include "models/internal_model_data.h"
|
@@ -504,3 +505,171 @@ ModelYolo::Region::Region(size_t classes,
|
504 | 505 | num = anchors.size() / 2;
|
505 | 506 | }
|
506 | 507 | }
|
| 508 | + |
| 509 | +std::string YOLOv5::ModelType = "YOLOv5"; |
| 510 | + |
| 511 | +void YOLOv5::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { |
| 512 | + const ov::Output<ov::Node>& input = model->input(); |
| 513 | + const ov::Shape& in_shape = input.get_partial_shape().get_max_shape(); |
| 514 | + if (in_shape.size() != 4) { |
| 515 | + throw std::runtime_error("YOLO: the rank of the input must be 4"); |
| 516 | + } |
| 517 | + inputNames.push_back(input.get_any_name()); |
| 518 | + const ov::Layout& inputLayout = getInputLayout(input); |
| 519 | + if (!embedded_processing) { |
| 520 | + model = ImageModel::embedProcessing(model, |
| 521 | + inputNames[0], |
| 522 | + inputLayout, |
| 523 | + resizeMode, |
| 524 | + interpolationMode, |
| 525 | + ov::Shape{ |
| 526 | + in_shape[ov::layout::width_idx(inputLayout)], |
| 527 | + in_shape[ov::layout::height_idx(inputLayout)] |
| 528 | + }, |
| 529 | + pad_value, |
| 530 | + reverse_input_channels, |
| 531 | + {}, |
| 532 | + scale_values); |
| 533 | + |
| 534 | + netInputWidth = in_shape[ov::layout::width_idx(inputLayout)]; |
| 535 | + netInputHeight = in_shape[ov::layout::height_idx(inputLayout)]; |
| 536 | + |
| 537 | + embedded_processing = true; |
| 538 | + } |
| 539 | + |
| 540 | + const ov::Output<const ov::Node>& output = model->output(); |
| 541 | + if (ov::element::Type_t::f32 != output.get_element_type()) { |
| 542 | + throw std::runtime_error("YOLO: the output must be of precision f32"); |
| 543 | + } |
| 544 | + const ov::Shape& out_shape = output.get_partial_shape().get_max_shape(); |
| 545 | + if (3 != out_shape.size()) { |
| 546 | + throw std::runtime_error("YOLO: the output must be of rank 3"); |
| 547 | + } |
| 548 | + if (!labels.empty() && labels.size() + 4 != out_shape[1]) { |
| 549 | + throw std::runtime_error("YOLO: number of labels must be smaller than out_shape[1] by 4"); |
| 550 | + } |
| 551 | +} |
| 552 | + |
| 553 | +void YOLOv5::updateModelInfo() { |
| 554 | + DetectionModelExt::updateModelInfo(); |
| 555 | + model->set_rt_info(YOLOv5::ModelType, "model_info", "model_type"); |
| 556 | + model->set_rt_info(agnostic_nms, "model_info", "agnostic_nms"); |
| 557 | + model->set_rt_info(iou_threshold, "model_info", "iou_threshold"); |
| 558 | +} |
| 559 | + |
| 560 | +void YOLOv5::init_from_config(const ov::AnyMap& top_priority, const ov::AnyMap& mid_priority) { |
| 561 | + pad_value = get_from_any_maps("pad_value", top_priority, mid_priority, 114); |
| 562 | + if (top_priority.find("resize_type") == top_priority.end() && mid_priority.find("resize_type") == mid_priority.end()) { |
| 563 | + interpolationMode = cv::INTER_LINEAR; |
| 564 | + resizeMode = RESIZE_KEEP_ASPECT_LETTERBOX; |
| 565 | + } |
| 566 | + reverse_input_channels = get_from_any_maps("reverse_input_channels", top_priority, mid_priority, true); |
| 567 | + scale_values = get_from_any_maps("scale_values", top_priority, mid_priority, std::vector<float>{255.0f}); |
| 568 | + confidence_threshold = get_from_any_maps("confidence_threshold", top_priority, mid_priority, 0.25f); |
| 569 | + agnostic_nms = get_from_any_maps("agnostic_nms", top_priority, mid_priority, agnostic_nms); |
| 570 | + iou_threshold = get_from_any_maps("iou_threshold", top_priority, mid_priority, 0.7f); |
| 571 | +} |
| 572 | + |
| 573 | +YOLOv5::YOLOv5(std::shared_ptr<ov::Model>& model, const ov::AnyMap& configuration) |
| 574 | + : DetectionModelExt(model, configuration) { |
| 575 | + init_from_config(configuration, model->get_rt_info<ov::AnyMap>("model_info")); |
| 576 | +} |
| 577 | + |
| 578 | +YOLOv5::YOLOv5(std::shared_ptr<InferenceAdapter>& adapter) |
| 579 | + : DetectionModelExt(adapter) { |
| 580 | + init_from_config(adapter->getModelConfig(), ov::AnyMap{}); |
| 581 | +} |
| 582 | + |
| 583 | +std::unique_ptr<ResultBase> YOLOv5::postprocess(InferenceResult& infResult) { |
| 584 | + if (1 != infResult.outputsData.size()) { |
| 585 | + throw std::runtime_error("YOLO: expect 1 output"); |
| 586 | + } |
| 587 | + const ov::Tensor& detectionsTensor = infResult.getFirstOutputTensor(); |
| 588 | + const ov::Shape& out_shape = detectionsTensor.get_shape(); |
| 589 | + if (3 != out_shape.size()) { |
| 590 | + throw std::runtime_error("YOLO: the output must be of rank 3"); |
| 591 | + } |
| 592 | + if (1 != out_shape[0]) { |
| 593 | + throw std::runtime_error("YOLO: the first dim of the output must be 1"); |
| 594 | + } |
| 595 | + size_t num_proposals = out_shape[2]; |
| 596 | + std::vector<Anchor> boxes; |
| 597 | + std::vector<float> confidences; |
| 598 | + std::vector<size_t> labelIDs; |
| 599 | + const float* const detections = detectionsTensor.data<float>(); |
| 600 | + for (size_t i = 0; i < num_proposals; ++i) { |
| 601 | + float confidence = 0.0f; |
| 602 | + size_t max_id = 0; |
| 603 | + constexpr size_t LABELS_START = 4; |
| 604 | + for (size_t j = LABELS_START; j < out_shape[1]; ++j) { |
| 605 | + if (detections[j * num_proposals + i] > confidence) { |
| 606 | + confidence = detections[j * num_proposals + i]; |
| 607 | + max_id = j; |
| 608 | + } |
| 609 | + } |
| 610 | + if (confidence > confidence_threshold) { |
| 611 | + boxes.push_back(Anchor{ |
| 612 | + detections[0 * num_proposals + i] - detections[2 * num_proposals + i] / 2.0f, |
| 613 | + detections[1 * num_proposals + i] - detections[3 * num_proposals + i] / 2.0f, |
| 614 | + detections[0 * num_proposals + i] + detections[2 * num_proposals + i] / 2.0f, |
| 615 | + detections[1 * num_proposals + i] + detections[3 * num_proposals + i] / 2.0f, |
| 616 | + }); |
| 617 | + confidences.push_back(confidence); |
| 618 | + labelIDs.push_back(max_id - LABELS_START); |
| 619 | + } |
| 620 | + } |
| 621 | + constexpr bool includeBoundaries = false; |
| 622 | + constexpr size_t keep_top_k = 30000; |
| 623 | + std::vector<size_t> keep; |
| 624 | + if (agnostic_nms) { |
| 625 | + keep = nms(boxes, confidences, iou_threshold, includeBoundaries, keep_top_k); |
| 626 | + } else { |
| 627 | + std::vector<AnchorLabeled> boxes_with_class; |
| 628 | + boxes_with_class.reserve(boxes.size()); |
| 629 | + for (size_t i = 0; i < boxes.size(); ++i) { |
| 630 | + boxes_with_class.emplace_back(boxes[i], int(labelIDs[i])); |
| 631 | + } |
| 632 | + keep = multiclass_nms(boxes_with_class, confidences, iou_threshold, includeBoundaries, keep_top_k); |
| 633 | + } |
| 634 | + DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData); |
| 635 | + auto base = std::unique_ptr<ResultBase>(result); |
| 636 | + const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>(); |
| 637 | + float floatInputImgWidth = float(internalData.inputImgWidth), |
| 638 | + floatInputImgHeight = float(internalData.inputImgHeight); |
| 639 | + float invertedScaleX = floatInputImgWidth / netInputWidth, |
| 640 | + invertedScaleY = floatInputImgHeight / netInputHeight; |
| 641 | + int padLeft = 0, padTop = 0; |
| 642 | + if (RESIZE_KEEP_ASPECT == resizeMode || RESIZE_KEEP_ASPECT_LETTERBOX == resizeMode) { |
| 643 | + invertedScaleX = invertedScaleY = std::max(invertedScaleX, invertedScaleY); |
| 644 | + if (RESIZE_KEEP_ASPECT_LETTERBOX == resizeMode) { |
| 645 | + padLeft = (netInputWidth - int(std::round(floatInputImgWidth / invertedScaleX))) / 2; |
| 646 | + padTop = (netInputHeight - int(std::round(floatInputImgHeight / invertedScaleY))) / 2; |
| 647 | + } |
| 648 | + } |
| 649 | + for (size_t idx : keep) { |
| 650 | + DetectedObject desc; |
| 651 | + desc.x = clamp( |
| 652 | + round((boxes[idx].left - padLeft) * invertedScaleX), |
| 653 | + 0.f, |
| 654 | + floatInputImgWidth); |
| 655 | + desc.y = clamp( |
| 656 | + round((boxes[idx].top - padTop) * invertedScaleY), |
| 657 | + 0.f, |
| 658 | + floatInputImgHeight); |
| 659 | + desc.width = clamp( |
| 660 | + round((boxes[idx].right - padLeft) * invertedScaleX), |
| 661 | + 0.f, |
| 662 | + floatInputImgWidth) - desc.x; |
| 663 | + desc.height = clamp( |
| 664 | + round((boxes[idx].bottom - padTop) * invertedScaleY), |
| 665 | + 0.f, |
| 666 | + floatInputImgHeight) - desc.y; |
| 667 | + desc.confidence = confidences[idx]; |
| 668 | + desc.labelID = static_cast<size_t>(labelIDs[idx]); |
| 669 | + desc.label = getLabelName(desc.labelID); |
| 670 | + result->objects.push_back(desc); |
| 671 | + } |
| 672 | + return base; |
| 673 | +} |
| 674 | + |
| 675 | +std::string YOLOv8::ModelType = "YOLOv8"; |
0 commit comments