|
300 | 300 | " return padding_im\n",
|
301 | 301 | "\n",
|
302 | 302 | "\n",
|
303 |
| - "def batch_text_box(dt_boxes, frame):\n", |
| 303 | + "def prep_for_rec(dt_boxes, frame):\n", |
304 | 304 | " \"\"\"\n",
|
305 |
| - " Batch the detected bounding boxes for text recognition\n", |
| 305 | + " Preprocessing of the detected bounding boxes for text recognition\n", |
306 | 306 | "\n",
|
307 | 307 | " Parameters:\n",
|
308 | 308 | " dt_boxes: detected bounding boxes from text detection \n",
|
309 | 309 | " frame: original input frame \n",
|
310 | 310 | " \"\"\"\n",
|
311 |
| - " \n", |
312 | 311 | " ori_im = frame.copy()\n",
|
313 | 312 | " img_crop_list = [] \n",
|
314 | 313 | " for bno in range(len(dt_boxes)):\n",
|
|
321 | 320 | " width_list = []\n",
|
322 | 321 | " for img in img_crop_list:\n",
|
323 | 322 | " width_list.append(img.shape[1] / float(img.shape[0]))\n",
|
| 323 | + " \n", |
324 | 324 | " # Sorting can speed up the recognition process\n",
|
325 | 325 | " indices = np.argsort(np.array(width_list))\n",
|
326 |
| - " rec_res = [['', 0.0]] * img_num\n", |
327 |
| - " batch_num = 6\n", |
328 |
| - "\n", |
329 |
| - " # For each detected text box batch, run inference for text recognition\n", |
330 |
| - " for beg_img_no in range(0, img_num, batch_num):\n", |
331 |
| - " end_img_no = min(img_num, beg_img_no + batch_num)\n", |
332 |
| - "\n", |
333 |
| - " norm_img_batch = []\n", |
334 |
| - " max_wh_ratio = 0\n", |
335 |
| - " for ino in range(beg_img_no, end_img_no):\n", |
336 |
| - " h, w = img_crop_list[indices[ino]].shape[0:2]\n", |
337 |
| - " wh_ratio = w * 1.0 / h\n", |
338 |
| - " max_wh_ratio = max(max_wh_ratio, wh_ratio)\n", |
339 |
| - " for ino in range(beg_img_no, end_img_no):\n", |
340 |
| - " norm_img = resize_norm_img(img_crop_list[indices[ino]], max_wh_ratio)\n", |
341 |
| - " norm_img = norm_img[np.newaxis, :]\n", |
342 |
| - " norm_img_batch.append(norm_img)\n", |
| 326 | + " return img_crop_list, img_num, indices\n", |
| 327 | + "\n", |
| 328 | + "\n", |
| 329 | + "def batch_text_box(img_crop_list, img_num, indices, beg_img_no, batch_num):\n", |
| 330 | + " \"\"\"\n", |
| 331 | + " Batch for text recognition\n", |
| 332 | + "\n", |
| 333 | + " Parameters:\n", |
| 334 | + " img_crop_list: processed detected bounding box images \n", |
| 335 | + " img_num: number of bounding boxes from text detection\n", |
| 336 | + " indices: sorting for bounding boxes to speed up text recognition\n", |
| 337 | + " beg_img_no: the beginning number of bounding boxes for each batch of text recognition inference\n", |
| 338 | + " batch_num: number of images for each batch\n", |
| 339 | + " \"\"\"\n", |
| 340 | + " norm_img_batch = []\n", |
| 341 | + " max_wh_ratio = 0\n", |
| 342 | + " end_img_no = min(img_num, beg_img_no + batch_num)\n", |
| 343 | + " for ino in range(beg_img_no, end_img_no):\n", |
| 344 | + " h, w = img_crop_list[indices[ino]].shape[0:2]\n", |
| 345 | + " wh_ratio = w * 1.0 / h\n", |
| 346 | + " max_wh_ratio = max(max_wh_ratio, wh_ratio)\n", |
| 347 | + " for ino in range(beg_img_no, end_img_no):\n", |
| 348 | + " norm_img = resize_norm_img(img_crop_list[indices[ino]], max_wh_ratio)\n", |
| 349 | + " norm_img = norm_img[np.newaxis, :]\n", |
| 350 | + " norm_img_batch.append(norm_img)\n", |
343 | 351 | "\n",
|
344 | 352 | " norm_img_batch = np.concatenate(norm_img_batch)\n",
|
345 | 353 | " norm_img_batch = norm_img_batch.copy()\n",
|
346 |
| - " return norm_img_batch, rec_res, indices, beg_img_no" |
| 354 | + " return norm_img_batch" |
347 | 355 | ]
|
348 | 356 | },
|
349 | 357 | {
|
|
461 | 469 | " frame = cv2.resize(src=frame, dsize=None, fx=scale, fy=scale,\n",
|
462 | 470 | " interpolation=cv2.INTER_AREA)\n",
|
463 | 471 | " # preprocess image for text detection\n",
|
464 |
| - " test_image = image_preprocess(frame,640)\n", |
| 472 | + " test_image = image_preprocess(frame, 640)\n", |
465 | 473 | " \n",
|
466 | 474 | " # measure processing time for text detection\n",
|
467 | 475 | " start_time = time.time()\n",
|
|
480 | 488 | "\n",
|
481 | 489 | " # Preprocess detection results for recognition\n",
|
482 | 490 | " dt_boxes = processing.sorted_boxes(dt_boxes) \n",
|
483 |
| - " if dt_boxes:\n", |
| 491 | + " batch_num = 6\n", |
| 492 | + " img_crop_list, img_num, indices = prep_for_rec(dt_boxes, frame)\n", |
| 493 | + " \n", |
| 494 | + " # For storing recognition results, include two parts:\n", |
| 495 | + " # txts are the recognized text results, scores are the recognition confidence level \n", |
| 496 | + " rec_res = [['', 0.0]] * img_num\n", |
| 497 | + " txts = [] \n", |
| 498 | + " scores = []\n", |
| 499 | + "\n", |
| 500 | + " for beg_img_no in range(0, img_num, batch_num):\n", |
| 501 | + "\n", |
484 | 502 | " # Recognition starts from here\n",
|
485 |
| - " norm_img_batch, rec_res, indices, beg_img_no = batch_text_box(dt_boxes, frame)\n", |
| 503 | + " norm_img_batch = batch_text_box(\n", |
| 504 | + " img_crop_list, img_num, indices, beg_img_no, batch_num)\n", |
486 | 505 | "\n",
|
487 | 506 | " # Run inference for text recognition \n",
|
488 | 507 | " rec_results = rec_compiled_model([norm_img_batch])[rec_output_layer]\n",
|
|
491 | 510 | " postprocess_op = processing.build_post_process(processing.postprocess_params)\n",
|
492 | 511 | " rec_result = postprocess_op(rec_results)\n",
|
493 | 512 | " for rno in range(len(rec_result)):\n",
|
494 |
| - " rec_res[indices[beg_img_no + rno]] = rec_result[rno]\n", |
495 |
| - "\n", |
496 |
| - " # Text recognition results, rec_res, include two parts:\n", |
497 |
| - " # txts are the recognized text results, scores are the recognition confidence level \n", |
| 513 | + " rec_res[indices[beg_img_no + rno]] = rec_result[rno] \n", |
498 | 514 | " if rec_res:\n",
|
499 |
| - " image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))\n", |
500 |
| - " boxes = dt_boxes\n", |
501 | 515 | " txts = [rec_res[i][0] for i in range(len(rec_res))] \n",
|
502 |
| - " scores = [rec_res[i][1] for i in range(len(rec_res))] \n", |
503 |
| - "\n", |
504 |
| - " # draw text recognition results beside the image\n", |
505 |
| - " draw_img = processing.draw_ocr_box_txt(\n", |
506 |
| - " image,\n", |
507 |
| - " boxes,\n", |
508 |
| - " txts,\n", |
509 |
| - " scores,\n", |
510 |
| - " drop_score=0.5)\n", |
511 |
| - " else:\n", |
512 |
| - " draw_img = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)\n", |
| 516 | + " scores = [rec_res[i][1] for i in range(len(rec_res))]\n", |
| 517 | + " \n", |
| 518 | + " image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))\n", |
| 519 | + " boxes = dt_boxes\n", |
| 520 | + " # draw text recognition results beside the image\n", |
| 521 | + " draw_img = processing.draw_ocr_box_txt(\n", |
| 522 | + " image,\n", |
| 523 | + " boxes,\n", |
| 524 | + " txts,\n", |
| 525 | + " scores,\n", |
| 526 | + " drop_score=0.5)\n", |
513 | 527 | "\n",
|
514 | 528 | " # Visualize PaddleOCR results\n",
|
515 | 529 | " f_height, f_width = draw_img.shape[:2]\n",
|
516 | 530 | " fps = 1000 / processing_time_det\n",
|
517 | 531 | " cv2.putText(img=draw_img, text=f\"Inference time: {processing_time_det:.1f}ms ({fps:.1f} FPS)\", \n",
|
518 |
| - " org=(20, 40),fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=f_height / 1000,\n", |
| 532 | + " org=(20, 40),fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=f_width / 1000,\n", |
519 | 533 | " color=(0, 0, 255), thickness=1, lineType=cv2.LINE_AA)\n",
|
520 | 534 | " \n",
|
521 | 535 | " # use this workaround if there is flickering\n",
|
|
0 commit comments