Reports

I particularly, trained a YOLOv11 segmentation model in order to detect positions for Rubik's cubes.

First of all, data has to be prepared in the YOLOv11 Dataset format. and a data.yaml file has to be created:

train: ../train/images
val: ../valid/images
test: ../test/images

nc: 6
names: ['Cube']

Then, install ultralytics and train the model

!pip install ultralytics
from ultralytics import YOLO

model = YOLO('best.pt')

model.train(data='./data/data.yaml', epochs=100, batch=64, device='cuda')

After using the segmentation model on a frame, I do some checks to see if the object is a Rubiks' cube or not:

import cv2
import numpy as np
from ultralytics import YOLO

def is_patch_cube(patch, epsilon=0.2):
    h, w = patch.shape[:2]
    ratio, inverse = h/w, w/h

    if ratio < 1 - epsilon or ratio > 1 + epsilon:
        return False
    if inverse < 1 - epsilon or inverse > 1 + epsilon:
        return False

    return True

def is_patch_mostly_colored(patch, threshold=0.85):
    h, w, c = patch.shape
    num_pixels = h*w*c
    num_colored_pixels = np.sum(patch > 0)
    return num_colored_pixels/num_pixels > threshold

def check_homogenous_color(patch, color, threshold):
    if color not in color_ranges: return False
    h, w = patch.shape[:2]
    patch = cv2.cvtColor(patch, cv2.COLOR_BGR2HSV)
    lower, upper = color_ranges[color]
    thres = cv2.inRange(patch, np.array(lower), np.array(upper))
    # print(thres.shape)
    return (np.count_nonzero(thres)/(h*w)) > threshold


def find_segments(seg_model: YOLO, image):
    return seg_model(image, verbose=False)


def get_face(results, n, homogenity_thres=0.6):
    for i, r in enumerate(results):
        original_img = r.orig_img
        img_h, img_w, c = original_img.shape

        if r.masks is not None:
            for obj_i, mask_tensor in enumerate(r.masks.data):
                mask_np = (mask_tensor.cpu().numpy() * 255).astype(np.uint8)

                if mask_np.shape[0] != original_img.shape[0] or mask_np.shape[1] != original_img.shape[1]:
                    mask_np = cv2.resize(mask_np, (img_w, img_h), interpolation=cv2.INTER_NEAREST)
                
                mask_np, box = simplify_mask(mask_np, eps=0.005)
                obj = cv2.bitwise_and(original_img, original_img, mask=mask_np)

                x1, y1, w, h = box
                x2, y2 = x1 + w, y1 + h

                x1 = max(0, x1)
                y1 = max(0, y1)
                x2 = min(original_img.shape[1], x2)
                y2 = min(original_img.shape[0], y2)

                cropped_object = obj[y1:y2, x1:x2]

                if not is_patch_cube(cropped_object):
                    continue

                if not is_patch_mostly_colored(cropped_object):
                    continue

                colors, homogenity = find_colors(cropped_object, n, color_detection_model)

                if sum([sum(row) for row in homogenity]) < homogenity_thres * len(homogenity) * len(homogenity[0]):
                    continue

                return colors, cropped_object, mask_np, box
            
    return None, None, None, None



def find_colors(patch, n):
    h, w, c = patch.shape
    hh, ww = h//n, w//n

    colors = [['' for _ in range(n)] for __ in range(n)]
    homogenity = [[False for _ in range(n)] for __ in range(n)]

    for i in range(n):
        for j in range(n):
            pp = patch[i*hh:(i+1)*hh, j*ww:(j+1)*ww]
            colors[i][j] = find_best_matching_color_legacy(
                get_median_color(pp), tpe='bgr') # whatever function you want to detect colors
            homogenity[i][j] = check_homogenous_color(pp, colors[i][j], threshold=0.5)
    
    return colors, homogenity

We can use this as follows:

results = find_segments(model, self.current_frame)

face, obj, mask, box = get_face(results, n=self.n, homogenity_thres=0.6)

Thanks to @ChristophRackwitz for recommending usage of semantic segmentation models

79707227