多行人计数与跟踪-技术圈

先来看看效果，在视频中我们可以看到，行人都被黄色的框框住了，框的左上角用了红色的字体显示当前行人的id，而整幅画面的左上角是当前的识别到的行人数量。

概述

SUMMER.TIME

看到这个效果，是否感觉有点熟悉，这不就是yolo+deepsort吗？其实不然，deepsort的算法较为复杂，我这里使用了我自己的想法，看起来效果还可以，以下是deepsort算法以及本文中的算法的思路：

deepsort:

目标检测模型：用来检测图片中行人的位置，常用yolo,ssd,faster-rcnn等算法。
卡尔曼滤波：得到由前面帧box产生的状态预测和协方差预测。
IOU计算：求跟踪器所有目标状态预测与本帧检测的box的IOU。
reID算法：用来提取外观信息的深度模型，最后输出128D向量。
匈牙利算法：得到IOU最大的唯一匹配（数据关联部分），再去掉匹配值小于iou_threshold的匹配对。

本文中的算法：

目标检测模型：用来检测图片中行人的位置，常用yolo,ssd,faster-rcnn等算法。
记录第一帧中所有行人的特征信息以及位置信息
IOU计算：计算相邻帧的所有目标的IOU(物理距离)
reID计算：计算相邻帧的所有目标的特征(特征距离)
距离匹配：融合物理距离以及特征距离，匹配小于某个阈值的id

编码

SUMMER.TIME

接下来，我们进行代码的编写，目标检测部分与前面推文中的一致，这里不再赘述，我们直接进入正题

# 导入依赖库

from utils.datasets import *from utils.utils import *import osimport torchfrom models.create_model import Create_Model
os.environ['CUDA_VISIBLE_DEVICES'] = "0"print(torch.cuda.is_available())

# 编写提取特征以及判断阈值的函数

# 获得特征距离def person_distance(person_encodings, person_unknow):    if len(person_encodings) == 0:        return np.empty((0))    l1 = np.sqrt(np.sum(np.square(person_encodings - person_unknow), axis=-1))    return l1
#判断阈值def com_person(person_list, person, tolerance=1):    dis = person_distance(person_list, person)    # print(dis)    return dis,list(dis <= tolerance)

# 编写距离组合矩阵

代码较长，无法就是组合上文中提到的物理距离（IOU）以及特征距离（reID）,不过这里当IOU距离较大时，我们直接赋予一个很大的值，方便后续进行过滤匹配。

# 组合距离矩阵def get_iou(boxes1,boxes2,arr_frame):    boxes1 = np.array(boxes1,dtype=np.float32)    boxes2 = np.array(boxes2,dtype=np.float32)
    # 求左上角右下角坐标    b1 = np.expand_dims(boxes1, -2)    b1_xy = b1[..., :2]    b1_wh = b1[..., 2:4]    b1_half = b1_wh / 2.    b1_mins = b1_xy - b1_half    b1_maxs = b1_xy + b1_half
    b2 = np.expand_dims(boxes2, 0)    b2_xy = b2[..., :2]    b2_wh = b2[..., 2:4]    b2_half = b2_wh / 2.    b2_mins = b2_xy - b2_half    b2_maxs = b2_xy + b2_half
    # 求交集面积    intersction_min = np.maximum(b1_mins, b2_mins)    intersction_max = np.minimum(b1_maxs, b2_maxs)    intersction_wh = np.maximum(intersction_max - intersction_min, 0.)    intersction_area = intersction_wh[..., 0] * intersction_wh[..., 1]
    # 求交并比    b1_area = b1_wh[..., 0] * b1_wh[..., 1]    b2_area = b2_wh[..., 0] * b2_wh[..., 1]
    # 理论上  交并比越大说明 越接近，不过为了 和 特征距离统一，这里使用1-，确保距离矩阵只拿最小值即可    iou = 1-intersction_area / (b1_area + b2_area - intersction_area)
    # 当1-交并比 过大 ，直接设置一个很大的数，方便过滤    iou[iou>0.7]=10086+1e-5
    # 调整特征矩阵的维度    arr_frame=arr_frame.reshape(arr_frame.shape[0],arr_frame.shape[1])    # 将 iou矩阵与特征矩阵相加 得到最终的距离矩阵    iou = iou+arr_frame    return  iou

# 主函数编写

大量代码警告，这里是完成所有逻辑的程序，为了方便直接写到一起了，代码中也有注释，相信你可以读懂的，具体逻辑如下：

加载目标检测模型
加载行人重识别模型
初始化参数
通过while True 读取视频流中的每一帧图片
第一帧：数据标准化，进行目标检测获取行人目标，将获取到的行人目标进行特征提取。并将特征与坐标记录下来。
第二帧以后：数据标准化，，进行目标检测获取行人目标，将获取到的行人目标进行特征提取，并将特征与坐标与上一帧的特征与坐标进行距离匹配（物理距离+特征距离）
第二帧以及以后帧获取到距离匹配结果后：获取与目标，距离最为相似的3个备选目标，并判断每一个备选目标是否已经被标记，如果已被标记，则顺位到下一个备选目标。
如果行人匹配成功，则跟新该行人的特征与坐标。
如果前面的步骤都匹配失败，则说明这是一个新的行人目标，直接记录坐标与特征。
将匹配结果显示到图像中。

def run():    # 加载目标检测模型    device = torch_utils.select_device('cpu')    # google_utils.attempt_download(model_path)    model = torch.load(model_path, map_location=device)['model']    model.to(device).eval()    names = model.names if hasattr(model, 'names') else model.modules.names
    # 加载reid 识别模型    input_size = (215, 90, 3)    model_, pred_model = Create_Model(inpt=input_size, num_classes=1812)    model_.load_weights('weights\ep039-loss0.066.h5')

    video_capture = cv2.VideoCapture(Cam_num)
    # 写入视频    video_FourCC = int(video_capture.get(cv2.CAP_PROP_FOURCC))    video_fps = video_capture.get(cv2.CAP_PROP_FPS)    video_size = (int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)),                  int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))    out = cv2.VideoWriter('output.mp4', video_FourCC, video_fps, video_size)    index = 0
    # 行人特征  行人坐标    unknow_person_emb =[]    xyxy_all = []
    # person_state = []
    while True:        ret, im0 = video_capture.read()        # im0 = cv2.flip(im0, 1, dst=None)        iimage = im0.copy()        img = letterbox(im0, new_shape=image_size)[0]        img = img[:, :, ::-1].transpose(2, 0, 1)        img = np.ascontiguousarray(img)        img = torch.from_numpy(img).to(device)        img = img.half() if half else img.float()  # uint8 to fp16/32        img /= 255.0  # 0 - 255 to 0.0 - 1.0        if img.ndimension() == 3:            img = img.unsqueeze(0)        # print(img.shape)
        # Inference        pred = model(img, augment=False)[0]

        if half:            pred = pred.float()
        # Apply NMS        pred = non_max_suppression(pred, conf_thres, iou_thres,                                   fast=True, classes=None, agnostic=False)
        this_frame_xyxy =[]        this_frame_emb=[]
        for i, det in enumerate(pred):            if det is not None and len(det):                # Rescale boxes from img_size to im0 size                det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()                for *xyxy, conf, cls in det:                    if names[int(cls)] == 'person':                        c1, c2 = (int(xyxy[0]), int(xyxy[1])), (int(xyxy[2]), int(xyxy[3]))                        # 获得ROI 区域                        x_min, y_min = c1                        x_max, y_max = c2                        roi = iimage[y_min:y_max, x_min:x_max]                        roi = cv2.resize(roi,(90,215))                        # cv2.imwrite('person.jpg',roi)                        image_1 = np.asarray(roi).astype(np.float64) / 255                        photo1 = np.expand_dims(image_1 ,0)                        output1 = pred_model.predict(photo1)                        # centerx = x_min+(x_max-x_min)/2                        # centery = y_min+(y_max-y_min)/2                        # w , h  =  (x_max-x_min) , (y_max-y_min)
                        # if 0.2<w/h<0.5:                        # 提取每一帧的行人                        this_frame_xyxy.append([x_min,y_min,x_max,y_max])                        this_frame_emb.append(output1)

                # 第一帧只进行 特征提取                if index == 0:                    for index_, xyxy_ in enumerate(this_frame_xyxy):
                        unknow_person_emb.append(this_frame_emb[index_])                        xyxy_all.append(xyxy_)
                # 开始特征匹配                else:                    # 当前帧的每个行人 与 上一帧的行人的 特征距离                    this_frame_emb_arr=[]                    for i in this_frame_emb:                        dit, com_p = com_person(unknow_person_emb, i, tolerance=0.7)                        # print(dit)                        this_frame_emb_arr.append(dit)                    arr_frame = np.array(this_frame_emb_arr)                    # 获得距离矩阵 （物理距离+特征距离）                    ious= get_iou(this_frame_xyxy, xyxy_all,arr_frame)                    # 当前帧已有的行人id  确保不出现重复id                    this_person_index_no_state=[]
                    # 当前帧索引，                    for index_,iou in enumerate(ious):                        # 查找距离最近的3个行人 （物理距离+特征距离）                        ind = np.argpartition(iou, (0,3))[:3]                        # ind = [iou.argmin()]                        # 最小的 距离小于1.4  特征距离0.7 + 物理距离0.7                        if iou[ind[0]]<1.4:                            person_index = ind[0]                            i= 0                            match_person = True                            # 当 行人id 已经出现时，取第二 第三个id 进行匹配                            while person_index in this_person_index_no_state:                                person_index =ind[i+1]                                # 匹配失败则认为该行人 是新出现的行人                                if iou[person_index]>1.4:                                    match_person = False                                if i == 1:                                    break                                i+=1
                            # 匹配行人成功   画框、更新特征以及坐标                            if match_person:                            # while iou.argmin() not in this_person_index_no_state:                                x1, y1, x2, y2 = this_frame_xyxy[index_]                                person_id = 'ID-%s' % person_index                                cv2.rectangle(im0, (x1, y1), (x2, y2), (0, 255, 255), 3)                                cv2.putText(im0,person_id,(x1,y1-10),cv2.FONT_HERSHEY_COMPLEX,1,(0,0,255),2)                                unknow_person_emb[person_index]=this_frame_emb[index_]                                xyxy_all[person_index]=this_frame_xyxy[index_]                                this_person_index_no_state.append(person_index)                            # 行人匹配失败 添加 特征和坐标                            else:                                unknow_person_emb.append(this_frame_emb[index_])                                xyxy_all.append(this_frame_xyxy[index_])                        # 行人匹配失败 添加 特征和坐标                        else:                            unknow_person_emb.append(this_frame_emb[index_])                            xyxy_all.append(this_frame_xyxy[index_])        index+=1        log = 'all person:%s this frame person:%s '%(len(unknow_person_emb),len(this_frame_emb))        cv2.putText(im0, log, (20, 20), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255), 2)        cv2.namedWindow('image',cv2.WINDOW_NORMAL)        cv2.imshow('image', im0)        out.write(im0)        if cv2.waitKey(1) == ord('q'):            break
    video_capture.release()    out.release()    cv2.destroyAllWindows()

# Main函数编写

if __name__ == '__main__':    model_path = 'weights\yolov5m.pt'    hand_model_path = 'weights\hand_pose.h5'    Cam_num = r'test_video\Running - 294.mp4'    image_size = 416    conf_thres = 0.4    iou_thres = 0.4    device = 'cpu'    half = False    with torch.no_grad():        run()

运行

SUMMER.TIME

运行程序，就可以得到文章开头的效果啦：

以上就是本推文的全部内容啦，行人跟踪与检测的内容就到此一段落了，不过通过这个算法，我们还可以做一些更有意思的程序，如以图搜图、声纹识别、语音唤醒词识别等。喜欢的同学可以关注一波噢