"""python 分别读取train和valid的图片和xml信息,创建用于训练和测试的json文件 """ defcreate_data_lists(voc07_path, voc12_path, output_folder): """ Create lists of images, the bounding boxes and labels of the objects in these images, and save these to file. :param voc07_path: path to the 'VOC2007' folder :param voc12_path: path to the 'VOC2012' folder :param output_folder: folder where the JSONs must be saved """
# Training data for path in [voc07_path, voc12_path]:
# Find IDs of images in training data #获取训练所用的train和val数据的图片id with open(os.path.join(path, 'ImageSets/Main/trainval.txt')) as f: ids = f.read().splitlines()
#根据图片id,解析图片的xml文件,获取标注信息 for id in ids: # Parse annotation's XML file objects = parse_annotation(os.path.join(path, 'Annotations', id + '.xml')) if len(objects['boxes']) == 0: #如果没有目标则跳过 continue n_objects += len(objects) #统计目标总数 train_objects.append(objects) #存储每张图片的标注信息到列表train_objects train_images.append(os.path.join(path, 'JPEGImages', id + '.jpg')) #存储每张图片的路径到列表train_images,用于读取图片
# Save to file #将训练数据的图片路径,标注信息,类别映射信息,分别保存为json文件 with open(os.path.join(output_folder, 'TRAIN_images.json'), 'w') as j: json.dump(train_images, j) with open(os.path.join(output_folder, 'TRAIN_objects.json'), 'w') as j: json.dump(train_objects, j) with open(os.path.join(output_folder, 'label_map.json'), 'w') as j: json.dump(label_map, j) # save label map too
print('\nThere are %d training images containing a total of %d objects. Files have been saved to %s.' % ( len(train_images), n_objects, os.path.abspath(output_folder)))
#与Train data一样,目的是将测试数据的图片路径,标注信息,类别映射信息,分别保存为json文件,参考上面的注释理解 # Test data test_images = list() test_objects = list() n_objects = 0
# Find IDs of images in the test data with open(os.path.join(voc07_path, 'ImageSets/Main/test.txt')) as f: ids = f.read().splitlines()
for id in ids: # Parse annotation's XML file objects = parse_annotation(os.path.join(voc07_path, 'Annotations', id + '.xml')) if len(objects) == 0: continue test_objects.append(objects) n_objects += len(objects) test_images.append(os.path.join(voc07_path, 'JPEGImages', id + '.jpg'))
assert len(test_objects) == len(test_images)
# Save to file with open(os.path.join(output_folder, 'TEST_images.json'), 'w') as j: json.dump(test_images, j) with open(os.path.join(output_folder, 'TEST_objects.json'), 'w') as j: json.dump(test_objects, j)
print('\nThere are %d test images containing a total of %d objects. Files have been saved to %s.' % ( len(test_images), n_objects, os.path.abspath(output_folder)))
代码位于 datasets.py 脚本中,可以看到,PascalVOCDataset继承了torch.utils.data.Dataset,然后重写了__init__ , getitem, len 和 collate_fn 四个方法,这也是我们在构建自己的dataset的时候需要经常做的工作,配合下面注释理解代码:
"""python PascalVOCDataset具体实现过程 """ import torch from torch.utils.data import Dataset import json import os from PIL import Image from utils import transform
classPascalVOCDataset(Dataset): """ A PyTorch Dataset class to be used in a PyTorch DataLoader to create batches. """
#初始化相关变量 #读取images和objects标注信息 def__init__(self, data_folder, split, keep_difficult=False): """ :param data_folder: folder where data files are stored :param split: split, one of 'TRAIN' or 'TEST' :param keep_difficult: keep or discard objects that are considered difficult to detect? """ self.split = split.upper() #保证输入为纯大写字母,便于匹配{'TRAIN', 'TEST'}
#我们知道,我们输入到网络中训练的数据通常是一个batch一起输入,而通过__getitem__我们只读取了一张图片及其objects信息 #如何将读取的一张张图片及其object信息整合成batch的形式呢? #collate_fn就是做这个事情, #对于一个batch的images,collate_fn通过torch.stack()将其整合成4维tensor,对应的objects信息分别用一个list存储 defcollate_fn(self, batch): """ Since each image may have a different number of objects, we need a collate function (to be passed to the DataLoader). This describes how to combine these tensors of different sizes. We use lists. Note: this need not be defined in this Class, can be standalone. :param batch: an iterable of N sets from __getitem__() :return: a tensor of images, lists of varying-size tensors of bounding boxes, labels, and difficulties """
"""python transform操作是训练模型中一项非常重要的工作,其中不仅包含数据增强以提升模型性能的相关操作,也包含如数据类型转换(PIL to Tensor)、归一化(Normalize)这些必要操作。 """ import json import os import torch import random import xml.etree.ElementTree as ET import torchvision.transforms.functional as FT
在TRAIN和TEST时都要进行的transform有: 1.统一图像大小到(224,224),resize 2.PIL to Tensor 3.归一化,FT.normalize()
注1: resize也是一种几何变化,要知道应用数据增强策略时,哪些属于几何变化,哪些属于像素变化 注2: PIL to Tensor操作,normalize操作必须执行 """
deftransform(image, boxes, labels, difficulties, split): """ Apply the transformations above. :param image: image, a PIL Image :param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4) :param labels: labels of objects, a tensor of dimensions (n_objects) :param difficulties: difficulties of detection of these objects, a tensor of dimensions (n_objects) :param split: one of 'TRAIN' or 'TEST', since different sets of transformations are applied :return: transformed image, transformed bounding box coordinates, transformed labels, transformed difficulties """
#在训练和测试时使用的transform策略往往不完全相同,所以需要split变量指明是TRAIN还是TEST时的transform方法 assert split in {'TRAIN', 'TEST'}
# Mean and standard deviation of ImageNet data that our base VGG from torchvision was trained on # see: https://pytorch.org/docs/stable/torchvision/models.html #为了防止由于图片之间像素差异过大而导致的训练不稳定问题,图片在送入网络训练之间需要进行归一化 #对所有图片各通道求mean和std来获得 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225]
# Skip the following operations for evaluation/testing if split == 'TRAIN': # A series of photometric distortions in random order, each with 50% chance of occurrence, as in Caffe repo new_image = photometric_distort(new_image)
# Convert PIL image to Torch tensor new_image = FT.to_tensor(new_image)
# Expand image (zoom out) with a 50% chance - helpful for training detection of small objects # Fill surrounding space with the mean of ImageNet data that our base VGG was trained on if random.random() < 0.5: new_image, new_boxes = expand(new_image, boxes, filler=mean)
# Convert Torch tensor to PIL image new_image = FT.to_pil_image(new_image)
# Flip image with a 50% chance if random.random() < 0.5: new_image, new_boxes = flip(new_image, new_boxes)
# Resize image to (224, 224) - this also converts absolute boundary coordinates to their fractional form new_image, new_boxes = resize(new_image, new_boxes, dims=(224, 224))
# Convert PIL image to Torch tensor new_image = FT.to_tensor(new_image)
# Normalize by mean and standard deviation of ImageNet data that our base VGG was trained on new_image = FT.normalize(new_image, mean=mean, std=std)