创建符合MMdetection要求的训练数据集
創建符合MMdetection要求的訓練數據集
- 前言
- 一、MMdetection安裝
- 二、創建COCO格式數據集
- 注意注意
- 總結
- 下一篇將講述如何使用本節劃分完成的數據集訓練模型
前言
MMdetection是openmmlab的一個目標檢測開源項目,本文先簡述MMdetection的安裝,然后針對用戶標注的標簽文件格式(比如xml、txt)如何轉換成符合mmdetection的標簽格式。
一、MMdetection安裝
如果你是在本地電腦安裝mmdetection,那么請確保已經正確安裝Python和Anaconda;如果是遠程服務器,一般都會默認配置Python和Miniconda
這里參考openmmlab官方的安裝步驟:
# 創建環境 conda create -n openmmlab(名字可自取) python=3.8 pytorch==1.10.0 cudatoolkit=10.1 torchvision -c pytorch -y conda activate openmmlab pip install openmim mim install mmcv-full git clone https://github.com/open-mmlab/mmdetection.git cd mmdetection pip install -r requirements/build.txt pip install -v -e .二、創建COCO格式數據集
之所以是COCO格式數據集,是因為mmdetection大部分的數據格式是COCO,所以轉換成COCO方便訓練測試。我這里默認已經標注并劃分好了數據集(以yolov5):
NeedTrainImageFold:images:train: 訓練集圖片val: 驗證集圖片test: 測試集圖片labels:train: 訓練集標簽val: 驗證集標簽test: 測試集標簽如果你的標簽文件是xml或者txt,那么就需要將其轉換成json,這里貼上開源庫,如果你的標簽文件是xml格式,請用這個:voc2coco;如果是txt,請用這個:YOLO2COCO
因為我使用txt的格式較多,所以我演示演示一下YOLO2COCO這個庫的使用:
首先是把項目克隆下來:git clone https://github.com/RapidAI/YOLO2COCO.git(請確保你已經安裝git,這樣才能正常克隆)
請看其項目有2個轉換成COCO的方法,YOLOV5 -> COCO、YOLOV5 YAML -> COCO
我這里主要演示YOLOV5 YAML -> COCO,首先就是你的得創建YAML文件,格式如下:
之后在終端輸入:
python yolov5_yaml_2_coco.py --yaml_path dataset/YOLOV5_yaml/sample.yaml (你的yaml文件路徑)
注意注意
如果你的數據集劃分和我上面一樣,有test測試集的話,那么運行上述代碼不會有test2017文件夾及其標注信息。
我將作者的yolov5_yaml_2_coco.py代碼稍微改了一下,直接全復制下面的代碼,然后替換yolov5_yaml_2_coco.py即可
# !/usr/bin/env python # -*- encoding: utf-8 -*- import argparse import glob import json import os import shutil import time from pathlib import Pathimport cv2 import yaml from tqdm import tqdmdef read_txt(txt_path):with open(str(txt_path), 'r', encoding='utf-8') as f:data = list(map(lambda x: x.rstrip('\n'), f))return datadef mkdir(dir_path):Path(dir_path).mkdir(parents=True, exist_ok=True)def verify_exists(file_path):file_path = Path(file_path).resolve()if not file_path.exists():raise FileNotFoundError(f'The {file_path} is not exists!!!')class YOLOV5CFG2COCO(object):def __init__(self, yaml_path):verify_exists(yaml_path)with open(yaml_path, 'r', encoding="UTF-8") as f:self.data_cfg = yaml.safe_load(f)self.root_dir = Path(yaml_path).parent.parentself.root_data_dir = Path(self.data_cfg.get('path'))self.train_path = self._get_data_dir('train')self.val_path = self._get_data_dir('val')self.test_path = self._get_data_dir('test')nc = self.data_cfg['nc']if 'names' in self.data_cfg:self.names = self.data_cfg.get('names')else:# assign class names if missingself.names = [f'class{i}' for i in range(self.data_cfg['nc'])]assert len(self.names) == nc, \f'{len(self.names)} names found for nc={nc} dataset in {yaml_path}'# 構建COCO格式目錄self.dst = self.root_dir / f"{Path(self.root_data_dir).stem}_COCO_format"self.coco_train = "train2017"self.coco_val = "val2017"self.coco_test = "test2017"self.coco_annotation = "annotations"self.coco_train_json = self.dst / self.coco_annotation / \f'instances_{self.coco_train}.json'self.coco_val_json = self.dst / self.coco_annotation / \f'instances_{self.coco_val}.json'self.coco_test_json = self.dst / self.coco_annotation / \f'instances_{self.coco_test}.json'mkdir(self.dst)mkdir(self.dst / self.coco_train)mkdir(self.dst / self.coco_val)mkdir(self.dst / self.coco_test)mkdir(self.dst / self.coco_annotation)# 構建json內容結構self.type = 'instances'self.categories = []self._get_category()self.annotation_id = 1cur_year = time.strftime('%Y', time.localtime(time.time()))self.info = {'year': int(cur_year),'version': '1.0','description': 'For object detection','date_created': cur_year,}self.licenses = [{'id': 1,'name': 'Apache License v2.0','url': 'https://github.com/RapidAI/YOLO2COCO/LICENSE',}]def _get_data_dir(self, mode):data_dir = self.data_cfg.get(mode)if data_dir:if isinstance(data_dir, str):full_path = [str(self.root_data_dir / data_dir)]elif isinstance(data_dir, list):full_path = [str(self.root_data_dir / one_dir)for one_dir in data_dir]else:raise TypeError(f'{data_dir} is not str or list.')else:raise ValueError(f'{mode} dir is not in the yaml.')return full_pathdef _get_category(self):for i, category in enumerate(self.names, start=1):self.categories.append({'supercategory': category,'id': i,'name': category,})def generate(self):self.train_files = self.get_files(self.train_path)self.valid_files = self.get_files(self.val_path)self.test_files = self.get_files(self.test_path)train_dest_dir = Path(self.dst) / self.coco_trainself.gen_dataset(self.train_files, train_dest_dir,self.coco_train_json, mode='train')val_dest_dir = Path(self.dst) / self.coco_valself.gen_dataset(self.valid_files, val_dest_dir,self.coco_val_json, mode='val')test_dest_dir = Path(self.dst) / self.coco_testself.gen_dataset(self.test_files, test_dest_dir,self.coco_test_json, mode='test')print(f"The output directory is: {str(self.dst)}")def get_files(self, path):# include image suffixesIMG_FORMATS = ['bmp', 'dng', 'jpeg', 'jpg','mpo', 'png', 'tif', 'tiff', 'webp']f = []for p in path:p = Path(p) # os-agnosticif p.is_dir(): # dirf += glob.glob(str(p / '**' / '*.*'), recursive=True)# f = list(p.rglob('*.*')) # pathlibelif p.is_file(): # filewith open(p) as t:t = t.read().strip().splitlines()parent = str(p.parent) + os.sep# local to global pathf += [x.replace('./', parent)if x.startswith('./') else x for x in t]# f += [p.parent / x.lstrip(os.sep) for x in t] # local to global path (pathlib)else:raise Exception(f'{p} does not exist')im_files = sorted(x.replace('/', os.sep)for x in f if x.split('.')[-1].lower() in IMG_FORMATS)return im_filesdef gen_dataset(self, img_paths, target_img_path, target_json, mode):"""https://cocodataset.org/#format-data"""images = []annotations = []sa, sb = os.sep + 'images' + os.sep, os.sep + \'labels' + os.sep # /images/, /labels/ substringsfor img_id, img_path in enumerate(tqdm(img_paths, desc=mode), 1):label_path = sb.join(img_path.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt'img_path = Path(img_path)verify_exists(img_path)imgsrc = cv2.imread(str(img_path))height, width = imgsrc.shape[:2]dest_file_name = f'{img_id:012d}.jpg'save_img_path = target_img_path / dest_file_nameif img_path.suffix.lower() == ".jpg":shutil.copyfile(img_path, save_img_path)else:cv2.imwrite(str(save_img_path), imgsrc)images.append({'date_captured': '2021','file_name': dest_file_name,'id': img_id,'height': height,'width': width,})if Path(label_path).exists():new_anno = self.read_annotation(label_path, img_id,height, width)if len(new_anno) > 0:annotations.extend(new_anno)else:# print(f'{label_path} is empty')raise ValueError(f'{label_path} is empty')else:raise FileNotFoundError(f'{label_path} not exists')json_data = {'info': self.info,'images': images,'licenses': self.licenses,'type': self.type,'annotations': annotations,'categories': self.categories,}with open(target_json, 'w', encoding='utf-8') as f:json.dump(json_data, f, ensure_ascii=False)def read_annotation(self, txt_file, img_id, height, width):annotation = []all_info = read_txt(txt_file)for label_info in all_info:# 遍歷一張圖中不同標注對象label_info = label_info.split(" ")if len(label_info) < 5:continuecategory_id, vertex_info = label_info[0], label_info[1:]segmentation, bbox, area = self._get_annotation(vertex_info,height, width)annotation.append({'segmentation': segmentation,'area': area,'iscrowd': 0,'image_id': img_id,'bbox': bbox,'category_id': int(category_id)+1,'id': self.annotation_id,})self.annotation_id += 1return annotation@staticmethoddef _get_annotation(vertex_info, height, width):cx, cy, w, h = [float(i) for i in vertex_info]cx = cx * widthcy = cy * heightbox_w = w * widthbox_h = h * height# left topx0 = max(cx - box_w / 2, 0)y0 = max(cy - box_h / 2, 0)# right bottomtx1 = min(x0 + box_w, width)y1 = min(y0 + box_h, height)segmentation = [[x0, y0, x1, y0, x1, y1, x0, y1]]bbox = [x0, y0, box_w, box_h]area = box_w * box_hreturn segmentation, bbox, areaif __name__ == "__main__":parser = argparse.ArgumentParser('Datasets converter from YOLOV5 to COCO')parser.add_argument('--yaml_path', type=str,default='dataset/YOLOV5_yaml/sample.yaml',help='Dataset cfg file')args = parser.parse_args()converter = YOLOV5CFG2COCO(args.yaml_path)converter.generate()之后在終端輸入:
python yolov5_yaml_2_coco.py --yaml_path dataset/YOLOV5_yaml/sample.yaml
這樣就能看到測試集以及測試集標注信息json文件了:
大功告成,COCO格式數據集就創建成功了,還是很簡單的。
總結
- 安裝mmdetection
- 準備好已經標注完成的數據集
- 看自己標簽文件,使用對應的開源庫
- 如果和我數據集格式一樣,請看注意,把作者的代碼替換一下
下一篇將講述如何使用本節劃分完成的數據集訓練模型
總結
以上是生活随笔為你收集整理的创建符合MMdetection要求的训练数据集的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 线代 | 考研线性代数 解题方法汇总(非
- 下一篇: [量化-029]陈光明投资哲学整理