From 144afe6b4992610231e4860b8aa6b09928ded9dd Mon Sep 17 00:00:00 2001 From: Justin Ruan Date: Mon, 1 Mar 2021 02:59:44 +0800 Subject: [PATCH] Add label_to_csv.py to convert txt or xml files to csv. Also, add the (#704) README.md to tell how to do the transformation --- convert/README.md | 85 ++++++++++++++++ convert/label_to_csv.py | 215 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 300 insertions(+) create mode 100644 convert/README.md create mode 100644 convert/label_to_csv.py diff --git a/convert/README.md b/convert/README.md new file mode 100644 index 00000000..1e9addac --- /dev/null +++ b/convert/README.md @@ -0,0 +1,85 @@ +# Convert the label files to CSV + +## Introduction +To train the images on [Google Cloud AutoML](https://cloud.google.com/automl), we should prepare the specific csv files follow [this format](https://cloud.google.com/vision/automl/object-detection/docs/csv-format). + +`label_to_csv.py` can convert the `txt` or `xml` label files to csv file. The labels files should strictly follow to below structure. + +## Structures +* Images + To train the object detection tasks, all the images should upload to the cloud storage and access it by its name. All the images should stay in the **same buckets** in cloud storage. Also, different classes should have their own folder as below. + ``` + (on the cloud storage) + | -- class1 + | | -- class1_01.jpg + | | -- class1_02.jpg + | | ... + | -- class2 + | | -- class2_01.jpg + | | -- class2_02.jpg + | | ... + | ... + ``` + Note, URI of the `class1_01.jpg` is `gs:///class1/class1_01.jpg` +* Labels + There are four types of training data - `TRAINING`, `VALIDATION`, `TEST` and `UNASSIGNED`. To assign different categories, we should create four directories. + Inside each folder, users should create the class folders with the same name in cloud storage (see below structure). + ``` + labels (on PC) + | -- TRAINING + | | -- class1 + | | | -- class1_01.txt (or .xml) + | | | ... + | | -- class2 + | | | -- class2_01.txt (or .xml) + | | | ... + | | ... + | -- VALIDATION + | | -- class1 + | | | -- class1_02.txt (or .xml) + | | | ... + | | -- class2 + | | | -- class2_02.txt (or .xml) + | | | ... + | | ... + | -- TEST + | | (same as TRAINING and VALIDATION) + | -- UNASSIGNED + | | (same as TRAINING and VALIDATION) + ``` + +## Usage + +To see the argument of `label_to_csv.py`, +```commandline +python label_to_csv.py -h +``` + +```commandline +usage: label_to_csv.py [-h] -p PREFIX -l LOCATION -m MODE [-o OUTPUT] + [-c CLASSES] + +optional arguments: + -h, --help show this help message and exit + -p PREFIX, --prefix PREFIX + Bucket of the cloud storage path + -l LOCATION, --location LOCATION + Parent directory of the label files + -m MODE, --mode MODE 'xml' for converting from xml and 'txt' for converting + from txt + -o OUTPUT, --output OUTPUT + Output name of csv file + -c CLASSES, --classes CLASSES + Label classes path +``` + +For example, if mine bucket name is **test**, the location of the label directory is **/User/test/labels**, the mode I choose from is **txt**, the output name and the class path is same as default. +```commandline +python label_to_csv.py \ +-p test\ +-l /User/test/labels \ +-m txt +``` + +The output file is `res.csv` by default. Afterwards, upload the csv file to the cloud storage and you can start training! + \ No newline at end of file diff --git a/convert/label_to_csv.py b/convert/label_to_csv.py new file mode 100644 index 00000000..f3b72bab --- /dev/null +++ b/convert/label_to_csv.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Name: label_to_csv.py +Author: Justin Ruan +Contact: justin900429@gmail.com +Time: 2021.02.06 +""" + +import os +import argparse +import codecs + +import pandas as pd + + +def txt2csv(location, training_dir, path_prefix): + # Return list + temp_res = [] + + # Run through all the files + for file in os.listdir(location): + # Check the file name ends with txt + # and not class.txt + if (not file.endswith(".txt")) | \ + (file == "classes.txt"): + continue + + # Get the file name + file_whole_name = f"{location}/{file}" + + # Read in txt as csv + df_txt = pd.read_csv(file_whole_name, sep=" ", header=None) + + # Create data for each labels + for index, row in df_txt.iterrows(): + # Temp array for csv, initialized by the training types + temp_csv = [str(training_dir)] + + # gs://prefix/name/{image_name} + cloud_path = f"{path_prefix}/{os.path.splitext(file)[0]}.jpg" + temp_csv.append(cloud_path) + + # Class label + temp_csv.append(class_labels[int(row[0])]) + + # Add the upper left coordinate + x_min = min(max(0.0, row[1] - row[3] / 2), 1.0) + y_min = min(max(0.0, row[2] - row[4] / 2), 1.0) + temp_csv.extend([x_min, y_min]) + + # Add the lower left coordinate (not necessary, left blank) + temp_csv.extend(["", ""]) + + # Add the lower right coordinate + x_max = min(max(0.0, row[1] + row[3] / 2), 1.0) + y_max = min(max(0.0, row[2] + row[4] / 2), 1.0) + temp_csv.extend([x_max, y_max]) + + # Add the upper right coordinate (not necessary, left blank) + temp_csv.extend(["", ""]) + + # Append to the res + temp_res.append(temp_csv) + + return temp_res + + +def xml2csv(location, training_dir, path_prefix): + # To parse the xml files + import xml.etree.ElementTree as ET + + # Return list + temp_res = [] + + # Run through all the files + for file in os.listdir(location): + # Check the file name ends with xml + if not file.endswith(".xml"): + continue + + # Get the file name + file_whole_name = f"{location}/{file}" + + # Open the xml name + tree = ET.parse(file_whole_name) + root = tree.getroot() + + # Get the width, height of images + # to normalize the bounding boxes + size = root.find("size") + width, height = float(size.find("width").text), float(size.find("height").text) + + # Find all the bounding objects + for label_object in root.findall("object"): + # Temp array for csv, initialized by the training types + temp_csv = [str(training_dir)] + + # gs://prefix/name/{image_name} + cloud_path = f"{path_prefix}/{os.path.splitext(file)[0]}.jpg" + temp_csv.append(cloud_path) + + # Class label + temp_csv.append(label_object.find("name").text) + + # Bounding box coordinate + bounding_box = label_object.find("bndbox") + + # Add the upper left coordinate + x_min = float(bounding_box.find("xmin").text) / width + y_min = float(bounding_box.find("ymin").text) / height + temp_csv.extend([x_min, y_min]) + + # Add the lower left coordinate (not necessary, left blank) + temp_csv.extend(["", ""]) + + # Add the lower right coordinate + x_max = float(bounding_box.find("xmax").text) / width + y_max = float(bounding_box.find("ymax").text) / height + temp_csv.extend([x_max, y_max]) + + # Add the upper right coordinate (not necessary, left blank) + temp_csv.extend(["", ""]) + + # Append to the res + temp_res.append(temp_csv) + + return temp_res + + +if __name__ == "__main__": + # Add the argument parse + arg_p = argparse.ArgumentParser() + arg_p.add_argument("-p", "--prefix", + required=True, + type=str, + help="Bucket of the cloud storage path") + arg_p.add_argument("-l", "--location", + type=str, + required=True, + help="Location of the label files") + arg_p.add_argument("-m", "--mode", + type=str, + required=True, + help="'xml' for converting from xml and 'txt' for converting from txt") + arg_p.add_argument("-o", "--output", + type=str, + default="res.csv", + help="Output name of csv file") + arg_p.add_argument("-c", "--classes", + type=str, + default=os.path.join("..", "data", "predefined_classes.txt"), + help="Label classes path") + args = vars(arg_p.parse_args()) + + # Class labels + class_labels = [] + + # Load in the defined classes + if os.path.exists(args["classes"]) is True: + with codecs.open(args["classes"], 'r', 'utf8') as f: + for line in f: + line = line.strip() + class_labels.append(line) + else: # Exit if errors occurred + print(f"File: {args['classes']} not exists") + exit(1) + + # Prefix of the cloud storage + ori_prefix = f"gs://{args['prefix']}" + + # Array for final csv file + res = [] + # Get all the file in dir + for training_type_dir in os.listdir(args["location"]): + # Get the dirname + dir_name = f"{args['location']}/{training_type_dir}" + + # Check whether is dir + if not os.path.isdir(dir_name): + continue + # Process the files + + for class_type_dir in os.listdir(dir_name): + + # Check whether is dir + if not os.path.isdir(dir_name): + continue + + prefix = f"{ori_prefix}/{class_type_dir}" + + # Convert the chosen extension to csv + if args["mode"] == "txt": + res.extend(txt2csv(f"{dir_name}/{class_type_dir}", + training_type_dir, + prefix)) + elif args["mode"] == "xml": + res.extend(xml2csv(f"{dir_name}/{class_type_dir}", + training_type_dir, + prefix)) + else: + print("Wrong argument for convert mode.\n" + "'xml' for converting from xml to csv\n" + "'txt' for converting from txt to csv") + exit(1) + + # Write to the result csv + res_csv = pd.DataFrame(res, + columns=["set", "path", "label", + "x_min", "y_min", + "x_max", "y_min", + "x_max", "y_max", + "x_min", "y_max"]) + res_csv.to_csv("res.csv", index=False, header=False)