From 144afe6b4992610231e4860b8aa6b09928ded9dd Mon Sep 17 00:00:00 2001
From: Justin Ruan <justin900429@gmail.com>
Date: Mon, 1 Mar 2021 02:59:44 +0800
Subject: [PATCH] Add label_to_csv.py to convert txt or xml files to csv. Also,
 add the (#704)

README.md to tell how to do the transformation
---
 convert/README.md       |  85 ++++++++++++++++
 convert/label_to_csv.py | 215 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 300 insertions(+)
 create mode 100644 convert/README.md
 create mode 100644 convert/label_to_csv.py

diff --git a/convert/README.md b/convert/README.md
new file mode 100644
index 00000000..1e9addac
--- /dev/null
+++ b/convert/README.md
@@ -0,0 +1,85 @@
+# Convert the label files to CSV
+
+## Introduction
+To train the images on [Google Cloud AutoML](https://cloud.google.com/automl), we should prepare the specific csv files follow [this format](https://cloud.google.com/vision/automl/object-detection/docs/csv-format).
+
+`label_to_csv.py` can convert the `txt` or `xml` label files to csv file. The labels files should strictly follow to below structure.
+
+## Structures
+* Images  
+    To train the object detection tasks, all the images should upload to the cloud storage and access it by its name. All the images should stay in the **same buckets** in cloud storage. Also, different classes should have their own folder as below.
+    ```
+    <bucket_name> (on the cloud storage)
+    | -- class1
+    |    | -- class1_01.jpg
+    |    | -- class1_02.jpg
+    |    | ...
+    | -- class2
+    |    | -- class2_01.jpg
+    |    | -- class2_02.jpg
+    |    | ...
+    | ...
+    ```
+    Note, URI of the `class1_01.jpg` is `gs://<bucket_name>/class1/class1_01.jpg`
+* Labels  
+    There are four types of training data - `TRAINING`, `VALIDATION`, `TEST` and `UNASSIGNED`. To assign different categories, we should create four directories.
+    Inside each folder, users should create the class folders with the same name in cloud storage (see below structure).
+    ```
+    labels (on PC)
+    | -- TRAINING
+    |    | -- class1
+    |    |    | -- class1_01.txt (or .xml)
+    |    |    | ...
+    |    | -- class2
+    |    |    | -- class2_01.txt (or .xml)
+    |    |    | ...
+    |    | ... 
+    | -- VALIDATION
+    |    | -- class1
+    |    |    | -- class1_02.txt (or .xml)
+    |    |    | ...
+    |    | -- class2
+    |    |    | -- class2_02.txt (or .xml)
+    |    |    | ...
+    |    | ... 
+    | -- TEST
+    |    | (same as TRAINING and VALIDATION)
+    | -- UNASSIGNED
+    |    | (same as TRAINING and VALIDATION)
+    ```
+  
+## Usage
+
+To see the argument of `label_to_csv.py`,
+```commandline
+python label_to_csv.py -h
+```
+
+```commandline
+usage: label_to_csv.py [-h] -p PREFIX -l LOCATION -m MODE [-o OUTPUT]
+                       [-c CLASSES]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PREFIX, --prefix PREFIX
+                        Bucket of the cloud storage path
+  -l LOCATION, --location LOCATION
+                        Parent directory of the label files
+  -m MODE, --mode MODE  'xml' for converting from xml and 'txt' for converting
+                        from txt
+  -o OUTPUT, --output OUTPUT
+                        Output name of csv file
+  -c CLASSES, --classes CLASSES
+                        Label classes path
+```
+
+For example, if mine bucket name is **test**, the location of the label directory is **/User/test/labels**, the mode I choose from is **txt**, the output name and the class path is same as default.
+```commandline
+python label_to_csv.py \
+-p test\
+-l /User/test/labels \
+-m txt
+```
+
+The output file is `res.csv` by default. Afterwards, upload the csv file to the cloud storage and you can start training!
+    
\ No newline at end of file
diff --git a/convert/label_to_csv.py b/convert/label_to_csv.py
new file mode 100644
index 00000000..f3b72bab
--- /dev/null
+++ b/convert/label_to_csv.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Name: label_to_csv.py
+Author: Justin Ruan
+Contact: justin900429@gmail.com
+Time: 2021.02.06
+"""
+
+import os
+import argparse
+import codecs
+
+import pandas as pd
+
+
+def txt2csv(location, training_dir, path_prefix):
+    # Return list
+    temp_res = []
+
+    # Run through all the files
+    for file in os.listdir(location):
+        # Check the file name ends with txt
+        #  and not class.txt
+        if (not file.endswith(".txt")) | \
+                (file == "classes.txt"):
+            continue
+
+        # Get the file name
+        file_whole_name = f"{location}/{file}"
+
+        # Read in txt as csv
+        df_txt = pd.read_csv(file_whole_name, sep=" ", header=None)
+
+        # Create data for each labels
+        for index, row in df_txt.iterrows():
+            # Temp array for csv, initialized by the training types
+            temp_csv = [str(training_dir)]
+
+            # gs://prefix/name/{image_name}
+            cloud_path = f"{path_prefix}/{os.path.splitext(file)[0]}.jpg"
+            temp_csv.append(cloud_path)
+
+            # Class label
+            temp_csv.append(class_labels[int(row[0])])
+
+            # Add the upper left coordinate
+            x_min = min(max(0.0, row[1] - row[3] / 2), 1.0)
+            y_min = min(max(0.0, row[2] - row[4] / 2), 1.0)
+            temp_csv.extend([x_min, y_min])
+
+            # Add the lower left coordinate (not necessary, left blank)
+            temp_csv.extend(["", ""])
+
+            # Add the lower right coordinate
+            x_max = min(max(0.0, row[1] + row[3] / 2), 1.0)
+            y_max = min(max(0.0, row[2] + row[4] / 2), 1.0)
+            temp_csv.extend([x_max, y_max])
+
+            # Add the upper right coordinate (not necessary, left blank)
+            temp_csv.extend(["", ""])
+
+            # Append to the res
+            temp_res.append(temp_csv)
+
+    return temp_res
+
+
+def xml2csv(location, training_dir, path_prefix):
+    # To parse the xml files
+    import xml.etree.ElementTree as ET
+
+    # Return list
+    temp_res = []
+
+    # Run through all the files
+    for file in os.listdir(location):
+        # Check the file name ends with xml
+        if not file.endswith(".xml"):
+            continue
+
+        # Get the file name
+        file_whole_name = f"{location}/{file}"
+
+        # Open the xml name
+        tree = ET.parse(file_whole_name)
+        root = tree.getroot()
+
+        # Get the width, height of images
+        #  to normalize the bounding boxes
+        size = root.find("size")
+        width, height = float(size.find("width").text), float(size.find("height").text)
+
+        # Find all the bounding objects
+        for label_object in root.findall("object"):
+            # Temp array for csv, initialized by the training types
+            temp_csv = [str(training_dir)]
+
+            # gs://prefix/name/{image_name}
+            cloud_path = f"{path_prefix}/{os.path.splitext(file)[0]}.jpg"
+            temp_csv.append(cloud_path)
+
+            # Class label
+            temp_csv.append(label_object.find("name").text)
+
+            # Bounding box coordinate
+            bounding_box = label_object.find("bndbox")
+
+            # Add the upper left coordinate
+            x_min = float(bounding_box.find("xmin").text) / width
+            y_min = float(bounding_box.find("ymin").text) / height
+            temp_csv.extend([x_min, y_min])
+
+            # Add the lower left coordinate (not necessary, left blank)
+            temp_csv.extend(["", ""])
+
+            # Add the lower right coordinate
+            x_max = float(bounding_box.find("xmax").text) / width
+            y_max = float(bounding_box.find("ymax").text) / height
+            temp_csv.extend([x_max, y_max])
+
+            # Add the upper right coordinate (not necessary, left blank)
+            temp_csv.extend(["", ""])
+
+            # Append to the res
+            temp_res.append(temp_csv)
+
+    return temp_res
+
+
+if __name__ == "__main__":
+    # Add the argument parse
+    arg_p = argparse.ArgumentParser()
+    arg_p.add_argument("-p", "--prefix",
+                       required=True,
+                       type=str,
+                       help="Bucket of the cloud storage path")
+    arg_p.add_argument("-l", "--location",
+                       type=str,
+                       required=True,
+                       help="Location of the label files")
+    arg_p.add_argument("-m", "--mode",
+                       type=str,
+                       required=True,
+                       help="'xml' for converting from xml and 'txt' for converting from txt")
+    arg_p.add_argument("-o", "--output",
+                       type=str,
+                       default="res.csv",
+                       help="Output name of csv file")
+    arg_p.add_argument("-c", "--classes",
+                       type=str,
+                       default=os.path.join("..", "data", "predefined_classes.txt"),
+                       help="Label classes path")
+    args = vars(arg_p.parse_args())
+
+    # Class labels
+    class_labels = []
+
+    # Load in the defined classes
+    if os.path.exists(args["classes"]) is True:
+        with codecs.open(args["classes"], 'r', 'utf8') as f:
+            for line in f:
+                line = line.strip()
+                class_labels.append(line)
+    else:  # Exit if errors occurred
+        print(f"File: {args['classes']} not exists")
+        exit(1)
+
+    # Prefix of the cloud storage
+    ori_prefix = f"gs://{args['prefix']}"
+
+    # Array for final csv file
+    res = []
+    # Get all the file in dir
+    for training_type_dir in os.listdir(args["location"]):
+        # Get the dirname
+        dir_name = f"{args['location']}/{training_type_dir}"
+
+        # Check whether is dir
+        if not os.path.isdir(dir_name):
+            continue
+            # Process the files
+
+        for class_type_dir in os.listdir(dir_name):
+
+            # Check whether is dir
+            if not os.path.isdir(dir_name):
+                continue
+
+            prefix = f"{ori_prefix}/{class_type_dir}"
+
+            # Convert the chosen extension to csv
+            if args["mode"] == "txt":
+                res.extend(txt2csv(f"{dir_name}/{class_type_dir}",
+                                   training_type_dir,
+                                   prefix))
+            elif args["mode"] == "xml":
+                res.extend(xml2csv(f"{dir_name}/{class_type_dir}",
+                                   training_type_dir,
+                                   prefix))
+            else:
+                print("Wrong argument for convert mode.\n"
+                      "'xml' for converting from xml to csv\n"
+                      "'txt' for converting from txt to csv")
+                exit(1)
+
+    # Write to the result csv
+    res_csv = pd.DataFrame(res,
+                           columns=["set", "path", "label",
+                                    "x_min", "y_min",
+                                    "x_max", "y_min",
+                                    "x_max", "y_max",
+                                    "x_min", "y_max"])
+    res_csv.to_csv("res.csv", index=False, header=False)