Add label_to_csv.py to convert txt or xml files to csv. Also, add the (#704)
README.md to tell how to do the transformation
This commit is contained in:
parent
ebb2b83387
commit
144afe6b49
85
convert/README.md
Normal file
85
convert/README.md
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
# Convert the label files to CSV
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
To train the images on [Google Cloud AutoML](https://cloud.google.com/automl), we should prepare the specific csv files follow [this format](https://cloud.google.com/vision/automl/object-detection/docs/csv-format).
|
||||||
|
|
||||||
|
`label_to_csv.py` can convert the `txt` or `xml` label files to csv file. The labels files should strictly follow to below structure.
|
||||||
|
|
||||||
|
## Structures
|
||||||
|
* Images
|
||||||
|
To train the object detection tasks, all the images should upload to the cloud storage and access it by its name. All the images should stay in the **same buckets** in cloud storage. Also, different classes should have their own folder as below.
|
||||||
|
```
|
||||||
|
<bucket_name> (on the cloud storage)
|
||||||
|
| -- class1
|
||||||
|
| | -- class1_01.jpg
|
||||||
|
| | -- class1_02.jpg
|
||||||
|
| | ...
|
||||||
|
| -- class2
|
||||||
|
| | -- class2_01.jpg
|
||||||
|
| | -- class2_02.jpg
|
||||||
|
| | ...
|
||||||
|
| ...
|
||||||
|
```
|
||||||
|
Note, URI of the `class1_01.jpg` is `gs://<bucket_name>/class1/class1_01.jpg`
|
||||||
|
* Labels
|
||||||
|
There are four types of training data - `TRAINING`, `VALIDATION`, `TEST` and `UNASSIGNED`. To assign different categories, we should create four directories.
|
||||||
|
Inside each folder, users should create the class folders with the same name in cloud storage (see below structure).
|
||||||
|
```
|
||||||
|
labels (on PC)
|
||||||
|
| -- TRAINING
|
||||||
|
| | -- class1
|
||||||
|
| | | -- class1_01.txt (or .xml)
|
||||||
|
| | | ...
|
||||||
|
| | -- class2
|
||||||
|
| | | -- class2_01.txt (or .xml)
|
||||||
|
| | | ...
|
||||||
|
| | ...
|
||||||
|
| -- VALIDATION
|
||||||
|
| | -- class1
|
||||||
|
| | | -- class1_02.txt (or .xml)
|
||||||
|
| | | ...
|
||||||
|
| | -- class2
|
||||||
|
| | | -- class2_02.txt (or .xml)
|
||||||
|
| | | ...
|
||||||
|
| | ...
|
||||||
|
| -- TEST
|
||||||
|
| | (same as TRAINING and VALIDATION)
|
||||||
|
| -- UNASSIGNED
|
||||||
|
| | (same as TRAINING and VALIDATION)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To see the argument of `label_to_csv.py`,
|
||||||
|
```commandline
|
||||||
|
python label_to_csv.py -h
|
||||||
|
```
|
||||||
|
|
||||||
|
```commandline
|
||||||
|
usage: label_to_csv.py [-h] -p PREFIX -l LOCATION -m MODE [-o OUTPUT]
|
||||||
|
[-c CLASSES]
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-p PREFIX, --prefix PREFIX
|
||||||
|
Bucket of the cloud storage path
|
||||||
|
-l LOCATION, --location LOCATION
|
||||||
|
Parent directory of the label files
|
||||||
|
-m MODE, --mode MODE 'xml' for converting from xml and 'txt' for converting
|
||||||
|
from txt
|
||||||
|
-o OUTPUT, --output OUTPUT
|
||||||
|
Output name of csv file
|
||||||
|
-c CLASSES, --classes CLASSES
|
||||||
|
Label classes path
|
||||||
|
```
|
||||||
|
|
||||||
|
For example, if mine bucket name is **test**, the location of the label directory is **/User/test/labels**, the mode I choose from is **txt**, the output name and the class path is same as default.
|
||||||
|
```commandline
|
||||||
|
python label_to_csv.py \
|
||||||
|
-p test\
|
||||||
|
-l /User/test/labels \
|
||||||
|
-m txt
|
||||||
|
```
|
||||||
|
|
||||||
|
The output file is `res.csv` by default. Afterwards, upload the csv file to the cloud storage and you can start training!
|
||||||
|
|
||||||
215
convert/label_to_csv.py
Normal file
215
convert/label_to_csv.py
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
Name: label_to_csv.py
|
||||||
|
Author: Justin Ruan
|
||||||
|
Contact: justin900429@gmail.com
|
||||||
|
Time: 2021.02.06
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def txt2csv(location, training_dir, path_prefix):
|
||||||
|
# Return list
|
||||||
|
temp_res = []
|
||||||
|
|
||||||
|
# Run through all the files
|
||||||
|
for file in os.listdir(location):
|
||||||
|
# Check the file name ends with txt
|
||||||
|
# and not class.txt
|
||||||
|
if (not file.endswith(".txt")) | \
|
||||||
|
(file == "classes.txt"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the file name
|
||||||
|
file_whole_name = f"{location}/{file}"
|
||||||
|
|
||||||
|
# Read in txt as csv
|
||||||
|
df_txt = pd.read_csv(file_whole_name, sep=" ", header=None)
|
||||||
|
|
||||||
|
# Create data for each labels
|
||||||
|
for index, row in df_txt.iterrows():
|
||||||
|
# Temp array for csv, initialized by the training types
|
||||||
|
temp_csv = [str(training_dir)]
|
||||||
|
|
||||||
|
# gs://prefix/name/{image_name}
|
||||||
|
cloud_path = f"{path_prefix}/{os.path.splitext(file)[0]}.jpg"
|
||||||
|
temp_csv.append(cloud_path)
|
||||||
|
|
||||||
|
# Class label
|
||||||
|
temp_csv.append(class_labels[int(row[0])])
|
||||||
|
|
||||||
|
# Add the upper left coordinate
|
||||||
|
x_min = min(max(0.0, row[1] - row[3] / 2), 1.0)
|
||||||
|
y_min = min(max(0.0, row[2] - row[4] / 2), 1.0)
|
||||||
|
temp_csv.extend([x_min, y_min])
|
||||||
|
|
||||||
|
# Add the lower left coordinate (not necessary, left blank)
|
||||||
|
temp_csv.extend(["", ""])
|
||||||
|
|
||||||
|
# Add the lower right coordinate
|
||||||
|
x_max = min(max(0.0, row[1] + row[3] / 2), 1.0)
|
||||||
|
y_max = min(max(0.0, row[2] + row[4] / 2), 1.0)
|
||||||
|
temp_csv.extend([x_max, y_max])
|
||||||
|
|
||||||
|
# Add the upper right coordinate (not necessary, left blank)
|
||||||
|
temp_csv.extend(["", ""])
|
||||||
|
|
||||||
|
# Append to the res
|
||||||
|
temp_res.append(temp_csv)
|
||||||
|
|
||||||
|
return temp_res
|
||||||
|
|
||||||
|
|
||||||
|
def xml2csv(location, training_dir, path_prefix):
|
||||||
|
# To parse the xml files
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
# Return list
|
||||||
|
temp_res = []
|
||||||
|
|
||||||
|
# Run through all the files
|
||||||
|
for file in os.listdir(location):
|
||||||
|
# Check the file name ends with xml
|
||||||
|
if not file.endswith(".xml"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the file name
|
||||||
|
file_whole_name = f"{location}/{file}"
|
||||||
|
|
||||||
|
# Open the xml name
|
||||||
|
tree = ET.parse(file_whole_name)
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
# Get the width, height of images
|
||||||
|
# to normalize the bounding boxes
|
||||||
|
size = root.find("size")
|
||||||
|
width, height = float(size.find("width").text), float(size.find("height").text)
|
||||||
|
|
||||||
|
# Find all the bounding objects
|
||||||
|
for label_object in root.findall("object"):
|
||||||
|
# Temp array for csv, initialized by the training types
|
||||||
|
temp_csv = [str(training_dir)]
|
||||||
|
|
||||||
|
# gs://prefix/name/{image_name}
|
||||||
|
cloud_path = f"{path_prefix}/{os.path.splitext(file)[0]}.jpg"
|
||||||
|
temp_csv.append(cloud_path)
|
||||||
|
|
||||||
|
# Class label
|
||||||
|
temp_csv.append(label_object.find("name").text)
|
||||||
|
|
||||||
|
# Bounding box coordinate
|
||||||
|
bounding_box = label_object.find("bndbox")
|
||||||
|
|
||||||
|
# Add the upper left coordinate
|
||||||
|
x_min = float(bounding_box.find("xmin").text) / width
|
||||||
|
y_min = float(bounding_box.find("ymin").text) / height
|
||||||
|
temp_csv.extend([x_min, y_min])
|
||||||
|
|
||||||
|
# Add the lower left coordinate (not necessary, left blank)
|
||||||
|
temp_csv.extend(["", ""])
|
||||||
|
|
||||||
|
# Add the lower right coordinate
|
||||||
|
x_max = float(bounding_box.find("xmax").text) / width
|
||||||
|
y_max = float(bounding_box.find("ymax").text) / height
|
||||||
|
temp_csv.extend([x_max, y_max])
|
||||||
|
|
||||||
|
# Add the upper right coordinate (not necessary, left blank)
|
||||||
|
temp_csv.extend(["", ""])
|
||||||
|
|
||||||
|
# Append to the res
|
||||||
|
temp_res.append(temp_csv)
|
||||||
|
|
||||||
|
return temp_res
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Add the argument parse
|
||||||
|
arg_p = argparse.ArgumentParser()
|
||||||
|
arg_p.add_argument("-p", "--prefix",
|
||||||
|
required=True,
|
||||||
|
type=str,
|
||||||
|
help="Bucket of the cloud storage path")
|
||||||
|
arg_p.add_argument("-l", "--location",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Location of the label files")
|
||||||
|
arg_p.add_argument("-m", "--mode",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="'xml' for converting from xml and 'txt' for converting from txt")
|
||||||
|
arg_p.add_argument("-o", "--output",
|
||||||
|
type=str,
|
||||||
|
default="res.csv",
|
||||||
|
help="Output name of csv file")
|
||||||
|
arg_p.add_argument("-c", "--classes",
|
||||||
|
type=str,
|
||||||
|
default=os.path.join("..", "data", "predefined_classes.txt"),
|
||||||
|
help="Label classes path")
|
||||||
|
args = vars(arg_p.parse_args())
|
||||||
|
|
||||||
|
# Class labels
|
||||||
|
class_labels = []
|
||||||
|
|
||||||
|
# Load in the defined classes
|
||||||
|
if os.path.exists(args["classes"]) is True:
|
||||||
|
with codecs.open(args["classes"], 'r', 'utf8') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
class_labels.append(line)
|
||||||
|
else: # Exit if errors occurred
|
||||||
|
print(f"File: {args['classes']} not exists")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# Prefix of the cloud storage
|
||||||
|
ori_prefix = f"gs://{args['prefix']}"
|
||||||
|
|
||||||
|
# Array for final csv file
|
||||||
|
res = []
|
||||||
|
# Get all the file in dir
|
||||||
|
for training_type_dir in os.listdir(args["location"]):
|
||||||
|
# Get the dirname
|
||||||
|
dir_name = f"{args['location']}/{training_type_dir}"
|
||||||
|
|
||||||
|
# Check whether is dir
|
||||||
|
if not os.path.isdir(dir_name):
|
||||||
|
continue
|
||||||
|
# Process the files
|
||||||
|
|
||||||
|
for class_type_dir in os.listdir(dir_name):
|
||||||
|
|
||||||
|
# Check whether is dir
|
||||||
|
if not os.path.isdir(dir_name):
|
||||||
|
continue
|
||||||
|
|
||||||
|
prefix = f"{ori_prefix}/{class_type_dir}"
|
||||||
|
|
||||||
|
# Convert the chosen extension to csv
|
||||||
|
if args["mode"] == "txt":
|
||||||
|
res.extend(txt2csv(f"{dir_name}/{class_type_dir}",
|
||||||
|
training_type_dir,
|
||||||
|
prefix))
|
||||||
|
elif args["mode"] == "xml":
|
||||||
|
res.extend(xml2csv(f"{dir_name}/{class_type_dir}",
|
||||||
|
training_type_dir,
|
||||||
|
prefix))
|
||||||
|
else:
|
||||||
|
print("Wrong argument for convert mode.\n"
|
||||||
|
"'xml' for converting from xml to csv\n"
|
||||||
|
"'txt' for converting from txt to csv")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# Write to the result csv
|
||||||
|
res_csv = pd.DataFrame(res,
|
||||||
|
columns=["set", "path", "label",
|
||||||
|
"x_min", "y_min",
|
||||||
|
"x_max", "y_min",
|
||||||
|
"x_max", "y_max",
|
||||||
|
"x_min", "y_max"])
|
||||||
|
res_csv.to_csv("res.csv", index=False, header=False)
|
||||||
Loading…
x
Reference in New Issue
Block a user