load-images-in-hdf5 - Databricks

# Add the Storage Account, Container, and reference the secret to pass the SAS Token
MOUNTPOINT = "/mnt/databricks/demo"
STORAGE_ACCOUNT = "emeashare"
CONTAINER = "demo"
SASTOKEN = "?sv=2019-02-02&ss=b&srt=co&sp=rl&se=2031-05-13T02:07:21Z&st=2020-03-11T19:07:21Z&spr=https&sig=3q2qpa3ZHf2TVXki9nAjmVnFvykm7u1Y%2FMa8vx25uUg%3D"

# Do not change these values
SOURCE = "wasbs://{container}@{storage_acct}.blob.core.windows.net/".format(container=CONTAINER, storage_acct=STORAGE_ACCOUNT)
URI = "fs.azure.sas.{container}.{storage_acct}.blob.core.windows.net".format(container=CONTAINER, storage_acct=STORAGE_ACCOUNT)

try:
  dbutils.fs.mount(
    source=SOURCE,
    mount_point=MOUNTPOINT,
    extra_configs={URI:SASTOKEN})
except Exception as e:
  if "Directory already mounted" in str(e):
    pass # Ignore error if already mounted.
  else:
    raise e
print("Success.")

Success.

%fs
ls /mnt/databricks/demo/CarClassification


dbfs:/mnt/databricks/demo/CarClassification/bmw10_release/	bmw10_release/	0
dbfs:/mnt/databricks/demo/CarClassification/cars_test/	cars_test/	0
dbfs:/mnt/databricks/demo/CarClassification/cars_train/	cars_train/	0
dbfs:/mnt/databricks/demo/CarClassification/devkit/	devkit/	0
dbfs:/mnt/databricks/demo/CarClassification/output/	output/	0
dbfs:/mnt/databricks/demo/CarClassification/resnet152_weights_tf.h5	resnet152_weights_tf.h5	243179624
dbfs:/mnt/databricks/demo/CarClassification/tables/	tables/	0
dbfs:/mnt/databricks/demo/CarClassification/test_cars.h5	test_cars.h5	1008807650
dbfs:/mnt/databricks/demo/CarClassification/train_cars.h5	train_cars.h5	1020543546
dbfs:/mnt/databricks/demo/CarClassification/weights.best.hdf5	weights.best.hdf5	472412592

Descriptions of the files are as follows:

cars_meta.mat

Contains a cell array of class names, one for each class.

cars_train_annos.mat

Contains the variable 'annotations', which is a struct array of length num_images and where each element has the fields

bbox_x1: Min x-value of the bounding box, in pixels
bbox_x2: Max x-value of the bounding box, in pixels
bbox_y1: Min y-value of the bounding box, in pixels
bbox_y2: Max y-value of the bounding box, in pixels
class: Integral id of the class the image belongs to.
fname: Filename of the image within the folder of images.

cars_test_annos.mat:

Same format as cars_train_annos.mat, except the class is not provided.

import scipy.io 
import numpy as np
import h5py
import cv2 # install opencv-python-headless

LOCAL_SOURCE='/dbfs/mnt/databricks/demo/CarClassification'
OUTPUT_PATH='/tmp'

HEIGHT = 224
WIDTH = 224
CHANNELS = 3
SHAPE = (HEIGHT, WIDTH, CHANNELS)

h5_train_proc = { "fname": "train_cars.h5", "imgDir": "cars_train", "matFile": "cars_train_annos.mat"}
h5_test_proc = { "fname": "test_cars.h5", "imgDir": "cars_test", "matFile": "cars_test_annos_withlabels.mat"}

for test_train in [h5_train_proc, h5_test_proc]:
  
  cars_annos = scipy.io.loadmat( LOCAL_SOURCE + '/devkit/{}'.format(test_train.get('matFile')) )
  
  car_bbox_x1 = np.zeros(0,)
  car_bbox_x2 = np.zeros(0,)
  car_bbox_y1 = np.zeros(0,)
  car_bbox_y2 = np.zeros(0,)
  car_class = np.zeros(0,)
  car_fname = np.zeros(0,)
  
  NUM_IMAGES = len(cars_annos['annotations'][0])
  
  car_image = np.zeros((NUM_IMAGES, HEIGHT, WIDTH, CHANNELS))
  
  # Iterating through the annotations and loading images and labels
  i = 0
  for car in cars_annos['annotations'][0]:
    car_bbox_x1 =  np.append(car_bbox_x1, car[0][0].item())
    car_bbox_x2 = np.append(car_bbox_x2, car[1][0].item())
    car_bbox_y1 = np.append(car_bbox_y1, car[2][0].item())
    car_bbox_y2 = np.append(car_bbox_y2, car[3][0].item())
    car_class = np.append(car_class, car[4][0].item())
    car_fname = np.append(car_fname, car[5][0].item())
    
    image_location =LOCAL_SOURCE + '/{}/{}'.format(test_train.get('imgDir'), car[5][0].item())
    image_tmp = cv2.imread(LOCAL_SOURCE + '/{}/{}'.format(test_train.get('imgDir'), car[5][0].item()))
    car_image[i][:][:][:] = cv2.resize(image_tmp, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC).astype(int)
    i = i + 1
  
  # writing into HDF5 files
  with h5py.File('{}/{}'.format(OUTPUT_PATH, test_train.get('fname')), 'w') as hf:
    Xset = hf.create_dataset(name='dataset_x',
      data=car_image,
      shape=(NUM_IMAGES,HEIGHT, WIDTH, CHANNELS),
      maxshape=(NUM_IMAGES,HEIGHT, WIDTH, CHANNELS),
      dtype = np.uint8,
      compression="gzip",
      compression_opts=9)
    yset = hf.create_dataset(name='dataset_y',
      data = (car_class),
      shape=(NUM_IMAGES,),
      maxshape=(NUM_IMAGES,),
      dtype = np.uint8,
      compression="gzip",
      compression_opts=9)

Next Step