load-images-in-hdf5(Python)
# Add the Storage Account, Container, and reference the secret to pass the SAS Token
MOUNTPOINT = "/mnt/databricks/demo"
STORAGE_ACCOUNT = "emeashare"
CONTAINER = "demo"
SASTOKEN = "?sv=2019-02-02&ss=b&srt=co&sp=rl&se=2031-05-13T02:07:21Z&st=2020-03-11T19:07:21Z&spr=https&sig=3q2qpa3ZHf2TVXki9nAjmVnFvykm7u1Y%2FMa8vx25uUg%3D"

# Do not change these values
SOURCE = "wasbs://{container}@{storage_acct}.blob.core.windows.net/".format(container=CONTAINER, storage_acct=STORAGE_ACCOUNT)
URI = "fs.azure.sas.{container}.{storage_acct}.blob.core.windows.net".format(container=CONTAINER, storage_acct=STORAGE_ACCOUNT)

try:
  dbutils.fs.mount(
    source=SOURCE,
    mount_point=MOUNTPOINT,
    extra_configs={URI:SASTOKEN})
except Exception as e:
  if "Directory already mounted" in str(e):
    pass # Ignore error if already mounted.
  else:
    raise e
print("Success.")
Success.
%fs
ls /mnt/databricks/demo/CarClassification
dbfs:/mnt/databricks/demo/CarClassification/bmw10_release/bmw10_release/0
dbfs:/mnt/databricks/demo/CarClassification/cars_test/cars_test/0
dbfs:/mnt/databricks/demo/CarClassification/cars_train/cars_train/0
dbfs:/mnt/databricks/demo/CarClassification/devkit/devkit/0
dbfs:/mnt/databricks/demo/CarClassification/output/output/0
dbfs:/mnt/databricks/demo/CarClassification/resnet152_weights_tf.h5resnet152_weights_tf.h5243179624
dbfs:/mnt/databricks/demo/CarClassification/tables/tables/0
dbfs:/mnt/databricks/demo/CarClassification/test_cars.h5test_cars.h51008807650
dbfs:/mnt/databricks/demo/CarClassification/train_cars.h5train_cars.h51020543546
dbfs:/mnt/databricks/demo/CarClassification/weights.best.hdf5weights.best.hdf5472412592

Descriptions of the files are as follows:

  • cars_meta.mat

    Contains a cell array of class names, one for each class.

  • cars_train_annos.mat

    Contains the variable 'annotations', which is a struct array of length num_images and where each element has the fields

    bbox_x1: Min x-value of the bounding box, in pixels
    bbox_x2: Max x-value of the bounding box, in pixels
    bbox_y1: Min y-value of the bounding box, in pixels
    bbox_y2: Max y-value of the bounding box, in pixels
    class: Integral id of the class the image belongs to.
    fname: Filename of the image within the folder of images.
    
  • cars_test_annos.mat:

    Same format as cars_train_annos.mat, except the class is not provided.

Loading Python dependencies

  • opencv-python-headless
  • koalas
import scipy.io 
import numpy as np
import h5py
import cv2 # install opencv-python-headless

Defining file locations

LOCAL_SOURCE='/dbfs/mnt/databricks/demo/CarClassification'
OUTPUT_PATH='/tmp'

This can take a couple of minutes! You mays want to skip the step!
Write compressed image files to HDF5

hdf5 exist under: /dbfs/mnt/databricks/demo/CarClassification

HEIGHT = 224
WIDTH = 224
CHANNELS = 3
SHAPE = (HEIGHT, WIDTH, CHANNELS)

h5_train_proc = { "fname": "train_cars.h5", "imgDir": "cars_train", "matFile": "cars_train_annos.mat"}
h5_test_proc = { "fname": "test_cars.h5", "imgDir": "cars_test", "matFile": "cars_test_annos_withlabels.mat"}

for test_train in [h5_train_proc, h5_test_proc]:
  
  cars_annos = scipy.io.loadmat( LOCAL_SOURCE + '/devkit/{}'.format(test_train.get('matFile')) )
  
  car_bbox_x1 = np.zeros(0,)
  car_bbox_x2 = np.zeros(0,)
  car_bbox_y1 = np.zeros(0,)
  car_bbox_y2 = np.zeros(0,)
  car_class = np.zeros(0,)
  car_fname = np.zeros(0,)
  
  NUM_IMAGES = len(cars_annos['annotations'][0])
  
  car_image = np.zeros((NUM_IMAGES, HEIGHT, WIDTH, CHANNELS))
  
  # Iterating through the annotations and loading images and labels
  i = 0
  for car in cars_annos['annotations'][0]:
    car_bbox_x1 =  np.append(car_bbox_x1, car[0][0].item())
    car_bbox_x2 = np.append(car_bbox_x2, car[1][0].item())
    car_bbox_y1 = np.append(car_bbox_y1, car[2][0].item())
    car_bbox_y2 = np.append(car_bbox_y2, car[3][0].item())
    car_class = np.append(car_class, car[4][0].item())
    car_fname = np.append(car_fname, car[5][0].item())
    
    image_location =LOCAL_SOURCE + '/{}/{}'.format(test_train.get('imgDir'), car[5][0].item())
    image_tmp = cv2.imread(LOCAL_SOURCE + '/{}/{}'.format(test_train.get('imgDir'), car[5][0].item()))
    car_image[i][:][:][:] = cv2.resize(image_tmp, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC).astype(int)
    i = i + 1
  
  # writing into HDF5 files
  with h5py.File('{}/{}'.format(OUTPUT_PATH, test_train.get('fname')), 'w') as hf:
    Xset = hf.create_dataset(name='dataset_x',
      data=car_image,
      shape=(NUM_IMAGES,HEIGHT, WIDTH, CHANNELS),
      maxshape=(NUM_IMAGES,HEIGHT, WIDTH, CHANNELS),
      dtype = np.uint8,
      compression="gzip",
      compression_opts=9)
    yset = hf.create_dataset(name='dataset_y',
      data = (car_class),
      shape=(NUM_IMAGES,),
      maxshape=(NUM_IMAGES,),
      dtype = np.uint8,
      compression="gzip",
      compression_opts=9)

© 2020 Databricks, Inc. All rights reserved.
Apache, Apache Spark, Spark and the Spark logo are trademarks of the Apache Software Foundation.

Privacy Policy | Terms of Use | Support