Supplementary Code

Below are various modules that we implemented and used to clean, organize, and process the data.

'''
Resize smaller side of all images in all directories in the image directory

USAGE:

python resize_smaller_side.py
'''
from glob import glob
from PIL import Image
import os, os.path
import time
import math

if __name__ == '__main__':
    start = time.time()
    for filename in glob('images/*/*.jpg'):
        try:
            im = Image.open(filename)
            w,h = im.size
            if w < h:
                resized_im = im.resize((224, int(224 * h / w)))
            else:
                resized_im = im.resize((int(224 * w / h), 224))
            resized_im.save(filename)
        except:
            print("could not resize " + filename)
    end = time.time()
    print("time elapsed: {} mins {} seconds".format(math.floor((end-start)/60),(end-start) % 60))

'''
-Resize smaller side of all images in all directories in the image directory
-Take a random crop of images in the training set
-Take center crop of images in the test set
(result is all images are 224 x 224)
-take 10 random samples from test and train set to ensure sizes are correct
-show 4 random images for visual verification

USAGE:
python preprocessing.py
'''
from glob import glob
from PIL import Image
import os, os.path
import time
import math
import numpy as np
import pandas as pd
import random

if __name__ == '__main__':
    start = time.time()
    train_df = pd.read_csv("raw_train_df.csv")
    test_df = pd.read_csv("raw_test_df.csv")
    for path in train_df['scaled_path']:
        try:
            im = Image.open(path)
            w,h = im.size
            if w < h:
                resized_im = im.resize((224, int(224 * h / w)))
            else:
                resized_im = im.resize((int(224 * w / h), 224))
            w,h = resized_im.size
            top, left = random.randint(0,h-224), random.randint(0,w-224)
            bottom, right = top + 224, left + 224
            cropped_img = resized_im.crop((left, top, right, bottom))
            cropped_img.save(path)
        except:
            print("could not resize/random crop" + path)
            print("removing " + path + " from dataset")
            os.remove(path)

    for path in test_df['scaled_path']:
        try:
            im = Image.open(path)
            w,h = im.size
            if w < h:
                resized_im = im.resize((224, int(224 * h / w)))
            else:
                resized_im = im.resize((int(224 * w / h), 224))
            w,h = resized_im.size
            cropped_img = resized_im.crop((w/2  - 112, h/2 - 112, w/2 + 112, h/2 + 112))
            cropped_img.save(path)
        except:
            print("could not resize/center crop" + path)
            print("removing " + path + " from dataset")
            os.remove(path)



    """TESTING"""
    train_samples = train_df.sample(n=10)
    test_samples = train_df.sample(n=10)
    i = 0
    for path in train_samples['scaled_path']:
        train_w, train_h = (Image.open(path)).size
        if train_w != 224 or train_h != 224:
            print("ERROR: sampled train_df image with incorrect size: " + path)
        if i >= 8:
            (Image.open(path)).show()
        i += 1
    i = 0
    for path in test_samples['scaled_path']:
        test_w, test_h = (Image.open(path)).size
        if test_w != 224 or test_h != 224:
            print("ERROR: sampled test_df image with incorrect size: " + path)
        if i >= 8:
            (Image.open(path)).show()
        i += 1



    end = time.time()
    print("time elapsed: {} mins {} seconds".format(math.floor((end-start)/60),(end-start) % 60))

'''
-Take a subset of the testing and training data
-Create directories to house the data as keras expects it

Structure:

-train
    -1
        -.jpg
        -.jpg
        . . .

USAGE:
python reorganize.py
'''
from glob import glob
from PIL import Image
import os, os.path, sys
import time
import math
import numpy as np
import pandas as pd
import random
import shutil


#

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('usage: python reorganize.py frac_of_data_as_decimal')
        sys.exit(1)
    try:
        frac = float(sys.argv[1])
    except:
        print('frac must be a decimal')
        sys.exit(1)

    start = time.time()
    train_df = pd.read_csv("raw_train_df.csv")
    test_df = pd.read_csv("raw_test_df.csv")
    train_df_samples = train_df.sample(frac=frac)
    test_df_samples = test_df.sample(frac=frac)
    if not os.path.exists('train'):
        os.mkdir('train')
    if not os.path.exists('test'):
        os.mkdir('test')
    for index, row in train_df_samples.iterrows():
        try:
            path = row['scaled_path']
            name = (path.split('/'))[-1]
            super_breed = row['class']
            if not os.path.exists('train/' + str(int(super_breed))):
                os.mkdir('train/' + str(int(super_breed)))
            shutil.copy(path, 'train/' + str(int(super_breed)) + '/' + name)
        except:
            print("could not copy " + path)

    for index, row in test_df_samples.iterrows():
        try:
            path = row['scaled_path']
            name = (path.split('/'))[-1]
            super_breed = row['class']
            if not os.path.exists('test/' + str(int(super_breed))):
                os.mkdir('test/' + str(int(super_breed)))
            shutil.copy(path, 'test/' + str(int(super_breed)) + '/' + name)
        except:
            print("could not copy " + path)

    end = time.time()
    print("time elapsed: {} mins {} seconds".format(math.floor((end-start)/60),(end-start) % 60))

import random
random.seed(112358)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import keras
from keras.models import Sequential
from keras.layers import Dense

import seaborn as sns

from keras import regularizers
import scipy.io
from scipy import stats
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

###
random_image = np.random.rand(224, 224)
puppy_array = X_train_cropped_flattened[4340].reshape(224, 224, 3)
plt.imshow(array_to_img(puppy_array))

###
def cutout_image(image):
    image_copy = image.copy()
    starting_row = random.randint(0, 204)
    starting_col = random.randint(0, 204)
    for i in range(20):
        for j in range(20):
            image_copy[starting_row + i, starting_col + j] = 0
    return image_copy

###
new_image = cutout_image(puppy_array)
plt.imshow(array_to_img(new_image))

###
train_datagen = ImageDataGenerator(
    preprocessing_function=cutout_function,
    horizontal_flip=True,
    channel_shift_range=20)

###
new_puppy_array = puppy_array.reshape((1,) + puppy_array.shape)
data_aug_iterator = train_datagen.flow(new_puppy_array, save_to_dir="data_aug")

###
i = 0
for y in data_aug_iterator:
    i += 1
    if i > 20:
        break

'''
Usage: Run "   %run -i 'metrics_script.py'  " in a cell in the ipynb.
Then you can call run_metrics on y_pred, y_pmf (the softmax probabilities),
y_true, model_name (a string describing the model type), and time_to_classify
in the ipynb directly.
See Baseline Models.ipynb for an example.

'''
import matplotlib.pyplot as plt
import random
import pandas as pd
import numpy as np
import itertools
import time

from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix

def run_metrics(y_pred, y_pmf, y_true, model_name, time_to_classify=None):
    loss = log_loss(y_true, y_pmf)
    accuracy = accuracy_score(y_true, y_pred)
    cnf_matrix = confusion_matrix(y_true, y_pred)
    print("Categorical Loss: {}".format(loss))
    print("Accuracy Score: {}".format(accuracy))
    if time_to_classify:
        print("Runtime: {}".format(time_to_classify))
    plot_confusion_matrix(cnf_matrix, super_classes, model_name)
    plot_confusion_matrix(cnf_matrix, super_classes, model_name, True)
    return loss, accuracy
    
    
    
def plot_confusion_matrix(cm, classes, title, normalize=False, cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        title += " Confusion Matrix (Normalized)"
    else:
        title += " Confusion Matrix (Not Normalized)"
    plt.figure(figsize=(8,5))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    

Supplementary Code

Contents