Below are various modules that we implemented and used to clean, organize, and process the data.
'''
Resize smaller side of all images in all directories in the image directory
USAGE:
python resize_smaller_side.py
'''
from glob import glob
from PIL import Image
import os, os.path
import time
import math
if __name__ == '__main__':
start = time.time()
for filename in glob('images/*/*.jpg'):
try:
im = Image.open(filename)
w,h = im.size
if w < h:
resized_im = im.resize((224, int(224 * h / w)))
else:
resized_im = im.resize((int(224 * w / h), 224))
resized_im.save(filename)
except:
print("could not resize " + filename)
end = time.time()
print("time elapsed: {} mins {} seconds".format(math.floor((end-start)/60),(end-start) % 60))
'''
-Resize smaller side of all images in all directories in the image directory
-Take a random crop of images in the training set
-Take center crop of images in the test set
(result is all images are 224 x 224)
-take 10 random samples from test and train set to ensure sizes are correct
-show 4 random images for visual verification
USAGE:
python preprocessing.py
'''
from glob import glob
from PIL import Image
import os, os.path
import time
import math
import numpy as np
import pandas as pd
import random
if __name__ == '__main__':
start = time.time()
train_df = pd.read_csv("raw_train_df.csv")
test_df = pd.read_csv("raw_test_df.csv")
for path in train_df['scaled_path']:
try:
im = Image.open(path)
w,h = im.size
if w < h:
resized_im = im.resize((224, int(224 * h / w)))
else:
resized_im = im.resize((int(224 * w / h), 224))
w,h = resized_im.size
top, left = random.randint(0,h-224), random.randint(0,w-224)
bottom, right = top + 224, left + 224
cropped_img = resized_im.crop((left, top, right, bottom))
cropped_img.save(path)
except:
print("could not resize/random crop" + path)
print("removing " + path + " from dataset")
os.remove(path)
for path in test_df['scaled_path']:
try:
im = Image.open(path)
w,h = im.size
if w < h:
resized_im = im.resize((224, int(224 * h / w)))
else:
resized_im = im.resize((int(224 * w / h), 224))
w,h = resized_im.size
cropped_img = resized_im.crop((w/2 - 112, h/2 - 112, w/2 + 112, h/2 + 112))
cropped_img.save(path)
except:
print("could not resize/center crop" + path)
print("removing " + path + " from dataset")
os.remove(path)
"""TESTING"""
train_samples = train_df.sample(n=10)
test_samples = train_df.sample(n=10)
i = 0
for path in train_samples['scaled_path']:
train_w, train_h = (Image.open(path)).size
if train_w != 224 or train_h != 224:
print("ERROR: sampled train_df image with incorrect size: " + path)
if i >= 8:
(Image.open(path)).show()
i += 1
i = 0
for path in test_samples['scaled_path']:
test_w, test_h = (Image.open(path)).size
if test_w != 224 or test_h != 224:
print("ERROR: sampled test_df image with incorrect size: " + path)
if i >= 8:
(Image.open(path)).show()
i += 1
end = time.time()
print("time elapsed: {} mins {} seconds".format(math.floor((end-start)/60),(end-start) % 60))
'''
-Take a subset of the testing and training data
-Create directories to house the data as keras expects it
Structure:
-train
-1
-.jpg
-.jpg
. . .
USAGE:
python reorganize.py
'''
from glob import glob
from PIL import Image
import os, os.path, sys
import time
import math
import numpy as np
import pandas as pd
import random
import shutil
#
if __name__ == '__main__':
if len(sys.argv) < 2:
print('usage: python reorganize.py frac_of_data_as_decimal')
sys.exit(1)
try:
frac = float(sys.argv[1])
except:
print('frac must be a decimal')
sys.exit(1)
start = time.time()
train_df = pd.read_csv("raw_train_df.csv")
test_df = pd.read_csv("raw_test_df.csv")
train_df_samples = train_df.sample(frac=frac)
test_df_samples = test_df.sample(frac=frac)
if not os.path.exists('train'):
os.mkdir('train')
if not os.path.exists('test'):
os.mkdir('test')
for index, row in train_df_samples.iterrows():
try:
path = row['scaled_path']
name = (path.split('/'))[-1]
super_breed = row['class']
if not os.path.exists('train/' + str(int(super_breed))):
os.mkdir('train/' + str(int(super_breed)))
shutil.copy(path, 'train/' + str(int(super_breed)) + '/' + name)
except:
print("could not copy " + path)
for index, row in test_df_samples.iterrows():
try:
path = row['scaled_path']
name = (path.split('/'))[-1]
super_breed = row['class']
if not os.path.exists('test/' + str(int(super_breed))):
os.mkdir('test/' + str(int(super_breed)))
shutil.copy(path, 'test/' + str(int(super_breed)) + '/' + name)
except:
print("could not copy " + path)
end = time.time()
print("time elapsed: {} mins {} seconds".format(math.floor((end-start)/60),(end-start) % 60))
import random
random.seed(112358)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential
from keras.layers import Dense
import seaborn as sns
from keras import regularizers
import scipy.io
from scipy import stats
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
###
random_image = np.random.rand(224, 224)
puppy_array = X_train_cropped_flattened[4340].reshape(224, 224, 3)
plt.imshow(array_to_img(puppy_array))
###
def cutout_image(image):
image_copy = image.copy()
starting_row = random.randint(0, 204)
starting_col = random.randint(0, 204)
for i in range(20):
for j in range(20):
image_copy[starting_row + i, starting_col + j] = 0
return image_copy
###
new_image = cutout_image(puppy_array)
plt.imshow(array_to_img(new_image))
###
train_datagen = ImageDataGenerator(
preprocessing_function=cutout_function,
horizontal_flip=True,
channel_shift_range=20)
###
new_puppy_array = puppy_array.reshape((1,) + puppy_array.shape)
data_aug_iterator = train_datagen.flow(new_puppy_array, save_to_dir="data_aug")
###
i = 0
for y in data_aug_iterator:
i += 1
if i > 20:
break
'''
Usage: Run " %run -i 'metrics_script.py' " in a cell in the ipynb.
Then you can call run_metrics on y_pred, y_pmf (the softmax probabilities),
y_true, model_name (a string describing the model type), and time_to_classify
in the ipynb directly.
See Baseline Models.ipynb for an example.
'''
import matplotlib.pyplot as plt
import random
import pandas as pd
import numpy as np
import itertools
import time
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix
def run_metrics(y_pred, y_pmf, y_true, model_name, time_to_classify=None):
loss = log_loss(y_true, y_pmf)
accuracy = accuracy_score(y_true, y_pred)
cnf_matrix = confusion_matrix(y_true, y_pred)
print("Categorical Loss: {}".format(loss))
print("Accuracy Score: {}".format(accuracy))
if time_to_classify:
print("Runtime: {}".format(time_to_classify))
plot_confusion_matrix(cnf_matrix, super_classes, model_name)
plot_confusion_matrix(cnf_matrix, super_classes, model_name, True)
return loss, accuracy
def plot_confusion_matrix(cm, classes, title, normalize=False, cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
title += " Confusion Matrix (Normalized)"
else:
title += " Confusion Matrix (Not Normalized)"
plt.figure(figsize=(8,5))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()