initial comments + func to vary dataset size

0fc3863e · Djordje Mihajlovic · ea27c2ad · 0fc3863e · 0fc3863e
Commit 0fc3863e authored 1 year ago by Djordje Mihajlovic
--- a/src/loaders.py
+++ b/src/loaders.py
@@ -31,6 +31,10 @@ def load_dataset(dirname, knot, net, dtype, Nbeads, pers_len, label):
    # Loading the dataset file
    dataset = tf.data.experimental.CsvDataset(os.path.join(dirname,fname), type_list, header=header, field_delim=" ", select_cols=select_cols)

+    # Could we not use tf.data.experimental.make_csv_dataset
+    # and then set batch size? --> 
+    # tf.data.experimental.make_csv_dataset(os.path.join(dirname, fname), batch_size = 1000, header = header, select_columns = select_cols, field_delim= " ")
+    
    # Reshape the incoming data
    dataset = dataset.batch(Nbeads)

@@ -60,6 +64,9 @@ def load_dataset(dirname, knot, net, dtype, Nbeads, pers_len, label):
        # Create labelled classification database
        dataset = dataset.map(lambda x: (x, label))

+    dataset = dataset.take(100) # <-set amounts of dataset to analyze 
+    # Test with stupidly low value to see if this increases speed -- going to manually test first! --
+

    return dataset


--- a/src/main.py
+++ b/src/main.py
@@ -20,10 +20,13 @@ from models import build_model

 print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")), tf.config.list_physical_devices("GPU"))

-physical_devices = tf.config.list_physical_devices("GPU")
-if len(physical_devices) > 0:
-    tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

+# Ensure's efficient GPU utilization
+physical_devices = tf.config.list_physical_devices("GPU") # Lists all GPU's available
+if len(physical_devices) > 0: 
+    tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) # If number of GPU's is >0 run memory growth accordingly (doesn't just allocate all GPU memory at once)
+
+# CPU usage distribution (tries run 4 tasks concurrently)
 tf.config.threading.set_inter_op_parallelism_threads(4)
 tf.config.threading.set_intra_op_parallelism_threads(4)

@@ -70,7 +73,7 @@ def main():

            if net != "randFOR":

-                # HyperBand algorithm from keras tuner
+                # HyperBand algorithm from keras tuner (automated hyperparameter tuner in ML)
                tuner1 = kt.Hyperband(
                    build_model(in_layer, len(knots), "relu", norm),
                    objective='val_accuracy',
@@ -80,7 +83,7 @@ def main():
                    project_name=f"{dtype}_{prob}_Adj_{adj}_Norm_{norm}_Net_{net}_Nbeads_{Nbeads}_BatchSize_{bs}_LenDB_{len_db}_PersLen{pers_len}"
                )
                
-                # Bayesian Optimisation algorithm from keras tuner
+                # Bayesian Optimisation algorithm from keras tuner (second tuner, again an automated hyperparameter tuner)
                tuner2 = kt.BayesianOptimization(
                        build_model(in_layer, len(knots), "relu", norm),
                        objective='val_accuracy',
@@ -121,10 +124,10 @@ def train(model, train_dataset, val_dataset, bs):
    train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
    val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)

-    # Early Stopping Callback
+    # Early Stopping Callback (prevents overfitting)
    es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=10, restore_best_weights=True, min_delta=0.001)

-    # Finding best NN model weights and saving them during training process
+    # Finding best NN model weights and saving them during training process (useful for model evaluation)
    mc = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath, save_weights_only=False, monitor="val_loss", mode="min", save_best_only=True
    )