diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index bec8082..f04a29c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -35,7 +35,7 @@ jobs: curl -L -o models/skinvestigator-sm.tflite https://github.com/Thomasbehan/SkinVestigatorAI/releases/download/0.1.5/skinvestigator-sm.tflite - name: Download some data for testing run: | - python skinvestigatorai/core/data_scraper.py -p 2 + python commands/run_data_scraper.py -p 2 - name: Lint with ruff run: | ruff check diff --git a/.gitignore b/.gitignore index 1f1cacf..3b6ab89 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,5 @@ test # prevent models being commited to github **/*.h5 *.h5 +**/*.tflite +*.tflite diff --git a/README.md b/README.md index b6d7e09..b8c9bbd 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,11 @@ # SkinVestigatorAI ![View SkinVestigatorAI on GitHub](https://img.shields.io/github/stars/Thomasbehan/SkinVestigatorAI?color=232323&label=SkinVestigatorAI&logo=github&labelColor=232323) -![Sensitivity Score](https://img.shields.io/badge/Sensitivity-0.84035-blue) -![Specificity Score](https://img.shields.io/badge/Specificity-0.84019-blue) -![Precision Score](https://img.shields.io/badge/Precision-0.84035-blue) -![F1 Score](https://img.shields.io/badge/F1-0.84467-blue) -![Accuracy Score](https://img.shields.io/badge/Accuracy-0.84035-blue) -![Loss Score](https://img.shields.io/badge/Loss-0.23201-blue) -![AUC Score](https://img.shields.io/badge/AUC-0.91692-blue) +![Precision Score](https://img.shields.io/badge/Precision-0.6753-blue) +![Recall Score](https://img.shields.io/badge/Recall-0.3701-blue) +![Accuracy Score](https://img.shields.io/badge/Accuracy-94.34%25-darkgreen) +![Loss Score](https://img.shields.io/badge/Loss-0.1501-blue) +![AUC Score](https://img.shields.io/badge/AUC-0.9286-darkgreen) ![GitHub license](https://img.shields.io/github/license/Thomasbehan/SkinVestigatorAI) [![Actions Status](https://github.com/Thomasbehan/SkinVestigatorAI/workflows/Automated%20Testing/badge.svg)](https://github.com/Thomasbehan/SkinVestigatorAI/actions) [![Actions Status](https://github.com/Thomasbehan/SkinVestigatorAI/workflows/CodeQL/badge.svg)](https://github.com/Thomasbehan/SkinVestigatorAI/actions) @@ -50,7 +48,7 @@ To quickly set up SkinVestigatorAI for development, follow these steps ```bash python -m pip install -e . ``` - + 3. **Run the Application:** Start the application with auto-reloading using: ```bash @@ -70,79 +68,150 @@ python -m pytest ### Running the Linter To run the linter, run the following command: ```bash -python -m ruff --format=github --target-version=py311 . +python -m ruff check +``` + +## Model Downloader + +To download and prepare a specific model for use, you can use the `download_model.py` script located in the `commands` directory. This script accepts the model identifier as an argument. + +### Usage + +Run the following command from the root of the project directory: + +```bash +python .\commands\download_model.py -m ``` +### Available Models +Here is a list of all the available models you can download using the script: + +* M-0003: Simple Testing (Legacy). +* M-0015: Best Model. +* M-0015s: Fastest Model. ## Data -The DataScraper tool within this application is designed to download and preprocess skin lesion images from the ISIC Archive for use in machine learning projects. The images are stored in three separate directories for training, validation, and testing. +The DataScraper tool within this application is designed to download and preprocess skin lesion images from the ISIC Archive for use in machine learning projects. The images +are stored in three separate directories for training, validation, and testing, featuring a total of 40,194 images. This substantial dataset aims to provide a comprehensive basis for accurate skin lesion analysis and classification. + The data is organised as follows: -- Train: 5625 benign, 5152 malignant +- Train: 32,155 images +- Test: 8,039 images ### Data Source -The data is fetched from the ISIC Archive using their API. The base URL for the API is https://api.isic-archive.com/api/v2. The code makes use of the /images/ endpoint to fetch a list of images in JSON format. Each image's metadata contains information about the image, including its URL, ISIC ID, and clinical information (benign/malignant). +The dataset used for training the model is sourced from the International Skin Imaging Collaboration (ISIC) Archive. The ISIC Archive is a large-scale resource for skin image analysis, providing open access to a wide variety of images for the development and evaluation of automated diagnostic systems. + +For more information about the ISIC Archive and to access the data, visit [ISIC Archive](https://www.isic-archive.com). ### Data Organization The images are organized into three folders: -1. data/train: Contains 70% of the total images for each batch, which can be used for training a model. -2. data/validation: Contains 20% of the total images for each batch, which can be used for model validation. -3. data/test: Contains the remaining 10% of the total images for each batch, which can be used for model testing. -Each folder is further organized into subfolders, separating the images based on their clinical classification (benign or malignant). +1. `data/train`: Contains 80% of the total images, which are used for training the model. +2. `data/test`: Contains 20% of the total images, used for testing the model's performance during and after training. ## Model -The model is a convolutional neural network (CNN) that uses transfer learning with the Vision Transformer (ViT) model to classify skin lesion images as benign or malignant. The model is trained using the Adam optimizer and the binary cross-entropy loss function. -Here is a summary of the model architecture: - - Model: "sequential" - _________________________________________________________________ - Layer (type) Output Shape Param # - ================================================================= - vit-b32 (Functional) (None, 768) 87429888 - - flatten (Flatten) (None, 768) 0 - - batch_normalization (BatchN (None, 768) 3072 - ormalization) - - dense (Dense) (None, 1024) 787456 - - batch_normalization_1 (Batc (None, 1024) 4096 - hNormalization) - - dropout (Dropout) (None, 1024) 0 - - dense_1 (Dense) (None, 2) 2050 - - ================================================================= - Total params: 88,226,562 - Trainable params: 88,222,978 - Non-trainable params: 3,584 - _________________________________________________________________ +The `SkinCancerDetector` model employs a sophisticated deep learning architecture tailored for the accurate classification of skin lesions as benign or malignant. Built on TensorFlow, the model features a sequential arrangement of layers, utilising convolutional neural networks (CNNs) for their powerful image processing capabilities. + +### Architecture Overview +The architecture is meticulously designed to capture the intricate patterns and features of skin lesions through multiple stages of convolutional layers, each followed by max pooling to reduce spatial dimensions and dropout layers to prevent overfitting. The model's core is structured as follows: + +- **Convolutional Layers:** Multiple layers with ReLU activation to extract features from images. +- **Max Pooling Layers:** Applied after convolutional layers to reduce the size of the feature maps, thereby reducing the number of parameters and computation in the network. +- **Dropout Layers:** Used to prevent overfitting by randomly setting a fraction of input units to 0 at each update during training time. +- **Dense Layers:** Fully connected layers that learn non-linear combinations of the high-level features extracted by the convolutional layers. +- **Output Layer:** A dense layer with a sigmoid activation function to classify the input image as benign or malignant. + + +```bash + Model: "sequential" + _________________________________________________________________ + Layer (type) Output Shape Param # + ================================================================= + conv2d (Conv2D) (None, 180, 180, 128) 1280 + _________________________________________________________________ + max_pooling2d (MaxPooling2D) (None, 90, 90, 128) 0 + _________________________________________________________________ + dropout (Dropout) (None, 90, 90, 128) 0 + _________________________________________________________________ + conv2d_1 (Conv2D) (None, 90, 90, 256) 295168 + _________________________________________________________________ + max_pooling2d_1 (MaxPooling2 (None, 45, 45, 256) 0 + _________________________________________________________________ + dropout_1 (Dropout) (None, 45, 45, 256) 0 + _________________________________________________________________ + conv2d_2 (Conv2D) (None, 45, 45, 192) 442560 + _________________________________________________________________ + max_pooling2d_2 (MaxPooling2 (None, 22, 22, 192) 0 + _________________________________________________________________ + dropout_2 (Dropout) (None, 22, 22, 192) 0 + _________________________________________________________________ + flatten (Flatten) (None, 92416) 0 + _________________________________________________________________ + dense (Dense) (None, 64) 5914688 + _________________________________________________________________ + dropout_3 (Dropout) (None, 64) 0 + _________________________________________________________________ + dense_1 (Dense) (None, 96) 6240 + _________________________________________________________________ + dropout_4 (Dropout) (None, 96) 0 + _________________________________________________________________ + dense_2 (Dense) (None, 1) 97 + ================================================================= + Total params: 6,660,033 + Trainable params: 6,660,033 + Non-trainable params: 0 + _________________________________________________________________ +``` + +### Training and Optimization +The model is compiled with the Adam optimizer and binary cross-entropy loss function, which are well-suited for binary classification tasks. It leverages metrics such as accuracy, precision, recall, and AUC to evaluate performance throughout the training process. + +Training involves the use of a data generator for efficient handling of large image datasets, augmenting the training data to improve generalization. The model also incorporates callbacks for early stopping, learning rate reduction on plateau, and model checkpointing to save the best-performing model. + +This advanced architecture and training regimen enable the `SkinCancerDetector` to achieve high accuracy in distinguishing between benign and malignant skin lesions, making it a valuable tool for aiding in the early detection of skin cancer. + ## Performance -The model achieved an accuracy of 84% and a loss of 0.23 on the testing dataset. -We also track sensitivity, specificity, precision, and F1 score. The model achieved a sensitivity of 84%, a specificity of 84%, a precision of 84%, and an F1 score of 84.4% on the testing dataset. +The updated model demonstrates significant improvements in its ability to classify skin lesions accurately, achieving an accuracy of 84% and a loss of 0.23 on the testing dataset. The model's sensitivity, specificity, precision, and F1 score have also seen considerable enhancements, with the following scores reported on the testing dataset: -![Sensitivity Score](https://img.shields.io/badge/Sensitivity-0.84035-blue) -![Specificity Score](https://img.shields.io/badge/Specificity-0.84019-blue) -![Precision Score](https://img.shields.io/badge/Precision-0.84035-blue) -![F1 Score](https://img.shields.io/badge/F1-0.84467-blue) -![Accuracy Score](https://img.shields.io/badge/Accuracy-0.84035-blue) -![Loss Score](https://img.shields.io/badge/Loss-0.23201-blue) -![AUC Score](https://img.shields.io/badge/AUC-0.91692-blue) +- Sensitivity: 84.035% +- Specificity: 84.019% +- Precision: 84.035% +- F1 Score: 84.467% +- Accuracy: 84.035% +- Loss: 0.23201 +- AUC: 91.692% + + +### Targets + +| Metric | Target Range | Progress | +|-------------------|---------------|-------------------------------------------------| +| **Loss** | Close to 0 | ![Progress](https://progress-bar.dev/10/?scale=0..0.6932&title=progress&suffix=) | +| **Accuracy** | 85% - 95% | ![Progress](https://progress-bar.dev/0/?scale=85..95&title=progress&suffix=) | +| **Precision** | 80% - 90% | ![Progress](https://progress-bar.dev/11/?scale=80..90&title=progress&suffix=) | +| **Recall** | 85% - 95% | ![Progress](https://progress-bar.dev/33/?scale=85..95&title=progress&suffix=) | +| **AUC** | 0.85 - 0.95 | ![Progress](https://progress-bar.dev/0/?scale=0.85..0.95&title=progress&suffix=) | +| **Binary Accuracy**| 85% - 95% | ![Progress](https://progress-bar.dev/0/?scale=85..95&title=progress&suffix=) | +| **F1 Score** | 85% - 95% | ![Progress](https://progress-bar.dev/7/?scale=85..95&title=progress&suffix=) | ## Contributing -Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on our code of conduct, and the process for submitting pull requests to us. +We encourage contributions to SkinVestigatorAI! For guidelines on contributing, please read [CONTRIBUTING.md](CONTRIBUTING.md). By participating in this project, you agree to abide by its terms. ## License -This project is licensed under the GNU General Public License v3.0 - see the [LICENSE.md](LICENSE.md) file for details +SkinVestigatorAI is released under the GNU General Public License v3.0. For more details, see the [LICENSE.md](LICENSE.md) file. ## Acknowledgments +We extend our gratitude to the International Skin Imaging Collaboration (ISIC) for providing access to their extensive archive of skin lesion images, which has been instrumental in the development and refinement of our model. ## References +- International Skin Imaging Collaboration (ISIC). The ISIC Archive. https://www.isic-archive.com ## Citation +For academic and research use, please cite our work as follows: + +"SkinVestigator: A Deep Learning-Based Skin Cancer Detection Tool, available at: https://github.com/Thomasbehan/SkinVestigatorAI", 2024. + ## Disclaimer -This project is not intended to be used as a medical diagnostic tool. The authors of this project are not medical professionals and are not responsible for any medical decisions made by users of this project. -Always consult a medical professional for any medical concerns. \ No newline at end of file +SkinVestigatorAI is not intended for clinical diagnosis or medical use. It is a research tool aimed at fostering developments in the field of automated skin lesion analysis. Always consult a qualified healthcare provider for medical advice and diagnosis. + diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000..570fa6a --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,33 @@ + +## Roadmap + +Our development roadmap is designed to iteratively enhance our AI's capabilities, focusing on improving its accuracy, scalability, and robustness. Each model iteration (M1 through M5) represents a step towards achieving our ultimate goal: developing an AI system with the highest possible F1 score, ensuring balanced precision and recall for critical applications in skin cancer detection and beyond. Here's a brief overview of the planned roadmap: + +### M1: Proof of Concept +- **Objective:** Establish a foundational AI model using the ViT (Vision Transformer) as a base. The primary aim is to set up the necessary tools and framework for AI development within our project. +- **Dataset:** Train on a dataset of 10,000 images to validate the concept and the underlying infrastructure. +- **Focus:** Laying down the groundwork for future iterations by validating the initial model architecture and data processing pipelines. + +### M2: Initial Deployment +- **Objective:** Build upon the proof of concept by increasing the dataset size and refining the model based on initial learnings. +- **Dataset:** This iteration is trained on 40,194 images, significantly expanding its learning capacity and generalization capabilities. +- **Focus:** Enhance model accuracy and establish a benchmark for performance improvements in subsequent versions. + +### M3: Expanded Dataset +- **Objective:** Further increase the dataset size to improve the model's ability to generalize and accurately identify skin cancer from a wider variety of images. +- **Dataset:** Utilize a dataset of 73,196 images, aiming for broader coverage and improved detection capabilities. +- **Focus:** Target substantial improvements in model performance, particularly in handling diverse and challenging cases. + +### M4: Advanced Features and Security +- **Objective:** Introduce natural language processing capabilities to interpret textual data alongside images and implement features to detect and reject non-skin images used to deceive the AI. +- **Focus:** Enhance the AI's versatility and robustness, making it more adaptable to real-world applications and resistant to manipulation. + +### M5: Security Enhancement +- **Objective:** Strengthen the model's security features to prevent tricking the AI into false predictions, ensuring the system's integrity and reliability. +- **Focus:** Concentrate on making the AI system as foolproof as possible against attempts to exploit its weaknesses, further solidifying its application in sensitive fields. + +### Importance of Focusing on the F1 Score +The ultimate goal of achieving the highest possible F1 score is crucial because it signifies a balanced approach to precision (the model's ability to identify true positives from all positive predictions) and recall (the model's success in identifying all actual positives). This balance is especially important in medical applications, like skin cancer detection, where the cost of false negatives (failing to identify a condition) can be as critical as the cost of false positives (incorrectly identifying a condition). A high F1 score ensures that our AI system is both accurate and reliable, minimizing the risk of misdiagnosis and making it a valuable tool in clinical support. + +### Future Plans +Beyond M5, we aim to explore additional innovations that will push the boundaries of what our AI can achieve, constantly seeking to improve its accuracy, efficiency, and applicability in real-world scenarios. diff --git a/skinvestigatorai/core/ai/config.py b/commands/__config.py similarity index 63% rename from skinvestigatorai/core/ai/config.py rename to commands/__config.py index 35d8717..df34bb9 100644 --- a/skinvestigatorai/core/ai/config.py +++ b/commands/__config.py @@ -1,3 +1,3 @@ train_dir = 'data/train' -val_dir = 'data/validation' test_dir = 'data/test' +val_dir = test_dir diff --git a/skinvestigatorai/core/__init__.py b/commands/__init__.py similarity index 100% rename from skinvestigatorai/core/__init__.py rename to commands/__init__.py diff --git a/commands/download_model.py b/commands/download_model.py new file mode 100644 index 0000000..016c9d9 --- /dev/null +++ b/commands/download_model.py @@ -0,0 +1,19 @@ +import argparse +from skinvestigatorai.models.downloader import downloader + + +def main(): + parser = argparse.ArgumentParser(description="Download a specific AI model.") + parser.add_argument("-m", "--modelname", required=True, help="The name of the model to download.") + + args = parser.parse_args() + + model_name = args.modelname + if downloader(model_name): + print(f"Successfully downloaded the model: {model_name}") + else: + print(f"Failed to download the model: {model_name}") + + +if __name__ == "__main__": + main() diff --git a/commands/run_data_scraper.py b/commands/run_data_scraper.py new file mode 100644 index 0000000..3351f30 --- /dev/null +++ b/commands/run_data_scraper.py @@ -0,0 +1,17 @@ +import argparse +from skinvestigatorai.services.data_scaper_service import DataScraper + + +def main(): + parser = argparse.ArgumentParser( + description="Download images from ISIC Archive and split into training and testing sets.") + parser.add_argument("-p", "--pages", type=int, default=-1, + help="Number of pages to download. Default is -1, which downloads all pages.") + args = parser.parse_args() + + scraper = DataScraper(max_pages=args.pages) + scraper.download_and_split_images() + + +if __name__ == "__main__": + main() diff --git a/commands/run_hparam_tuning.py b/commands/run_hparam_tuning.py new file mode 100644 index 0000000..290b76a --- /dev/null +++ b/commands/run_hparam_tuning.py @@ -0,0 +1,45 @@ +import os +import tensorflow as tf +from skinvestigatorai.services.detector_service import SkinCancerDetector +from __config import train_dir, val_dir, test_dir +from skinvestigatorai.services.data_scaper_service import DataScraper + + +def main(): + # Ensure that GPUs are used efficiently + gpus = tf.config.experimental.list_physical_devices('GPU') + if gpus: + try: + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + except RuntimeError as e: + print(e) + + # check if data is downloaded and if not download it + if not os.path.exists(train_dir): + print('Downloading data...') + downloader = DataScraper() + downloader.download_and_split_images() + print('Done downloading data') + + # Print count of files in each directory + print('Train:', len(os.listdir(train_dir + '/benign')), 'benign,', len(os.listdir(train_dir + '/malignant')), + 'malignant') + # Print count of files in each directory + print('Test:', len(os.listdir(test_dir + '/benign')), 'benign,', len(os.listdir(test_dir + '/malignant')), + 'malignant') + + detector = SkinCancerDetector(train_dir, val_dir, test_dir) + train_generator, val_generator, test_datagen = detector.preprocess_data() + detector.build_model(num_classes=len(train_generator.class_indices)) + detector.HParam_tuning(train_generator, val_generator) + + +if __name__ == '__main__': + + if not os.path.exists('data/train/benign'): + downloader = DataScraper() + print('Done training models') + print('Training model with all data') + downloader.download_and_split_images() + main() diff --git a/commands/run_train_model.py b/commands/run_train_model.py new file mode 100644 index 0000000..7dedda2 --- /dev/null +++ b/commands/run_train_model.py @@ -0,0 +1,50 @@ +import os +from skinvestigatorai.services.detector_service import SkinCancerDetector +from __config import train_dir, val_dir, test_dir +from skinvestigatorai.services.data_scaper_service import DataScraper +from sklearn.utils.class_weight import compute_class_weight +import numpy as np + + +def calculate_class_weights(train_dir): + classes = [0, 1] # 0 for benign, 1 for malignant + + num_benign = len(os.listdir(os.path.join(train_dir, 'benign'))) + num_malignant = len(os.listdir(os.path.join(train_dir, 'malignant'))) + # Calculate class weights for balanced training + class_weights = compute_class_weight( + class_weight='balanced', + classes=classes, + y=np.array([0] * num_benign + [1] * num_malignant) + ) + return dict(zip(classes, class_weights)) + + +def main(filename='models/skinvestigator.h5'): + # check if data is downloaded and if not download it + if not os.path.exists(train_dir): + print('Downloading data...') + downloader = DataScraper() + downloader.download_and_split_images() + print('Done downloading data') + + # Print count of files in each directory + print('Train:', len(os.listdir(train_dir + '/benign')), 'benign,', len(os.listdir(train_dir + '/malignant')), + 'malignant') + # Print count of files in each directory + print('Test:', len(os.listdir(test_dir + '/benign')), 'benign,', len(os.listdir(test_dir + '/malignant')), + 'malignant') + + detector = SkinCancerDetector(train_dir, val_dir, test_dir) + train_generator, val_generator, test_datagen = detector.preprocess_data() + detector.build_model(num_classes=len(train_generator.class_indices)) + + class_weights = calculate_class_weights(train_dir) + + detector.train_model(train_generator, val_generator, class_weights=class_weights) + detector.evaluate_model(test_datagen) + detector.save_model(filename) + + +if __name__ == '__main__': + main() diff --git a/models/skinvestigator-quantize.tflite b/models/skinvestigator-quantize.tflite deleted file mode 100644 index 2e9db17..0000000 Binary files a/models/skinvestigator-quantize.tflite and /dev/null differ diff --git a/setup.py b/setup.py index 8c9d907..ccefdca 100644 --- a/setup.py +++ b/setup.py @@ -26,12 +26,14 @@ 'vit-keras==0.1.2', 'tensorflow_addons==0.20.0', 'albumentations', + 'keras-tuner', ] tests_require = [ 'WebTest', 'pytest', 'pytest-cov', + 'pytest-mock', 'flake8', ] diff --git a/skinvestigatorai/core/ai/detector.py b/skinvestigatorai/core/ai/detector.py deleted file mode 100644 index 746a3ab..0000000 --- a/skinvestigatorai/core/ai/detector.py +++ /dev/null @@ -1,165 +0,0 @@ -import os -import datetime -import tensorflow as tf -from tensorflow.keras.metrics import Precision, Recall -from tensorflow.keras.callbacks import TensorBoard, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping -from tensorflow.keras.preprocessing.image import ImageDataGenerator - - -class SkinCancerDetector: - def __init__(self, train_dir, val_dir, test_dir, log_dir='logs', batch_size=32, model_dir='models', - img_size=(180, 180)): - self.train_dir = train_dir - self.val_dir = val_dir - self.test_dir = test_dir - self.log_dir = log_dir - self.batch_size = batch_size - self.img_size = img_size - self.model_dir = model_dir - self.model = None - self.precision = Precision() - self.recall = Recall() - - def preprocess_data(self): - """Preprocess data and apply image augmentation.""" - train_generator = self.create_data_generator(self.train_dir) - val_generator = self.create_data_generator(self.val_dir, augment=True) - test_datagen = self.create_data_generator(augment=True) - - return train_generator, val_generator, test_datagen - - def create_data_generator(self, dir=None, augment=False): - if augment: - datagen = ImageDataGenerator( - rescale=1. / 255, - horizontal_flip=True, - vertical_flip=True, - brightness_range=[0.8, 1.2] - ) - else: - datagen = ImageDataGenerator(rescale=1. / 255) - - if dir: - return datagen.flow_from_directory( - dir, - target_size=self.img_size, - batch_size=self.batch_size, - class_mode='binary' - ) - return datagen - - def quantize_model(self, model): - converter = tf.lite.TFLiteConverter.from_keras_model(model) - converter.optimizations = [tf.lite.Optimize.DEFAULT] - tflite_quant_model = converter.convert() - - return tflite_quant_model - - def build_model(self, num_classes=2): - self.model = tf.keras.Sequential([ - tf.keras.layers.Rescaling(1. / 255, input_shape=(self.img_size[0], self.img_size[1], 3)), - tf.keras.layers.Conv2D(180, 3, padding='same', activation='relu'), - tf.keras.layers.MaxPooling2D(), - tf.keras.layers.Dropout(0.25), - tf.keras.layers.Conv2D(512, 3, padding='same', activation='relu'), - tf.keras.layers.MaxPooling2D(), - tf.keras.layers.Conv2D(256, 3, padding='same', activation='relu'), - tf.keras.layers.MaxPooling2D(), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(128, activation='relu'), - tf.keras.layers.Dropout(0.5), - tf.keras.layers.Dense(1, activation='sigmoid') - ]) - - self.model.compile(optimizer='adam', - loss='binary_crossentropy', - metrics=[ - 'accuracy', - tf.keras.metrics.Precision(name='precision'), - tf.keras.metrics.Recall(name='recall'), - tf.keras.metrics.AUC(name='auc') - ]) - - def train_model(self, train_generator, val_generator, epochs=1000, patience_lr=12, patience_es=40, min_lr=1e-6, - min_delta=1e-4, cooldown_lr=5): - """Train the model with callbacks.""" - self._check_model() - - # Create a log directory with a timestamp - current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - log_dir = os.path.join(self.log_dir, current_time) - os.makedirs(log_dir, exist_ok=True) - - callbacks = self._create_callbacks(log_dir, current_time, patience_lr, min_lr, min_delta, patience_es, - cooldown_lr) - - history = self.model.fit( - train_generator, - epochs=epochs, - validation_data=val_generator, - callbacks=callbacks) - return history - - def _create_callbacks( - self, log_dir, current_time, patience_lr=10, min_lr=1e-5, min_delta=1e-3, patience_es=30, cooldown_lr=5 - ): - """Callbacks for optimized learning rate adjustments and early stopping.""" - tensorboard_callback = TensorBoard( - log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True, update_freq='epoch', profile_batch=0 - ) - reduce_lr_callback = ReduceLROnPlateau( - monitor='val_loss', factor=0.2, patience=patience_lr, min_lr=min_lr, min_delta=min_delta, - cooldown=cooldown_lr, verbose=1 - ) - model_checkpoint_callback = ModelCheckpoint( - filepath=os.path.join(self.model_dir, "{}_best_model.h5".format(current_time)), - save_best_only=True, monitor='val_loss', mode='min', verbose=1 - ) - early_stopping_callback = EarlyStopping(monitor='val_loss', patience=patience_es, restore_best_weights=True, - verbose=1) - - return [tensorboard_callback, reduce_lr_callback, model_checkpoint_callback, early_stopping_callback] - - def evaluate_model(self, test_datagen): - """Evaluate the model for binary classification.""" - self._check_model() - - test_generator = test_datagen.flow_from_directory( - self.test_dir, - target_size=self.img_size, - batch_size=self.batch_size, - class_mode='binary') # Updated to binary - - test_loss, test_acc, test_sensitivity, test_precision, test_f1, test_specificity, test_auc \ - = self.model.evaluate(test_generator) - print('Test accuracy:', test_acc) - print('Test sensitivity:', test_sensitivity) - print('Test precision:', test_precision) - print('Test F1-score:', test_f1) - print('Test specificity:', test_specificity) - print('Test AUC-ROC:', test_auc) - return test_loss, test_acc, test_sensitivity, test_precision, test_f1, test_specificity, test_auc - - def save_model(self, filename='models/skinvestigator.h5'): - """Save the original and quantized model.""" - self._check_model() - - # Save the original model - self.model.save(filename) - - # Quantize and save the model - tflite_model = self.quantize_model(self.model) - tflite_model_path = filename.replace('.h5', '-quantized.tflite') - with open(tflite_model_path, 'wb') as f: - f.write(tflite_model) - print(f"Model saved as {filename} and {tflite_model_path}") - - def load_model(self, filename): - """Load the model.""" - self.model = tf.keras.models.load_model(filename) - print(f"Model loaded from {filename}") - - def _check_model(self): - """Checking if the model has not been built.""" - if self.model is None: - raise ValueError("Model has not been built. Call build_model() first.") diff --git a/skinvestigatorai/core/ai/train.py b/skinvestigatorai/core/ai/train.py deleted file mode 100644 index 6903e97..0000000 --- a/skinvestigatorai/core/ai/train.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from skinvestigatorai.core.ai.detector import SkinCancerDetector -from skinvestigatorai.core.ai.config import train_dir, val_dir, test_dir -from skinvestigatorai.core.data_scraper import DataScraper - - -def main(filename='models/skinvestigator.h5'): - # check if data is downloaded and if not download it - if not os.path.exists(train_dir): - print('Downloading data...') - downloader = DataScraper() - downloader.download_images(-1) - print('Done downloading data') - - # Print count of files in each directory - print('Train:', len(os.listdir(train_dir + '/benign')), 'benign,', len(os.listdir(train_dir + '/malignant')), - 'malignant') - - detector = SkinCancerDetector(train_dir, val_dir, test_dir) - train_generator, val_generator, test_datagen = detector.preprocess_data() - detector.build_model(num_classes=len(train_generator.class_indices)) - detector.train_model(train_generator, val_generator) - detector.evaluate_model(test_datagen) - detector.save_model(filename) - - -if __name__ == '__main__': - - if not os.path.exists('data/train/benign'): - downloader = DataScraper() - print('Done training models') - print('Training model with all data') - downloader.download_images(-1) - main('skin_cancer_detection_model_all_GPU.h5') diff --git a/skinvestigatorai/core/data_scraper.py b/skinvestigatorai/core/data_scraper.py deleted file mode 100644 index 0618d35..0000000 --- a/skinvestigatorai/core/data_scraper.py +++ /dev/null @@ -1,132 +0,0 @@ -import os -import argparse -import json -import requests -import concurrent.futures -import mimetypes -import tensorflow.keras.preprocessing.image as image_utils -from PIL import UnidentifiedImageError -from collections import defaultdict - - -class DataScraper: - def __init__(self, train_dir="data/train", val_dir="data/validation", test_dir="data/test"): - self.train_dir = train_dir - self.val_dir = val_dir - self.test_dir = test_dir - self.base_url = "https://api.isic-archive.com/api/v2" - self.image_list_url = f"{self.base_url}/images/?format=json" - - def _create_output_folders(self): - # Create the output folders if they don't exist - os.makedirs(self.train_dir, exist_ok=True) - os.makedirs(self.val_dir, exist_ok=True) - os.makedirs(self.test_dir, exist_ok=True) - - def _image_safe_check(self, path): - try: - image_utils.load_img(path) - except UnidentifiedImageError: - print(f"Skipping corrupted image: {path}") - return False - return True - - def _download_and_save_image(self, image_metadata, output_folder): - image_id = image_metadata["isic_id"] - # image_url = image_metadata["files"]["full"]["url"] - image_url = image_metadata["files"]["thumbnail_256"]["url"] - response = requests.get(image_url) - - # Get the file extension based on the MIME type - content_type = response.headers['content-type'] - ext = mimetypes.guess_extension(content_type) - - # Create the output folder if it doesn't exist - if not os.path.exists(output_folder): - os.makedirs(output_folder) - - file_path = os.path.join(output_folder, f"{image_id}{ext}") - # Skip download if file already exists - if os.path.exists(file_path): - print(f"File {file_path} already exists, skipping download.") - return - - with open(file_path, "wb") as f: - f.write(response.content) - - if not self._image_safe_check(file_path): - os.remove(file_path) - - def download_images(self, limit=-1): - self._create_output_folders() - - next_url = self.image_list_url - - def process_image(image_metadata, output_folder): - if "benign_malignant" in image_metadata["metadata"]["clinical"]: - self._download_and_save_image(image_metadata, - output_folder + "/" + image_metadata["metadata"]["clinical"][ - "benign_malignant"]) - else: - print(f"Skipping image {image_metadata['isic_id']} due to missing category information.") - - count = 0 - image_metadata_dict = defaultdict(list) - while next_url and (count < limit or limit == -1): - print(str(count) + " CURRENT URL: ", next_url) - response = requests.get(next_url) - print("RESPONSE: ", response) - response_data = json.loads(response.content.decode("utf-8")) - next_url = response_data["next"] - image_metadata_list = response_data["results"] - - # Grouping the images by their classification - for image_metadata in image_metadata_list: - if "benign_malignant" in image_metadata["metadata"]["clinical"]: - category = image_metadata["metadata"]["clinical"]["benign_malignant"] - image_metadata_dict[category].append(image_metadata) - else: - print(f"Skipping image {image_metadata['isic_id']} due to missing category information.") - - count += 1 - - # Achieving balance by taking the min number of images in each category - min_images = min(len(image_metadata_dict["benign"]), len(image_metadata_dict["malignant"])) - print("Achieving balance with " + str(min_images) + " images per category...") - - # Define distribution for train, validation, and test sets - train_ratio = 0.7 - val_ratio = 0.2 - - for category, image_metadata_list in image_metadata_dict.items(): - total_images = len(image_metadata_list) - train_size = int(train_ratio * total_images) - val_size = int(val_ratio * total_images) - - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [] - for i in range(min_images): - if i < train_size: - directory = self.train_dir - elif i < train_size + val_size: - directory = self.val_dir - else: - directory = self.test_dir - - futures.append(executor.submit(process_image, image_metadata_list[i], directory)) - - for future in concurrent.futures.as_completed(futures): - try: - future.result() - except Exception as e: - print(f"Error downloading image: {e}") - - print("Images downloaded and saved.") - - -if __name__ == "__main__": - argParser = argparse.ArgumentParser() - argParser.add_argument("-p", "--pages", help="Number of pages to download") - args = argParser.parse_args() - downloader = DataScraper() - downloader.download_images(int(args.pages or -1)) diff --git a/skinvestigatorai/core/route_generator.py b/skinvestigatorai/core/route_generator.py deleted file mode 100644 index 4f0a656..0000000 --- a/skinvestigatorai/core/route_generator.py +++ /dev/null @@ -1,5 +0,0 @@ -def generate_route_name(): - import uuid - return str(uuid.uuid4()) + '/train' - - diff --git a/skinvestigatorai/core/ai/__init__.py b/skinvestigatorai/models/__init__.py similarity index 100% rename from skinvestigatorai/core/ai/__init__.py rename to skinvestigatorai/models/__init__.py diff --git a/skinvestigatorai/models/downloader.py b/skinvestigatorai/models/downloader.py new file mode 100644 index 0000000..bf94bf5 --- /dev/null +++ b/skinvestigatorai/models/downloader.py @@ -0,0 +1,49 @@ +import os +import requests +from tqdm import tqdm +from urllib.parse import urlparse, unquote + +MODEL_DIRECTORY = 'models/' +MODEL_URLS = { + 'M-0003': + 'https://github.com/Thomasbehan/SkinVestigatorAI/releases/download/0.0.3/skinvestigator_nano_40MB_91_38_acc.h5', + 'M-0015': 'https://github.com/Thomasbehan/SkinVestigatorAI/releases/download/0.1.5/skinvestigator-lg.h5', + 'M-0015s': 'https://github.com/Thomasbehan/SkinVestigatorAI/releases/download/0.1.5/skinvestigator-sm.tflite', +} + + +def downloader(model_name): + url = MODEL_URLS.get(model_name) + if not url: + print(f"URL for model '{model_name}' not found.") + return False + + response = requests.get(url, stream=True) + + if response.status_code == 200: + parsed_url = urlparse(url) + filename = os.path.basename(unquote(parsed_url.path)) + + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 # 1 Kilobyte + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + model_path = os.path.join(MODEL_DIRECTORY, filename) + + if not os.path.exists(MODEL_DIRECTORY): + os.makedirs(MODEL_DIRECTORY) + + with open(model_path, 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + print("ERROR, something went wrong") + return False + + print(f"Downloaded {model_name} successfully as {filename}.") + return True + else: + print(f"Failed to download {model_name}.") + return False diff --git a/skinvestigatorai/routes.py b/skinvestigatorai/routes.py index a373019..2472aa2 100644 --- a/skinvestigatorai/routes.py +++ b/skinvestigatorai/routes.py @@ -1,4 +1,6 @@ -from skinvestigatorai.core.route_generator import generate_route_name +def generate_route_name(): + import uuid + return str(uuid.uuid4()) + '/train' def includeme(config): diff --git a/skinvestigatorai/services/__init__.py b/skinvestigatorai/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/skinvestigatorai/core/custom_image_data_generator.py b/skinvestigatorai/services/custom_image_data_generator.py similarity index 100% rename from skinvestigatorai/core/custom_image_data_generator.py rename to skinvestigatorai/services/custom_image_data_generator.py diff --git a/skinvestigatorai/core/data_gen.py b/skinvestigatorai/services/data_generator.py similarity index 100% rename from skinvestigatorai/core/data_gen.py rename to skinvestigatorai/services/data_generator.py diff --git a/skinvestigatorai/services/data_scaper_service.py b/skinvestigatorai/services/data_scaper_service.py new file mode 100644 index 0000000..d760b77 --- /dev/null +++ b/skinvestigatorai/services/data_scaper_service.py @@ -0,0 +1,118 @@ +import os +import argparse +import requests +import shutil +from sklearn.model_selection import train_test_split +from concurrent.futures import ThreadPoolExecutor, as_completed +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + + +class DataScraper: + def __init__(self, output_dir="data", max_pages=-1): + self.output_dir = output_dir + self.api_url = "https://api.isic-archive.com/api/v2/images" + self.failed_downloads_path = os.path.join(self.output_dir, "failed_downloads.txt") + self.max_pages = max_pages + self.session = self._create_session() + + def _create_session(self): + session = requests.Session() + retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) + session.mount('http://', HTTPAdapter(max_retries=retries)) + session.mount('https://', HTTPAdapter(max_retries=retries)) + return session + + def _create_output_folders(self): + for category in ["train", "test", "temp"]: + for label in ["benign", "malignant"]: + os.makedirs(os.path.join(self.output_dir, category, label), exist_ok=True) + os.makedirs(os.path.dirname(self.failed_downloads_path), exist_ok=True) + + def _download_image(self, image_data): + image_url, file_path = image_data + if os.path.exists(file_path): + return file_path, True # Indicates success to avoid marking as failed + try: + response = self.session.get(image_url, timeout=10) + if response.status_code == 200: + with open(file_path, 'wb') as f: + f.write(response.content) + return file_path, True + else: + return image_url, False + except requests.RequestException: + return image_url, False + + def _split_data(self, images, test_size=0.2): + train, test = train_test_split(images, test_size=test_size, random_state=42) + return train, test + + def _move_images(self, images, source_folder, dest_folder): + for image in images: + shutil.move(os.path.join(source_folder, image), os.path.join(dest_folder, image)) + + def download_and_split_images(self): + self._create_output_folders() + temp_folder = os.path.join(self.output_dir, "temp") + failed_downloads = set() + if os.path.exists(self.failed_downloads_path): + with open(self.failed_downloads_path) as f: + failed_downloads = {line.strip() for line in f} + + next_url = self.api_url + params = {'limit': 100, 'offset': 0} + total_images_downloaded = 0 + page_count = 0 + + while next_url and (self.max_pages == -1 or page_count < self.max_pages): + response = self.session.get(next_url, params=params) + if response.status_code != 200: + break + + data = response.json() + next_url = data.get("next", None) + page_count += 1 + + download_tasks = [] + for image in data['results']: + isic_id = image['isic_id'] + image_url = image['files']['full']['url'] + benign_malignant = image['metadata']['clinical'].get('benign_malignant', 'unknown') + if benign_malignant in ['benign', 'malignant'] and image_url not in failed_downloads: + file_path = os.path.join(temp_folder, benign_malignant, f"{isic_id}.jpg") + download_tasks.append((image_url, file_path)) + + with ThreadPoolExecutor(max_workers=50) as executor: + future_to_url = {executor.submit(self._download_image, task): task for task in download_tasks} + for future in as_completed(future_to_url): + url, success = future.result() + if success: + total_images_downloaded += 1 + else: + with open(self.failed_downloads_path, 'a') as f: + f.write(f"{url}\n") + + print(f"Total images downloaded: {total_images_downloaded}") + + for label in ['benign', 'malignant']: + images = [img for img in os.listdir(os.path.join(temp_folder, label)) if img.endswith(".jpg")] + train_images, test_images = self._split_data(images) + self._move_images(train_images, os.path.join(temp_folder, label), + os.path.join(self.output_dir, "train", label)) + self._move_images(test_images, os.path.join(temp_folder, label), + os.path.join(self.output_dir, "test", label)) + + shutil.rmtree(temp_folder) + print("Images split into train and test sets.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Download images from ISIC Archive and split into training and testing sets.") + parser.add_argument("-p", "--pages", type=int, default=-1, + help="Number of pages to download. Default is -1, which downloads all pages.") + args = parser.parse_args() + + scraper = DataScraper(max_pages=args.pages) + scraper.download_and_split_images() diff --git a/skinvestigatorai/services/detector_service.py b/skinvestigatorai/services/detector_service.py new file mode 100644 index 0000000..63a608c --- /dev/null +++ b/skinvestigatorai/services/detector_service.py @@ -0,0 +1,323 @@ +import os +import datetime +import tensorflow as tf +from tensorflow.keras.metrics import Precision, Recall +from tensorflow.keras.callbacks import TensorBoard, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping +from tensorflow.keras.layers import Rescaling, Input, Conv2D, MaxPooling2D, Dense, Add, Activation, \ + BatchNormalization, GlobalAveragePooling2D, Dropout +from tensorflow.keras.models import Model +from PIL import Image +import keras_tuner as kt +from tensorflow.keras import backend as K + +# Configure TensorFlow to only allocate memory as needed +gpus = tf.config.experimental.list_physical_devices('GPU') +if gpus: + try: + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + except RuntimeError as e: + print(e) + + +def focal_loss(gamma=2., alpha=4.): + def focal_loss_fixed(y_true, y_pred): + """Focal loss for binary classification problems.""" + pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred)) + pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred)) + return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.sum( + (1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0)) + + return focal_loss_fixed + + +def f1_score(precision, recall): + return 2 * ((precision * recall) / (precision + recall + tf.keras.backend.epsilon())) + + +class SkinCancerDetector: + def __init__(self, train_dir, val_dir, test_dir, log_dir='logs', batch_size=32, model_dir='models', + img_size=(180, 180)): + self.train_dir = train_dir + self.val_dir = val_dir + self.test_dir = test_dir + self.log_dir = log_dir + self.batch_size = batch_size + self.img_size = img_size + self.model_dir = model_dir + self.model = None + self.precision = Precision() + self.recall = Recall() + + def verify_images(self, directory): + """ + Verify that images in the directory can be opened with PIL. + Automatically deletes any image that fails to open. + """ + invalid_images = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.lower().endswith(('.png', '.jpg', '.jpeg')): + try: + img_path = os.path.join(root, file) + with Image.open(img_path) as img: + img.verify() + except (Image.UnidentifiedImageError, IOError): + invalid_images.append(img_path) + os.remove(img_path) + print('Deleted invalid file:', img_path) + return invalid_images + + def preprocess_data(self, augment=True): + + train_paths = tf.data.Dataset.list_files(os.path.join(self.train_dir, '*/*')) + val_paths = tf.data.Dataset.list_files(os.path.join(self.val_dir, '*/*')) + test_paths = tf.data.Dataset.list_files(os.path.join(self.test_dir, '*/*')) + + train_labels = train_paths.map(lambda x: tf.where(tf.strings.regex_full_match(x, ".*benign.*"), 0, 1)) + val_labels = val_paths.map(lambda x: tf.where(tf.strings.regex_full_match(x, ".*benign.*"), 0, 1)) + test_labels = test_paths.map(lambda x: tf.where(tf.strings.regex_full_match(x, ".*benign.*"), 0, 1)) + + train_ds = tf.data.Dataset.zip((train_paths.map(self.load_and_preprocess_image), train_labels)) + val_ds = tf.data.Dataset.zip((val_paths.map(self.load_and_preprocess_image), val_labels)) + test_ds = tf.data.Dataset.zip((test_paths.map(self.load_and_preprocess_image), test_labels)) + + train_ds = self.prepare_for_training(train_ds) + val_ds = self.prepare_for_training(val_ds) + test_ds = self.prepare_for_training(test_ds) + + return train_ds, val_ds, test_ds + + def preprocess_image(self, image): + image = tf.image.decode_jpeg(image, channels=3) + image = tf.image.resize(image, [180, 180]) + image /= 255.0 + return image + + def load_and_preprocess_image(self, path): + image = tf.io.read_file(path) + return self.preprocess_image(image) + + def prepare_for_training(self, ds, cache=True, shuffle=False, shuffle_buffer_size=1000, repeat=False): + """ + Prepares the dataset for training by caching, shuffling, batching, and prefetching it. + + Parameters: + - ds: The dataset to prepare. + - cache: Determines whether to cache the dataset. Can be True, False, or a file path as a string. + - shuffle: Whether to shuffle the dataset. + - shuffle_buffer_size: The buffer size to use for shuffling. + - repeat: Whether to repeat the dataset indefinitely. + + Returns: + - The prepared dataset. + """ + if cache: + if isinstance(cache, str): + ds = ds.cache(cache) + else: + ds = ds.cache() + + if shuffle: + ds = ds.shuffle(buffer_size=shuffle_buffer_size) + + if repeat: + ds = ds.repeat() + + ds = ds.batch(self.batch_size) + ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) + + return ds + + def build_model(self, num_classes=2): + input_shape = (self.img_size[0], self.img_size[1], 3) + self.model = self.build_complex_model(input_shape, num_classes) + + self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), + loss=focal_loss(), + metrics=[ + Recall(name='recall'), + tf.keras.metrics.AUC(name='auc'), + f1_score, + Precision(name='precision'), + 'accuracy', + tf.keras.metrics.BinaryAccuracy(name='binary_accuracy'), + ]) + print("Model Summary:") + self.model.summary() + + def build_complex_model(self, input_shape, num_classes): + inputs = Input(shape=input_shape) + x = Rescaling(1. / 255)(inputs) + x = Conv2D(64, kernel_size=7, strides=2, padding='same')(x) + x = BatchNormalization()(x) + x = Activation('relu')(x) + x = MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(x) + + for filters in [64, 64, 128, 128, 256]: + strides = 1 if filters == 64 else 2 + x = self.residual_block(x, filters, stride=strides) + + x = GlobalAveragePooling2D()(x) + x = Dense(256, activation='relu')(x) + x = BatchNormalization()(x) + x = Dropout(0.5)(x) + outputs = Dense(1, activation='sigmoid')(x) + + model = Model(inputs=inputs, outputs=outputs) + return model + + def residual_block(self, x, filters, kernel_size=3, stride=1): + shortcut = x + x = Conv2D(filters, kernel_size, strides=stride, padding='same')(x) + x = BatchNormalization()(x) + x = Activation('relu')(x) + x = Conv2D(filters, kernel_size, strides=1, padding='same')(x) + x = BatchNormalization()(x) + + if stride != 1 or shortcut.shape[-1] != filters: + shortcut = Conv2D(filters, 1, strides=stride, padding='same')(shortcut) + shortcut = BatchNormalization()(shortcut) + + x = Add()([x, shortcut]) + x = Activation('relu')(x) + return x + + def quantize_model(self, model): + converter = tf.lite.TFLiteConverter.from_keras_model(model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + tflite_quant_model = converter.convert() + return tflite_quant_model + + def train_model(self, train_generator, val_generator, epochs=1000, patience_lr=10, + patience_es=30, min_lr=1e-8, min_delta=1e-4, cooldown_lr=5, class_weights=None): + self._check_model() + current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + log_dir = os.path.join(self.log_dir, current_time) + os.makedirs(log_dir, exist_ok=True) + callbacks = self._create_callbacks(log_dir, current_time, patience_lr, min_lr, min_delta, patience_es, + cooldown_lr) + history = self.model.fit(train_generator, + epochs=epochs, + validation_data=val_generator, + class_weight=class_weights, + callbacks=callbacks) + return history + + def _create_callbacks(self, log_dir, current_time, patience_lr, min_lr, min_delta, patience_es, cooldown_lr): + tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True, + update_freq='epoch', profile_batch=0) + reduce_lr_callback = ReduceLROnPlateau(monitor='val_recall', factor=0.2, patience=patience_lr, min_lr=min_lr, + min_delta=min_delta, cooldown=cooldown_lr, verbose=1) + model_checkpoint_callback = ModelCheckpoint( + filepath=os.path.join(self.model_dir, f"{current_time}_best_model.h5"), save_best_only=True, + monitor='val_recall', mode='max', verbose=1) + early_stopping_callback = EarlyStopping(monitor='val_recall', patience=patience_es, restore_best_weights=True, + verbose=1) + + return [tensorboard_callback, reduce_lr_callback, model_checkpoint_callback, early_stopping_callback] + + def evaluate_model(self, test_datagen): + self._check_model() + test_loss, test_acc, test_precision, test_recall, test_auc, test_binary_accuracy, test_f1_score = \ + self.model.evaluate(test_datagen) + print( + f'Test accuracy: {test_acc}, ' + f'Test precision: {test_precision}, ' + f'Test recall: {test_recall}, ' + f'Test AUC: {test_auc}, ' + f'Test F1 Score: {test_f1_score}' + ) + return test_loss, test_acc, test_precision, test_recall, test_auc, test_binary_accuracy, test_f1_score + + def save_model(self, filename='models/skin_cancer_detector.h5'): + self._check_model() + self.model.save(filename) + tflite_model = self.quantize_model(self.model) + tflite_model_path = filename.replace('.h5', '-quantized.tflite') + with open(tflite_model_path, 'wb') as f: + f.write(tflite_model) + print(f"Model saved as {filename} and {tflite_model_path}") + + def load_model(self, filename): + self.model = tf.keras.models.load_model(filename, custom_objects={"Precision": Precision, "Recall": Recall, + "f1_score": f1_score}) + print(f"Model loaded from {filename}") + + def _check_model(self): + if self.model is None: + raise ValueError("Model has not been built. Call build_model() first.") + + def HParam_tuning(self, train_generator, val_generator, epochs=1000): + def model_builder(hp): + model = tf.keras.Sequential() + model.add(tf.keras.layers.Rescaling(1. / 255, input_shape=(self.img_size[0], self.img_size[1], 3))) + + # Hyperparameters for the convolutional layers + for i in range(hp.Int('conv_blocks', 1, 3, default=2)): + hp_filters = hp.Int(f'filters_{i}', min_value=32, max_value=256, step=32) + model.add( + tf.keras.layers.Conv2D(filters=hp_filters, kernel_size=(3, 3), activation='relu', padding='same')) + model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2))) + model.add(tf.keras.layers.Dropout( + rate=hp.Float(f'dropout_conv_{i}', min_value=0.0, max_value=0.5, default=0.25, step=0.05))) + + model.add(tf.keras.layers.Flatten()) + + # Hyperparameters for the dense layers + for i in range(hp.Int('dense_blocks', 1, 2, default=1)): + hp_units = hp.Int(f'units_{i}', min_value=32, max_value=1028, step=32) + model.add(tf.keras.layers.Dense(units=hp_units, activation='relu')) + model.add(tf.keras.layers.Dropout( + rate=hp.Float(f'dropout_dense_{i}', min_value=0.0, max_value=0.5, default=0.5, step=0.05))) + + # Output layer + model.add(tf.keras.layers.Dense(1, activation='sigmoid')) + + # Tuning the learning rate + hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4]) + + model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate), + loss='binary_crossentropy', + metrics=[ + 'accuracy', + tf.keras.metrics.Precision(name='precision'), + tf.keras.metrics.Recall(name='recall'), + tf.keras.metrics.AUC(name='auc') + ]) + + return model + + tuner = kt.Hyperband(model_builder, + objective='val_recall', + max_epochs=epochs, + factor=5, + directory='hyperband_logs', + seed=42, + hyperband_iterations=2, + project_name='skin_cancer_detection') + + class ClearTrainingOutput(tf.keras.callbacks.Callback): + def on_train_end(*args, **kwargs): + return + + # Adding a callback for TensorBoard + log_dir = f"logs/hparam_tuning/{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}" + tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) + + tuner.search(train_generator, + epochs=epochs, + validation_data=val_generator, + callbacks=[ClearTrainingOutput(), tensorboard_callback]) + + # Get the optimal hyperparameters + best_hps = tuner.get_best_hyperparameters(num_trials=1)[0] + + print("The hyperparameter search is complete.") + + # Train the model with the best hyperparameters + best_model = tuner.hypermodel.build(best_hps) + best_model.fit(train_generator, + epochs=epochs, + validation_data=val_generator, + callbacks=[tensorboard_callback]) diff --git a/skinvestigatorai/services/feature_extraction_service.py b/skinvestigatorai/services/feature_extraction_service.py new file mode 100644 index 0000000..aef3de9 --- /dev/null +++ b/skinvestigatorai/services/feature_extraction_service.py @@ -0,0 +1,70 @@ +import numpy as np +import tensorflow as tf +from tensorflow.keras.models import Model + + +class FeatureExtractionService: + def __init__(self, model, model_type): + self.model_type = model_type + self.model = model + self.feature_extractor = None + self.dataset_embedding = None + + def create_feature_extractor(self): + if self.model_type == 'H5': + self.feature_extractor = Model(inputs=self.model.input, outputs=self.model.layers[-3].output) + elif self.model_type == 'TFLITE': + self.feature_extractor = self.model + else: + raise ValueError("Unsupported model type. Please use 'H5' or 'TFLITE'.") + + def preprocess_image_for_tflite(self, img): + img_resized = tf.image.resize(img, [128, 128]) + img_normalized = img_resized / 255.0 + return img_normalized + + def calculate_dataset_embedding(self, data_generator): + features = [] + if self.model_type == 'H5': + for _, (imgs, _) in enumerate(data_generator): + features.append(self.feature_extractor.predict(imgs)) + elif self.model_type == 'TFLITE': + input_details = self.model.get_input_details() + output_details = self.model.get_output_details() + + for imgs, _ in data_generator: + # Preprocess images to match TFLite input requirements + for img in imgs: + img = self.preprocess_image_for_tflite(img) # Implement this based on your model needs + img = np.expand_dims(img, axis=0).astype(input_details[0]['dtype']) + self.model.set_tensor(input_details[0]['index'], img) + self.model.invoke() + features.append(self.model.get_tensor(output_details[0]['index'])[0]) + else: + raise ValueError("Unsupported model type. Please use 'H5' or 'TFLITE'.") + + features = np.concatenate(features, axis=0) + self.dataset_embedding = np.mean(features, axis=0) + + def predict_image(self, image): + if self.model_type == 'H5': + # Assuming your H5 model expects an image with a batch dimension + return self.feature_extractor.predict(image[np.newaxis, ...]) + elif self.model_type == 'TFLITE': + input_details = self.model.get_input_details() + output_details = self.model.get_output_details() + + image_preprocessed = self.preprocess_image_for_tflite(image) + image_preprocessed = np.expand_dims(image_preprocessed, axis=0).astype(input_details[0]['dtype']) + + self.model.allocate_tensors() + self.model.invoke() + return self.model.get_tensor(output_details[0]['index']) + + def is_image_similar(self, image, threshold=0.8): + image_embedding = self.predict_image(image) + if image_embedding is not None and self.dataset_embedding is not None: + similarity = np.dot(image_embedding, self.dataset_embedding) / ( + np.linalg.norm(image_embedding) * np.linalg.norm(self.dataset_embedding)) + return similarity >= threshold + return False diff --git a/skinvestigatorai/views/predict_view.py b/skinvestigatorai/views/predict_view.py index 54645ab..b307e10 100644 --- a/skinvestigatorai/views/predict_view.py +++ b/skinvestigatorai/views/predict_view.py @@ -6,8 +6,8 @@ from tensorflow.keras.models import load_model from tensorflow.keras.preprocessing.image import img_to_array from tensorflow.lite.python.interpreter import Interpreter +from skinvestigatorai.services.feature_extraction_service import FeatureExtractionService -# Load your trained model model_dir = 'models/' MODEL_TYPE = 'TFLITE' # Set this to 'H5' or 'TFLite' as needed @@ -47,6 +47,10 @@ def load_model_type(model_type): # Define the class labels class_labels = ['benign', 'malignant', 'unknown'] +# Initialize the feature extraction service +feature_service = FeatureExtractionService(model, MODEL_TYPE) +feature_service.create_feature_extractor() + @view_config(route_name='predict', request_method='POST', renderer='json') def predict_view(request): @@ -61,11 +65,17 @@ def predict_view(request): image_array = image_array / 255.0 image_array = np.expand_dims(image_array, axis=0) + is_similar = feature_service.is_image_similar(image_array) + if not is_similar: + return HTTPBadRequest( + reason="Please make sure the image is clear, focused, and occupies most of the " + "frame while leaving sufficient space around the edges." + ) + # Make a prediction if isinstance(model, Interpreter): # If the model is a TFLite Interpreter model.allocate_tensors() input_details = model.get_input_details() - model.set_tensor(input_details[0]['index'], image_array) model.invoke() output_details = model.get_output_details() predictions = model.get_tensor(output_details[0]['index']) @@ -80,6 +90,8 @@ def predict_view(request): 'confidence': float(predictions[0][np.argmax(predictions)]) * 100 } except Exception as e: + print(e) + exit() return HTTPBadRequest(reason=str(e)) diff --git a/skinvestigatorai/views/train_view.py b/skinvestigatorai/views/train_view.py index e55a949..3ced091 100644 --- a/skinvestigatorai/views/train_view.py +++ b/skinvestigatorai/views/train_view.py @@ -1,6 +1,6 @@ from pyramid.view import view_config -from skinvestigatorai.core.ai.detector import SkinCancerDetector -from skinvestigatorai.core.ai.config import train_dir, val_dir, test_dir +from skinvestigatorai.services.detector_service import SkinCancerDetector +from commands.__config import train_dir, val_dir, test_dir @view_config(route_name='train', renderer='json') def train_model(request): diff --git a/tests/test_custom_image_data_generator.py b/tests/test_custom_image_data_generator.py index 6970f65..b8c70f1 100644 --- a/tests/test_custom_image_data_generator.py +++ b/tests/test_custom_image_data_generator.py @@ -1,5 +1,5 @@ from tensorflow.keras.preprocessing.image import ImageDataGenerator -from skinvestigatorai.core.custom_image_data_generator import CustomImageDataGenerator +from skinvestigatorai.services.custom_image_data_generator import CustomImageDataGenerator def test_custom_image_data_generator(): @@ -13,8 +13,3 @@ def test_custom_image_data_generator(): assert hasattr(custom_image_data_gen, '_get_batches_of_transformed_samples') -def test_get_batches_of_transformed_samples(): - # You may need to add test code to check if the method - # is handling the UnidentifiedImageError properly when - # loading problematic images. - pass diff --git a/tests/test_data_scraper.py b/tests/test_data_scraper.py index b3a61da..a730002 100644 --- a/tests/test_data_scraper.py +++ b/tests/test_data_scraper.py @@ -1,35 +1,38 @@ import os import shutil from unittest.mock import patch -from skinvestigatorai.core.data_scraper import DataScraper +from skinvestigatorai.services.data_scaper_service import DataScraper def test_create_output_folders(): - test_train_dir = "test_data/train" - test_val_dir = "test_data/validation" - test_test_dir = "test_data/test" + data_dir = "test_data/" + test_dir = "test_data/test" + train_dir = "test_data/train" + temp_dir = "test_data/temp" + benign_dir = "test_data/temp/benign" + malignant_dir = "test_data/temp/malignant" - data_scraper = DataScraper(train_dir=test_train_dir, val_dir=test_val_dir, test_dir=test_test_dir) + data_scraper = DataScraper(data_dir, 1) # Call the internal function data_scraper._create_output_folders() # Check if the directories were created - assert os.path.exists(test_train_dir) - assert os.path.exists(test_val_dir) - assert os.path.exists(test_test_dir) + assert os.path.exists(test_dir) + assert os.path.exists(train_dir) + assert os.path.exists(temp_dir) + assert os.path.exists(benign_dir) + assert os.path.exists(malignant_dir) # Cleanup shutil.rmtree("test_data") -@patch("skinvestigatorai.core.data_scraper.DataScraper.download_images") +@patch("skinvestigatorai.services.data_scaper_service.DataScraper.download_and_split_images") def test_download_images(mock_download_images): data_scraper = DataScraper() - data_scraper.download_images() + data_scraper.download_and_split_images() # Test if the download_images() method is called once mock_download_images.assert_called_once() -# You may add more tests to cover other functions like _image_safe_check, _download_and_save_image, etc. -# However, some of these tests may require mocking external calls to the API and may not be as straightforward. diff --git a/tests/test_detector.py b/tests/test_detector.py index 9d8dd24..8852c17 100644 --- a/tests/test_detector.py +++ b/tests/test_detector.py @@ -1,45 +1,71 @@ import os -import tempfile import pytest -from skinvestigatorai.core.ai.detector import SkinCancerDetector +from PIL import Image +from skinvestigatorai.services.detector_service import SkinCancerDetector -train_dir = 'data/train' -val_dir = 'data/validation' -test_dir = 'data/test' +TRAIN_DIR = "data/train" +VAL_DIR = "data/val" +TEST_DIR = "data/test" +LOG_DIR = "logs" +MODEL_DIR = "models" +IMG_SIZE = (180, 180) -@pytest.fixture -def detector(): - return SkinCancerDetector(train_dir, val_dir, test_dir) +# Setup and Teardown Functions +@pytest.fixture(scope="module") +def get_detector(): + os.makedirs(TRAIN_DIR, exist_ok=True) + os.makedirs(VAL_DIR, exist_ok=True) + os.makedirs(TEST_DIR, exist_ok=True) + for i in range(5): + for directory in [TRAIN_DIR, VAL_DIR, TEST_DIR]: + subdir = os.path.join(directory, str(i)) + os.makedirs(subdir, exist_ok=True) + img = Image.new('RGB', (100, 100), color='red') + img.save(os.path.join(subdir, f"img_{i}.jpeg")) + # Provide setup data for tests + detector = SkinCancerDetector(TRAIN_DIR, VAL_DIR, TEST_DIR, LOG_DIR, 32, MODEL_DIR, IMG_SIZE) + yield detector -def test_preprocess_data(detector): - train_generator, val_generator, test_datagen = detector.preprocess_data() - assert train_generator is not None - assert val_generator is not None - assert test_datagen is not None +# Tests +def test_verify_images(get_detector): + detector = get_detector + # Intentionally corrupt an image to test verification + open(os.path.join(TRAIN_DIR, "0/img_0.jpeg"), "w").close() + invalid_images = detector.verify_images(TRAIN_DIR) + assert len(invalid_images) == 1 + assert "img_0.jpeg" in invalid_images[0] -def test_build_model(detector): - num_classes = 5 - detector.build_model(num_classes) - assert detector.model is not None - assert len(detector.model.layers) > 0 +def test_preprocess_data(get_detector): + detector = get_detector + train_gen, val_gen, test_gen = detector.preprocess_data() + assert train_gen is not None + assert val_gen is not None + assert test_gen is not None -def test_train_model(detector): - # Add test code to train the model here - pass +def test_build_model_and_process_data(get_detector): + detector = get_detector + detector.build_model() + train_gen, val_gen, _ = detector.preprocess_data() -def test_evaluate_model(detector): - # Add test code to evaluate the model here - pass +def test_evaluate_model(get_detector): + detector = get_detector + _, _, test_gen = detector.preprocess_data() + test_loss, test_acc, test_precision, test_recall, test_auc, test_binary_accuracy, test_f1_score = \ + detector.evaluate_model(test_gen) + assert isinstance(test_acc, (int, float)) + assert isinstance(test_loss, (int, float)) + assert isinstance(test_precision, (int, float)) + assert isinstance(test_recall, (int, float)) + assert isinstance(test_auc, (int, float)) + assert isinstance(test_binary_accuracy, (int, float)) + assert isinstance(test_f1_score, (int, float)) -def test_save_model(detector): - detector.build_model(5) # Change this to the number of classes you have - with tempfile.TemporaryDirectory() as tmpdir: - model_path = os.path.join(tmpdir, 'test_model.h5') - detector.save_model(model_path) - assert os.path.exists(model_path) + +if __name__ == "__main__": + pytest.main() diff --git a/tests/test_predict_view.py b/tests/test_predict_view.py index b9e2dd6..f151216 100644 --- a/tests/test_predict_view.py +++ b/tests/test_predict_view.py @@ -1,27 +1,40 @@ -import io -from unittest.mock import MagicMock -from pyramid import testing -from PIL import Image +import os +import random +from unittest.mock import MagicMock, patch from skinvestigatorai.views.predict_view import predict_view, dashboard_view +from pyramid import testing +import pytest + +@pytest.fixture +def mock_is_image_similar(): + with patch( + 'skinvestigatorai.services.feature_extraction_service.FeatureExtractionService.is_image_similar', + return_value=True + ): + yield -def test_predict_view(): + +def test_predict_view(mock_is_image_similar): request = testing.DummyRequest() - img = Image.new('RGB', (150, 150)) - buf = io.BytesIO() - img.save(buf, format='PNG') - buf.seek(0) - # Creating a dummy file upload object - dummy_file_upload = MagicMock() - dummy_file_upload.file = buf - request.POST['image'] = dummy_file_upload + benign_directory = 'data/train/benign' - response = predict_view(request) - assert 'prediction' in response - assert 'confidence' in response - assert response['prediction'] in ['benign', 'malignant'] - assert 0 <= response['confidence'] <= 100 + jpg_files = [f for f in os.listdir(benign_directory) if f.endswith('.JPG')] + + random_file = random.choice(jpg_files) + file_path = os.path.join(benign_directory, random_file) + + with open(file_path, 'rb') as image_file: + dummy_file_upload = MagicMock() + dummy_file_upload.file = image_file + request.POST['image'] = dummy_file_upload + + response = predict_view(request) + assert 'prediction' in response + assert 'confidence' in response + assert response['prediction'] in ['benign', 'malignant'] + assert 0 <= response['confidence'] <= 100 def test_dashboard_view(): diff --git a/tests/test_train.py b/tests/test_train.py deleted file mode 100644 index bd12bf4..0000000 --- a/tests/test_train.py +++ /dev/null @@ -1,31 +0,0 @@ -import pytest -from unittest.mock import MagicMock, patch - -# Import the main function from train.py -from skinvestigatorai.core.ai.train import main, train_dir, val_dir, test_dir - -# Mock the SkinCancerDetector class and its methods -@pytest.fixture -def mock_detector(monkeypatch): - with patch('skinvestigatorai.core.ai.train.SkinCancerDetector') as mock: - yield mock - -def test_main(mock_detector): - # Set the return values of the preprocess_data() method - mock_detector.return_value.preprocess_data.return_value = (MagicMock(), MagicMock(), MagicMock()) - - # Run the main function - main('skin_cancer_detection_model_all_GPU.h5') - - # Check if the SkinCancerDetector constructor is called with the correct arguments - mock_detector.assert_called_once_with(train_dir, val_dir, test_dir) - - # Get the detector instance from the constructor call - detector_instance = mock_detector.return_value - - # Check if the instance methods are called in the correct order - detector_instance.preprocess_data.assert_called_once() - detector_instance.build_model.assert_called_once() - detector_instance.train_model.assert_called_once() - detector_instance.evaluate_model.assert_called_once() - detector_instance.save_model.assert_called_once_with('skin_cancer_detection_model_all_GPU.h5')