Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

src/com_augmentations.py +255 -0
src/com_const.py +17 -0
src/com_func.py +20 -0
src/com_image.py +162 -0
src/com_plot.py +54 -0
src/leaf_patch_annotation.ipynb +552 -0
src/leaf_patch_extractor.ipynb +470 -0
src/leaf_patch_extractor_model.py +1292 -0
src/leaf_patch_gen_diff.ipynb +650 -0
src/leaf_patch_oiv_predictor.ipynb +397 -0
src/leaf_patch_oiv_predictor_model.py +1266 -0
src/repo_manager.ipynb +0 -0

src/com_augmentations.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from pprint import pprint
+import numpy as np
+import cv2
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+from albumentations import ImageOnlyTransform
+import torch
+from torch.utils.data import Dataset
+import com_image as ci
+import com_plot as cp
+class FixPatchBrightness(ImageOnlyTransform):
+    def __init__(
+        self,
+        brightness_target=115,
+        brightness_thresholds=(115, 130),
+        always_apply: bool = False,
+        p: float = 0.5,
+    ):
+        super().__init__(always_apply, p)
+        self.brightness_target = brightness_target
+        self.brightness_thresholds = brightness_thresholds
+    def apply(self, img, brightness_target=None, brightness_thresholds=None, **params):
+        brightness_target = (
+            self.brightness_target if brightness_target is None else brightness_target
+        )
+        brightness_thresholds = (
+            self.brightness_thresholds
+            if brightness_thresholds is None
+            else brightness_thresholds
+        )
+        r, g, b = cv2.split(img)
+        avg_bright = np.sqrt(
+            0.241 * np.power(r.astype(float), 2)
+            + 0.691 * np.power(g.astype(float), 2)
+            + 0.068 * np.power(b.astype(float), 2)
+        ).mean()
+        tmin, tmax = min(*brightness_thresholds), max(*brightness_thresholds)
+        if avg_bright < tmin or avg_bright > tmax:
+            if avg_bright > brightness_target:
+                gamma = brightness_target / avg_bright
+                if gamma != 1:
+                    inv_gamma = 1.0 / gamma
+                    table = np.array(
+                        [((i / 255.0) ** inv_gamma) * 255 for i in np.arange(0, 256)]
+                    ).astype("uint8")
+                    return cv2.LUT(src=img, lut=table)
+                else:
+                    return img
+            else:
+                return cv2.convertScaleAbs(
+                    src=img,
+                    alpha=(brightness_target + avg_bright) / (2 * avg_bright),
+                    beta=(brightness_target - avg_bright) / 2,
+                )
+        else:
+            return img
+def build_albumentations(
+    image_size: int,
+    gamma=(60, 180),
+    brightness_limit=0.15,
+    contrast_limit=0.25,
+    crop=None,
+    center_crop: int = -1,
+    mean=(0.485, 0.456, 0.406),
+    std=(0.229, 0.224, 0.225),
+    brightness_target=None,
+    brightness_thresholds=None,
+    affine_transforms={"H": 0.3, "V": 0.3, "R": 0.3, "T": 0.3},
+):
+    albs_ = {"resize": [A.Resize(height=image_size, width=image_size, p=1)]}
+    if brightness_target is not None and brightness_thresholds is not None:
+        albs_ = albs_ | {
+            "fix_brightness": [
+                FixPatchBrightness(
+                    brightness_target=brightness_target,
+                    brightness_thresholds=brightness_thresholds,
+                    p=1,
+                )
+            ]
+        }
+    if crop is not None:
+        if isinstance(crop, int):
+            albs_ = albs_ | {
+                "crop_and_pad": [
+                    A.RandomCrop(height=crop, width=crop, p=0.5),
+                    A.PadIfNeeded(min_height=image_size, min_width=image_size, p=1),
+                ]
+            }
+        elif isinstance(crop, dict):
+            crop_val = crop["value"]
+            crop_p = crop["p"]
+            albs_ = albs_ | {
+                "crop_and_pad": [
+                    A.PadIfNeeded(min_height=crop_val, min_width=crop_val, p=1),
+                    A.RandomCrop(height=crop_val, width=crop_val, p=crop_p),
+                    A.PadIfNeeded(min_height=image_size, min_width=image_size, p=1),
+                ]
+            }
+    if center_crop > -1:
+        albs_ = albs_ | {
+            "center_crop": [
+                A.PadIfNeeded(min_height=center_crop, min_width=center_crop, p=1),
+                A.CenterCrop(height=center_crop, width=center_crop, p=1),
+            ]
+        }
+    affine = []
+    for k, v in affine_transforms.items():
+        if k == "H":
+            affine.append(A.HorizontalFlip(p=v))
+        elif k == "V":
+            affine.append(A.VerticalFlip(p=v))
+        elif k == "R":
+            affine.append(A.RandomRotate90(p=v))
+        elif k == "T":
+            affine.append(A.Transpose(p=v))
+    albs_ = albs_ | {"affine": affine}
+    color = []
+    if brightness_limit is not None and contrast_limit is not None:
+        color.append(
+            A.RandomBrightnessContrast(
+                brightness_limit=brightness_limit,
+                contrast_limit=contrast_limit,
+                p=0.5,
+            )
+        )
+    if gamma is not None:
+        color.append(A.RandomGamma(gamma_limit=gamma, p=0.5))
+    albs_ = albs_ | {"color": color}
+    return albs_ | {
+        "to_tensor": [A.Normalize(mean=mean, std=std, p=1), ToTensorV2()],
+        "un_normalize": [
+            A.Normalize(
+                mean=[-m / s for m, s in zip(mean, std)],
+                std=[1.0 / s for s in std],
+                always_apply=True,
+                max_pixel_value=1.0,
+            ),
+        ],
+    }
+def get_augmentations(
+    image_size: int = 224,
+    gamma=(60, 180),
+    brightness_limit=0.15,
+    contrast_limit=0.25,
+    crop=180,
+    center_crop: int = -1,
+    kinds: list = ["resize", "to_tensor"],
+    mean=(0.485, 0.456, 0.406),
+    std=(0.229, 0.224, 0.225),
+    brightness_target=None,
+    brightness_thresholds=None,
+    affine_transforms={"H": 0.3, "V": 0.3, "R": 0.3, "T": 0.3},
+):
+    if "train" in kinds:
+        kinds.insert(kinds.index("train"), "affine")
+        kinds.insert(kinds.index("train"), "color")
+        kinds.remove("train")
+    td_ = build_albumentations(
+        image_size := image_size,
+        gamma=gamma,
+        brightness_limit=brightness_limit,
+        contrast_limit=contrast_limit,
+        crop=crop,
+        center_crop=center_crop,
+        mean=mean,
+        std=std,
+        brightness_target=brightness_target,
+        brightness_thresholds=brightness_thresholds,
+        affine_transforms=affine_transforms,
+    )
+    augs = []
+    for k in kinds:
+        if k:
+            augs += td_[k]  # .append(*[a for a in td_[k]])
+    return A.Compose(augs)
+class MlcPatches(Dataset):
+    def __init__(self, dataframe, transform, path_to_images) -> None:
+        super().__init__()
+        self.dataframe = dataframe
+        self.transform = transform
+        self.path_to_images = path_to_images
+    def __len__(self):
+        return self.dataframe.shape[0]
+    def __getitem__(self, index):
+        img = self.transform(image=self.get_image(index=index))["image"]
+        return {"image": img, "labels": torch.tensor([1])}
+    def get_image(self, index):
+        return ci.load_image(
+            file_name=self.dataframe.file_name.to_list()[index],
+            path_to_images=self.path_to_images,
+        )
+def test_augmentations(
+    df,
+    image_size,
+    path_to_images,
+    columns: list = [],
+    kinds: list = ["resize", "to_tensor"],
+    rows: int = 2,
+    cols: int = 4,
+    **aug_params,
+):
+    sample = df.sample(n=1)
+    src_dataset = MlcPatches(
+        dataframe=sample,
+        transform=get_augmentations(
+            image_size=image_size, kinds=["resize", "to_tensor"], **aug_params
+        ),
+        path_to_images=path_to_images,
+    )
+    test_dataset = MlcPatches(
+        dataframe=sample,
+        transform=get_augmentations(image_size=image_size, kinds=kinds, **aug_params),
+        path_to_images=path_to_images,
+    )
+    pprint(sample[[c for c in ["file_name"] + columns if c in sample]])
+    cp.tensor_image_to_grid(
+        images=[(src_dataset[0]["image"], "source")]
+        + [(test_dataset[0]["image"], "augmented") for i in range(rows * cols)],
+        transform=get_augmentations(
+            image_size=image_size, kinds=(["un_normalize"]), **aug_params
+        ),
+        row_count=rows,
+        col_count=cols,
+        figsize=(cols * 4, rows * 4),
+    )

src/com_const.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from pathlib import Path
+path_to_here = Path(__file__).resolve().parent
+path_to_root = path_to_here.parent
+path_to_data = path_to_root.joinpath("data")
+path_to_images = path_to_root.joinpath("images")
+path_to_plates = path_to_images.joinpath("plates")
+path_to_leaf_discs = path_to_images.joinpath("leaf_discs")
+path_to_leaf_patches = path_to_images.joinpath("leaf_patches")
+path_to_checkpoints = path_to_root.joinpath("checkpoints")
+path_to_chk_detector = path_to_checkpoints.joinpath("leaf_disc_detector")
+path_to_chk_oiv = path_to_checkpoints.joinpath("oiv_scorer")
+path_to_src = path_to_root.joinpath("src")

src/com_func.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import pandas as pd
+def ensure_folder(forced_path, return_string: bool = True):
+    path = forced_path.parent
+    if path.is_dir() is False:
+        path.mkdir(parents=True, exist_ok=True)
+    return str(forced_path) if return_string is True else forced_path
+def read_dataframe(path, sep=";") -> pd.DataFrame:
+    try:
+        return pd.read_csv(filepath_or_buffer=str(path), sep=sep)
+    except:
+        return None
+def write_dataframe(df: pd.DataFrame, path, sep=";") -> pd.DataFrame:
+    df.to_csv(path_or_buf=ensure_folder(path, return_string=True), sep=sep, index=False)
+    return df

src/com_image.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from pathlib import Path
+from typing import Any, Union
+import numpy as np
+import cv2
+from PIL import Image, ImageEnhance
+def load_image(file_name, path_to_images=None, rgb: bool = True):
+    path = (
+        file_name
+        if isinstance(file_name, Path) is True
+        else path_to_images.joinpath(file_name)
+    )
+    try:
+        img = cv2.imread(str(path))
+        if rgb is True:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    except Exception as e:
+        print(file_name)
+    return img
+def to_pil(image):
+    return Image.fromarray(image)
+def to_cv2(image):
+    return np.array(image)
+def enhance_pil_image(
+    image, color=1, brightness=1, contrast=1, sharpness=1
+) -> Image.Image:
+    image = ImageEnhance.Sharpness(
+        image=ImageEnhance.Brightness(
+            image=ImageEnhance.Contrast(
+                image=ImageEnhance.Color(
+                    image=(
+                        image
+                        if isinstance(image, Image.Image) is True
+                        else to_pil(image=image)
+                    )
+                ).enhance(color)
+            ).enhance(contrast)
+        ).enhance(brightness)
+    ).enhance(sharpness)
+    return image
+def ensure_odd(
+    i: int,
+    min_val: Union[None, int] = None,
+    max_val: Union[None, int] = None,
+) -> int:
+    """Transforms an odd number into pair number by adding one
+    Arguments:
+        i {int} -- number
+    Returns:
+        int -- Odd number
+    """
+    if (i > 0) and (i % 2 == 0):
+        i += 1
+    if min_val is not None:
+        return max(i, min_val)
+    if max_val is not None:
+        return min(i, max_val)
+    return i
+def get_morphology_kernel(size: int, shape: int):
+    """Builds morphology kernel
+    :param size: kernel size, must be odd number
+    :param shape: select shape of kernel
+    :return: Morphology kernel
+    """
+    size = ensure_odd(size)
+    return cv2.getStructuringElement(shape, (size, size))
+def close(
+    image: Any,
+    kernel_size: int = 3,
+    kernel_shape: int = cv2.MORPH_ELLIPSE,
+    rois: tuple = (),
+    proc_times: int = 1,
+):
+    """Morphology - Close wrapper
+    Arguments:
+        image {numpy array} -- Source image
+        kernel_size {int} -- kernel size
+        kernel_shape {int} -- cv2 constant
+        roi -- Region of Interest
+        proc_times {int} -- iterations
+    Returns:
+        numpy array -- closed image
+    """
+    morph_kernel = get_morphology_kernel(kernel_size, kernel_shape)
+    if rois:
+        result = image.copy()
+        for roi in rois:
+            r = roi.as_rect()
+            result[r.top : r.bottom, r.left : r.right] = cv2.morphologyEx(
+                result[r.top : r.bottom, r.left : r.right],
+                cv2.MORPH_CLOSE,
+                morph_kernel,
+                iterations=proc_times,
+            )
+    else:
+        result = cv2.morphologyEx(
+            image, cv2.MORPH_CLOSE, morph_kernel, iterations=proc_times
+        )
+    return result
+def get_concat_h_multi_resize(im_list, resample=Image.Resampling.BICUBIC):
+    min_height = min(im.height for im in im_list)
+    im_list_resize = [
+        im.resize(
+            (int(im.width * min_height / im.height), min_height), resample=resample
+        )
+        for im in im_list
+    ]
+    total_width = sum(im.width for im in im_list_resize)
+    dst = Image.new("RGB", (total_width, min_height))
+    pos_x = 0
+    for im in im_list_resize:
+        dst.paste(im, (pos_x, 0))
+        pos_x += im.width
+    return dst
+def get_concat_v_multi_resize(im_list, resample=Image.Resampling.BICUBIC):
+    min_width = min(im.width for im in im_list)
+    im_list_resize = [
+        im.resize((min_width, int(im.height * min_width / im.width)), resample=resample)
+        for im in im_list
+    ]
+    total_height = sum(im.height for im in im_list_resize)
+    dst = Image.new("RGB", (min_width, total_height))
+    pos_y = 0
+    for im in im_list_resize:
+        dst.paste(im, (0, pos_y))
+        pos_y += im.height
+    return dst
+def get_concat_tile_resize(im_list_2d, resample=Image.Resampling.BICUBIC):
+    im_list_v = [
+        get_concat_h_multi_resize(im_list_h, resample=resample)
+        for im_list_h in im_list_2d
+    ]
+    return get_concat_v_multi_resize(im_list_v, resample=resample)
+def get_tiles(img_list, row_count, resample=Image.Resampling.BICUBIC):
+    if isinstance(img_list, np.ndarray) is False:
+        img_list = np.asarray(img_list, dtype="object")
+    return get_concat_tile_resize(np.split(img_list, row_count), resample)

src/com_plot.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import matplotlib.pyplot as plt
+def _update_axis(
+    axis, image, title=None, fontsize=18, remove_axis=True, title_loc="center"
+):
+    axis.imshow(image, origin="upper")
+    if title is not None:
+        axis.set_title(title, fontsize=fontsize, loc=title_loc)
+    if remove_axis is True:
+        axis.set_axis_off()
+def tensor_image_to_grid(
+    images: list,
+    transform,
+    row_count,
+    col_count=None,
+    figsize=(20, 20),
+    fontsize=None,
+):
+    def splt_image_title(image):
+        if isinstance(image, tuple):
+            return image[0], image[1]
+        else:
+            return image, None
+    def torch_to_image(t):
+        return transform(image=t.permute(1, 2, 0).numpy())["image"]
+    col_count = row_count if col_count is None else col_count
+    if len(images) == 1:
+        img, title = splt_image_title(images[0])
+        plt.imshow(torch_to_image(img))
+        plt.title = title
+        plt.tight_layout()
+        plt.axis("off")
+    else:
+        _, axii = plt.subplots(row_count, col_count, figsize=figsize)
+        for ax, image in zip(axii.reshape(-1), images):
+            try:
+                img, title = splt_image_title(image)
+                _update_axis(
+                    axis=ax,
+                    image=torch_to_image(img),
+                    remove_axis=True,
+                    title=title,
+                    fontsize=figsize[0] if fontsize is None else fontsize,
+                )
+            except:
+                pass
+    plt.tight_layout()
+    plt.show()

src/leaf_patch_annotation.ipynb ADDED Viewed

	@@ -0,0 +1,552 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 202311 Dataset Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import warnings\n",
+    "from datetime import datetime as dt\n",
+    "import inspect\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "import altair as alt\n",
+    "import plotly.express as px\n",
+    "\n",
+    "from PIL import Image, ImageEnhance\n",
+    "\n",
+    "from siuba import _ as s\n",
+    "from siuba import filter as sfilter\n",
+    "from siuba import mutate\n",
+    "\n",
+    "import panel as pn\n",
+    "\n",
+    "import com_const as cc\n",
+    "import com_func as cf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "warnings.simplefilter(action=\"ignore\", category=UserWarning)\n",
+    "warnings.simplefilter(action=\"ignore\", category=FutureWarning)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option(\"display.max_colwidth\", 500)\n",
+    "pd.set_option(\"display.max_columns\", 500)\n",
+    "pd.set_option(\"display.width\", 1000)\n",
+    "pd.set_option(\"display.max_rows\", 16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "alt.data_transformers.disable_max_rows();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pn.extension(\n",
+    "    \"plotly\", \"terminal\", \"tabulator\", \"vega\", notifications=True, console_output=\"disable\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template = pn.template.BootstrapTemplate(title=\"OIV Annotation Tool\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Gobals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "current_row = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_quality_options = {\n",
+    "    \"?\":np.nan,\n",
+    "    \"good\":\"good_images\",\n",
+    "    \"crop\":\"crop_images\",\n",
+    "    \"missing\":\"missing_images\",\n",
+    "    \"dark\":\"dark_images\",\n",
+    "    \"blur\":\"blur_images\",\n",
+    "    \"color\":\"color_images\",\n",
+    "    \"water\":\"water_images\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = cf.read_dataframe(path=cc.path_to_data.joinpath(\"oiv_annotation.csv\")).sort_values(\n",
+    "    [\"experiment\", \"inoc\", \"dpi\", \"plaque\", \"row\", \"col\"]\n",
+    ")\n",
+    "if \"seen_at\" not in df:\n",
+    "    df = df >> mutate(seen_at=np.nan)\n",
+    "df.seen_at = pd.to_datetime(df.seen_at)\n",
+    "df = df.set_index(\"file_name\")\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def update_image(image_name:str, color, brightness, contrast, sharpness):\n",
+    "    image_path = cc.path_to_leaf_patches.joinpath(image_name)\n",
+    "    if image_path.is_file() is False:\n",
+    "        fig = px.imshow(\n",
+    "            np.array(\n",
+    "                [\n",
+    "                    [[255, 0, 255], [255, 0, 255], [255, 0, 255]],\n",
+    "                    [[255, 0, 255], [255, 0, 255], [255, 0, 255]],\n",
+    "                    [[255, 0, 255], [255, 0, 255], [255, 0, 255]],\n",
+    "                ],\n",
+    "                dtype=np.uint8,\n",
+    "            )\n",
+    "        )\n",
+    "    else:\n",
+    "        image = Image.open(image_path)\n",
+    "        image = ImageEnhance.Color(image=image).enhance(color)\n",
+    "        image = ImageEnhance.Contrast(image=image).enhance(contrast)\n",
+    "        image = ImageEnhance.Brightness(image=image).enhance(brightness)\n",
+    "        image = ImageEnhance.Sharpness(image=image).enhance(sharpness)\n",
+    "        fig = px.imshow(image)\n",
+    "    fig.update_layout(coloraxis_showscale=False)\n",
+    "    fig.update_xaxes(showticklabels=False)\n",
+    "    fig.update_yaxes(showticklabels=False)\n",
+    "    fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))\n",
+    "    return fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_classes(df_: pd.DataFrame, var: str):\n",
+    "    d = pd.DataFrame(\n",
+    "        data={\n",
+    "            var: df_[var]\n",
+    "            .fillna(\"?\")\n",
+    "            .astype(str)\n",
+    "            .str.replace(\".0\", \"\")\n",
+    "            .str.replace(\"images\", \"\")\n",
+    "        }\n",
+    "    )\n",
+    "    bars = (\n",
+    "        alt.Chart(d)\n",
+    "        .mark_bar()\n",
+    "        .encode(\n",
+    "            y=alt.Y(var, title=None),\n",
+    "            x=alt.X(\"count()\", axis=None),\n",
+    "            color=alt.Color(var, legend=None),\n",
+    "        )\n",
+    "    )\n",
+    "    text = bars.mark_text(align=\"center\", dy=0, dx=12).encode(\n",
+    "        y=alt.Y(var, title=None),\n",
+    "        x=alt.X(\"count()\", axis=None),\n",
+    "        color=alt.Color(var, legend=None),\n",
+    "        text=\"count()\",\n",
+    "    )\n",
+    "\n",
+    "    return (bars + text).configure_view(stroke=None).configure_axis(grid=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Widgets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_current = pn.pane.Plotly(height=750, align=(\"center\", \"center\"))\n",
+    "mkd_current = pn.pane.Markdown(sizing_mode=\"scale_width\", align=\"center\")\n",
+    "\n",
+    "sl_contrast = pn.widgets.EditableFloatSlider(\n",
+    "    name=\"Contrast\", start=0.0, end=7.5, value=1.5, step=0.1, sizing_mode=\"scale_width\"\n",
+    ")\n",
+    "sl_color = pn.widgets.EditableFloatSlider(\n",
+    "    name=\"Color\", start=0.0, end=5.0, value=1.0, step=0.1, sizing_mode=\"scale_width\"\n",
+    ")\n",
+    "sl_brightness = pn.widgets.EditableFloatSlider(\n",
+    "    name=\"Brightness\",\n",
+    "    start=0.0,\n",
+    "    end=5.0,\n",
+    "    value=1.0,\n",
+    "    step=0.1,\n",
+    "    sizing_mode=\"scale_width\",\n",
+    ")\n",
+    "sl_sharpness = pn.widgets.EditableFloatSlider(\n",
+    "    name=\"Sharpness\", start=0.0, end=2.0, value=1.5, step=0.1, sizing_mode=\"scale_width\"\n",
+    ")\n",
+    "\n",
+    "c_image_processing = pn.Card(\n",
+    "    pn.Column(sl_brightness, sl_color, sl_contrast, sl_sharpness),\n",
+    "    title=\"Image Processing Options\",\n",
+    "    sizing_mode=\"scale_width\",\n",
+    ")\n",
+    "\n",
+    "pg_progress = pn.widgets.Tqdm(name=\"Progress\", align=\"center\", max=len(df))\n",
+    "\n",
+    "rgb_oiv = pn.widgets.RadioButtonGroup(\n",
+    "    name=\"OIV\",\n",
+    "    options=[\"?\", 1, 3, 5, 7, 9],\n",
+    "    button_style=\"outline\",\n",
+    "    button_type=\"success\",\n",
+    ")\n",
+    "\n",
+    "rgb_source = pn.widgets.RadioButtonGroup(\n",
+    "    name=\"Image quality\",\n",
+    "    options=list(image_quality_options.keys()),\n",
+    "    button_style=\"outline\",\n",
+    "    button_type=\"success\",\n",
+    "    value=\"?\",\n",
+    ")\n",
+    "\n",
+    "sel_def_img_quality = pn.widgets.Select(\n",
+    "    name=\"Default Image Quality\", options=list(image_quality_options.keys())\n",
+    ")\n",
+    "\n",
+    "mc_filter_quality = pn.widgets.MultiChoice(\n",
+    "    name=\"Allow qualities\",\n",
+    "    options=list(image_quality_options.values()),\n",
+    "    value=list(image_quality_options.values()),\n",
+    ")\n",
+    "\n",
+    "rgb_target = pn.widgets.RadioButtonGroup(\n",
+    "    name=\"Annotation target\",\n",
+    "    options=[\"All\", \"OIV\", \"Image quality\"],\n",
+    "    button_style=\"outline\",\n",
+    "    button_type=\"success\",\n",
+    "    value=\"All\",\n",
+    ")\n",
+    "\n",
+    "c_anno_options = pn.Card(\n",
+    "    pn.Column(\n",
+    "        pn.Row(pn.pane.Markdown(\"**Annotate**\"), rgb_target),\n",
+    "        sel_def_img_quality,\n",
+    "        mc_filter_quality,\n",
+    "    ),\n",
+    "    title=\"Annotation Options\",\n",
+    "    sizing_mode=\"scale_width\",\n",
+    ")\n",
+    "\n",
+    "pn_hist_oiv = pn.pane.Vega()\n",
+    "pn_hist_source = pn.pane.Vega()\n",
+    "\n",
+    "c_hists = pn.Card(\n",
+    "    pn.Column(\n",
+    "        pn.pane.Markdown(\"### OIV\"),\n",
+    "        pn_hist_oiv,\n",
+    "        pn.pane.Markdown(\"### Image Quality\"),\n",
+    "        pn_hist_source,\n",
+    "    ),\n",
+    "    title=\"Annotation Overview\",\n",
+    "    sizing_mode=\"scale_width\",\n",
+    ")\n",
+    "\n",
+    "sw_ui_state = pn.widgets.Switch(name=\"active\", value=False)\n",
+    "alt_ui_state = pn.pane.Alert(\"Annotations will be stored\", alert_type=\"primary\")\n",
+    "\n",
+    "pn_ui_state = pn.Row(sw_ui_state, alt_ui_state)\n",
+    "\n",
+    "\n",
+    "bt_next = pn.widgets.Button(name=\"Next\", button_type=\"primary\")\n",
+    "bt_previous = pn.widgets.Button(name=\"Previous\", button_type=\"primary\")\n",
+    "\n",
+    "ui_annotation = pn.GridSpec(sizing_mode=\"scale_width\", align=\"center\", max_height=120)\n",
+    "\n",
+    "ui_annotation[1, 0] = bt_previous\n",
+    "ui_annotation[0, 1:5] = rgb_source\n",
+    "ui_annotation[1, 1:5] = rgb_oiv\n",
+    "ui_annotation[1, 5] = bt_next"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Callbacks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@pn.depends(\n",
+    "    sl_color.param.value,\n",
+    "    sl_contrast.param.value,\n",
+    "    sl_brightness.param.value,\n",
+    "    sl_sharpness.param.value,\n",
+    "    watch=True,\n",
+    ")\n",
+    "def on_preprocess_changed(color, contrast, brightness, sharpeness):\n",
+    "    img_current.object = update_image(\n",
+    "        image_name=current_row.file_name,\n",
+    "        color=color,\n",
+    "        brightness=brightness,\n",
+    "        contrast=contrast,\n",
+    "        sharpness=sharpeness,\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def update_ui_state(ui_state: bool):\n",
+    "    if ui_state is True:\n",
+    "        alt_ui_state.object = \"Annotations will be stored\"\n",
+    "        alt_ui_state.alert_type = \"primary\"\n",
+    "    else:\n",
+    "        alt_ui_state.object = \"Annotations will be discarded\"\n",
+    "        alt_ui_state.alert_type = \"danger\"\n",
+    "\n",
+    "\n",
+    "@pn.depends(sw_ui_state, watch=True)\n",
+    "def on_ui_State_changed(new_state: bool):\n",
+    "    update_ui_state(new_state)\n",
+    "\n",
+    "\n",
+    "def select_next(event):\n",
+    "    global current_row\n",
+    "    global df\n",
+    "    now = dt.now()\n",
+    "    if current_row is not None and (event is None or event.obj.name == \"Next\"):\n",
+    "        if rgb_target.value in [\"All\", \"OIV\"] and rgb_oiv.value != \"?\":\n",
+    "            df.at[current_row.file_name, \"oiv\"] = int(rgb_oiv.value)\n",
+    "            df.at[current_row.file_name, \"oiv_annotated_at\"] = now\n",
+    "\n",
+    "        if rgb_target.value in [\"All\", \"Image quality\"] and rgb_source.value != \"?\":\n",
+    "            df.at[current_row.file_name, \"source_annotated_at\"] = now\n",
+    "            df.at[current_row.file_name, \"source\"] = image_quality_options[\n",
+    "                rgb_source.value\n",
+    "            ]\n",
+    "        cf.write_dataframe(\n",
+    "            df=df.reset_index(),\n",
+    "            path=cc.path_to_data.joinpath(\n",
+    "                \"oiv_annotation.csv\" if sw_ui_state.value is True else \"oiv_annotation_test.csv\"\n",
+    "            ),\n",
+    "        )\n",
+    "        df.at[current_row.file_name, \"seen_at\"] = now\n",
+    "\n",
+    "    df_cr = df >> sfilter(s.source.isin(mc_filter_quality.value))\n",
+    "\n",
+    "    if rgb_target.value == \"All\":\n",
+    "        df_cr = df_cr >> sfilter(s.oiv.isna() | s.source.isna())\n",
+    "    elif rgb_target.value == \"OIV\":\n",
+    "        df_cr = df_cr >> sfilter(s.oiv.isna())\n",
+    "    if rgb_target.value == \"Image quality\":\n",
+    "        df_cr = df_cr >> sfilter(s.source.isna())\n",
+    "    remaining = len(df_cr)\n",
+    "    if event is None or event.obj.name == \"Next\":\n",
+    "        df_cr = df_cr.reset_index()\n",
+    "        current_row = df_cr.sample(n=1).iloc[0] if len(df_cr) > 0 else None\n",
+    "    elif event.obj.name == \"Previous\":\n",
+    "        current_row = (\n",
+    "            (df.reset_index() >> sfilter(~s.seen_at.isna()))\n",
+    "            .sort_values(\"seen_at\", ascending=False)\n",
+    "            .reset_index(drop=True)\n",
+    "            .iloc[0]\n",
+    "        )\n",
+    "        df.at[current_row.file_name, \"seen_at\"] = None\n",
+    "\n",
+    "    if current_row is not None:\n",
+    "        rgb_source.value = (\n",
+    "            sel_def_img_quality.value\n",
+    "            if pd.isnull(current_row.source)\n",
+    "            else {v: k for k, v in image_quality_options.items()}[current_row.source]\n",
+    "        )\n",
+    "        rgb_oiv.value = (\n",
+    "            current_row.oiv if current_row.oiv in [1, 3, 5, 7, 9] else \"?\"\n",
+    "        )\n",
+    "\n",
+    "    pg_progress.value = len(df) - remaining\n",
+    "    file_name = current_row.file_name if current_row is not None else \"\"\n",
+    "    img_current.object = update_image(\n",
+    "        image_name=file_name,\n",
+    "        color=sl_color.value,\n",
+    "        brightness=sl_brightness.value,\n",
+    "        contrast=sl_contrast.value,\n",
+    "        sharpness=sl_sharpness.value,\n",
+    "    )\n",
+    "    mkd_current.object = f\"## {file_name}\"\n",
+    "    df_unf = df >> sfilter(s.source.isin(mc_filter_quality.value))\n",
+    "    pn_hist_source.object = plot_classes(df_unf, \"source\")\n",
+    "    pn_hist_oiv.object = plot_classes(df_unf, \"oiv\")\n",
+    "\n",
+    "\n",
+    "@pn.depends(rgb_target, watch=True)\n",
+    "def on_target_changed(target):\n",
+    "    rgb_oiv.disabled = target == \"Image quality\"\n",
+    "    rgb_source.disabled = target == \"OIV\"\n",
+    "\n",
+    "\n",
+    "# @pn.depends(rgb_oiv, watch=True)\n",
+    "# def on_oiv_changed(_):\n",
+    "#     select_next(None)\n",
+    "\n",
+    "\n",
+    "bt_next.on_click(select_next)\n",
+    "bt_previous.on_click(select_next)\n",
+    "\n",
+    "update_ui_state(sw_ui_state.value)\n",
+    "select_next(None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## UI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template.sidebar.append(pn_ui_state)\n",
+    "template.sidebar.append(c_image_processing)\n",
+    "template.sidebar.append(c_anno_options)\n",
+    "\n",
+    "template.main.append(\n",
+    "    pn.Row(\n",
+    "        pn.Column(\n",
+    "            # mkd_current,\n",
+    "            img_current,\n",
+    "            ui_annotation,\n",
+    "        ),\n",
+    "        pn.Column(c_hists, pg_progress),\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "template.servable()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Please launch with command \"panel serve leaf_patch_annotation.ipynb --show --dev\" from the \"src\" folder"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/leaf_patch_extractor.ipynb ADDED Viewed

	@@ -0,0 +1,470 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Extract Leaf Patches From Plates"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime as dt\n",
+    "import warnings\n",
+    "import random\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "import cv2\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from siuba import _ as s\n",
+    "from siuba import filter as sfilter\n",
+    "from siuba import mutate, select, if_else\n",
+    "\n",
+    "import panel as pn\n",
+    "\n",
+    "import torch\n",
+    "\n",
+    "from pytorch_lightning.callbacks import (\n",
+    "    RichProgressBar,\n",
+    "    ModelCheckpoint,\n",
+    "    LearningRateMonitor,\n",
+    ")\n",
+    "from pytorch_lightning import Trainer\n",
+    "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n",
+    "from pytorch_lightning.loggers import TensorBoardLogger\n",
+    "\n",
+    "\n",
+    "import com_const as cc\n",
+    "import com_image as ci\n",
+    "import com_func as cf\n",
+    "import leaf_patch_extractor_model as lpem"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "warnings.simplefilter(action=\"ignore\", category=UserWarning)\n",
+    "warnings.simplefilter(action=\"ignore\", category=FutureWarning)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option(\"display.max_colwidth\", 500)\n",
+    "pd.set_option(\"display.max_columns\", 500)\n",
+    "pd.set_option(\"display.width\", 1000)\n",
+    "pd.set_option(\"display.max_rows\", 16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pn.extension(notifications=True, console_output=\"disable\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train Disc Detector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load Datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train, val, test = [\n",
+    "    cf.read_dataframe(cc.path_to_data.joinpath(f\"ldd_{d}.csv\"))\n",
+    "    for d in [\"train\", \"val\", \"test\"]\n",
+    "]\n",
+    "\n",
+    "print(len(train), len(test), len(val))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Test Augmentations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# aug_ = lpem.get_augmentations(image_size=10, kinds=[\"resize\", \"train\"])\n",
+    "\n",
+    "# test_aug_dataset = lpem.LeafDiskDetectorDataset(csv=train, transform=aug_)\n",
+    "\n",
+    "# file_name = train.sample(n=1).plate_name.to_list()[0]\n",
+    "\n",
+    "# print(aug_[0].width, aug_[0].height)\n",
+    "\n",
+    "# lpem.make_patches_grid(\n",
+    "#     images=[\n",
+    "#         test_aug_dataset.draw_image_with_boxes(plate_name=file_name) for _ in range(12)\n",
+    "#     ],\n",
+    "#     row_count=3,\n",
+    "#     col_count=4,\n",
+    "#     figsize=(12, 6),\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model = lpem.LeafDiskDetector(\n",
+    "#     batch_size=15,\n",
+    "#     learning_rate=7.0e-05,\n",
+    "#     image_factor=10,\n",
+    "#     max_epochs=1000,\n",
+    "#     train_data=train,\n",
+    "#     val_data=val,\n",
+    "#     test_data=test,\n",
+    "#     augmentations_kinds=[\"resize\", \"train\", \"to_tensor\"],\n",
+    "#     augmentations_params={\"gamma\": (60, 180)},\n",
+    "#     num_workers=2,\n",
+    "#     accumulate_grad_batches=5,\n",
+    "#     scheduler=\"steplr\",\n",
+    "#     scheduler_params={\"step_size\": 10, \"gamma\": 0.80},\n",
+    "# )\n",
+    "\n",
+    "# model.eval()\n",
+    "# len(model(torch.rand(2, 3, 128, 128)))\n",
+    "\n",
+    "# model.hr_desc()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# trainer = Trainer(\n",
+    "#     default_root_dir=str(cc.path_to_chk_detector),\n",
+    "#     logger=TensorBoardLogger(\n",
+    "#         save_dir=str(cc.path_to_chk_detector),\n",
+    "#         version=model.model_name + \"_\" + dt.now().strftime(\"%Y%m%d_%H%M%S\"),\n",
+    "#         name=\"lightning_logs\",\n",
+    "#     ),\n",
+    "#     accelerator=\"gpu\",\n",
+    "#     max_epochs=model.max_epochs,\n",
+    "#     log_every_n_steps=5,\n",
+    "#     callbacks=[\n",
+    "#         RichProgressBar(),\n",
+    "#         EarlyStopping(monitor=\"val_loss\", mode=\"min\", patience=10, min_delta=0.0005),\n",
+    "#         ModelCheckpoint(\n",
+    "#             save_top_k=1,\n",
+    "#             monitor=\"val_loss\",\n",
+    "#             auto_insert_metric_name=True,\n",
+    "#             filename=model.model_name\n",
+    "#             + \"-{val_loss:.3f}-{epoch}-{train_loss:.3f}-{step}\",\n",
+    "#         ),\n",
+    "#         LearningRateMonitor(logging_interval=\"epoch\"),\n",
+    "#     ],\n",
+    "#     accumulate_grad_batches=model.accumulate_grad_batches,\n",
+    "# )\n",
+    "\n",
+    "# trainer.fit(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Extract Patches"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ld_model: lpem.LeafDiskDetector = lpem.LeafDiskDetector.load_from_checkpoint(\n",
+    "    cc.path_to_chk_detector.joinpath(\"leaf_disc_detector.ckpt\")\n",
+    ")\n",
+    "ld_model.hr_desc()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Predict All Bounding Boxes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bb_predictions_path = cc.path_to_data.joinpath(\"train_ld_bounding_boxes.csv\")\n",
+    "\n",
+    "bb_predictions = (\n",
+    "    cf.read_dataframe(bb_predictions_path)\n",
+    "    if bb_predictions_path.is_file() is True\n",
+    "    else pd.DataFrame()\n",
+    ")\n",
+    "\n",
+    "bb_predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plates = list(cc.path_to_plates.rglob(\"*.JPG\"))\n",
+    "len(plates)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "errors = []\n",
+    "handled_plates = bb_predictions.file_name.unique()\n",
+    "\n",
+    "for plate in tqdm(plates):\n",
+    "    if \"file_name\" in bb_predictions and plate.name in handled_plates:\n",
+    "        continue\n",
+    "    try:\n",
+    "        current_data = ld_model.index_plate(plate) >> mutate(\n",
+    "            disc_name=s.file_name.str.replace(\" \", \"\").replace(\".JPG\", \"\")\n",
+    "            + \"_\"\n",
+    "            + s.row.astype(str)\n",
+    "            + \"_\"\n",
+    "            + s.col.astype(str)\n",
+    "            + \".png\"\n",
+    "        )\n",
+    "        bb_predictions = pd.concat([bb_predictions, current_data])\n",
+    "    except:\n",
+    "        errors.append(plate)\n",
+    "\n",
+    "print(errors)\n",
+    "cf.write_dataframe(\n",
+    "    bb_predictions.sort_values([\"file_name\", \"col\", \"row\"]).reset_index(drop=True)\n",
+    "    >> mutate(disc_name=s.disc_name.str.replace(\".JPG\", \"\")),\n",
+    "    bb_predictions_path,\n",
+    ")\n",
+    "\n",
+    "bb_predictions = cf.read_dataframe(bb_predictions_path)\n",
+    "bb_predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "selected_image = random.choice(plates)\n",
+    "bboxes = bb_predictions >> sfilter(s.file_name == selected_image.name)\n",
+    "pn.Column(\n",
+    "    pn.pane.Markdown(f\"### {selected_image.name}\"),\n",
+    "    pn.pane.DataFrame(bboxes),\n",
+    "    pn.pane.Image(\n",
+    "        ci.to_pil(\n",
+    "            lpem.print_boxes(\n",
+    "                image_name=selected_image,\n",
+    "                boxes=bboxes,\n",
+    "                draw_first_line=True,\n",
+    "                return_plot=False,\n",
+    "            )  #\n",
+    "        ),\n",
+    "        sizing_mode=\"scale_width\",\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Extract Needed Patches"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Model Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_model_training = pd.concat(\n",
+    "    [\n",
+    "        cf.read_dataframe(cc.path_to_data.joinpath(f\"oiv_{d}.csv\"))\n",
+    "        for d in [\"train\", \"val\", \"test\"]\n",
+    "    ]\n",
+    ").sort_values([\"file_name\"]).reset_index(drop=True)\n",
+    "df_model_training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "err = {}\n",
+    "\n",
+    "for file_name in tqdm(df_model_training.file_name):\n",
+    "    row = (bb_predictions >> sfilter(s.disc_name == file_name)).reset_index(drop=True)\n",
+    "    lpem.handle_bbox(\n",
+    "        row.iloc[0],\n",
+    "        add_process_image=True,\n",
+    "        paths=dict(\n",
+    "            segmented_leaf_disc=cc.path_to_leaf_discs,\n",
+    "            leaf_disc_patch=cc.path_to_leaf_patches,\n",
+    "            plates=cc.path_to_plates,\n",
+    "        ),\n",
+    "        errors=err,\n",
+    "    )\n",
+    "err"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Genotype differenciation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_gd = cf.read_dataframe(\n",
+    "    cc.path_to_data.joinpath(\"genotype_differenciation_dataset.csv\")\n",
+    ")\n",
+    "df_gd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "err = {}\n",
+    "\n",
+    "for file_name in tqdm(df_gd.file_name):\n",
+    "    row = (bb_predictions >> sfilter(s.disc_name == file_name)).reset_index(drop=True)\n",
+    "    lpem.handle_bbox(\n",
+    "        row.iloc[0],\n",
+    "        add_process_image=True,\n",
+    "        paths=dict(\n",
+    "            segmented_leaf_disc=cc.path_to_leaf_discs,\n",
+    "            leaf_disc_patch=cc.path_to_leaf_patches,\n",
+    "            plates=cc.path_to_plates,\n",
+    "        ),\n",
+    "        errors=err,\n",
+    "    )\n",
+    "err"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/leaf_patch_extractor_model.py ADDED Viewed

	@@ -0,0 +1,1292 @@

+from pathlib import Path
+import math
+from rich.console import Console
+from rich.table import Table
+from rich.pretty import Pretty
+import numpy as np
+import pandas as pd
+import cv2
+from sklearn.cluster import MeanShift
+from skimage.transform import hough_circle, hough_circle_peaks
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
+from torchvision.models.detection import (
+    fasterrcnn_resnet50_fpn_v2,
+    FasterRCNN_ResNet50_FPN_V2_Weights,
+)
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import RichProgressBar
+from pytorch_lightning import Trainer
+import albumentations as A
+from albumentations.pytorch.transforms import ToTensorV2
+import matplotlib.pyplot as plt
+import com_const as cc
+import com_image as ci
+g_device = (
+    "mps"
+    if torch.backends.mps.is_built() is True
+    else "cuda" if torch.backends.cuda.is_built() else "cpu"
+)
+def load_tray_image(image_name):
+    return ci.load_image(
+        file_name=image_name, path_to_images=cc.path_to_plates, rgb=True
+    )
+def build_albumentations(
+    image_size: int = 10,
+    gamma=(60, 180),
+    mean=(0.485, 0.456, 0.406),
+    std=(0.229, 0.224, 0.225),
+):
+    return {
+        "resize": [
+            A.Resize(height=image_size * 32 * 2, width=image_size * 32 * 3, p=1)
+        ],
+        "train": [
+            A.HorizontalFlip(p=0.3),
+            A.RandomBrightnessContrast(
+                brightness_limit=0.25, contrast_limit=0.25, p=0.5
+            ),
+            A.RandomGamma(gamma_limit=gamma, p=0.5),
+        ],
+        "to_tensor": [A.Normalize(mean=mean, std=std, p=1), ToTensorV2()],
+        "un_normalize": [
+            A.Normalize(
+                mean=[-m / s for m, s in zip(mean, std)],
+                std=[1.0 / s for s in std],
+                always_apply=True,
+                max_pixel_value=1.0,
+            ),
+        ],
+    }
+def get_augmentations(
+    image_size: int = 10,
+    gamma=(60, 180),
+    kinds: list = ["resize", "to_tensor"],
+    mean=(0.485, 0.456, 0.406),
+    std=(0.229, 0.224, 0.225),
+    inferrence: bool = False,
+):
+    td_ = build_albumentations(
+        image_size=image_size,
+        gamma=gamma,
+        mean=mean,
+        std=std,
+    )
+    augs = []
+    for k in kinds:
+        augs += td_[k]
+    if inferrence is True:
+        return A.Compose(augs)
+    else:
+        return A.Compose(
+            augs,
+            bbox_params={"format": "pascal_voc", "label_fields": ["labels"]},
+        )
+def safe_row_col(row, col):
+    """Ensures that row is a string and col is an integer
+    Args:
+        row (int or str): row output must be string
+        col (int or str): col output must be int
+    """
+    if row is not None and col is not None:
+        if isinstance(col, str):
+            row, col = col, row
+    return row, col
+def _update_axis(axis, image, title=None, fontsize=10, remove_axis=True):
+    axis.imshow(image, origin="upper")
+    if title is not None:
+        axis.set_title(title, fontsize=fontsize)
+def make_patches_grid(images, row_count, col_count=None, figsize=(20, 20)):
+    col_count = row_count if col_count is None else col_count
+    _, axii = plt.subplots(row_count, col_count, figsize=figsize)
+    for ax, image in zip(axii.reshape(-1), images):
+        if isinstance(image, tuple):
+            title = image[1]
+            image = image[0]
+        else:
+            title = None
+        try:
+            _update_axis(axis=ax, image=image, remove_axis=True, title=title)
+        except:
+            pass
+        ax.set_axis_off()
+    plt.tight_layout()
+    plt.show()
+def print_boxes(
+    image_name,
+    boxes,
+    highlight=(None, None),
+    draw_first_line: bool = False,
+    return_plot: bool = True,
+):
+    r, c = safe_row_col(*highlight)
+    image = load_tray_image(image_name=image_name)
+    fnt = cv2.FONT_HERSHEY_SIMPLEX
+    fnt_scale = 3
+    fnt_thickness = 8
+    column_colors = {
+        1: (255, 0, 0),
+        2: (0, 0, 255),
+        3: (255, 255, 0),
+        4: (0, 255, 255),
+    }
+    for box in boxes[["x1", "y1", "x2", "y2", "cx", "cy", "row", "col"]].values:
+        color = (
+            (255, 0, 255)
+            if c == box[7] and r == box[6]
+            else column_colors.get(box[7], (255, 255, 244))
+        )
+        thickness = 20 if c == box[7] and r == box[6] else 10
+        image = cv2.rectangle(
+            image,
+            (int(box[0]), int(box[1])),
+            (int(box[2]), int(box[3])),
+            color,
+            thickness,
+        )
+        label = str(box[6]).upper() + str(int(box[7]))
+        (w, h), _ = cv2.getTextSize(label, fnt, fnt_scale, fnt_thickness)
+        x, y = (int(box[0]), int(box[1]) - fnt_thickness)
+        image = cv2.rectangle(
+            image,
+            (x - fnt_thickness, y - h - fnt_thickness),
+            (x + fnt_thickness + w, y + fnt_thickness),
+            color,
+            -1,
+        )
+        image = cv2.putText(
+            image,
+            label,
+            (x + fnt_thickness, y),
+            fnt,
+            fnt_scale,
+            (0, 0, 0),
+            fnt_thickness,
+        )
+    if draw_first_line is True:
+        line = get_first_vert_line(image_name=image_name)
+        if line is not None:
+            x1, y1, x2, y2 = line
+            cv2.line(
+                image,
+                [
+                    int(i)
+                    for i in (np.array([x2, y2]) - np.array([x1, y1])) * 10
+                    + np.array([x1, y1])
+                ],
+                [
+                    int(i)
+                    for i in (np.array([x1, y1]) - np.array([x2, y2])) * 10
+                    + np.array([x2, y2])
+                ],
+                (255, 0, 255),
+                20,
+                lineType=8,
+            )
+    if return_plot is True:
+        plt.figure(figsize=(10, 10))
+        plt.imshow(image)
+        plt.tight_layout()
+        plt.axis("off")
+        plt.show()
+    else:
+        return image
+def crop_to_vert(image):
+    return image[0 : image.shape[1] // 2, 0 : image.shape[0] // 3]
+def get_first_vert_line(image_name, min_angle=80, max_angle=100):
+    r, *_ = cv2.split(load_tray_image(image_name))
+    red_crop = cv2.normalize(
+        crop_to_vert(r),
+        None,
+        alpha=0,
+        beta=200,
+        norm_type=cv2.NORM_MINMAX,
+    )
+    lines = cv2.HoughLinesP(
+        image=ci.close(
+            cv2.Canny(red_crop, 50, 200, None, 3),
+            kernel_size=5,
+            proc_times=5,
+        ),
+        rho=1,
+        theta=np.pi / 180,
+        threshold=50,
+        minLineLength=red_crop.shape[0] // 5,
+        maxLineGap=20,
+    )
+    if lines is not None:
+        min_x = red_crop.shape[0]
+        sel_line = None
+        for _, line in enumerate(lines):
+            x1, y1, x2, y2 = line[0]
+            min_angle, max_angle = min(min_angle, max_angle), max(min_angle, max_angle)
+            line_angle = math.atan2(y2 - y1, x2 - x1) * 180 / math.pi * -1
+            if min_angle <= abs(line_angle) <= max_angle and min(x1, x2) < min_x:
+                min_x = min(x1, x2)
+                sel_line = (x1, y1, x2, y2)
+        if sel_line is not None:
+            return sel_line
+        else:
+            return None
+def draw_first_line(image_name, dot_size=10, crop_canvas: bool = False):
+    line = get_first_vert_line(image_name=image_name)
+    if line is None:
+        return canvas
+    x1, y1, x2, y2 = line
+    canvas = load_tray_image(image_name)
+    if crop_canvas is True:
+        canvas = crop_to_vert(canvas)
+    cv2.circle(canvas, (x1, y1), dot_size, (255, 0, 0))
+    cv2.circle(canvas, (x2, y2), dot_size, (0, 255, 0))
+    cv2.line(canvas, (x1, y1), (x2, y2), (0, 0, 255), 10)
+    return canvas
+def get_bbox(image_name, bboxes, row, col):
+    if isinstance(bboxes, pd.Series):
+        return bboxes
+    else:
+        row, col = safe_row_col(row, col)
+        return bboxes[
+            (
+                bboxes.file_name
+                == (image_name.name if isinstance(image_name, Path) else image_name)
+            )
+            & (bboxes.row == row)
+            & (bboxes.col == col)
+        ].iloc[0]
+def get_hough_leaf_disc_circle(
+    image_name,
+    bboxes,
+    row=-1,
+    col=-1,
+    padding: int = 10,
+    allow_move: bool = False,
+):
+    padded_leaf_disk = get_leaf_disk_wbb(
+        image_name=image_name,
+        bboxes=bboxes,
+        row=row,
+        col=col,
+        padding=padding,
+    )
+    *_, b = cv2.split(padded_leaf_disk)
+    min_t, max_t = 100, 200
+    rb = cv2.Canny(
+        cv2.normalize(
+            b,
+            None,
+            alpha=0,
+            beta=200,
+            norm_type=cv2.NORM_MINMAX,
+        ),
+        min_t,
+        max_t,
+        None,
+        3,
+    )
+    bbox = get_bbox(image_name=image_name, bboxes=bboxes, row=row, col=col)
+    hough_radii = np.arange(bbox.max_size // 2 - 10, bbox.max_size // 2 + 10, 10)
+    hough_res = hough_circle(rb, hough_radii)
+    # Select the most prominent n circles
+    _, cx, cy, radii = hough_circle_peaks(
+        hough_res,
+        hough_radii,
+        min_xdistance=10,
+        min_ydistance=10,
+        total_num_peaks=1,
+    )
+    cx = cx[0]
+    cy = cy[0]
+    r = radii[0]
+    if allow_move is True:
+        h, w, c = padded_leaf_disk.shape
+        if cx - r < 0:
+            cx += abs(r - cx)
+        if cx + r > w:
+            cx -= abs(r - cx)
+        if cy - r < 0:
+            cy += abs(cy - r)
+        if cy + r > h:
+            cy -= abs(cy - r)
+    return dict(cx=cx, cy=cy, r=radii)
+def get_hough_leaf_disk_patch(
+    image_name,
+    bboxes,
+    patch_size=-1,
+    row=-1,
+    col=-1,
+    padding: int = 10,
+    radius_crop=0,
+    disc=None,
+    allow_move: bool = False,
+    image_folder=None,
+):
+    if patch_size > 0:
+        try:
+            bbox = get_bbox(image_name, bboxes, row, col)
+            cx = int(bbox.cx)
+            cy = int(bbox.cy)
+        except:
+            return None
+        patch_size = patch_size // 2
+        return A.crop(
+            load_tray_image(image_name, image_folder=image_folder),
+            cx - patch_size,
+            cy - patch_size,
+            cx + patch_size,
+            cy + patch_size,
+        )
+    else:
+        if disc is None:
+            disc = get_hough_leaf_disc_circle(
+                image_name=image_name,
+                bboxes=bboxes,
+                row=row,
+                col=col,
+                padding=padding,
+                allow_move=allow_move,
+            )
+        r = int((disc["r"] - radius_crop) / math.sqrt(2))
+        cx = int(disc["cx"])
+        cy = int(disc["cy"])
+        left = cx - r
+        top = cy - r
+        right = cx + r
+        bottom = cy + r
+        return get_leaf_disk_wbb(
+            image_name=image_name,
+            bboxes=bboxes,
+            row=row,
+            col=col,
+            padding=padding,
+        )[top:bottom, left:right]
+def get_hough_segment_disk(
+    image_name,
+    bboxes,
+    row=-1,
+    col=-1,
+    padding: int = 10,
+    radius_crop=0,
+    disc=None,
+    allow_move: bool = False,
+):
+    if disc is None:
+        disc = get_hough_leaf_disc_circle(
+            image_name=image_name,
+            bboxes=bboxes,
+            row=row,
+            col=col,
+            padding=padding,
+            allow_move=allow_move,
+        )
+    padded_leaf_disk = get_leaf_disk_wbb(
+        image_name=image_name,
+        bboxes=bboxes,
+        row=row,
+        col=col,
+        padding=padding,
+    )
+    r = int(disc["r"] - radius_crop)
+    rc = int((disc["r"] - radius_crop) / math.sqrt(2))
+    cx = int(disc["cx"])
+    cy = int(disc["cy"])
+    left = cx - r
+    top = cy - r
+    right = cx + r
+    bottom = cy + r
+    return cv2.bitwise_and(
+        padded_leaf_disk,
+        padded_leaf_disk,
+        mask=cv2.circle(np.zeros_like(padded_leaf_disk[:, :, 0]), (cx, cy), r, 255, -1),
+    )[top:bottom, left:right]
+def draw_hough_bb_to_patch_process(
+    image_name,
+    bboxes,
+    row=-1,
+    col=-1,
+    padding: int = 10,
+    radius_crop=0,
+    disc=None,
+    allow_move: bool = False,
+):
+    if disc is None:
+        disc = get_hough_leaf_disc_circle(
+            image_name=image_name,
+            bboxes=bboxes,
+            row=row,
+            col=col,
+            padding=padding,
+            allow_move=allow_move,
+        )
+    padded_leaf_disk = get_leaf_disk_wbb(
+        image_name=image_name,
+        bboxes=bboxes,
+        row=row,
+        col=col,
+        padding=padding,
+    )
+    r = int(disc["r"] - radius_crop)
+    rc = int((disc["r"] - radius_crop) / math.sqrt(2))
+    cx = int(disc["cx"])
+    cy = int(disc["cy"])
+    left = cx - r
+    top = cy - r
+    right = cx + r
+    bottom = cy + r
+    return cv2.circle(
+        cv2.circle(
+            cv2.rectangle(
+                cv2.rectangle(
+                    padded_leaf_disk,
+                    (cx - rc, cy - rc),
+                    (cx + rc, cy + rc),
+                    (0, 255, 0),
+                    5,
+                ),
+                (left, top),
+                (right, bottom),
+                (255, 0, 155),
+                5,
+            ),
+            (cx, cy),
+            10,
+            (255, 0, 155),
+            -1,
+        ),
+        (cx, cy),
+        r,
+        (255, 0, 155),
+        5,
+    )
+def get_leaf_disk_wbb(image_name, bboxes, row=-1, col=-1, image_path: Path = None):
+    try:
+        bbox = get_bbox(image_name, bboxes, row, col)
+        return load_tray_image(image_name if image_path is None else image_path)[
+            int(bbox.y1) : int(bbox.y2), int(bbox.x1) : int(bbox.x2)
+        ]
+    except:
+        return None
+def get_fast_leaf_disc_circle(
+    image_name, bboxes, row=-1, col=-1, percent_radius: float = 1.0
+):
+    bbox = get_bbox(image_name=image_name, bboxes=bboxes, row=row, col=col)
+    return int(bbox.cx), int(bbox.cy), int((bbox.max_size / 2) * percent_radius)
+def get_fast_segment_disk(
+    image_name,
+    bboxes,
+    row=-1,
+    col=-1,
+    percent_radius: float = 1.0,
+    image_path: Path = None,
+):
+    cx, cy, r = get_fast_leaf_disc_circle(
+        image_name=image_name,
+        bboxes=bboxes,
+        row=row,
+        col=col,
+        percent_radius=percent_radius,
+    )
+    src_image = load_tray_image(image_name if image_path is None else image_path)
+    left = cx - r
+    top = cy - r
+    right = cx + r
+    bottom = cy + r
+    return cv2.bitwise_and(
+        src_image,
+        src_image,
+        mask=cv2.circle(np.zeros_like(src_image[:, :, 0]), (cx, cy), r, 255, -1),
+    )[top:bottom, left:right]
+def get_fast_leaf_disk_patch(
+    image_name,
+    bboxes,
+    row=-1,
+    col=-1,
+    percent_radius: float = 1.0,
+    image_path: Path = None,
+):
+    cx, cy, r = get_fast_leaf_disc_circle(
+        image_name=image_name,
+        bboxes=bboxes,
+        row=row,
+        col=col,
+        percent_radius=percent_radius,
+    )
+    r = int(r / math.sqrt(2))
+    left = cx - r
+    top = cy - r
+    right = cx + r
+    bottom = cy + r
+    return load_tray_image(image_name if image_path is None else image_path)[
+        top:bottom, left:right
+    ]
+def draw_fast_bb_to_patch_process(
+    image_name,
+    bboxes,
+    row=-1,
+    col=-1,
+    percent_radius: float = 1.0,
+    image_path: Path = None,
+    add_center: bool = True,
+):
+    cx, cy, r = get_fast_leaf_disc_circle(
+        image_name=image_name,
+        bboxes=bboxes,
+        row=row,
+        col=col,
+        percent_radius=percent_radius,
+    )
+    bbox = get_bbox(image_name=image_name, bboxes=bboxes, row=row, col=col)
+    image = load_tray_image(image_name if image_path is None else image_path)
+    rc = int(r / math.sqrt(2))
+    cv2.circle(image, (cx, cy), r, color=(255, 0, 155), thickness=5)
+    if add_center is True:
+        cv2.circle(image, (cx, cy), 10, color=(255, 0, 155), thickness=-1)
+    cv2.rectangle(image, (cx - rc, cy - rc), (cx + rc, cy + rc), (0, 255, 0), 5)
+    return image[int(bbox.y1) : int(bbox.y2), int(bbox.x1) : int(bbox.x2)]
+class LeafDiskDetectorDataset(Dataset):
+    def __init__(
+        self,
+        csv,
+        transform=None,
+        yxyx: bool = False,
+        return_id: bool = False,
+        bboxes: bool = True,
+    ):
+        self.boxes = csv.copy()
+        self.images = list(self.boxes.plate_name.unique())
+        self.transforms = transform
+        if transform is not None:
+            self.width, self.height = transform[0].width, transform[0].height
+        else:
+            self.width, self.height = 0, 0
+        self.yxyx = yxyx
+        self.return_id = return_id
+        self.bboxes = bboxes
+    def __len__(self):
+        return len(self.images)
+    def load_boxes(self, idx):
+        if "x" in self.boxes.columns:
+            boxes = self.boxes[self.boxes.plate_name == self.images[idx]].dropna()
+            size = boxes.shape[0]
+            return (
+                (size, boxes[["x1", "y1", "x2", "y2"]].values) if size > 0 else (0, [])
+            )
+        return 0, []
+    def load_tray_image(self, idx):
+        return load_tray_image(self.images[idx])
+    def get_by_sample_name(self, plate_name):
+        return self[self.images.index(plate_name)]
+    def get_image_by_name(self, plate_name):
+        return load_tray_image(plate_name)
+    def draw_image_with_boxes(self, plate_name):
+        image, labels, *_ = self[self.images.index(plate_name)]
+        boxes = labels[self.get_boxes_key()]
+        for box in boxes:
+            box_indexes = [1, 0, 3, 2] if self.yxyx is True else [0, 1, 2, 3]
+            image = cv2.rectangle(
+                image,
+                # Boxes are in yxyx format
+                (int(box[box_indexes[0]]), int(box[box_indexes[1]])),
+                (int(box[box_indexes[2]]), int(box[box_indexes[3]])),
+                (255, 0, 0),
+                2,
+            )
+        return image
+    def get_boxes_key(self):
+        return "bboxes" if self.bboxes is True else "boxes"
+    def __getitem__(self, index):
+        num_box, boxes = self.load_boxes(
+            index
+        )  # return list of [xmin, ymin, xmax, ymax]
+        img = self.load_tray_image(index)  # return an image
+        if num_box > 0:
+            boxes = torch.as_tensor(boxes, dtype=torch.float32)
+        else:
+            # negative example, ref: https://github.com/pytorch/vision/issues/2144
+            boxes = torch.zeros((0, 4), dtype=torch.float32)
+        image_id = torch.tensor([index])
+        labels = torch.ones((num_box,), dtype=torch.int64)
+        target = {
+            self.get_boxes_key(): boxes,
+            "labels": labels,
+            "image_id": image_id,
+            "area": torch.as_tensor(
+                (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]),
+                dtype=torch.float32,
+            ),
+            "iscrowd": torch.zeros((num_box,), dtype=torch.int64),
+            "img_size": torch.tensor([self.height, self.width]),
+            "img_scale": torch.tensor([1.0]),
+        }
+        if self.transforms is not None:
+            sample = {
+                "image": img,
+                "bboxes": target[self.get_boxes_key()],
+                "labels": labels,
+            }
+            sample = self.transforms(**sample)
+            img = sample["image"]
+            if num_box > 0:
+                # Convert to ndarray to allow slicing
+                boxes = np.array(sample["bboxes"])
+                # Convert to yxyx
+                if self.yxyx is True:
+                    boxes[:, [0, 1, 2, 3]] = boxes[:, [1, 0, 3, 2]]
+                # Convert to tensor
+                target[self.get_boxes_key()] = torch.as_tensor(
+                    boxes, dtype=torch.float32
+                )
+            else:
+                target[self.get_boxes_key()] = torch.zeros((0, 4), dtype=torch.float32)
+        else:
+            img = transforms.ToTensor()(img)
+        if self.return_id is True:
+            return img, target, image_id
+        else:
+            return img, target
+def collate_fn(batch):
+    images, targets = tuple(zip(*batch))
+    images = torch.stack(images)
+    images = images.float()
+    boxes = [target["boxes"].float() for target in targets]
+    labels = [target["labels"].float() for target in targets]
+    return images, targets
+def find_best_lr(model, default_root_dir=cc.path_to_chk_detector):
+    # run learning rate finder, results override hparams.learning_rate
+    trainer = Trainer(
+        default_root_dir=default_root_dir,
+        auto_lr_find=True,
+        accelerator="gpu",
+        callbacks=[RichProgressBar()],
+    )
+    # call tune to find the lr
+    trainer.tune(model)
+    return model.learning_rate
+class LeafDiskDetector(pl.LightningModule):
+    def __init__(
+        self,
+        batch_size: int,
+        learning_rate: float,
+        max_epochs: int,
+        image_factor: int,
+        train_data: pd.DataFrame,
+        val_data: pd.DataFrame,
+        test_data: pd.DataFrame,
+        augmentations_kinds: list = ["resize", "train", "to_tensor"],
+        augmentations_params: dict = {"gamma": (60, 180)},
+        num_workers: int = 0,
+        accumulate_grad_batches: int = 3,
+        selected_device: str = g_device,
+        optimizer: str = "adam",
+        scheduler: str = None,
+        scheduler_params: dict = {},
+    ):
+        super().__init__()
+        self.model_name = "ldd"
+        # Hyperparameters
+        self.batch_size = batch_size
+        self.selected_device = selected_device
+        self.learning_rate = learning_rate
+        self.num_workers = num_workers
+        self.max_epochs = max_epochs
+        self.accumulate_grad_batches = accumulate_grad_batches
+        # dataframes
+        self.train_data = train_data
+        self.val_data = val_data
+        self.test_data = test_data
+        # Optimizer
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        self.scheduler_params = scheduler_params
+        # albumentations
+        self.image_factor = image_factor
+        self.augmentations_kinds = augmentations_kinds
+        self.augmentations_params = augmentations_params
+        self.train_augmentations = get_augmentations(
+            image_size=self.image_factor,
+            kinds=self.augmentations_kinds,
+            **self.augmentations_params,
+        )
+        self.val_augmentations = get_augmentations(
+            image_size=self.image_factor,
+            kinds=["resize", "to_tensor"],
+            **self.augmentations_params,
+        )
+        # Model
+        self.encoder = fasterrcnn_resnet50_fpn_v2(
+            weights=FasterRCNN_ResNet50_FPN_V2_Weights
+        )
+        num_classes = 2  # 1 class (wheat) + background
+        # get number of input features for the classifier
+        in_features = self.encoder.roi_heads.box_predictor.cls_score.in_features
+        # replace the pre-trained head with a new one
+        self.encoder.roi_heads.box_predictor = FastRCNNPredictor(
+            in_features, num_classes
+        )
+        self.save_hyperparameters()
+    def hr_desc(self):
+        table = Table(title=f"{self.model_name} params & values")
+        table.add_column("Param", justify="right", style="bold", no_wrap=True)
+        table.add_column("Value")
+        def add_pairs(table_, attributes: list) -> None:
+            for a in attributes:
+                try:
+                    table_.add_row(a, Pretty(getattr(self, a)))
+                except:
+                    pass
+        add_pairs(
+            table,
+            ["model_name", "batch_size", "num_workers", "accumulate_grad_batches"],
+        )
+        table.add_row("image_width", Pretty(self.train_augmentations[0].width))
+        table.add_row("image_height", Pretty(self.train_augmentations[0].height))
+        add_pairs(
+            table,
+            ["image_factor", "augmentations_kinds", "augmentations_params"],
+        )
+        add_pairs(
+            table,
+            ["learning_rate", "optimizer", "scheduler", "scheduler_params"],
+        )
+        for name, df in zip(
+            ["train", "val", "test"],
+            [self.train_data, self.val_data, self.test_data],
+        ):
+            table.add_row(
+                name,
+                Pretty(
+                    f"shape: {str(df.shape)}, images: {len(df.plate_name.unique())}"
+                ),
+            )
+        Console().print(table)
+    def configure_optimizers(self):
+        # Optimizer
+        if self.optimizer == "adam":
+            optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
+        elif self.optimizer == "sgd":
+            optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate)
+        else:
+            optimizer = None
+        # Scheduler
+        if self.scheduler == "cycliclr":
+            scheduler = torch.optim.lr_scheduler.CyclicLR(
+                optimizer,
+                base_lr=self.learning_rate,
+                max_lr=0.01,
+                step_size_up=100,
+                mode=self.scheduler_mode,
+            )
+        elif self.scheduler == "steplr":
+            self.scheduler_params["optimizer"] = optimizer
+            scheduler = torch.optim.lr_scheduler.StepLR(**self.scheduler_params)
+            self.scheduler_params.pop("optimizer")
+        elif self.scheduler == "plateau":
+            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+                optimizer,
+                mode="min",
+                factor=0.2,
+                patience=10,
+                min_lr=1e-6,
+            )
+            scheduler = {"scheduler": scheduler, "monitor": "val_loss"}
+        else:
+            scheduler = None
+        if scheduler is None:
+            return optimizer
+        else:
+            return [optimizer], [scheduler]
+    def train_dataloader(self):
+        return DataLoader(
+            LeafDiskDetectorDataset(
+                csv=self.train_data,
+                transform=self.train_augmentations,
+                bboxes=False,
+            ),
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            collate_fn=collate_fn,
+            pin_memory=True,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            LeafDiskDetectorDataset(
+                csv=self.train_data,
+                transform=self.val_augmentations,
+                bboxes=False,
+            ),
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            collate_fn=collate_fn,
+            pin_memory=True,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            LeafDiskDetectorDataset(
+                csv=self.train_data,
+                transform=self.val_augmentations,
+                bboxes=False,
+            ),
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            collate_fn=collate_fn,
+            pin_memory=True,
+        )
+    def forward(self, x):
+        return self.encoder(x)
+    def step_(self, batch, batch_index):
+        x, y = batch
+        self.train()
+        loss_dict = self.encoder(x, y)
+        return sum(loss for loss in loss_dict.values())
+    def training_step(self, batch, batch_idx):
+        loss = self.step_(batch=batch, batch_index=batch_idx)
+        self.log(
+            "train_loss", loss, on_step=True, prog_bar=True, batch_size=self.batch_size
+        )
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self.step_(batch=batch, batch_index=batch_idx)
+        self.log(
+            "val_loss",
+            loss,
+            on_epoch=True,
+            on_step=False,
+            prog_bar=True,
+            batch_size=self.batch_size,
+        )
+        self.log("train_loss", loss)
+        return loss
+    def test_step(self, batch, batch_idx):
+        loss = self.step_(
+            batch=batch, batch_index=batch_idx, batch_size=self.batch_size
+        )
+        self.log("test_loss", loss)
+        return loss
+    def prepare_bboxes(
+        self,
+        image_name,
+        score_threshold=0.90,
+        ar_threshold=1.5,
+        size_threshold=0.30,
+    ):
+        augs = get_augmentations(
+            image_size=self.image_factor,
+            kinds=["resize", "to_tensor"],
+            inferrence=True,
+            **self.augmentations_params,
+        )
+        image = load_tray_image(image_name=image_name)
+        self.to(g_device)
+        self.eval()
+        predictions = self(augs(image=image)["image"].to(g_device).unsqueeze(0))
+        boxes = predictions[0]["boxes"].detach().to("cpu").numpy()
+        scores = predictions[0]["scores"].detach().to("cpu").numpy()
+        filtered_predictions = [
+            [box[i] for i in range(4)]
+            for box, score in zip(boxes, scores)
+            if score > score_threshold
+        ]
+        restore_size = A.Compose(
+            [A.Resize(width=image.shape[1], height=image.shape[0])],
+            # [A.Resize(width=5000, height=5000)],
+            bbox_params={"format": "pascal_voc", "label_fields": ["labels"]},
+        )
+        sample = {
+            "image": image,
+            "bboxes": filtered_predictions,
+            "labels": [1 for _ in range(len(filtered_predictions))],
+        }
+        sample = restore_size(**sample)
+        resized_predictions = sample["bboxes"]
+        from siuba import _, filter, mutate
+        boxes = (
+            pd.DataFrame(data=resized_predictions, columns=["x1", "y1", "x2", "y2"])
+            >> mutate(
+                x1=_.x1 * image.shape[1] / augs[0].width,
+                y1=_.y1 * image.shape[0] / augs[0].height,
+                x2=_.x2 * image.shape[1] / augs[0].width,
+                y2=_.y2 * image.shape[0] / augs[0].height,
+            )
+            >> mutate(width=_.x2 - _.x1, height=_.y2 - _.y1)
+            >> mutate(cx=(_.x1 + _.x2) / 2, cy=(_.y1 + _.y2) / 2)
+            >> mutate(area=_.width * _.height)
+            >> mutate(ar=_.width / _.height)
+        )
+        boxes.insert(
+            0,
+            "file_name",
+            image_name.name if isinstance(image_name, Path) else image_name,
+        )
+        boxes["max_size"] = boxes[["width", "height"]].max(axis=1)
+        ar_boxes = (
+            boxes
+            >> filter(_.width / _.height < ar_threshold)
+            >> filter(_.height / _.width < ar_threshold)
+        )
+        return ar_boxes[ar_boxes.area > ar_boxes.area.max() * size_threshold]
+    @staticmethod
+    def init_cols(bboxes):
+        bboxes = bboxes.copy()
+        # Handle columns
+        X = np.reshape(bboxes.cx.to_list(), (-1, 1))
+        ms = MeanShift(bandwidth=100, bin_seeding=True)
+        ms.fit(X)
+        cols = ms.predict(X)
+        bboxes["col"] = cols
+        bboxes = bboxes.sort_values("cx")
+        bboxes["mean_cx"] = (
+            bboxes.groupby("col").transform("mean", numeric_only=True).cx
+        )
+        bboxes = bboxes.sort_values("mean_cx")
+        for i, val in enumerate(bboxes.mean_cx.unique()):
+            bboxes.loc[bboxes["mean_cx"] == val, "col"] = i
+        # Handle Rows
+        bboxes = bboxes.sort_values("cy")
+        X = np.reshape(bboxes.cy.to_list(), (-1, 1))
+        ms = MeanShift(bandwidth=100, bin_seeding=True)
+        ms.fit(X)
+        rows = ms.predict(X)
+        bboxes["row"] = rows
+        bboxes = bboxes.sort_values("cy")
+        bboxes["mean_cy"] = (
+            bboxes.groupby("row").transform("mean", numeric_only=True).cy
+        )
+        bboxes = bboxes.sort_values("mean_cy")
+        for i, val in zip(["a", "b", "c"], bboxes.mean_cy.unique()):
+            bboxes.loc[bboxes["mean_cy"] == val, "row"] = i
+        bboxes = bboxes.sort_values("cx")
+        return bboxes
+    @staticmethod
+    def finalize_indexing(bboxes, image_name):
+        bboxes = bboxes.copy()
+        bboxes = bboxes.sort_values("cx")
+        labels_unique = bboxes.col.unique()
+        labels = bboxes.col.to_numpy()
+        if len(labels_unique) < 4:
+            inc_labels = [[i, 0] for i in range(len(labels_unique))]
+            max_width = bboxes.max_size.max()
+            # Handle left-most label
+            # We remove half of max width to take into account trails margins
+            left_most_line = get_first_vert_line(image_name=image_name)
+            if left_most_line is not None:
+                left_most_point = bboxes.x1.min() - min(
+                    left_most_line[0], left_most_line[1]
+                )
+            else:
+                left_most_point = bboxes.x1.min() - (max_width / 2)
+            i = 1
+            while left_most_point > i * 1.1 * max_width:
+                inc_labels[0][1] += 1
+                i += 1
+            # Handle the next labels
+            prev_min_min = bboxes[bboxes.col == 0].x2.max()
+            for label in labels_unique[1:]:
+                current_label_contours = bboxes[bboxes.col == label]
+                max_width = current_label_contours.max_size.max()
+                min_left = current_label_contours.x1.min()
+                i = 1
+                while min_left - prev_min_min > i * 1.1 * max_width:
+                    inc_labels[label][1] += 1
+                    i += 1
+                prev_min_min = min_left + max_width
+            for pos, inc in reversed(inc_labels):
+                labels[labels >= pos] += inc
+            bboxes["col"] = labels
+            labels_unique = np.unique(labels)
+        bboxes["col"] += 1
+        return bboxes.sort_values(["row", "col"])
+    def index_plate(
+        self,
+        image_name,
+        score_threshold=0.90,
+        ar_threshold=1.5,
+        size_threshold=0.50,
+    ):
+        bboxes = self.prepare_bboxes(
+            image_name=image_name,
+            score_threshold=score_threshold,
+            ar_threshold=ar_threshold,
+            size_threshold=size_threshold,
+        )
+        if bboxes.shape[0] == 0:
+            return bboxes
+        bboxes = self.init_cols(bboxes=bboxes)
+        bboxes = self.finalize_indexing(bboxes=bboxes, image_name=image_name)
+        return bboxes
+def test_augmentations(
+    df,
+    image_size,
+    kinds: list = ["resize", "train"],
+    row_count=2,
+    col_count=4,
+    **aug_params,
+):
+    src_dataset = LeafDiskDetectorDataset(
+        csv=df,
+        transform=get_augmentations(
+            image_size=image_size, kinds=["resize"], **aug_params
+        ),
+    )
+    test_dataset = LeafDiskDetectorDataset(
+        csv=df,
+        transform=get_augmentations(image_size=image_size, kinds=kinds, **aug_params),
+    )
+    image_name = df.sample(n=1).iloc[0].plate_name
+    images = [(src_dataset.draw_image_with_boxes(plate_name=image_name), "Source")] + [
+        (test_dataset.draw_image_with_boxes(plate_name=image_name), "Augmented")
+        for i in range(row_count * col_count - 1)
+    ]
+    make_patches_grid(
+        images=images,
+        row_count=row_count,
+        col_count=col_count,
+        figsize=(col_count * 4, row_count * 3),
+    )
+def get_file_path_from_row(row, path_to_patches: Path):
+    return path_to_patches.joinpath(row.file_name)
+def get_fast_images(
+    row, path_to_patches, percent_radius: float = 1.0, add_process_image: bool = False
+):
+    d = {}
+    try:
+        d["leaf_disc_box"] = get_leaf_disk_wbb(
+            row.file_name, row, image_path=get_file_path_from_row(row, path_to_patches)
+        )
+    except:
+        pass
+    try:
+        d["segmented_leaf_disc"] = get_fast_segment_disk(
+            image_name=row.file_name,
+            bboxes=row,
+            percent_radius=percent_radius,
+            image_path=get_file_path_from_row(row, path_to_patches),
+        )
+    except:
+        pass
+    try:
+        d["leaf_disc_patch"] = get_fast_leaf_disk_patch(
+            image_name=row.file_name,
+            bboxes=row,
+            percent_radius=percent_radius,
+            image_path=get_file_path_from_row(row, path_to_patches),
+        )
+    except:
+        pass
+    if add_process_image is True:
+        try:
+            d["process_image"] = draw_fast_bb_to_patch_process(
+                image_name=row.file_name,
+                bboxes=row,
+                percent_radius=percent_radius,
+                image_path=get_file_path_from_row(row, path_to_patches),
+            )
+        except:
+            pass
+    return d
+def save_images(row: pd.Series, images_data: dict, errors: dict, paths: dict):
+    fn = f"{Path(row.file_name).stem}_{row.row}_{int(row.col)}.png"
+    for k, image in images_data.items():
+        if k not in paths:
+            continue
+        path_to_image = paths[k].joinpath(fn)
+        if image is not None:
+            if path_to_image.is_file() is False:
+                cv2.imwrite(str(path_to_image), cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
+        elif errors is not None:
+            errors[k].append(row.file_name)
+        else:
+            pass
+def handle_bbox(
+    row: pd.Series,
+    paths: dict,
+    errors: dict = None,
+    percent_radius: float = 1.0,
+    add_process_image: bool = False,
+):
+    save_images(
+        row=row,
+        images_data=get_fast_images(
+            row=row,
+            percent_radius=percent_radius,
+            add_process_image=add_process_image,
+            path_to_patches=paths["plates"],
+        ),
+        errors=errors,
+        paths=paths,
+    )

src/leaf_patch_gen_diff.ipynb ADDED Viewed

	@@ -0,0 +1,650 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Genotype Differenciation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "import scipy.stats as stats\n",
+    "import statsmodels.api as sm\n",
+    "from statsmodels.formula.api import ols\n",
+    "from statsmodels.regression.linear_model import RegressionResultsWrapper\n",
+    "from statsmodels.stats.multicomp import pairwise_tukeyhsd\n",
+    "\n",
+    "from matplotlib.figure import Figure\n",
+    "import seaborn as sns\n",
+    "import panel as pn\n",
+    "\n",
+    "import com_const as cc\n",
+    "import com_func as cf\n",
+    "import com_image as ci"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "warnings.simplefilter(action=\"ignore\", category=UserWarning)\n",
+    "warnings.simplefilter(action=\"ignore\", category=FutureWarning)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option(\"display.max_colwidth\", 500)\n",
+    "pd.set_option(\"display.max_columns\", 500)\n",
+    "pd.set_option(\"display.width\", 1000)\n",
+    "pd.set_option(\"display.max_rows\", 20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.set_style(\"whitegrid\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pn.extension(\"ipywidgets\", \"plotly\", design=\"material\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Constants"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stars = [-np.log(0.05), -np.log(0.01), -np.log(0.001), -np.log(0.0001)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_single_progression(\n",
+    "    ax,\n",
+    "    df,\n",
+    "    target,\n",
+    "    title: str,\n",
+    "    hue=\"gen\",\n",
+    "    style=\"gen\",\n",
+    "    show_legend: bool = False,\n",
+    "):\n",
+    "    lp = sns.lineplot(\n",
+    "        df.sort_values(hue),\n",
+    "        x=\"dpi\",\n",
+    "        y=target,\n",
+    "        hue=hue,\n",
+    "        markers=True,\n",
+    "        style=style,\n",
+    "        dashes=False,\n",
+    "        palette=\"tab10\",\n",
+    "        markersize=12,\n",
+    "        ax=ax,\n",
+    "    )\n",
+    "    lp.set_yticklabels([\"\", \"3\", \"\", \"5\", \"\", \"7\", \"\", \"9\"])\n",
+    "    ax.set_title(title)\n",
+    "    if show_legend is True:\n",
+    "        sns.move_legend(ax, \"upper left\", bbox_to_anchor=(1, 1))\n",
+    "    else:\n",
+    "        ax.get_legend().set_visible(False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_model(\n",
+    "    df: pd.DataFrame, target: str, formula: str, dpi: int = None\n",
+    ") -> RegressionResultsWrapper:\n",
+    "    df_ = df[df.dpi == dpi] if dpi is not None else df\n",
+    "    return ols(f\"{target} {formula}\", data=df_).fit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def anova_table(aov, add_columns: bool = True):\n",
+    "    \"\"\"\n",
+    "    The function below was created specifically for the one-way ANOVA table\n",
+    "    results returned for Type II sum of squares\n",
+    "    \"\"\"\n",
+    "    if add_columns is True:\n",
+    "        aov[\"mean_sq\"] = aov[:][\"sum_sq\"] / aov[:][\"df\"]\n",
+    "\n",
+    "        aov[\"eta_sq\"] = aov[:-1][\"sum_sq\"] / sum(aov[\"sum_sq\"])\n",
+    "\n",
+    "        aov[\"omega_sq\"] = (\n",
+    "            aov[:-1][\"sum_sq\"] - (aov[:-1][\"df\"] * aov[\"mean_sq\"][-1])\n",
+    "        ) / (sum(aov[\"sum_sq\"]) + aov[\"mean_sq\"][-1])\n",
+    "\n",
+    "        cols = [\"sum_sq\", \"df\", \"mean_sq\", \"F\", \"PR(>F)\", \"eta_sq\", \"omega_sq\"]\n",
+    "        aov = aov[cols]\n",
+    "    return aov"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_assumptions(models: list, titles: list, figsize=(12, 4)):\n",
+    "    fig = Figure(figsize=figsize)\n",
+    "    fig.suptitle(\"Probability plot of model residual's\", fontsize=\"x-large\")\n",
+    "    axii = fig.subplots(1, len(models))\n",
+    "    for ax, model, title in zip(axii, models, titles):\n",
+    "        _ = stats.probplot(model.resid, plot=ax, rvalue=True)\n",
+    "        ax.set_title(title)\n",
+    "\n",
+    "    return fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def hghlight_rejection(s):\n",
+    "    df = pd.DataFrame(columns=s.columns, index=s.index)\n",
+    "    df.loc[s[\"reject_pred\"].ne(s[\"reject_obs\"]), [\"group1\", \"group2\"]] = (\n",
+    "        \"background: red\"\n",
+    "    )\n",
+    "    df.loc[s[\"reject_pred\"].eq(s[\"reject_obs\"]), [\"group1\", \"group2\"]] = (\n",
+    "        \"background: green\"\n",
+    "    )\n",
+    "    df.loc[s.reject_pred, [\"reject_pred\"]] = \"background: green\"\n",
+    "    df.loc[~s.reject_pred, [\"reject_pred\"]] = \"background: red\"\n",
+    "    df.loc[s.reject_obs, [\"reject_obs\"]] = \"background: green\"\n",
+    "    df.loc[~s.reject_obs, [\"reject_obs\"]] = \"background: red\"\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_tuckey_df(endog, groups, df_genotypes) -> pd.DataFrame:\n",
+    "    tukey = pairwise_tukeyhsd(endog=endog, groups=groups)\n",
+    "    df_tuc = pd.DataFrame(tukey._results_table)\n",
+    "    df_tuc.columns = [str(c) for c in df_tuc.iloc[0]]\n",
+    "    ret = (\n",
+    "        df_tuc.drop(df_tuc.index[0])\n",
+    "        .assign(group1=lambda s: s.group1.astype(str))\n",
+    "        .assign(group2=lambda s: s.group2.astype(str))\n",
+    "        .assign(reject=lambda s: s.reject.astype(str) == \"True\")\n",
+    "    )\n",
+    "    ret[\"p-adj\"] = tukey.pvalues\n",
+    "    if df_genotypes is None:\n",
+    "        return ret\n",
+    "    else:\n",
+    "        return (\n",
+    "            ret.merge(right=df_genotypes, how=\"left\", left_on=\"group1\", right_on=\"gen\")\n",
+    "            .drop([\"gen\"], axis=1)\n",
+    "            .rename(columns={\"rpvloci\": \"group1_rpvloci\"})\n",
+    "            .merge(right=df_genotypes, how=\"left\", left_on=\"group2\", right_on=\"gen\")\n",
+    "            .drop([\"gen\"], axis=1)\n",
+    "            .rename(columns={\"rpvloci\": \"group2_rpvloci\"})\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "def get_tuckey_compare(df, df_genotypes=None, groups: str = \"gen\"):\n",
+    "    merge_on = (\n",
+    "        [\"group1\", \"group2\"]\n",
+    "        if df_genotypes is None\n",
+    "        else [\"group1\", \"group2\", \"group1_rpvloci\", \"group2_rpvloci\"]\n",
+    "    )\n",
+    "    df_poiv = get_tuckey_df(df.p_oiv, df[groups], df_genotypes=df_genotypes)\n",
+    "    df_oiv = get_tuckey_df(df.oiv, df[groups], df_genotypes=df_genotypes)\n",
+    "    df = pd.merge(left=df_poiv, right=df_oiv, on=merge_on, suffixes=[\"_pred\", \"_obs\"])\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def df_tukey_cmp_plot(df, groups):\n",
+    "    df_tukey = (\n",
+    "        get_tuckey_compare(df=df, groups=groups, df_genotypes=None)\n",
+    "        .assign(pair_groups=lambda s: s.group1 + \"\\n\" + s.group2)\n",
+    "        .sort_values(\"p-adj_obs\")\n",
+    "    )\n",
+    "\n",
+    "    df_tukey_reject = df_tukey[df_tukey.reject_obs & df_tukey.reject_pred]\n",
+    "    df_tukey_accept = df_tukey[~df_tukey.reject_obs & ~df_tukey.reject_pred]\n",
+    "    df_tukey_diverge = df_tukey[df_tukey.reject_obs != df_tukey.reject_pred]\n",
+    "\n",
+    "    fig = Figure(figsize=(20, 6))\n",
+    "    ax_reject, ax_diverge, ax_accept = fig.subplots(\n",
+    "        1,\n",
+    "        3,\n",
+    "        gridspec_kw={\n",
+    "            \"width_ratios\": [\n",
+    "                len(df_tukey_reject),\n",
+    "                len(df_tukey_diverge),\n",
+    "                len(df_tukey_accept),\n",
+    "            ]\n",
+    "        },\n",
+    "        sharey=True,\n",
+    "    )\n",
+    "\n",
+    "    for ax in [ax_reject, ax_accept, ax_diverge]:\n",
+    "        ax.set_yticks(ticks=stars, labels=[\"*\", \"**\", \"***\", \"****\"])\n",
+    "        ax.grid(False)\n",
+    "\n",
+    "    ax_reject.set_title(\"Rejected\")\n",
+    "    ax_diverge.set_title(\"Conflict\")\n",
+    "    ax_accept.set_title(\"Accepted\")\n",
+    "\n",
+    "    for ax, df in zip(\n",
+    "        [ax_reject, ax_accept, ax_diverge],\n",
+    "        [df_tukey_reject, df_tukey_accept, df_tukey_diverge],\n",
+    "    ):\n",
+    "        for star in stars:\n",
+    "            ax.axhline(y=star, linestyle=\"-\", color=\"black\", alpha=0.5)\n",
+    "        ax.bar(\n",
+    "            x=df[\"pair_groups\"],\n",
+    "            height=-np.log(df[\"p-adj_pred\"]),\n",
+    "            width=-0.4,\n",
+    "            align=\"edge\",\n",
+    "            color=\"green\",\n",
+    "            label=\"predictions\",\n",
+    "        )\n",
+    "        ax.bar(\n",
+    "            x=df[\"pair_groups\"],\n",
+    "            height=-np.log(df[\"p-adj_obs\"]),\n",
+    "            width=0.4,\n",
+    "            align=\"edge\",\n",
+    "            color=\"blue\",\n",
+    "            label=\"scorings\",\n",
+    "        )\n",
+    "        ax.margins(0.01)\n",
+    "\n",
+    "    ax_accept.legend(loc=\"upper left\", bbox_to_anchor=[0, 1], ncols=1, fancybox=True)\n",
+    "    ax_reject.set_ylabel(\"-log(p value)\")\n",
+    "    ax_reject.tick_params(axis=\"y\", which=\"major\", labelsize=16)\n",
+    "\n",
+    "    fig.subplots_adjust(wspace=0.05, hspace=0.05)\n",
+    "\n",
+    "    return fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_patches(df, diff_only: bool = True):\n",
+    "    if diff_only is True:\n",
+    "        df = df[(df.oiv != df.p_oiv)]\n",
+    "    df = df.assign(diff=lambda s: s.oiv != s.p_oiv).sort_values(\n",
+    "        [\"diff\", \"oiv\", \"p_oiv\"]\n",
+    "    )\n",
+    "    return pn.GridBox(\n",
+    "        *[\n",
+    "            pn.Column(\n",
+    "                pn.pane.Markdown(f\"### {row.file_name}|{row.oiv}->p{row.p_oiv}\"),\n",
+    "                pn.pane.Image(\n",
+    "                    object=ci.enhance_pil_image(\n",
+    "                        image=ci.load_image(\n",
+    "                            file_name=row.file_name,\n",
+    "                            path_to_images=cc.path_to_leaf_patches,\n",
+    "                        ),\n",
+    "                        brightness=1.5,\n",
+    "                    )\n",
+    "                ),\n",
+    "            )\n",
+    "            for _, row in df.iterrows()\n",
+    "        ],\n",
+    "        ncols=len(df),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = cf.read_dataframe(\n",
+    "    path=cc.path_to_data.joinpath(\"genotype_differenciation_dataset.csv\")\n",
+    ").assign(exp=lambda s: s.experiment + s.inoc.astype(str))\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_dpi_6 = df[df.dpi == 6]\n",
+    "df_dpi_6"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Visualizations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = Figure(figsize=(12, 4))\n",
+    "ax_oiv, ax_p_oiv = fig.subplots(nrows=1, ncols=2)\n",
+    "\n",
+    "full_oiv = \"OIV 452-1\"\n",
+    "df_oiv = df.copy()\n",
+    "df_oiv[full_oiv] = df_oiv.oiv\n",
+    "df_p_oiv = df.copy()\n",
+    "df_p_oiv[full_oiv] = df_p_oiv.p_oiv\n",
+    "\n",
+    "var = \"gen\"\n",
+    "\n",
+    "plot_single_progression(\n",
+    "    ax=ax_oiv, df=df_oiv, target=full_oiv, title=\"Human scored OIV 452-1\"\n",
+    ")\n",
+    "\n",
+    "plot_single_progression(\n",
+    "    ax=ax_p_oiv,\n",
+    "    df=df_p_oiv,\n",
+    "    target=full_oiv,\n",
+    "    title=\"Model predicted OIV 452-1\",\n",
+    "    show_legend=True,\n",
+    ")\n",
+    "\n",
+    "fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = Figure(figsize=(16, 6))\n",
+    "sns.histplot(\n",
+    "    df_dpi_6.sort_values(\"gen\"),\n",
+    "    x=\"gen\",\n",
+    "    hue=\"gen\",\n",
+    "    shrink=0.8,\n",
+    "    ax=fig.subplots(1, 1),\n",
+    ")\n",
+    "\n",
+    "fig"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ANOVA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rpv_formula = f\"~ C(gen) + C(exp) + C(exp):C(gen)\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(\n",
+    "    pd.concat(\n",
+    "        [\n",
+    "            sm.stats.anova_lm(\n",
+    "                get_model(df=df, target=\"oiv\", dpi=i, formula=rpv_formula)\n",
+    "            ).assign(dpi=i)\n",
+    "            for i in sorted(list(df.dpi.unique()))\n",
+    "        ]\n",
+    "    )\n",
+    "    .reset_index()\n",
+    "    .set_index(\"dpi\")\n",
+    "    .drop(\n",
+    "        [\"df\", \"sum_sq\", \"mean_sq\"],\n",
+    "        axis=1,\n",
+    "    )\n",
+    "    .query(\"index != 'Residual'\")\n",
+    "    .query(\"index != 'C(exp)'\")\n",
+    "    .rename(columns={\"index\": \"source of variation\"})\n",
+    "    .replace(\"C(gen)\", \"genotype (between)\")\n",
+    "    .replace(\"C(exp):C(gen)\", \"interaction genotype/experiment\")\n",
+    "    .reset_index()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_dpi_6.groupby(\"gen\").agg(\n",
+    "    {\"oiv\": [\"mean\", \"std\"], \"p_oiv\": [\"mean\", \"std\"]}\n",
+    ").reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pn.GridBox(\n",
+    "    pn.Column(\n",
+    "        pn.pane.Markdown(\"### Annotated\"),\n",
+    "        anova_table(\n",
+    "            sm.stats.anova_lm(\n",
+    "                get_model(df=df_dpi_6, target=\"oiv\", dpi=6, formula=rpv_formula),\n",
+    "                typ=2,\n",
+    "            )\n",
+    "        ),\n",
+    "    ),\n",
+    "    pn.Column(\n",
+    "        pn.pane.Markdown(\"### Predicted\"),\n",
+    "        anova_table(\n",
+    "            sm.stats.anova_lm(\n",
+    "                get_model(df=df_dpi_6, target=\"p_oiv\", dpi=6, formula=rpv_formula),\n",
+    "                typ=2,\n",
+    "            )\n",
+    "        ),\n",
+    "    ),\n",
+    "    ncols=2,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_assumptions(\n",
+    "    models=[\n",
+    "        get_model(df=df_dpi_6, target=\"oiv\", dpi=6, formula=rpv_formula),\n",
+    "        get_model(df=df_dpi_6, target=\"p_oiv\", dpi=6, formula=rpv_formula),\n",
+    "    ],\n",
+    "    titles=[\"Score OIV 452-1\", \"Predicted OIV 452-1\"],\n",
+    "    figsize=(10, 5),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tukey HSD"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dft = get_tuckey_compare(df=df_dpi_6, groups=\"gen\", df_genotypes=None)\n",
+    "dft.style.apply(hghlight_rejection, axis=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_tukey_cmp_plot(df=df_dpi_6, groups=\"gen\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_cmp_means = (\n",
+    "    (df_dpi_6[df_dpi_6.gen.isin([\"1441s\", \"1466s\"])])\n",
+    "    .groupby(\"gen\")\n",
+    "    .agg({\"oiv\": [\"mean\", \"std\"], \"p_oiv\": [\"mean\", \"std\"]})\n",
+    "    .reset_index()\n",
+    ")\n",
+    "df_cmp_means[\"difference\"] = df_cmp_means.oiv[\"mean\"] - df_cmp_means.p_oiv[\"mean\"]\n",
+    "df_cmp_means"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_patches(df_dpi_6[df_dpi_6.gen.isin([\"1441s\"])], diff_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_patches(df_dpi_6[df_dpi_6.gen.isin([\"1466s\"])], diff_only=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/leaf_patch_oiv_predictor.ipynb ADDED Viewed

	@@ -0,0 +1,397 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step by Step OIV 452-1 predictor Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "from pathlib import Path\n",
+    "import shutil\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from sklearn.metrics import (\n",
+    "    confusion_matrix,\n",
+    "    mean_squared_error,\n",
+    "    ConfusionMatrixDisplay,\n",
+    "    classification_report,\n",
+    ")\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import altair as alt\n",
+    "\n",
+    "import panel as pn\n",
+    "\n",
+    "import com_const as cc\n",
+    "import com_func as cf\n",
+    "import com_augmentations as ca\n",
+    "import leaf_patch_oiv_predictor_model as lpopm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove warnings\n",
+    "warnings.simplefilter(action=\"ignore\", category=UserWarning)\n",
+    "warnings.simplefilter(action=\"ignore\", category=FutureWarning)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.options.display.float_format = \"{:4,.4f}\".format\n",
+    "\n",
+    "pd.set_option(\"display.max_colwidth\", 500)\n",
+    "pd.set_option(\"display.max_columns\", 500)\n",
+    "pd.set_option(\"display.width\", 1000)\n",
+    "pd.set_option(\"display.max_rows\", 16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "alt.data_transformers.disable_max_rows()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pn.extension(\"plotly\", \"vega\", notifications=True, console_output=\"disable\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train, val, test = [\n",
+    "    cf.read_dataframe(cc.path_to_data.joinpath(f\"oiv_{d}.csv\"))\n",
+    "    for d in [\"train\", \"val\", \"test\"]\n",
+    "]\n",
+    "alt.hconcat(\n",
+    "    *[\n",
+    "        alt.Chart(df.assign(oiv=lambda x: x.oiv.astype(str)))\n",
+    "        .mark_bar()\n",
+    "        .encode(x=\"oiv\", y=\"count()\", color=\"source\", tooltip=\"count()\")\n",
+    "        .properties(width=200, height=300, title=title)\n",
+    "        for (df, title) in [\n",
+    "            (train, \"train\"),\n",
+    "            (val, \"val\"),\n",
+    "            (test, \"test\"),\n",
+    "        ]\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# src_patches = (\n",
+    "#     Path(cc.path_to_root)\n",
+    "#     .joinpath(\"..\")\n",
+    "#     .joinpath(\"leafdisks_powderymildew\")\n",
+    "#     .joinpath(\"data_in\")\n",
+    "#     .joinpath(\"202311_dataset\")\n",
+    "#     .joinpath(\"patches\")\n",
+    "# )\n",
+    "# src_patches.is_dir()\n",
+    "\n",
+    "# for d in [train, val, test]:\n",
+    "#     for fn in tqdm(d.file_name):\n",
+    "#         shutil.copy(src=src_patches.joinpath(fn), dst=cc.path_to_leaf_patches.joinpath(fn))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Augmentation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "augmentations_kinds = [\"fix_brightness\", \"resize\", \"affine\", \"color\", \"to_tensor\"]\n",
+    "augmentations_params = dict(\n",
+    "    gamma=(60, 120),\n",
+    "    brightness_limit=0.15,\n",
+    "    contrast_limit=0.25,\n",
+    "    brightness_target=115,\n",
+    "    brightness_thresholds=(115, 130),\n",
+    ")\n",
+    "\n",
+    "ca.test_augmentations(\n",
+    "    df=train,\n",
+    "    image_size=224,\n",
+    "    path_to_images=cc.path_to_leaf_patches,\n",
+    "    kinds=augmentations_kinds,\n",
+    "    columns=[\"oiv\"],\n",
+    "    **augmentations_params\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Find Batch Size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 615"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We trained the models on an NVIDIA A100 80GB PCIe that allowed us a batch size of 769 that we reduced to 615 t avoid monopolizing the GPU. Uncomment the the following block to calculate optimal batch size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# batch_size = lpopm.get_bs(\n",
+    "#     batch_size=300,\n",
+    "#     train=train,\n",
+    "#     val=val,\n",
+    "#     test=test,\n",
+    "#     augmentations_kinds=augmentations_kinds,\n",
+    "#     augmentations_params=augmentations_params,\n",
+    "#     shrink_factor=0.8,\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "batch_size"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Find Learning Rate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "learning_rate = 0.000363"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We found that we our selected batch size the best learning rate was 0.000363. The function hereafter will calculate on optimal learning rate for your setup."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# learning_rate = lpopm.get_lr(\n",
+    "#     train=train,\n",
+    "#     val=val,\n",
+    "#     test=test,\n",
+    "#     augmentations_params=augmentations_params,\n",
+    "#     augmentations_kinds=augmentations_kinds,\n",
+    "#     batch_size=batch_size,\n",
+    "#     lr_times=10,\n",
+    "# )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "learning_rate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# lpopm.train_model(\n",
+    "#     path_to_images=cc.path_to_leaf_patches,\n",
+    "#     train=train,\n",
+    "#     val=val,\n",
+    "#     test=test,\n",
+    "#     monitor_loss=\"mse\",\n",
+    "#     augmentations_kinds=augmentations_kinds,\n",
+    "#     augmentations_params=augmentations_params,\n",
+    "#     batch_size=batch_size,\n",
+    "#     learning_rate=learning_rate,\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Validate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = lpopm.OivDetPatchesNet.load_from_checkpoint(\n",
+    "    cc.path_to_chk_oiv.joinpath(\"oiv_scorer.ckpt\")\n",
+    ")\n",
+    "model.path_to_images = cc.path_to_leaf_patches\n",
+    "model.hr_desc()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_data = model.test_data.assign(oiv=lambda x :x.fixed_oiv)\n",
+    "test_data[\"p_oiv\"] = model.predict(test_data)\n",
+    "\n",
+    "print(f\"MSE: {mean_squared_error(test_data.oiv.astype(int), test_data.p_oiv.astype(int)):.3f}\")\n",
+    "ConfusionMatrixDisplay.from_predictions(\n",
+    "    test_data.oiv.astype(int), test_data.p_oiv.astype(int)\n",
+    ");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/leaf_patch_oiv_predictor_model.py ADDED Viewed

	@@ -0,0 +1,1266 @@

+from copy import deepcopy
+from functools import partial
+from pathlib import Path
+from datetime import datetime as dt
+import json
+from rich.console import Console
+from rich.table import Table
+from rich.pretty import Pretty
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
+from siuba import _ as s
+from siuba import filter as sfilter
+from siuba import mutate
+from sklearn.metrics import (
+    classification_report,
+    mean_absolute_error,
+    mean_squared_error,
+)
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torch import nn
+import torchmetrics
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import (
+    RichProgressBar,
+    DeviceStatsMonitor,
+    ModelCheckpoint,
+    LearningRateMonitor,
+)
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.tuner.tuning import Tuner
+from coral_pytorch.losses import corn_loss
+from coral_pytorch.dataset import proba_to_label, corn_label_from_logits
+from transformers import logging
+from transformers import (
+    ViTForImageClassification,
+    SegformerForImageClassification,
+    BeitForImageClassification,
+    SwinForImageClassification,
+    ConvNextForImageClassification,
+    DeiTForImageClassificationWithTeacher,
+    ResNetForImageClassification,
+)
+import com_const as cc
+import com_image as ci
+import com_augmentations as ca
+import com_func as cf
+logging.set_verbosity_error()
+torch.set_float32_matmul_precision("high")
+oiv_models_overview_path = cc.path_to_data.joinpath("oiv_models_overview.csv")
+g_device = (
+    "mps"
+    if torch.backends.mps.is_built() is True
+    else "cuda" if torch.backends.cuda.is_built() else "cpu"
+)
+checkpoints_dict = {
+    "hf_vit_g16": {
+        "path": "google/vit-base-patch16-224-in21k",
+        "name": "Google ViT 16",
+        "link": "https://huggingface.co/google/vit-base-patch16-224-in21k",
+        "class": ViTForImageClassification,
+    },
+    "hf_bb_16": {
+        "path": "microsoft/beit-base-patch16-224-pt22k-ft22k",
+        "name": "BEiT (base-sized model, fine-tuned on ImageNet-22k)",
+        "link": "https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k",
+        "class": BeitForImageClassification,
+    },
+    "hf_seg": {
+        "path": "nvidia/mit-b0",
+        "name": "Segformer",
+        "link": "https://huggingface.co/nvidia/mit-b0",
+        "class": SegformerForImageClassification,
+    },
+    "hf_bl_16": {
+        "path": "microsoft/beit-large-patch16-224-pt22k-ft22k",
+        "name": "BEiT (large-sized model, fine-tuned on ImageNet-22k)",
+        "link": "https://huggingface.co/microsoft/beit-large-patch16-224-pt22k-ft22k",
+        "class": BeitForImageClassification,
+    },
+    "hf_vit_g32": {
+        "path": "google/vit-large-patch32-384",
+        "name": "Vision Transformer (large-sized model)",
+        "link": "https://huggingface.co/google/vit-large-patch32516-384",
+        "class": ViTForImageClassification,
+    },
+    "hf_swt_t": {
+        "path": "microsoft/swin-tiny-patch4-window7-224",
+        "name": "Swin Transformer (tiny-sized model)",
+        "link": "https://huggingface.co/microsoft/swin-tiny-patch4-window7-224",
+        "class": SwinForImageClassification,
+    },
+    "hf_cnx_t": {
+        "path": "facebook/convnext-tiny-224",
+        "name": "ConvNeXT (tiny-sized model)",
+        "link": "https://huggingface.co/facebook/convnext-tiny-224",
+        "class": ConvNextForImageClassification,
+    },
+    "hf_det_b": {
+        "path": "facebook/deit-base-distilled-patch16-224",
+        "name": "Distilled Data-efficient Image Transformer (base-sized model)",
+        "link": "https://huggingface.co/facebook/deit-base-distilled-patch16-224",
+        "class": DeiTForImageClassificationWithTeacher,
+    },
+    "hf_swt_l": {
+        "path": "microsoft/swin-large-patch4-window12-384-in22k",
+        "name": "Swin Transformer (large-sized model)",
+        "link": "https://huggingface.co/microsoft/swin-large-patch4-window12-384-in22k",
+        "class": SwinForImageClassification,
+    },
+    "hf_deit_s": {
+        "path": "facebook/deit-small-patch16-224",
+        "name": "Data-efficient Image Transformer (small-sized model)",
+        "link": "https://huggingface.co/facebook/deit-small-patch16-224",
+        "class": ViTForImageClassification,
+    },
+    "hf_seg_b3": {
+        "path": "nvidia/mit-b3",
+        "name": "SegFormer (b3-sized) encoder pre-trained-only",
+        "link": "https://huggingface.co/nvidia/mit-b3",
+        "class": SegformerForImageClassification,
+    },
+    "hf_vit_gl": {
+        "path": "google/vit-large-patch16-224",
+        "name": "Vision Transformer (large-sized model)",
+        "link": "https://huggingface.co/google/vit-large-patch16-224",
+        "class": ViTForImageClassification,
+    },
+    "hf_resnet": {
+        "path": "microsoft/resnet-50",
+        "name": "ResNet-50 v1.5",
+        "link": "https://huggingface.co/microsoft/resnet-50",
+        "class": ResNetForImageClassification,
+    },
+}
+def prepare_dataframe(
+    df, excluded_sources, predicted_var, invert_scale: bool = False
+) -> pd.DataFrame:
+    df = df >> sfilter(~s[predicted_var].isna()) >> sfilter(s[predicted_var] > 0)
+    if isinstance(excluded_sources, list) and excluded_sources:
+        df = df.filter_data(excluded_sources)
+    elif isinstance(excluded_sources, dict):
+        for k, v in excluded_sources.items():
+            if k == "==":
+                df = df >> sfilter(s[v[0]] == s[v[1]])
+            elif k == "!=":
+                df = df >> sfilter(s[v[0]] != s[v[1]])
+    df = df.reset_index(drop=True)
+    if isinstance(predicted_var, str):
+        df[predicted_var] = (df[predicted_var] - 1) // 2
+        df[predicted_var] = df[predicted_var].astype(int)
+        if invert_scale is True:
+            df[predicted_var] = df[predicted_var].max() - df[predicted_var]
+    elif isinstance(predicted_var, list):
+        for pv in predicted_var:
+            df[pv] = (df[pv] - 1) // 2
+            df[pv] = df[pv].astype(int)
+            if invert_scale is True:
+                df[pv] = df[pv].max() - df[pv]
+    return df
+class OivDetPatches(Dataset):
+    def __init__(
+        self,
+        dataframe,
+        transform,
+        predicted_var: str = None,
+        path_to_images=cc.path_to_leaf_patches,
+    ) -> None:
+        super().__init__()
+        if isinstance(dataframe, pd.DataFrame):
+            self.dataframe = dataframe.reset_index(drop=True)
+            self.predicted_var = predicted_var
+            self.dataframe = self.dataframe
+        elif isinstance(dataframe, list):
+            self.dataframe = pd.DataFrame(data={"file_name": dataframe})
+        self.transform = transform
+        self.path_to_images = path_to_images
+    def __len__(self):
+        return self.dataframe.shape[0]
+    def __getitem__(self, index):
+        img = self.transform(image=self.get_image(index=index))["image"]
+        if self.dataframe.shape[1] == 1 or self.predicted_var is None:
+            return {"image": img}
+        else:
+            return {
+                "image": img,
+                "label": torch.tensor(
+                    self.dataframe.loc[index, self.predicted_var], dtype=torch.long
+                ),
+            }
+    def get_resizer(self, to_tensor: bool = False):
+        for a in self.transform:
+            if isinstance(a, A.Resize):
+                if to_tensor is True:
+                    return A.Compose([a, ToTensorV2()])
+                return A.Compose([a])
+        else:
+            return None
+    def get_image(self, index):
+        return ci.load_image(
+            file_name=self.dataframe.file_name.to_list()[index],
+            path_to_images=self.path_to_images,
+        )
+    def get_resized_image(self, index, to_tensor: bool = False):
+        t = self.get_resizer(to_tensor=to_tensor)
+        if t is not None:
+            return t(
+                image=ci.load_image(
+                    file_name=self.dataframe.file_name.to_list()[index],
+                    path_to_images=self.path_to_images,
+                )
+            )["image"]
+        else:
+            return self.get_image(index=index)
+    def get_data(self, index):
+        return self.dataframe.iloc[index]
+def get_encoder_data(enc_key) -> dict:
+    return checkpoints_dict["hf_swt_t" if enc_key == "pretrained" else enc_key]
+class OivDetPatchesNet(pl.LightningModule):
+    def __init__(
+        self,
+        batch_size: int,
+        learning_rate: float,
+        max_epochs: int,
+        num_workers,
+        accumulate_grad_batches,
+        train: pd.DataFrame,
+        val: pd.DataFrame,
+        test: pd.DataFrame,
+        predicted_var: str = "oiv",
+        backbone: str = "hf_swt_t",
+        data_source: str = "improved_patches_v3",
+        augmentations_kinds: list = ["resize", "train", "to_tensor"],
+        augmentations_params: dict = {"gamma": (60, 180), "crop": None},
+        optimizer: str = "adam",
+        scheduler: str = None,
+        scheduler_params: dict = {},
+        conv_feature_sizes=None,
+        linear_features_sizes=[],
+        exclude_if_source: list = [],
+        weight_loss: bool = False,
+        ordinal_regression_model=None,
+        monitor_loss: str = "mse",
+        skip_linear: bool = False,
+        use_sigmoid: bool = False,
+        val_monitor_target: str = "val_monitor",
+        val_monitor_mode: str = "min",
+        salt_name: str = "oiv",
+        binary_data: dict = None,
+        path_to_images: str = cc.path_to_leaf_patches,
+        invert_scale: bool = False,
+    ) -> None:
+        super().__init__()
+        self.backbone = backbone
+        self.conv_feature_sizes = conv_feature_sizes
+        self.linear_features_sizes = linear_features_sizes
+        self.predicted_var = predicted_var
+        self.invert_scale = invert_scale
+        self.model_name = (
+            f"{salt_name}_{predicted_var}_{self.backbone_name}_{monitor_loss}"
+        )
+        if isinstance(ordinal_regression_model, str):
+            self.model_name = self.model_name + "_" + ordinal_regression_model
+        else:
+            self.model_name = self.model_name + "_" + "classic"
+        self.short_model_name = f"oiv_{predicted_var}"
+        # dataframes
+        self.exclude_if_source = exclude_if_source
+        self.train_data = prepare_dataframe(
+            df=train,
+            excluded_sources=self.exclude_if_source,
+            predicted_var=self.predicted_var,
+            invert_scale=invert_scale,
+        )
+        self.val_data = prepare_dataframe(
+            df=val,
+            excluded_sources=self.exclude_if_source,
+            predicted_var=self.predicted_var,
+            invert_scale=invert_scale,
+        )
+        self.test_data = prepare_dataframe(
+            df=test,
+            excluded_sources=self.exclude_if_source,
+            predicted_var=self.predicted_var,
+            invert_scale=invert_scale,
+        )
+        self.data_source = data_source
+        self.path_to_images = path_to_images
+        self.labels_cardinal = len(self.train_data[self.predicted_var].unique())
+        self.binary_data = binary_data
+        # Encoder
+        enc_data = get_encoder_data(self.backbone)
+        self.encoder = enc_data["class"].from_pretrained(
+            enc_data["path"],
+            num_labels=self.labels_cardinal,
+            problem_type="single_label_classification",
+            ignore_mismatched_sizes=True,
+        )
+        self.image_size = 224
+        self.ordinal_regression_model = ordinal_regression_model
+        self.flatten = nn.Flatten()
+        self.skip_linear = skip_linear
+        self.use_sigmoid = use_sigmoid
+        if self.ordinal_regression_model == "corn":
+            self.linear_out = nn.Linear(
+                in_features=self._get_conv_output_size(self.image_size),
+                out_features=self.labels_cardinal - 1,
+            )
+        else:
+            self.linear_out = nn.Linear(
+                in_features=self._get_conv_output_size(self.image_size),
+                out_features=self.labels_cardinal,
+            )
+        # Hyperparameters
+        self.batch_size = batch_size
+        self.selected_device = g_device
+        self.learning_rate = learning_rate
+        self.start_lr = self.learning_rate
+        self.num_workers = num_workers
+        self.max_epochs = max_epochs
+        self.accumulate_grad_batches = accumulate_grad_batches
+        self.weight_loss = weight_loss
+        if self.ordinal_regression_model is None:
+            if weight_loss is True:
+                vc = train[self.predicted_var].value_counts()
+                self.criterion = nn.CrossEntropyLoss(
+                    weight=torch.FloatTensor(
+                        [vc[i] / len(train) for i in [0, 1, 2, 3, 4]]
+                    )
+                )
+            else:
+                self.criterion = nn.CrossEntropyLoss()
+        elif self.ordinal_regression_model == "mse":
+            self.criterion = nn.MSELoss()
+        elif self.ordinal_regression_model == "mae":
+            self.criterion = nn.L1Loss()
+        elif self.ordinal_regression_model == "corn":
+            self.criterion = corn_loss
+        # Set up attributes for computing the MAE
+        self.monitor_loss = monitor_loss
+        self.val_monitor_target = val_monitor_target
+        self.val_monitor_mode = val_monitor_mode
+        if self.monitor_loss == "mse":
+            self.train_monitor = torchmetrics.MeanSquaredError()
+            self.val_monitor = torchmetrics.MeanSquaredError()
+            self.test_monitor = torchmetrics.MeanSquaredError()
+        elif self.monitor_loss == "mae":
+            self.train_monitor = torchmetrics.MeanAbsoluteError()
+            self.val_monitor = torchmetrics.MeanAbsoluteError()
+            self.test_monitor = torchmetrics.MeanAbsoluteError()
+        # Optimizer
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        self.scheduler_params = scheduler_params
+        # albumentations
+        self.augmentations_kinds = augmentations_kinds
+        self.augmentations_params = augmentations_params
+        self.augmentations_params["mean"] = (0.5, 0.5, 0.5)
+        self.augmentations_params["std"] = (0.5, 0.5, 0.5)
+        self.train_augmentations = ca.get_augmentations(
+            image_size=self.image_size,
+            kinds=self.augmentations_kinds,
+            **self.augmentations_params,
+        )
+        self.val_augmentations = ca.get_augmentations(
+            image_size=self.image_size,
+            kinds=["resize", "to_tensor"],
+            **self.augmentations_params,
+        )
+        self._thresholds = None
+        self._thresholds_source = None
+        self.save_hyperparameters()
+    def forward(self, x, binary_data=None, *args, **kwargs):
+        x = self.encoder(x)
+        if hasattr(x, "logits"):
+            x = x.logits
+            x = self.flatten(x)
+        if binary_data is not None:
+            x = torch.cat(x, binary_data)
+        if self.linear_out is not None:
+            x = self.linear_out(x)
+        if self.use_sigmoid:
+            x = nn.functional.sigmoid(x)
+        return x
+    def hr_desc(self):
+        table = Table(title=f"{self.model_name} params & values")
+        table.add_column("Param", justify="right", style="bold", no_wrap=True)
+        table.add_column("Value")
+        def add_pairs(table_, attributes: list) -> None:
+            for a in attributes:
+                try:
+                    table_.add_row(a, Pretty(getattr(self, a)))
+                except:
+                    pass
+        add_pairs(
+            table,
+            [
+                "backbone",
+                "predicted_var",
+                "invert_scale",
+                "skip_linear",
+                "use_sigmoid",
+                "loss_function",
+                "monitor_loss",
+                "val_monitor_target",
+                "val_monitor_mode",
+                "ordinal_regression_model",
+                "checkpoint_mode",
+            ],
+        )
+        for k, v in get_encoder_data(self.backbone).items():
+            if isinstance(v, str):
+                table.add_row(k, Pretty(v))
+        add_pairs(
+            table,
+            ["batch_size", "image_size", "augmentations_kinds", "augmentations_params"],
+        )
+        try:
+            if self.backbone == "custom":
+                table.add_row(
+                    "Conv Encoder",
+                    "\n".join(
+                        [layer_data.hr_desc() for layer_data in self.conv_feature_sizes]
+                    ),
+                )
+            table.add_row(
+                "Conv output size", str(self._get_conv_output_size(self.image_size))
+            )
+            table.add_row("Linear Encoder", self.encoder.lin_encoder.hr_desc())
+        except:
+            pass
+        eis = str(self.exclude_if_source)
+        if ">" in eis:
+            eis = (
+                eis.split(">")[1]
+                .replace("(", "")
+                .replace(")", "")
+                .replace("_,", "")
+                .replace("_.", "")
+            )
+        table.add_row("exclude_if_source", Pretty(eis))
+        table.add_row(
+            "path_to_images",
+            str(self.path_to_images.relative_to(cc.path_to_root.absolute())),
+        )
+        table.add_row(
+            "include_if_source",
+            str(self.train_data.source.sort_values().unique()),
+        )
+        add_pairs(
+            table,
+            [
+                "weight_loss",
+                "learning_rate",
+                "start_lr",
+                "optimizer",
+                "scheduler",
+                "scheduler_params",
+                "val_split",
+            ],
+        )
+        for name, df in zip(
+            ["train", "val", "test"],
+            [self.train_data, self.val_data, self.test_data],
+        ):
+            table.add_row(name, str(df.shape))
+        add_pairs(table_=table, attributes=["data_source"])
+        Console().print(table)
+    def do_test_augmentations(self):
+        ca.test_augmentations(
+            self.train_data,
+            self.image_size,
+            kinds=self.augmentations_kinds,
+            **self.augmentations_params,
+        )
+    def predict_sample(self, sample, device=g_device):
+        self.to(device)
+        if self.ordinal_regression_model == "coral":
+            prediction = proba_to_label(
+                torch.sigmoid(self(sample["image"].unsqueeze(0).to(device)))
+            )
+        elif self.ordinal_regression_model == "corn":
+            prediction = corn_label_from_logits(
+                self(sample["image"].unsqueeze(0).to(device))
+            )
+        else:
+            prediction = torch.argmax(
+                self(sample["image"].unsqueeze(0).to(device)),
+                dim=1,
+            )
+        return prediction.detach().to("cpu").flatten()
+    def predict_image(self, file_path, device=g_device):
+        return self.predict_sample(
+            self.val_augmentations(
+                image=ci.load_image(
+                    file_path
+                    if isinstance(file_path, Path)
+                    else self.path_to_images.joinpath(file_path)
+                )
+            ),
+            device=device,
+        )
+    def embed_sample(self, sample, device=g_device):
+        self.to(device)
+        if self.ordinal_regression_model == "coral":
+            raise NotImplementedError
+        elif self.ordinal_regression_model == "corn":
+            embeddings = torch.sigmoid(self(sample["image"].unsqueeze(0).to(device)))
+        else:
+            embeddings = torch.sigmoid(self(sample["image"].unsqueeze(0).to(device)))
+        return embeddings.detach().to("cpu").flatten().numpy()
+    def embed_image(self, file_path, device=g_device):
+        return self.embed_sample(
+            self.val_augmentations(
+                image=ci.load_image(
+                    file_path
+                    if isinstance(file_path, Path)
+                    else self.path_to_images.joinpath(file_path)
+                )
+            ),
+            device=device,
+        )
+    def predict(self, dataset: str = "val", show_progress: bool = True):
+        predictions = []
+        self.eval()
+        self.to(g_device)
+        dataset = self.get_dataset(dataset=dataset)
+        if show_progress is True:
+            for sample in tqdm(dataset, desc="Predicting"):
+                predictions.append(self.predict_sample(sample=sample))
+        else:
+            for sample in dataset:
+                predictions.append(self.predict_sample(sample=sample))
+        return torch.stack(predictions).detach().cpu().numpy()
+    def embed_data(self, dataset, device=g_device, predicted_var=None):
+        self.eval()
+        self.to(device)
+        dataset = self.get_dataset(dataset=dataset, predicted_var=predicted_var)
+        ret = pd.DataFrame()
+        for i in tqdm(range(len(dataset))):
+            sample = dataset[i]
+            emmbedding = self.embed_sample(sample=sample, device=device)
+            ret = pd.concat(
+                [
+                    ret,
+                    pd.DataFrame(
+                        data={
+                            "file_name": dataset.dataframe.file_name.to_list()[i],
+                            "oiv": sample["label"],
+                        }
+                        | {f"Dim {i}": [enc] for i, enc in enumerate(emmbedding)}
+                    ),
+                ]
+            )
+        return ret
+    def _get_conv_output_size(self, image_shape):
+        batch_size = 3
+        tensor_ = self.encoder(
+            torch.autograd.Variable(torch.rand(batch_size, 3, image_shape, image_shape))
+        )
+        return tensor_.logits.size(1) if hasattr(tensor_, "logits") else tensor_.size(1)
+    def get_dataset(self, dataset: str = "val", predicted_var=None):
+        if isinstance(dataset, str):
+            dataset = (
+                self.val_dataloader().dataset
+                if dataset == "val"
+                else (
+                    self.test_dataloader().dataset
+                    if dataset == "test"
+                    else self.train_dataloader().dataset
+                )
+            )
+        elif isinstance(dataset, pd.DataFrame):
+            return OivDetPatches(
+                dataframe=dataset,
+                transform=self.val_augmentations,
+                path_to_images=self.path_to_images,
+                predicted_var=predicted_var,
+            )
+        elif isinstance(dataset, OivDetPatches):
+            return dataset
+        elif isinstance(dataset, list):
+            return OivDetPatches(dataframe=dataset, transform=self.val_augmentations)
+        return dataset
+    def configure_optimizers(self):
+        # Optimizer
+        if self.optimizer == "adam":
+            optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
+        elif self.optimizer == "sgd":
+            optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate)
+        else:
+            optimizer = None
+        # Scheduler
+        if self.scheduler == "cycliclr":
+            scheduler = torch.optim.lr_scheduler.CyclicLR(
+                optimizer,
+                base_lr=self.learning_rate,
+                max_lr=0.01,
+                step_size_up=100,
+                mode=self.scheduler_mode,
+            )
+        elif self.scheduler == "steplr":
+            self.scheduler_params["optimizer"] = optimizer
+            scheduler = torch.optim.lr_scheduler.StepLR(**self.scheduler_params)
+            self.scheduler_params.pop("optimizer")
+        elif self.scheduler == "plateau":
+            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+                optimizer,
+                mode="min",
+                factor=0.2,
+                patience=10,
+                min_lr=1e-6,
+            )
+            scheduler = {"scheduler": scheduler, "monitor": "val_loss"}
+        else:
+            scheduler = None
+        if scheduler is None:
+            return optimizer
+        else:
+            return [optimizer], [scheduler]
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(
+            OivDetPatches(
+                dataframe=self.train_data,
+                transform=self.train_augmentations,
+                predicted_var=self.predicted_var,
+                path_to_images=self.path_to_images,
+            ),
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            pin_memory=True,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            OivDetPatches(
+                dataframe=self.val_data,
+                transform=self.val_augmentations,
+                predicted_var=self.predicted_var,
+                path_to_images=self.path_to_images,
+            ),
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=True,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            OivDetPatches(
+                dataframe=self.test_data,
+                transform=self.val_augmentations,
+                predicted_var=self.predicted_var,
+                path_to_images=self.path_to_images,
+            ),
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=True,
+        )
+    def compute_loss(self, preds, targets):
+        return self.criterion(preds, targets)
+    def _shared_step(self, batch):
+        x, target = batch["image"], batch["label"]
+        logits = self(x)
+        if self.ordinal_regression_model == "corn":
+            loss = self.criterion(logits, target, num_classes=self.labels_cardinal)
+            predicted_labels = corn_label_from_logits(logits)
+        else:
+            loss = self.compute_loss(logits, target)
+            predicted_labels = torch.argmax(logits, dim=1)
+        return loss, target, predicted_labels
+    def training_step(self, batch, batch_idx):
+        loss, true_labels, predicted_labels = self._shared_step(batch)
+        self.log("train_loss", loss)
+        self.train_monitor(predicted_labels, true_labels)
+        self.log("train_monitor", self.train_monitor, on_epoch=True, on_step=False)
+        return loss  # this is passed to the optimzer for training
+    def validation_step(self, batch, batch_idx):
+        loss, true_labels, predicted_labels = self._shared_step(batch)
+        self.log("val_loss", loss)
+        self.val_monitor(predicted_labels, true_labels)
+        self.log(
+            "val_monitor",
+            self.val_monitor,
+            on_epoch=True,
+            on_step=False,
+            prog_bar=self.val_monitor_target == "val_monitor",
+        )
+        return loss
+    def test_step(self, batch, batch_idx):
+        loss, true_labels, predicted_labels = self._shared_step(batch)
+        self.test_monitor(predicted_labels, true_labels)
+        self.log("test_monitor", self.test_monitor, on_epoch=True, on_step=False)
+        return loss
+    def check_forward(self, index: int = 0):
+        self.eval()
+        return self.training_step(
+            next(
+                iter(
+                    DataLoader(
+                        OivDetPatches(
+                            dataframe=self.val_data,
+                            transform=self.val_augmentations,
+                            predicted_var=self.predicted_var,
+                            path_to_images=self.path_to_images,
+                        ),
+                        batch_size=2,
+                        num_workers=index,
+                        pin_memory=True,
+                    )
+                )
+            ),
+            index,
+        )
+    def get_trainer(
+        self,
+        checkpoints_path: Path = cc.path_to_chk_oiv,
+        log_every_n_steps: int = 5,
+        patience: int = 10,
+        patience_min_delta: float = 0.0005,
+    ):
+        callbacks = [
+            RichProgressBar(),
+            EarlyStopping(
+                monitor=self.val_monitor_target,
+                mode=self.val_monitor_mode,
+                patience=patience,
+                min_delta=patience_min_delta,
+            ),
+            DeviceStatsMonitor(),
+            ModelCheckpoint(
+                save_top_k=1,
+                monitor=self.val_monitor_target,
+                mode=self.val_monitor_mode,
+                auto_insert_metric_name=True,
+                filename=self.short_model_name
+                + "-{val_monitor:.3f}-{epoch}-{val_loss:.3f}-{train_loss:.3f}-{step}",
+            ),
+            LearningRateMonitor(logging_interval="epoch"),
+        ]
+        return Trainer(
+            default_root_dir=str(checkpoints_path),
+            logger=TensorBoardLogger(
+                save_dir=str(checkpoints_path),
+                version=self.model_name + "_" + dt.now().strftime("%Y%m%d_%H%M%S"),
+                name="lightning_logs",
+            ),
+            accelerator="cpu" if self.selected_device == "cpu" else "gpu",
+            max_epochs=self.max_epochs,
+            log_every_n_steps=log_every_n_steps,
+            callbacks=callbacks,
+            accumulate_grad_batches=self.accumulate_grad_batches,
+        )
+    def tune_trainer(
+        self,
+        trainer: Trainer,
+        tune_options: list = ["find_lr", "find_bs"],
+        find_bs_mode: str = "binsearch",
+    ):
+        tuner = Tuner(trainer=trainer)
+        if "find_lr" in tune_options:
+            tuner.lr_find(self)
+        if "find_bs" in tune_options:
+            tuner.scale_batch_size(model=self, mode=find_bs_mode)
+    @staticmethod
+    def short_bin_label(label):
+        if label == "sporulation":
+            return "sp"
+        if label == "necrosis_dots":
+            return "nd"
+        if label == "necrosis_stains":
+            return "nf"
+        if label == "necrosis_senescence":
+            return "ns"
+        if label == "necrosis":
+            return "n"
+        if label == "stains":
+            return "s"
+    @staticmethod
+    def name_from_backbone(backbone):
+        if isinstance(backbone, str):
+            return backbone
+        elif isinstance(backbone, dict):
+            ret = "bin"
+            if "labels" in backbone:
+                labels = backbone["labels"]
+                if isinstance(labels, str):
+                    labels = (
+                        labels.replace("[", "")
+                        .replace("]", "")
+                        .replace("'", "")
+                        .replace('"', "")
+                        .replace(" ", "")
+                        .split(",")
+                    )
+                for label in labels:
+                    ret += "." + OivDetPatchesNet.short_bin_label(label)
+            if "max_epochs" in backbone:
+                ret += f"_me{backbone['max_epochs']}"
+            if "exclude_if_source" in backbone:
+                ret += f"_xis{len(backbone['exclude_if_source'])}"
+            if "_" not in ret:
+                ret += "_max_f1wa"
+            return ret
+        else:
+            raise Exception(f"Unknown backbone type {type(backbone)}")
+    @property
+    def labels(self):
+        return [1, 3, 5, 7, 9] if self.labels_cardinal == 5 else [1, 5, 9]
+    @property
+    def grad_cam_layer(self):
+        if isinstance(self.backbone, str):
+            return self.encoder.swin.layernorm
+        elif isinstance(self.backbone, dict):
+            return self.encoder.encoder.swin.layernorm
+        else:
+            raise Exception(f"Unknown backbone type {type(self.backbone)}")
+    @property
+    def relative_path_to_images(self):
+        return self.path_to_images.relative_to(cc.path_to_root.absolute())
+    @property
+    def backbone_name(self):
+        return OivDetPatchesNet.name_from_backbone(self.backbone)
+def get_model(path_to_model) -> OivDetPatchesNet:
+    return OivDetPatchesNet.load_from_checkpoint(str(path_to_model))
+def expand_dict(d: dict) -> pd.DataFrame:
+    return pd.DataFrame(data={k: [v] for k, v in d.items()})
+def get_model_data(chk, test_data="val"):
+    name_data = {"target_var": chk.stem.split("-")[0][4:]} | {
+        k: v
+        for k, v in {
+            kv.split("=")[0]: kv.split("=")[1] for kv in chk.stem.split("-")[1:]
+        }.items()
+    }
+    model_: OivDetPatchesNet = get_model(str(chk))
+    model_data = {}
+    for key in [
+        "batch_size",
+        "num_workers",
+        "max_epochs",
+        "accumulate_grad_batches",
+        "image_size",
+        "augmentations_kinds",
+        "augmentations_params",
+        "labels",
+        "invert_scale",
+        "exclude_if_source",
+        "learning_rate",
+        "start_lr",
+        "optimizer",
+        "scheduler",
+        "data_source",
+        "relative_path_to_images",
+        "loss_function",
+        "checkpoint_monitor",
+        "checkpoint_mode",
+        "ordinal_regression_model",
+        "monitor_loss",
+        "use_sigmoid",
+        "skip_linear",
+    ]:
+        try:
+            model_data[key] = str(getattr(model_, key))
+        except:
+            pass
+    eis = str(model_.exclude_if_source)
+    if ">" in eis:
+        eis = (
+            eis.split(">")[1]
+            .replace("(", "")
+            .replace(")", "")
+            .replace("_,", "")
+            .replace("_.", "")
+        )
+    model_data["exclude_if_source"] = eis
+    for k, v in model_.scheduler_params.items():
+        model_data[f"sched_{k}"] = v
+    model_data["backbone"] = model_.backbone_name
+    y_hat = model_.predict(dataset=test_data, show_progress=False)
+    y = model_.get_dataset(dataset=test_data).dataframe[model_.predicted_var]
+    cr = classification_report(
+        y_true=y, y_pred=y_hat, output_dict=True, target_names=[1, 3, 5, 7, 9]
+    )
+    return expand_dict(
+        d={
+            "file_name": chk.stem,
+            "date": chk.parent.parent.name.split("_")[-2],
+            "time": chk.parent.parent.name.split("_")[-1],
+        }
+        | name_data
+        | model_data
+        | {
+            "accuracy": cr["accuracy"],
+            "macro avg": cr["macro avg"]["f1-score"],
+            "weighted avg": cr["weighted avg"]["f1-score"],
+            "mse": mean_squared_error(y, y_hat),
+            "rmse": mean_squared_error(y, y_hat, squared=False),
+            "mae": mean_absolute_error(y, y_hat),
+        }
+        | {
+            f"f1-{k}": v["f1-score"]
+            for k, v in cr.items()
+            if isinstance(k, int) is True
+        }
+        | {
+            name: str(df.shape[0])
+            for name, df in zip(
+                ["train_count", "val_count", "test_count"],
+                [model_.train_data, model_.val_data, model_.test_data],
+            )
+        }
+        | {"file_path": str(chk)}
+    )
+def update_models_overview(
+    test_data: pd.DataFrame = "val",
+    overview_path=oiv_models_overview_path,
+    checkpoints_path=cc.path_to_chk_oiv,
+    force_reset: bool = False,
+    add_new: bool = True,
+):
+    if overview_path.is_file() is True and force_reset is False:
+        models_overview = cf.read_dataframe(overview_path)
+    else:
+        models_overview = pd.DataFrame().assign(file_name=None)
+    if add_new is False:
+        return models_overview
+    checkpoints = sorted(
+        [
+            chk
+            for chk in checkpoints_path.rglob("*.ckpt")
+            if chk.name.startswith(".") is False
+        ]
+    )
+    for chk in tqdm(checkpoints, desc="Building models summaries"):
+        if chk.stem in models_overview.file_name.unique():
+            continue
+        try:
+            new_data = get_model_data(chk, test_data)
+        except Exception as e:
+            print("______________________________")
+            print(chk)
+            print(str(e))
+            # break
+        else:
+            models_overview = pd.concat([models_overview, new_data]).reset_index(
+                drop=True
+            )
+    ret = models_overview.sort_values(["date", "time"]).replace(
+        ["False", "True"], [False, True]
+    ).sort_values(["date", "time"]) >> sfilter(
+        s.file_path.isin([str(c) for c in checkpoints])
+    )
+    ret["timestamp"] = pd.to_datetime(
+        pd.to_datetime(models_overview.date, format="%Y%m%d").astype(str)
+        + " "
+        + pd.to_datetime(models_overview.time, format="%H%M%S").astype(str)
+    )
+    return cf.write_dataframe(df=ret, path=overview_path)
+def create_model(
+    train,
+    val,
+    test,
+    augmentations_kinds,
+    augmentations_params,
+    backbone: str = "hf_swt_t",
+    orm: str = "corn",
+    predicted_var="oiv",
+    learning_rate: float = 0.00055,
+    batch_size=400,
+    monitor_loss="mse",
+    scheduler="steplr",
+    scheduler_params={"step_size": 6, "gamma": 0.85},
+    exclude_if_source=[],
+    path_to_images=cc.path_to_leaf_patches,
+    data_source="raw_dataset",
+    conv_feature_sizes=None,
+    linear_features_sizes=[],
+    invert_scale: bool = False,
+):
+    return OivDetPatchesNet(
+        backbone=backbone,
+        train=train,
+        val=val,
+        test=test,
+        batch_size=batch_size,
+        learning_rate=learning_rate,
+        num_workers=10,
+        max_epochs=200,
+        predicted_var=predicted_var,
+        accumulate_grad_batches=1,
+        scheduler=scheduler,
+        scheduler_params=scheduler_params,
+        augmentations_kinds=augmentations_kinds,
+        exclude_if_source=exclude_if_source,
+        augmentations_params=augmentations_params,
+        ordinal_regression_model=orm,
+        monitor_loss=monitor_loss,
+        path_to_images=path_to_images,
+        data_source=data_source,
+        conv_feature_sizes=conv_feature_sizes,
+        linear_features_sizes=linear_features_sizes,
+        invert_scale=invert_scale,
+    )
+def train_model(
+    train,
+    val,
+    test,
+    augmentations_kinds,
+    augmentations_params,
+    backbone: str = "hf_swt_t",
+    orm: str = "corn",
+    predicted_var="oiv",
+    learning_rate: float = 0.00055,
+    batch_size=400,
+    monitor_loss="mae",
+    scheduler="steplr",
+    scheduler_params={"step_size": 6, "gamma": 0.85},
+    patience=15,
+    exclude_if_source=[],
+    path_to_images=cc.path_to_leaf_patches,
+    data_source="raw_dataset",
+    conv_feature_sizes=None,
+    linear_features_sizes=[],
+    invert_scale: bool = False,
+    checkpoints_path: Path = cc.path_to_chk_oiv,
+):
+    model = create_model(
+        backbone=backbone,
+        train=train,
+        val=val,
+        test=test,
+        augmentations_kinds=augmentations_kinds,
+        augmentations_params=augmentations_params,
+        orm=orm,
+        learning_rate=learning_rate,
+        batch_size=batch_size,
+        monitor_loss=monitor_loss,
+        scheduler=scheduler,
+        scheduler_params=scheduler_params,
+        exclude_if_source=exclude_if_source,
+        predicted_var=predicted_var,
+        path_to_images=path_to_images,
+        data_source=data_source,
+        conv_feature_sizes=conv_feature_sizes,
+        linear_features_sizes=linear_features_sizes,
+        invert_scale=invert_scale,
+    )
+    model.hr_desc()
+    trainer = model.get_trainer(
+        patience=patience, log_every_n_steps=1, checkpoints_path=checkpoints_path
+    )
+    trainer.fit(model)
+def get_bs(
+    train,
+    val,
+    test,
+    augmentations_kinds,
+    augmentations_params,
+    backbone: str = "hf_swt_t",
+    predicted_var="oiv",
+    orm: str = "corn",
+    batch_size=400,
+    find_bs_mode: str = "binsearch",
+    shrink_factor: float = 1.0,
+    conv_feature_sizes=None,
+    linear_features_sizes=[],
+):
+    model_ = create_model(
+        backbone=backbone,
+        train=train,
+        val=val,
+        test=test,
+        augmentations_kinds=augmentations_kinds,
+        augmentations_params=augmentations_params,
+        orm=orm,
+        batch_size=batch_size,
+        conv_feature_sizes=conv_feature_sizes,
+        linear_features_sizes=linear_features_sizes,
+        predicted_var=predicted_var,
+    )
+    trainer = model_.get_trainer(checkpoints_path=cc.path_to_chk_oiv)
+    model_.tune_trainer(
+        trainer=trainer, tune_options=["find_bs"], find_bs_mode=find_bs_mode
+    )
+    return int(model_.batch_size * shrink_factor)
+def _inner_get_lr(
+    train,
+    val,
+    test,
+    augmentations_kinds,
+    augmentations_params,
+    backbone: str = "hf_swt_t",
+    predicted_var="oiv",
+    orm: str = "corn",
+    batch_size=400,
+    conv_feature_sizes=None,
+    linear_features_sizes=[],
+):
+    model_ = create_model(
+        backbone=backbone,
+        train=train,
+        val=val,
+        test=test,
+        augmentations_kinds=augmentations_kinds,
+        augmentations_params=augmentations_params,
+        orm=orm,
+        batch_size=batch_size,
+        conv_feature_sizes=conv_feature_sizes,
+        linear_features_sizes=linear_features_sizes,
+        predicted_var=predicted_var,
+    )
+    trainer = model_.get_trainer(checkpoints_path=cc.path_to_chk_oiv)
+    model_.tune_trainer(trainer=trainer, tune_options=["find_lr"])
+    return model_.learning_rate
+def get_lr(
+    batch_size: int,
+    train,
+    val,
+    test,
+    augmentations_kinds,
+    augmentations_params,
+    backbone: str = "hf_swt_t",
+    lr_times: int = 5,
+    conv_feature_sizes=None,
+    linear_features_sizes=[],
+    predicted_var="oiv",
+):
+    lrs = [
+        _inner_get_lr(
+            backbone=backbone,
+            train=train,
+            val=val,
+            test=test,
+            augmentations_kinds=augmentations_kinds,
+            augmentations_params=augmentations_params,
+            batch_size=batch_size,
+            conv_feature_sizes=conv_feature_sizes,
+            linear_features_sizes=linear_features_sizes,
+            predicted_var=predicted_var,
+        )
+        for _ in range(lr_times)
+    ]
+    return sum(lrs) / len(lrs)

src/repo_manager.ipynb ADDED Viewed

File without changes