Autoresearch implementation for testing

2026-06-03 17:06:46 +00:00 · 2026-03-24 10:37:51 +01:00
parent 4a435bf13d
commit a0086da16b
12 changed files with 1874 additions and 330 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,8 @@
 .env
 .venv/
+__pycache__/
 ships-aerial-images/
 runs*/
-*.pt
+*.pt
+results.tsv
+run.log
--- a/.ipynb_checkpoints/Lab5-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Lab5-checkpoint.ipynb
@@ -1,305 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "7f263647",
-   "metadata": {},
-   "source": [
-    "# Laboratory Exercise 5"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "448199f3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: ultralytics in ./.venv/lib/python3.12/site-packages (8.3.159)\n",
-      "Requirement already satisfied: numpy>=1.23.0 in ./.venv/lib/python3.12/site-packages (from ultralytics) (2.3.1)\n",
-      "Requirement already satisfied: matplotlib>=3.3.0 in ./.venv/lib/python3.12/site-packages (from ultralytics) (3.10.3)\n",
-      "Requirement already satisfied: opencv-python>=4.6.0 in ./.venv/lib/python3.12/site-packages (from ultralytics) (4.11.0.86)\n",
-      "Requirement already satisfied: pillow>=7.1.2 in ./.venv/lib/python3.12/site-packages (from ultralytics) (11.2.1)\n",
-      "Requirement already satisfied: pyyaml>=5.3.1 in ./.venv/lib/python3.12/site-packages (from ultralytics) (6.0.2)\n",
-      "Requirement already satisfied: requests>=2.23.0 in ./.venv/lib/python3.12/site-packages (from ultralytics) (2.32.4)\n",
-      "Requirement already satisfied: scipy>=1.4.1 in ./.venv/lib/python3.12/site-packages (from ultralytics) (1.16.0)\n",
-      "Requirement already satisfied: torch>=1.8.0 in ./.venv/lib/python3.12/site-packages (from ultralytics) (2.7.1)\n",
-      "Requirement already satisfied: torchvision>=0.9.0 in ./.venv/lib/python3.12/site-packages (from ultralytics) (0.22.1)\n",
-      "Requirement already satisfied: tqdm>=4.64.0 in ./.venv/lib/python3.12/site-packages (from ultralytics) (4.67.1)\n",
-      "Requirement already satisfied: psutil in ./.venv/lib/python3.12/site-packages (from ultralytics) (7.0.0)\n",
-      "Requirement already satisfied: py-cpuinfo in ./.venv/lib/python3.12/site-packages (from ultralytics) (9.0.0)\n",
-      "Requirement already satisfied: pandas>=1.1.4 in ./.venv/lib/python3.12/site-packages (from ultralytics) (2.3.0)\n",
-      "Requirement already satisfied: ultralytics-thop>=2.0.0 in ./.venv/lib/python3.12/site-packages (from ultralytics) (2.0.14)\n",
-      "Requirement already satisfied: contourpy>=1.0.1 in ./.venv/lib/python3.12/site-packages (from matplotlib>=3.3.0->ultralytics) (1.3.2)\n",
-      "Requirement already satisfied: cycler>=0.10 in ./.venv/lib/python3.12/site-packages (from matplotlib>=3.3.0->ultralytics) (0.12.1)\n",
-      "Requirement already satisfied: fonttools>=4.22.0 in ./.venv/lib/python3.12/site-packages (from matplotlib>=3.3.0->ultralytics) (4.58.4)\n",
-      "Requirement already satisfied: kiwisolver>=1.3.1 in ./.venv/lib/python3.12/site-packages (from matplotlib>=3.3.0->ultralytics) (1.4.8)\n",
-      "Requirement already satisfied: packaging>=20.0 in ./.venv/lib/python3.12/site-packages (from matplotlib>=3.3.0->ultralytics) (25.0)\n",
-      "Requirement already satisfied: pyparsing>=2.3.1 in ./.venv/lib/python3.12/site-packages (from matplotlib>=3.3.0->ultralytics) (3.2.3)\n",
-      "Requirement already satisfied: python-dateutil>=2.7 in ./.venv/lib/python3.12/site-packages (from matplotlib>=3.3.0->ultralytics) (2.9.0.post0)\n",
-      "Requirement already satisfied: pytz>=2020.1 in ./.venv/lib/python3.12/site-packages (from pandas>=1.1.4->ultralytics) (2025.2)\n",
-      "Requirement already satisfied: tzdata>=2022.7 in ./.venv/lib/python3.12/site-packages (from pandas>=1.1.4->ultralytics) (2025.2)\n",
-      "Requirement already satisfied: charset_normalizer<4,>=2 in ./.venv/lib/python3.12/site-packages (from requests>=2.23.0->ultralytics) (3.4.2)\n",
-      "Requirement already satisfied: idna<4,>=2.5 in ./.venv/lib/python3.12/site-packages (from requests>=2.23.0->ultralytics) (3.10)\n",
-      "Requirement already satisfied: urllib3<3,>=1.21.1 in ./.venv/lib/python3.12/site-packages (from requests>=2.23.0->ultralytics) (2.5.0)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in ./.venv/lib/python3.12/site-packages (from requests>=2.23.0->ultralytics) (2025.6.15)\n",
-      "Requirement already satisfied: filelock in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (3.18.0)\n",
-      "Requirement already satisfied: typing-extensions>=4.10.0 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (4.14.0)\n",
-      "Requirement already satisfied: setuptools in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (80.9.0)\n",
-      "Requirement already satisfied: sympy>=1.13.3 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (1.14.0)\n",
-      "Requirement already satisfied: networkx in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (3.5)\n",
-      "Requirement already satisfied: jinja2 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (3.1.6)\n",
-      "Requirement already satisfied: fsspec in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (2025.5.1)\n",
-      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (12.6.77)\n",
-      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (12.6.77)\n",
-      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (12.6.80)\n",
-      "Requirement already satisfied: nvidia-cudnn-cu12==9.5.1.17 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (9.5.1.17)\n",
-      "Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (12.6.4.1)\n",
-      "Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (11.3.0.4)\n",
-      "Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (10.3.7.77)\n",
-      "Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (11.7.1.2)\n",
-      "Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (12.5.4.2)\n",
-      "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.3 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (0.6.3)\n",
-      "Requirement already satisfied: nvidia-nccl-cu12==2.26.2 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (2.26.2)\n",
-      "Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (12.6.77)\n",
-      "Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (12.6.85)\n",
-      "Requirement already satisfied: nvidia-cufile-cu12==1.11.1.6 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (1.11.1.6)\n",
-      "Requirement already satisfied: triton==3.3.1 in ./.venv/lib/python3.12/site-packages (from torch>=1.8.0->ultralytics) (3.3.1)\n",
-      "Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.12/site-packages (from python-dateutil>=2.7->matplotlib>=3.3.0->ultralytics) (1.17.0)\n",
-      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in ./.venv/lib/python3.12/site-packages (from sympy>=1.13.3->torch>=1.8.0->ultralytics) (1.3.0)\n",
-      "Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib/python3.12/site-packages (from jinja2->torch>=1.8.0->ultralytics) (3.0.2)\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install ultralytics"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "41b20cd9",
-   "metadata": {},
-   "source": [
-    "### Dataset\n",
-    "- **Source** : Kaggle\n",
-    "- **Format** : Images + annotations\n",
-    "- **Classes**: 1 (`ship`)\n",
-    "- **Resolution per image**: Typically 640x640\n",
-    "- **Dataset size**: 26900 pictures\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3f72c4c8",
-   "metadata": {},
-   "source": [
-    "#### Importing dataset and pre-trained model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "dab37f87",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ultralytics import YOLO\n",
-    "\n",
-    "model = YOLO(\"yolo11l.pt\")\n",
-    "\n",
-    "data_path = 'ships-aerial-images/data.yaml'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9a15a1f5",
-   "metadata": {},
-   "source": [
-    "### Final training parameterse after couple iterations "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "33e8f858",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_params = {\n",
-    "    'epochs': 40,\n",
-    "    'batch': 32,\n",
-    "    'imgsz': 640,\n",
-    "    'lr0': 5e-4,\n",
-    "    'lrf': 0.1,\n",
-    "    'warmup_epochs': 5,\n",
-    "    'warmup_bias_lr': 1e-6,\n",
-    "    'momentum': 0.937,\n",
-    "    'weight_decay': 0.0001,\n",
-    "    'optimizer': 'AdamW',\n",
-    "    'device': '0,1',\n",
-    "    'project': 'runs/train',\n",
-    "    'name': 'vessel_deteciton_v11l',\n",
-    "    'exist_ok': True,\n",
-    "    'save_period': 2,\n",
-    "    'workers': 8,\n",
-    "    'patience': 20,           \n",
-    "    'cos_lr': True,            \n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "940aca02",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.train(data=data_path, **train_params)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "55e7e6a4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = YOLO(\"runs/train/vessel_deteciton_v11l/weights/best.pt\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "05a6fd1f",
-   "metadata": {},
-   "source": [
-    "### Validation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "cf1f9cdb",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Ultralytics 8.3.159 🚀 Python-3.12.3 torch-2.7.1+cu126 CUDA:0 (NVIDIA GeForce RTX 3090, 24135MiB)\n",
-      "                                                       CUDA:1 (NVIDIA GeForce RTX 3090, 24135MiB)\n",
-      "\u001b[34m\u001b[1mval: \u001b[0mFast image access ✅ (ping: 0.0±0.0 ms, read: 106.4±87.7 MB/s, size: 11.8 KB)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mval: \u001b[0mScanning /home/mlmonster/Projects/ferdzo/vesselDetection/ships-aerial-images/valid/labels.cache... 2165 images, 68 backgrounds, 0 corrupt: 100%|██████████| 2165/2165 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING ⚠️ Box and segment counts should be equal, but got len(segments) = 172, len(boxes) = 3720. To resolve this only boxes will be used and all segments will be removed. To avoid this please supply either a detect or segment dataset, not a detect-segment mixed dataset.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 136/136 [00:15<00:00,  8.74it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "                   all       2165       3720      0.603      0.543      0.551      0.341\n",
-      "Speed: 0.2ms preprocess, 6.2ms inference, 0.0ms loss, 0.2ms postprocess per image\n",
-      "Results saved to \u001b[1m/home/mlmonster/Projects/ferdzo/vesselDetection/runs/detect/val4\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "validation = model.val(conf=0.01,iou=0.7, max_det=300, imgsz=640, device='0,1')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "124cb886",
-   "metadata": {},
-   "source": [
-    "### Testing the model on custom images"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "43560cd1",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "image 1/1 /home/mlmonster/Projects/ferdzo/vesselDetection/5af55.jpg: 640x640 2 ships, 20.7ms\n",
-      "Speed: 21.6ms preprocess, 20.7ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 640)\n",
-      "Predictions: tensor([[727.0521, 301.3238, 749.1639, 321.9431],\n",
-      "        [631.3250, 203.8833, 668.7556, 220.2926]], device='cuda:0')\n",
-      "Confidence: tensor([0.7513, 0.3399], device='cuda:0')\n",
-      "Class IDs: tensor([0., 0.], device='cuda:0')\n",
-      "Number of detections: 2\n"
-     ]
-    }
-   ],
-   "source": [
-    "test_image = \"/home/mlmonster/Projects/ferdzo/vesselDetection/5af55.jpg\"\n",
-    "results = model(test_image)\n",
-    "\n",
-    "for result in results:\n",
-    "    print(f\"Predictions: {result.boxes.xyxy}\")\n",
-    "    print(f\"Confidence: {result.boxes.conf}\")\n",
-    "    print(f\"Class IDs: {result.boxes.cls}\")\n",
-    "    print(f\"Number of detections: {len(result.boxes)}\") \n",
-    "    result.save()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6c42e373",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.export(format='onnx')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
+3.13
--- a/README.md
+++ b/README.md
@@ -1,4 +1,42 @@
 # vesselDetection
-### Ship detection using Machine Learning methods for the course Digital Processing of Image(Дигитално Процесирање на Слика)

-This is a simple ship detection model, made using pre-trained YOLO with COCO weights.
+Ship detection using YOLO for the course Digital Processing of Image (Дигитално Процесирање на Слика).
+
+This repo now includes a lightweight `autoresearch`-style workflow adapted from `karpathy/autoresearch`: the idea is to let an AI agent iterate on `train.py`, run short fixed-budget experiments, and keep only changes that improve validation quality.
+
+## Files that matter
+
+- `prepare.py` - fixed utilities for dataset checks, runtime overrides, and metric extraction
+- `train.py` - the single training file the agent edits
+- `program.md` - instructions for the research agent
+
+## Metric
+
+The primary objective is `metrics/mAP50-95(B)` from Ultralytics validation results. Higher is better.
+
+## Setup
+
+Install dependencies with `uv`, make sure the dataset YAML exists at `ships-aerial-images/data.yaml`, then run:
+
+```bash
+uv sync
+```
+
+## Training
+
+Run the baseline or any experiment with:
+
+```bash
+uv run train.py
+```
+
+By default, the training script uses a fixed 5-minute budget through the Ultralytics `time` argument and prints a compact summary at the end so an agent can compare runs automatically.
+
+## Autoresearch loop
+
+1. Create a fresh branch such as `autoresearch/mar24`
+2. Read `program.md`
+3. Run a baseline with `uv run train.py > run.log 2>&1`
+4. Iterate only on `train.py`
+5. Log outcomes to `results.tsv`
+6. Keep only commits that improve `metrics/mAP50-95(B)`
--- a/prepare.py
+++ b/prepare.py
@@ -0,0 +1,165 @@
+from __future__ import annotations
+
+import csv
+import os
+from pathlib import Path
+
+DEFAULT_DATA_PATH = Path("ships-aerial-images/data.yaml")
+DEFAULT_PROJECT_DIR = Path("runs/autoresearch")
+DEFAULT_TIME_HOURS = 5 / 60
+PRIMARY_METRIC_KEY = "metrics/mAP50-95(B)"
+
+
+def ensure_dataset_exists(data_path: Path) -> None:
+    if not data_path.exists():
+        raise FileNotFoundError(
+            f"Dataset config not found at '{data_path}'. Set YOLO_DATA or add the dataset before training."
+        )
+
+
+def env_bool(name: str, default: bool) -> bool:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "yes", "on"}
+
+
+def build_train_kwargs(defaults: dict[str, object]) -> dict[str, object]:
+    kwargs = dict(defaults)
+    kwargs["project"] = os.getenv("YOLO_PROJECT", str(kwargs["project"]))
+    kwargs["name"] = os.getenv("YOLO_RUN_NAME", str(kwargs["name"]))
+    kwargs["exist_ok"] = env_bool("YOLO_EXIST_OK", bool(kwargs.get("exist_ok", True)))
+
+    time_override = os.getenv("YOLO_TIME_HOURS")
+    if time_override:
+        kwargs["time"] = float(time_override)
+
+    device_override = os.getenv("YOLO_DEVICE")
+    if device_override:
+        kwargs["device"] = device_override
+
+    return kwargs
+
+
+def resolve_save_dir(
+    project_dir: Path, run_name: str, expected_save_dir: Path | None = None
+) -> Path:
+    candidates: list[Path] = []
+    if expected_save_dir is not None:
+        candidates.append(expected_save_dir)
+    candidates.append(project_dir / run_name)
+
+    for candidate in candidates:
+        if (candidate / "results.csv").exists():
+            return candidate
+
+    matches = sorted(
+        (path for path in project_dir.glob(f"{run_name}*") if path.is_dir()),
+        key=lambda path: path.stat().st_mtime,
+        reverse=True,
+    )
+    for match in matches:
+        if (match / "results.csv").exists():
+            return match
+
+    return expected_save_dir or (project_dir / run_name)
+
+
+def _to_float(value: str | None) -> float | None:
+    if value in {None, "", "nan", "None"}:
+        return None
+    try:
+        return float(value)
+    except ValueError:
+        return None
+
+
+def _first_float(
+    row: dict[str, str], keys: list[str]
+) -> tuple[str | None, float | None]:
+    for key in keys:
+        if key in row:
+            value = _to_float(row.get(key))
+            if value is not None:
+                return key, value
+    return None, None
+
+
+def extract_experiment_summary(
+    save_dir: Path,
+    elapsed_seconds: float,
+    peak_vram_mb: float,
+    data_path: Path,
+    model_name: str,
+) -> dict[str, object]:
+    results_csv = save_dir / "results.csv"
+    if not results_csv.exists():
+        raise FileNotFoundError(f"Expected training metrics at '{results_csv}'.")
+
+    with results_csv.open("r", encoding="utf-8", newline="") as handle:
+        rows = list(csv.DictReader(handle))
+
+    if not rows:
+        raise RuntimeError(f"Training metrics file '{results_csv}' is empty.")
+
+    last_row = rows[-1]
+    fitness_key, fitness = _first_float(
+        last_row, [PRIMARY_METRIC_KEY, "metrics/mAP50(B)", "metrics/precision(B)"]
+    )
+    _, precision = _first_float(last_row, ["metrics/precision(B)"])
+    _, recall = _first_float(last_row, ["metrics/recall(B)"])
+    _, map50 = _first_float(last_row, ["metrics/mAP50(B)"])
+    _, map50_95 = _first_float(last_row, [PRIMARY_METRIC_KEY])
+    _, epoch = _first_float(last_row, ["epoch"])
+
+    best_weights = save_dir / "weights/best.pt"
+    last_weights = save_dir / "weights/last.pt"
+
+    return {
+        "fitness_key": fitness_key or PRIMARY_METRIC_KEY,
+        "fitness": fitness,
+        "precision": precision,
+        "recall": recall,
+        "map50": map50,
+        "map50_95": map50_95,
+        "epoch": epoch,
+        "training_seconds": elapsed_seconds,
+        "total_seconds": elapsed_seconds,
+        "peak_vram_mb": peak_vram_mb,
+        "data_path": str(data_path),
+        "model_name": model_name,
+        "save_dir": str(save_dir),
+        "results_csv": str(results_csv),
+        "best_weights": str(best_weights),
+        "best_weights_exists": best_weights.exists(),
+        "last_weights": str(last_weights),
+        "last_weights_exists": last_weights.exists(),
+    }
+
+
+def _format_metric(value: float | None, digits: int = 6) -> str:
+    if value is None:
+        return "n/a"
+    return f"{value:.{digits}f}"
+
+
+def print_experiment_summary(summary: dict[str, object]) -> None:
+    print("---")
+    print(f"fitness_key:       {summary['fitness_key']}")
+    print(f"fitness:           {_format_metric(summary['fitness'])}")
+    print(f"training_seconds:  {_format_metric(summary['training_seconds'], digits=1)}")
+    print(f"total_seconds:     {_format_metric(summary['total_seconds'], digits=1)}")
+    print(f"peak_vram_mb:      {_format_metric(summary['peak_vram_mb'], digits=1)}")
+    print(f"precision:         {_format_metric(summary['precision'])}")
+    print(f"recall:            {_format_metric(summary['recall'])}")
+    print(f"map50:             {_format_metric(summary['map50'])}")
+    print(f"map50_95:          {_format_metric(summary['map50_95'])}")
+    print(f"epoch:             {_format_metric(summary['epoch'], digits=0)}")
+    print(f"data_path:         {summary['data_path']}")
+    print(f"model:             {summary['model_name']}")
+    print(f"save_dir:          {summary['save_dir']}")
+    print(f"results_csv:       {summary['results_csv']}")
+    print(f"best_weights:      {summary['best_weights']}")
+    print(f"best_weights_ok:   {str(summary['best_weights_exists']).lower()}")
+    print(f"last_weights:      {summary['last_weights']}")
+    print(f"last_weights_ok:   {str(summary['last_weights_exists']).lower()}")
--- a/program.md
+++ b/program.md
@@ -0,0 +1,115 @@
+# autoresearch
+
+This is an experiment to have the LLM do its own research.
+
+## Setup
+
+To set up a new experiment, work with the user to:
+
+1. **Agree on a run tag**: propose a tag based on today's date (e.g. `mar24`). The branch `autoresearch/<tag>` must not already exist — this is a fresh run.
+2. **Create the branch**: `git checkout -b autoresearch/<tag>` from current master.
+3. **Read the in-scope files**: The repo is small. Read these files for full context:
+   - `README.md` — repository context.
+   - `prepare.py` — fixed runtime utilities, summary extraction, and dataset checks. Do not modify.
+   - `train.py` — the file you modify. Model choice, optimizer, hyperparameters, image size, and training loop entrypoint all live here.
+4. **Verify data exists**: Check that `ships-aerial-images/data.yaml` exists, or that `YOLO_DATA` points to a valid dataset YAML. If not, tell the human to add the dataset first.
+5. **Initialize results.tsv**: Create `results.tsv` with just the header row. The baseline will be recorded after the first run.
+6. **Confirm and go**: Confirm setup looks good.
+
+Once you get confirmation, kick off the experimentation.
+
+## Experimentation
+
+Each experiment runs through `uv run train.py`.
+
+The training script uses a **fixed 5-minute time budget** through Ultralytics' `time` argument, so experiments are approximately comparable and always short enough to iterate quickly.
+
+**What you CAN do:**
+- Modify `train.py` — this is the only file you edit. Everything there is fair game: model size, model weights, image size, batch size, optimizer, learning rate schedule, augmentation knobs, worker count, freeze settings, and similar training parameters.
+
+**What you CANNOT do:**
+- Modify `prepare.py`. It is read-only.
+- Install new packages or add dependencies. You can only use what's already in `pyproject.toml`.
+- Modify the evaluation harness outside the normal Ultralytics validation outputs produced by the training run.
+
+**The goal is simple: get the highest `metrics/mAP50-95(B)`.** Higher is better. Since the time budget is fixed, the core job is to find the best-performing experiment under that fixed budget.
+
+**VRAM** is a soft constraint. Some increase is acceptable for meaningful gains, but avoid ideas that blow up memory or make experiments fragile.
+
+**Simplicity criterion**: All else being equal, simpler is better. A tiny gain that adds ugly complexity is usually not worth it. Removing complexity while keeping equal or better quality is a win.
+
+**The first run**: Your very first run should always be the baseline, so run the training script as is before changing anything.
+
+## Output format
+
+Once the script finishes it prints a summary like this:
+
+```
+---
+fitness_key:       metrics/mAP50-95(B)
+fitness:           0.612345
+training_seconds:  300.1
+total_seconds:     300.1
+peak_vram_mb:      8240.5
+precision:         0.801234
+recall:            0.745678
+map50:             0.822222
+map50_95:          0.612345
+epoch:             18
+```
+
+You can extract the key metric from the log file with:
+
+```
+grep "^fitness:\|^peak_vram_mb:" run.log
+```
+
+## Logging results
+
+When an experiment is done, log it to `results.tsv` (tab-separated, NOT comma-separated — commas break descriptions).
+
+The TSV has a header row and 5 columns:
+
+```
+commit	metric	memory_gb	status	description
+```
+
+1. git commit hash (short, 7 chars)
+2. `metrics/mAP50-95(B)` achieved (e.g. 0.612345) — use `0.000000` for crashes
+3. peak memory in GB, round to `.1f` (divide `peak_vram_mb` by 1024) — use `0.0` for crashes
+4. status: `keep`, `discard`, or `crash`
+5. short text description of what the experiment tried
+
+Example:
+
+```
+commit	metric	memory_gb	status	description
+a1b2c3d	0.612345	8.1	keep	baseline yolo11l 640 adamw
+b2c3d4e	0.618901	9.4	keep	increase image size to 768
+c3d4e5f	0.605100	7.9	discard	reduce batch and switch optimizer
+d4e5f6g	0.000000	0.0	crash	batch too large caused OOM
+```
+
+## The experiment loop
+
+The experiment runs on a dedicated branch (e.g. `autoresearch/mar24`).
+
+LOOP FOREVER:
+
+1. Look at the git state: the current branch and commit.
+2. Tune `train.py` with one experimental idea.
+3. git commit
+4. Run the experiment: `uv run train.py > run.log 2>&1`
+5. Read out the results: `grep "^fitness:\|^peak_vram_mb:" run.log`
+6. If the grep output is empty, the run crashed. Read the traceback from `run.log`, attempt a fix if it is easy, otherwise mark it as a crash and move on.
+7. Record the result in `results.tsv` (do not commit `results.tsv`; leave it untracked).
+8. If the metric improved, keep the commit.
+9. If the metric is equal or worse, reset back to where you started.
+
+The idea is that you are a completely autonomous researcher trying things out. If they work, keep. If they don't, discard. Advance the branch only with improvements.
+
+**Timeout**: Each experiment should take about 5 minutes total, plus a small amount of overhead. If a run exceeds 10 minutes, kill it and treat it as a failure.
+
+**Crashes**: If a run crashes (OOM, bad hyperparameters, a typo, etc.), use judgment. If it is something dumb and easy to fix, fix it and re-run. If the idea is fundamentally broken, log it as `crash` and move on.
+
+**NEVER STOP**: Once the experiment loop has begun, do not pause to ask whether you should continue. Keep going until the human interrupts you.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,12 @@
+[project]
+name = "vesseldetection"
+version = "0.1.0"
+description = "Autoresearch-style YOLO vessel detection experiments"
+readme = "README.md"
+requires-python = ">=3.10,<3.14"
+dependencies = [
+    "ultralytics>=8.3.0",
+]
+
+[tool.uv]
+package = false
--- a/results_test.jpg
+++ b/results_test.jpg
--- a/slika.jpg
+++ b/slika.jpg
--- a/test.jpg
+++ b/test.jpg
--- a/train.py
+++ b/train.py
@@ -1,28 +1,89 @@
+from __future__ import annotations
+
+import os
+import time
+from pathlib import Path
+
+import torch
 from ultralytics import YOLO

-model = YOLO("yolo11l.pt")
+from prepare import (
+    DEFAULT_DATA_PATH,
+    DEFAULT_PROJECT_DIR,
+    DEFAULT_TIME_HOURS,
+    build_train_kwargs,
+    ensure_dataset_exists,
+    extract_experiment_summary,
+    print_experiment_summary,
+    resolve_save_dir,
+)

-data_path = 'ships-aerial-images/data.yaml'
+# The agent is expected to iterate on this file only.
+MODEL_WEIGHTS = os.getenv("YOLO_MODEL", "yolo11l.pt")
+DATA_PATH = Path(os.getenv("YOLO_DATA", str(DEFAULT_DATA_PATH)))
+RUN_NAME = "vessel_detection_yolo11l"

-train_params = {
-    'epochs': 40,
-    'batch': 32,
-    'imgsz': 640,
-    'lr0': 5e-4,
-    'lrf': 0.1,
-    'warmup_epochs': 5,
-    'warmup_bias_lr': 1e-6,
-    'momentum': 0.937,
-    'weight_decay': 0.0001,
-    'optimizer': 'AdamW',
-    'device': '0,1',
-    'project': 'runs/train',
-    'name': 'vessel_deteciton_v11l',
-    'exist_ok': True,
-    'save_period': 2,
-    'workers': 8,
-    'patience': 20,           
-    'cos_lr': True,            
+TRAIN_PARAMS = {
+    "epochs": 40,
+    "time": DEFAULT_TIME_HOURS,
+    "batch": 32,
+    "imgsz": 640,
+    "lr0": 5e-4,
+    "lrf": 0.1,
+    "warmup_epochs": 5,
+    "warmup_bias_lr": 1e-6,
+    "momentum": 0.937,
+    "weight_decay": 1e-4,
+    "optimizer": "AdamW",
+    "device": "0",
+    "project": str(DEFAULT_PROJECT_DIR),
+    "name": RUN_NAME,
+    "exist_ok": True,
+    "save_period": 2,
+    "workers": 8,
+    "patience": 20,
+    "cos_lr": True,
+    "seed": 42,
+    "deterministic": True,
+    "plots": False,
 }

-model.train(data=data_path, **train_params)
+
+def main() -> None:
+    ensure_dataset_exists(DATA_PATH)
+
+    train_kwargs = build_train_kwargs(TRAIN_PARAMS)
+    save_dir = Path(str(train_kwargs["project"])) / str(train_kwargs["name"])
+
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+
+    model = YOLO(MODEL_WEIGHTS)
+
+    start_time = time.time()
+    train_result = model.train(data=str(DATA_PATH), **train_kwargs)
+    elapsed_seconds = time.time() - start_time
+    peak_vram_mb = (
+        torch.cuda.max_memory_allocated() / 1024 / 1024
+        if torch.cuda.is_available()
+        else 0.0
+    )
+    result_save_dir = getattr(train_result, "save_dir", None)
+    save_dir = resolve_save_dir(
+        project_dir=Path(str(train_kwargs["project"])),
+        run_name=str(train_kwargs["name"]),
+        expected_save_dir=Path(result_save_dir) if result_save_dir else save_dir,
+    )
+
+    summary = extract_experiment_summary(
+        save_dir=save_dir,
+        elapsed_seconds=elapsed_seconds,
+        peak_vram_mb=peak_vram_mb,
+        data_path=DATA_PATH,
+        model_name=MODEL_WEIGHTS,
+    )
+    print_experiment_summary(summary)
+
+
+if __name__ == "__main__":
+    main()
--- a/uv.lock
+++ b/uv.lock