#!/usr/bin/env python3
"""
RW_Site_Scraper-For_Buisness.py
===============================

Scrapes Red Wing Safety "For Business" safety boots catalogue:
- https://www.redwingsafety.com/safety-boots/page-1/maxnum-0?catalog=international

Test mode:
- Set ONLY_STYLE at the top of this file to a style number (e.g. "400")
  to scrape just that single product.

Outputs (next to this script):
- RW_Site_Scrape.md
- RW_Site_Scraper_checkpoint.json   (resume state)
- RW_Site_Scraper_errors.txt        (links that failed repeatedly)

Stability features:
- `safe_get()` uses short timeouts + window.stop() so Selenium doesn't hang forever
- HTTP fallback (no Selenium) for the few product pages that still time out
- Optional salvage pass at the end to retry hard failures (HTTP-first)

Cross-platform (Windows + Linux Cinnamon):
- Headless Firefox (default)
- Geckodriver resolution "like Parts_Auto" (explicit Service path; no Selenium Manager):
    1) GECKODRIVER_PATH env var (file or directory)
    2) geckodriver(.exe) on PATH
    3) auto-download geckodriver (GitHub releases) into a user cache dir
    4) (optional) if double-click/no terminal and Tk is available, prompt to pick geckodriver

Feature columns are 1/0 (not Yes/No).
Includes Brand (string) + brand family flags (Red Wing / Irish Setter / Worx).

Dependencies:
- Python 3.9+
- Firefox installed
- Selenium installed:
    Linux Mint/Ubuntu: sudo apt install -y python3-selenium
    Windows: python -m pip install selenium

Notes for Linux Mint PEP 668:
- Prefer `python3-selenium` from apt (as above).
- This script does NOT require webdriver-manager.
"""

from __future__ import annotations

import json
import os
import platform
import re
import shutil
import stat
import sys
import tarfile
import time
import traceback
import random
import requests
import html as html_lib
from html.parser import HTMLParser
import zipfile
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from urllib.request import Request, urlopen

# NOTE: Style/Name extraction is now done via BeautifulSoup using the exact
# logic requested (shoeguide/printSpacing -> h3 + strong). This intentionally
# avoids any other name/title fallbacks.
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


# -----------------------------
# Config
# -----------------------------
# Catalog pages to scan for product links.
# Per request: ONLY use the international catalog.
CATALOG_URLS = [
    "https://www.redwingsafety.com/safety-boots/page-1/maxnum-0?catalog=international",
]
CATALOG_URL_PRIMARY = CATALOG_URLS[0]

# Optional: scrape only a single style number (set "" to scrape all discovered styles)
ONLY_STYLE: str = ""
BASE_DIR = Path(__file__).resolve().parent
# Re-discover catalog links on every run so newly added products are picked up.
REFRESH_PRODUCT_LINKS_EACH_RUN = os.environ.get("RW_REFRESH_LINKS", "1").strip().lower() in {"1", "true", "yes", "y", "on"}

OUT_MD = BASE_DIR / "RW_Site_Scrape.md"
CHECKPOINT = BASE_DIR / "RW_Site_Scraper_checkpoint.json"
ERRORS_TXT = BASE_DIR / "RW_Site_Scraper_errors.txt"

MAX_FAILS_PER_LINK = 3


# Auto-repair: if old checkpoint rows have clearly-bad Name/Brand (e.g. "RED WING FOR BUSINESS"),
# automatically un-mark those links as done so they get re-scraped on the next run.
AUTO_REPAIR_BAD_ROWS = True
BAD_NAME_SENTINEL = "RED WING FOR BUSINESS"

# Prefer names as shown on the International catalog listing page.
# Disabled: listing tiles can include UI strings like "Save to list".
PREFER_INTERNATIONAL_LISTING_NAMES = False


MD_HEADERS = [
    "Style #", "Name", "URL", "Image", "Brand",
    "Male", "Female",
    "Red Wing", "Irish Setter", "Worx",
    "Safety Toe", "Steel Toe", "Non-Metallic Toe", "Aluminum Toe", "Metatarsal Guard", "Soft Toe",
    "Waterproof", "Insulation", "Slip Resistant", "Electrical Hazard", "Puncture Resistant",
    "Static Dissipative", "Ankle Protection", "BOA® Lacing System",
    "Defined Heel", "All Leather Upper", "Resoleable",
    "Oxford/Athletic", "Chukka", "Hiker",
    '5"', '6"', '7"', '8"', '9"', '10"', '11"', '12"',
    "Built in USA", "Made in USA",
]

EXPECTED_COLS = len(MD_HEADERS)


AUTO_REPAIR_MISSING_MEDIA = True

# Navigation tuning (Red Wing pages sometimes never fully 'finish' loading)
PAGE_LOAD_TIMEOUT = 45
SCRIPT_TIMEOUT = 30
NAV_SETTLE_SECONDS = 1.25

# Small polite delay between product navigations to reduce transient server hiccups
REQUEST_DELAY_RANGE = (0.15, 0.55)

# After the main pass, retry hard-failed links once with a fresh driver + more conservative settings
ENABLE_SALVAGE_PASS = True
SALVAGE_PAGE_LOAD_TIMEOUT = 90
SALVAGE_MAX_TRIES_PER_LINK = 2

CHECKPOINT_VERSION = 9
# Optional: set this environment variable to explicitly point at geckodriver
#   Linux:   export GECKODRIVER_PATH="/home/you/bin/geckodriver"
#   Windows: setx GECKODRIVER_PATH "C:\Users\you\bin\geckodriver.exe"
GECKODRIVER_PATH = os.environ.get("GECKODRIVER_PATH", "").strip() or None


# -----------------------------
# Small helpers
# -----------------------------
def b01(v: bool) -> str:
    return "1" if v else "0"


def md_escape_cell(s: str) -> str:
    s = (s or "").replace("|", r"\|")
    s = re.sub(r"\s+", " ", s).strip()
    return s


def atomic_write(path: Path, text: str) -> None:
    tmp = path.with_suffix(path.suffix + ".tmp")
    tmp.write_text(text, encoding="utf-8")
    tmp.replace(path)


def load_checkpoint() -> Dict:
    if not CHECKPOINT.exists():
        return {
            "version": CHECKPOINT_VERSION,
            "catalog_urls": CATALOG_URLS,
            "product_links": [],
            "done_links": [],
            "rows": [],
            "preferred_names": {},
            "media_repair_done": False,
            "fail_counts": {},
            "hard_failed_links": [],
            "started_at": time.time(),
            "geckodriver_path": None,
        }
    try:
        data = json.loads(CHECKPOINT.read_text(encoding="utf-8"))
        old_ver = int(data.get("version", 0))
        if old_ver != CHECKPOINT_VERSION:
            try:
                CHECKPOINT.replace(CHECKPOINT.with_suffix(f".json.v{old_ver}.bak"))
            except Exception:
                pass
            return {
                "version": CHECKPOINT_VERSION,
                "catalog_urls": CATALOG_URLS,
                "product_links": [],
                "done_links": [],
                "rows": [],
                "fail_counts": {},
                "started_at": time.time(),
                "geckodriver_path": None,
            }
        data.setdefault("version", CHECKPOINT_VERSION)
        data.setdefault("catalog_urls", CATALOG_URLS)
        data.setdefault("product_links", [])
        data.setdefault("done_links", [])
        data.setdefault("rows", [])
        data.setdefault("fail_counts", {})
        data.setdefault("hard_failed_links", [])
        data.setdefault("started_at", time.time())
        data.setdefault("geckodriver_path", None)
        return data
    except Exception:
        try:
            CHECKPOINT.replace(CHECKPOINT.with_suffix(".json.corrupt"))
        except Exception:
            pass
        return {
            "version": CHECKPOINT_VERSION,
            "catalog_urls": CATALOG_URLS,
            "product_links": [],
            "done_links": [],
            "rows": [],
            "preferred_names": {},
            "media_repair_done": False,
            "fail_counts": {},
            "hard_failed_links": [],
            "started_at": time.time(),
            "geckodriver_path": None,
        }


def save_checkpoint(data: Dict) -> None:
    atomic_write(CHECKPOINT, json.dumps(data, indent=2, sort_keys=True))


def write_errors_line(line: str) -> None:
    with open(ERRORS_TXT, "a", encoding="utf-8") as f:
        f.write(line.rstrip() + "\n")


def _can_use_tk() -> bool:
    try:
        import tkinter  # noqa: F401
        return True
    except Exception:
        return False


def _tk_pick_file(title: str) -> Optional[str]:
    # Only use this when not running in a terminal (double-click scenario).
    if sys.stdout.isatty() or not _can_use_tk():
        return None
    try:
        import tkinter as tk
        from tkinter import filedialog, messagebox

        root = tk.Tk()
        root.withdraw()
        root.attributes("-topmost", True)

        messagebox.showinfo(
            "RW Site Scraper",
            "Could not find geckodriver automatically.\n\n"
            "Please select the geckodriver executable.\n"
            "Windows: geckodriver.exe\nLinux: geckodriver"
        )
        path = filedialog.askopenfilename(title=title)
        root.destroy()
        path = (path or "").strip()
        return path if path else None
    except Exception:
        return None


class ProgressReporter:
    """TTY progress bar, or Tk window if launched by double-click (no TTY)."""

    def __init__(self, total: int, started_at: float):
        self.total = max(int(total), 1)
        self.started_at = started_at
        self.use_tty = sys.stdout.isatty()
        self.gui = False
        self._root = None
        self._label = None
        self._pbar = None

        if (not self.use_tty) and _can_use_tk():
            try:
                import tkinter as tk
                from tkinter import ttk

                self._root = tk.Tk()
                self._root.title("RW Site Scraper")
                self._root.geometry("620x150")
                self._root.resizable(False, False)

                self._label = tk.Label(self._root, text="Starting...", anchor="w")
                self._label.pack(fill="x", padx=12, pady=(12, 6))

                self._pbar = ttk.Progressbar(self._root, maximum=self.total, length=580)
                self._pbar.pack(padx=12, pady=(0, 10))

                self.gui = True
                self._root.update_idletasks()
                self._root.update()
            except Exception:
                self.gui = False

    def update(self, current: int, note: str = "") -> None:
        current = max(0, min(int(current), self.total))

        if self.gui and self._root:
            msg = f"{current}/{self.total}  {note}".strip()
            if self._label:
                self._label.config(text=msg)
            if self._pbar:
                self._pbar["value"] = current
            self._root.update_idletasks()
            self._root.update()
            return

        width = 32
        frac = current / self.total
        filled = int(round(frac * width))
        bar = "#" * filled + "-" * (width - filled)

        elapsed = max(time.time() - self.started_at, 0.0001)
        rate = current / elapsed
        eta = (self.total - current) / rate if rate > 1e-9 else 0.0

        msg = f"[{bar}] {current}/{self.total} ({frac*100:5.1f}%) ETA {int(eta)}s"
        if note:
            msg += f"  {note}"

        sys.stdout.write("\r" + msg[:240])
        sys.stdout.flush()
        if current == self.total:
            sys.stdout.write("\n")
            sys.stdout.flush()

    def close(self) -> None:
        if self.gui and self._root:
            try:
                self._root.destroy()
            except Exception:
                pass


# -----------------------------
# Geckodriver resolution / install
# -----------------------------
def _cache_dir() -> Path:
    if os.name == "nt":
        base = os.environ.get("LOCALAPPDATA") or os.environ.get("APPDATA") or str(Path.home())
        d = Path(base) / "RW_Site_Scraper"
    else:
        base = os.environ.get("XDG_CACHE_HOME") or str(Path.home() / ".cache")
        d = Path(base) / "rw_site_scraper"
    d.mkdir(parents=True, exist_ok=True)
    return d


def _resolve_from_env_or_path() -> Optional[str]:
    # 1) explicit env var (file or directory)
    if GECKODRIVER_PATH:
        p = Path(GECKODRIVER_PATH)
        if p.is_file():
            return str(p)
        if p.is_dir():
            for name in ("geckodriver.exe", "geckodriver"):
                cand = p / name
                if cand.is_file():
                    return str(cand)

    # 2) alongside this script
    for name in ("geckodriver.exe", "geckodriver"):
        cand = BASE_DIR / name
        if cand.is_file():
            return str(cand)

    # 3) scraper cache dir
    for name in ("geckodriver.exe", "geckodriver"):
        cand = _cache_dir() / name
        if cand.is_file():
            return str(cand)

    # 4) common user bin dirs (linux)
    if os.name != "nt":
        for d in (Path.home() / "bin", Path.home() / ".local" / "bin"):
            cand = d / "geckodriver"
            if cand.is_file():
                return str(cand)

    # 5) PATH
    p = shutil.which("geckodriver") or shutil.which("geckodriver.exe")
    if p:
        return p
    return None


def _platform_asset_key() -> Tuple[str, str]:
    """
    Returns (asset_contains, archive_type) matching geckodriver releases.
    """
    sysplat = sys.platform.lower()
    mach = platform.machine().lower()

    if sysplat.startswith("win"):
        is_64 = "64" in platform.architecture()[0]
        return ("win64" if is_64 else "win32", "zip")

    if sysplat.startswith("linux"):
        if "aarch64" in mach or "arm64" in mach:
            return ("linux-aarch64", "tar.gz")
        return ("linux64", "tar.gz")

    if sysplat == "darwin":
        if "arm64" in mach or "aarch64" in mach:
            return ("macos-aarch64", "tar.gz")
        return ("macos", "tar.gz")

    return ("linux64", "tar.gz")


def _download_latest_geckodriver(dest_dir: Path) -> Path:
    """
    Downloads and extracts latest geckodriver into dest_dir.
    Returns path to extracted driver.
    """
    asset_key, archive_type = _platform_asset_key()

    api = "https://api.github.com/repos/mozilla/geckodriver/releases/latest"
    req = Request(api, headers={"User-Agent": "RW_Site_Scraper/1.0"})
    with urlopen(req, timeout=30) as r:
        data = json.loads(r.read().decode("utf-8", errors="replace"))

    assets = data.get("assets", [])
    dl_url = None
    dl_name = None
    for a in assets:
        name = a.get("name", "")
        if asset_key in name and name.endswith(archive_type):
            dl_url = a.get("browser_download_url")
            dl_name = name
            break

    if not dl_url:
        raise RuntimeError(f"Could not find a geckodriver asset for {asset_key} ({archive_type}).")

    archive_path = dest_dir / dl_name
    req2 = Request(dl_url, headers={"User-Agent": "RW_Site_Scraper/1.0"})
    with urlopen(req2, timeout=60) as r, open(archive_path, "wb") as f:
        shutil.copyfileobj(r, f)

    driver_name = "geckodriver.exe" if os.name == "nt" else "geckodriver"
    extracted_path = dest_dir / driver_name

    if archive_type == "zip":
        with zipfile.ZipFile(archive_path, "r") as z:
            for member in z.namelist():
                if member.endswith(driver_name):
                    z.extract(member, path=dest_dir)
                    p = dest_dir / member
                    p.replace(extracted_path)
                    break
    else:
        with tarfile.open(archive_path, "r:gz") as t:
            for member in t.getmembers():
                if member.name.endswith("/" + driver_name) or member.name == driver_name:
                    t.extract(member, path=dest_dir)
                    p = dest_dir / member.name
                    p.replace(extracted_path)
                    break

    try:
        archive_path.unlink(missing_ok=True)
    except Exception:
        pass

    if os.name != "nt":
        try:
            st = os.stat(extracted_path)
            os.chmod(extracted_path, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
        except Exception:
            pass

    if not extracted_path.exists():
        raise RuntimeError("Download succeeded but geckodriver was not extracted.")

    return extracted_path


def ensure_geckodriver(ck: Dict) -> str:
    # 1) env/path/script dir
    p = _resolve_from_env_or_path()
    if p and Path(p).exists():
        return p

    # 2) saved in checkpoint
    saved = (ck.get("geckodriver_path") or "").strip()
    if saved and Path(saved).exists():
        return saved

    # 3) auto-download
    try:
        dest = _cache_dir()
        driver_path = _download_latest_geckodriver(dest)
        ck["geckodriver_path"] = str(driver_path)
        save_checkpoint(ck)
        return str(driver_path)
    except Exception as e:
        # 4) optional interactive pick (only when double-click/no TTY)
        picked = _tk_pick_file("Select geckodriver / geckodriver.exe")
        if picked and Path(picked).exists():
            ck["geckodriver_path"] = picked
            save_checkpoint(ck)
            return picked

        raise RuntimeError(
            "Unable to locate or install geckodriver.\n\n"
            "Fix options:\n"
            "  - Put geckodriver on PATH\n"
            "  - OR set GECKODRIVER_PATH to the full driver path\n"
            "  - OR install via package manager (Linux often: sudo apt install firefox-geckodriver)\n\n"
            f"Underlying error: {e}"
        )


def create_driver(
    ck: Dict,
    headless: bool = True,
    *,
    page_load_strategy: str = "eager",
    block_images: bool = True,
    page_load_timeout: int = PAGE_LOAD_TIMEOUT,
    user_agent: Optional[str] = None,
) -> webdriver.Firefox:
    """Start Firefox using an explicit geckodriver path (no Selenium Manager).

    Args:
        page_load_strategy: "eager" returns after DOMContentLoaded. "normal" waits for full load.
        block_images: If True, blocks images to reduce load stalls/timeouts.
        page_load_timeout: Seconds for Selenium navigation timeout.
    """
    gecko = ensure_geckodriver(ck)

    options = Options()
    if headless:
        options.add_argument("-headless")

    if page_load_strategy:
        options.set_capability("pageLoadStrategy", page_load_strategy)

    # Safe stability prefs
    options.set_preference("dom.webnotifications.enabled", False)
    options.set_preference("media.volume_scale", "0.0")
    options.set_preference("browser.privatebrowsing.autostart", True)
    options.set_preference("network.http.http3.enable", False)

    if block_images:
        # 2 = block images
        options.set_preference("permissions.default.image", 2)

    if user_agent:
        options.set_preference("general.useragent.override", user_agent)

    service = FirefoxService(executable_path=gecko)
    driver = webdriver.Firefox(service=service, options=options)
    driver.set_page_load_timeout(page_load_timeout)
    driver.set_script_timeout(SCRIPT_TIMEOUT)
    return driver


# -----------------------------
# Navigation + HTTP fallback helpers
# -----------------------------

def _polite_delay() -> None:
    lo, hi = REQUEST_DELAY_RANGE
    try:
        time.sleep(random.uniform(lo, hi))
    except Exception:
        time.sleep(lo)


def safe_get(
    driver,
    url: str,
    timeout: int = PAGE_LOAD_TIMEOUT,
    settle: float = NAV_SETTLE_SECONDS,
    max_tries: int = 2,
) -> None:
    """Navigate without getting stuck on pages that never fully finish loading.

    - Uses page_load_timeout
    - On timeout, calls window.stop() and continues if the DOM exists
    - Retries a couple times with light cleanup
    """
    last_exc: Optional[BaseException] = None

    for attempt in range(1, max_tries + 2):
        try:
            driver.set_page_load_timeout(timeout)
            _polite_delay()
            # Python-side HTTP timeout to geckodriver (prevents indefinite hangs in driver.get)
            try:
                ce = getattr(driver, "command_executor", None)
                if ce is not None:
                    # Selenium 4: RemoteConnection holds a ClientConfig with a request timeout
                    if hasattr(ce, "_client_config") and hasattr(ce._client_config, "timeout"):
                        ce._client_config.timeout = int(timeout) + 10
                    # urllib3 PoolManager may also have a timeout attribute
                    if hasattr(ce, "_conn") and hasattr(ce._conn, "timeout"):
                        ce._conn.timeout = int(timeout) + 10
            except Exception:
                pass
            driver.get(url)

            # Give the DOM a moment to settle
            time.sleep(settle)

            # If we have any DOM, we can proceed.
            try:
                driver.find_element(By.TAG_NAME, "body")
            except Exception:
                pass
            return

        except TimeoutException as e:
            last_exc = e
            try:
                driver.execute_script("window.stop();")
            except Exception:
                pass
            time.sleep(settle)
            try:
                driver.find_element(By.TAG_NAME, "body")
                return
            except Exception:
                pass

        except WebDriverException as e:
            last_exc = e

        # Cleanup between retries
        try:
            driver.get("about:blank")
        except Exception:
            pass
        time.sleep(0.25 * attempt)

    if last_exc:
        raise last_exc



def _url_variants(url: str) -> List[str]:
    """Return a small set of URL variants (www/non-www) to dodge occasional redirects."""
    outs: List[str] = []
    for u in [url]:
        if u not in outs:
            outs.append(u)
    if "//www." in url:
        u2 = url.replace("//www.", "//", 1)
        if u2 not in outs:
            outs.append(u2)
    else:
        u2 = url.replace("//", "//www.", 1)
        if u2 not in outs:
            outs.append(u2)
    return outs


def _fetch_html(url: str, timeout: int = 30) -> str:
    req = Request(
        url,
        headers={
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
        },
    )
    with urlopen(req, timeout=timeout) as r:
        data = r.read()
    # best-effort decode
    return data.decode("utf-8", errors="replace")


class _VisibleTextExtractor(HTMLParser):
    """Very small HTML->visible text extractor (no external deps)."""

    _BLOCK_TAGS = {
        "p", "div", "br", "li", "tr", "td", "th",
        "h1", "h2", "h3", "h4", "h5", "h6",
        "section", "article", "header", "footer",
    }

    def __init__(self):
        super().__init__()
        self.parts: List[str] = []

    def handle_starttag(self, tag, attrs):
        if tag.lower() in self._BLOCK_TAGS:
            self.parts.append("\n")

    def handle_endtag(self, tag):
        if tag.lower() in self._BLOCK_TAGS:
            self.parts.append("\n")

    def handle_data(self, data):
        if data and data.strip():
            self.parts.append(data)


def _html_to_text(html: str) -> str:
    p = _VisibleTextExtractor()
    try:
        p.feed(html)
    except Exception:
        pass
    raw = "".join(p.parts)
    raw = html_lib.unescape(raw)
    raw = re.sub(r"\n{3,}", "\n\n", raw)
    raw = re.sub(r"[\t\r]+", " ", raw)
    return raw.strip()


def _extract_first_h1(html: str) -> str:
    m = re.search(r"<h1\b[^>]*>(.*?)</h1>", html, flags=re.IGNORECASE | re.DOTALL)
    if not m:
        return ""
    inner = m.group(1)
    inner = re.sub(r"<[^>]+>", " ", inner)
    inner = html_lib.unescape(inner)
    inner = re.sub(r"\s+", " ", inner).strip()
    return inner




def _extract_first_heading(html: str) -> str:
    """Extract a likely product title from HTML.

    Many Red Wing product pages place the actual product name in <h3>, while <h1>
    can be a site/banner header (e.g., 'RED WING FOR BUSINESS'). We therefore
    try meaningful <h3> first, then fall back to <h1>.
    """
    BAD = {"RED WING FOR BUSINESS"}
    STOP = {
        "FEATURES", "SPECIFICATIONS", "TECHNOLOGY", "DETAILS",
        "SIZE & FIT", "SIZING", "CARE", "REVIEWS", "RELATED PRODUCTS",
    }

    def _clean(inner: str) -> str:
        inner = re.sub(r"<[^>]+>", " ", inner)
        inner = html_lib.unescape(inner)
        inner = re.sub(r"\s+", " ", inner).strip()
        return inner

    def _ok(s: str) -> bool:
        s = (s or "").strip()
        if not s:
            return False
        up = s.upper()
        if up in BAD or up in STOP:
            return False
        if len(up) <= 3:
            return False
        if not re.search(r"[A-Z]", up):
            return False
        return True

    # Prefer the 'Name' field from the productInfo table when present
    mname = re.search(
        r"<td[^>]*class=['\"][^'\"]*prTitle[^'\"]*['\"][^>]*>\s*Name\s*</td>\s*"
        r"<td[^>]*class=['\"][^'\"]*prValue[^'\"]*['\"][^>]*>(.*?)</td>",
        html,
        flags=re.IGNORECASE | re.DOTALL,
    )
    if mname:
        s = _clean(mname.group(1))
        if _ok(s):
            return s


    # Try first meaningful <h3>
    for mh3 in re.finditer(r"<h3\b[^>]*>(.*?)</h3>", html, flags=re.IGNORECASE | re.DOTALL):
        s = _clean(mh3.group(1))
        if _ok(s):
            return s

    return _extract_first_h1(html)

# -----------------------------
# Scraping logic
# -----------------------------

def extract_style_and_name_from_html(html: str, url: str) -> Tuple[str, str, str]:
    """Extract (style_number, name, style_text) using the ONLY allowed method.

    IMPORTANT: Per user instruction, this function intentionally uses *only* the
    shoeguide/printSpacing -> h3 + strong logic (no fallbacks).
    """
    soup = BeautifulSoup(html, "html.parser")
    guide_div = soup.find("div", class_="shoeguide") or soup.find(id="printSpacing")
    h3_tag = guide_div.find("h3")
    strong_tag = guide_div.find("strong")

    style_text = h3_tag.get_text(" ", strip=True)
    match = re.search(r"#\s*(\d+)", style_text)
    style_number = match.group(1) if match else style_text.strip()
    name = strong_tag.get_text(" ", strip=True)
    return style_number, name, style_text
def dismiss_popups(driver) -> None:
    xpaths = [
        "//button[contains(.,'Agree') or contains(.,'Accept') or contains(.,'Proceed')]",
        "//a[contains(.,'Agree') or contains(.,'Accept') or contains(.,'Proceed')]",
        "//button[contains(.,'Close') or contains(.,'×') or contains(@aria-label,'Close')]",
    ]
    end = time.time() + 2.0
    while time.time() < end:
        clicked = False
        for xp in xpaths:
            try:
                el = driver.find_element(By.XPATH, xp)
                if el.is_displayed() and el.is_enabled():
                    el.click()
                    clicked = True
                    break
            except Exception:
                pass
        if not clicked:
            break
        time.sleep(0.2)


def scroll_to_load_all(driver, max_rounds: int = 80) -> None:
    stable_rounds = 0
    last_count = -1
    for _ in range(max_rounds):
        dismiss_popups(driver)

        links = driver.find_elements(By.XPATH, "//a[contains(@href,'/safety-boot/')]")
        count = len({a.get_attribute("href") for a in links if a.get_attribute("href")})
        if count == last_count:
            stable_rounds += 1
        else:
            stable_rounds = 0
            last_count = count
        if stable_rounds >= 3:
            break
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.0)



def _clean_listing_name(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"\s+", " ", s)
    if not s:
        return ""
    u = s.upper()
    if u in {
        "RED WING FOR BUSINESS",
        "FEATURES",
        "SPECIFICATIONS",
        "TECHNOLOGY",
        "DETAILS",
        "VIEW DETAILS",
        "QUICK VIEW",
        "ADD TO CART",
    }:
        return ""
    if re.fullmatch(r"\d{2,}", s):
        return ""
    if "$" in s:
        return ""
    if len(s) < 3:
        return ""
    return s


def _listing_name_from_anchor(a) -> str:
    """Best-effort: read the product name as displayed on a catalog listing tile."""
    candidates = []
    try:
        candidates.append(a.get_attribute("aria-label") or "")
    except Exception:
        pass
    try:
        candidates.append(a.get_attribute("title") or "")
    except Exception:
        pass
    try:
        candidates.append(a.text or "")
    except Exception:
        pass

    lines = []
    for c in candidates:
        if not c:
            continue
        for ln in str(c).splitlines():
            ln = _clean_listing_name(ln)
            if ln:
                lines.append(ln)

    # De-dup while preserving order
    seen = set()
    uniq = []
    for ln in lines:
        if ln not in seen:
            seen.add(ln)
            uniq.append(ln)

    if not uniq:
        return ""

    def score(s: str) -> int:
        sc = len(s)
        if re.search(r"[A-Za-z]", s):
            sc += 15
        if re.search(r"\bWOMEN'?S\b", s.upper()):
            sc += 10
        if re.search(r"\bMEN'?S\b", s.upper()):
            sc += 8
        if any(t in s.lower() for t in ("view", "quick", "cart", "compare", "wishlist")):
            sc -= 30
        return sc

    uniq.sort(key=score, reverse=True)
    return uniq[0]


def apply_preferred_names(rows: List[List[str]], preferred: Dict[str, str]) -> int:
    """Overwrite Name column using preferred mapping (by style). Returns number updated."""
    if not rows or not preferred:
        return 0
    updated = 0
    for r in rows:
        if not isinstance(r, list) or len(r) < 2:
            continue
        style = (r[0] or "").strip()
        if not style:
            continue
        pref = _clean_listing_name(preferred.get(style, ""))
        if pref and r[1] != pref:
            r[1] = pref
            updated += 1
    return updated

def collect_product_links(driver) -> Tuple[List[str], Dict[str, str]]:
    """Collect unique product links from all configured catalog sections.

    Dedupes by style number when possible (preferred), otherwise by URL.
    """
    all_links: List[str] = []
    seen_href: Set[str] = set()
    seen_style: Set[str] = set()
    preferred_names: Dict[str, str] = {}

    wait = WebDriverWait(driver, 25)

    # Prefer the primary (US) catalog first, then add any extra styles from international.
    for url in CATALOG_URLS:
        _polite_delay()
        safe_get(driver, url)
        wait.until(EC.presence_of_element_located((By.XPATH, "//a[contains(@href,'/safety-boot/')]")))
        dismiss_popups(driver)
        scroll_to_load_all(driver)

        for a in driver.find_elements(By.XPATH, "//a[contains(@href,'/safety-boot/')]"):
            href = a.get_attribute("href")
            if not href:
                continue
            if "/safety-boot/" not in href:
                continue
            if href in seen_href:
                continue

            style = extract_style_from_url(href)
            if PREFER_INTERNATIONAL_LISTING_NAMES and style and "catalog=international" in url:
                try:
                    nm = _clean_listing_name(_listing_name_from_anchor(a))
                    if nm and style not in preferred_names:
                        preferred_names[style] = nm
                except Exception:
                    pass
            if style:
                if style in seen_style:
                    # Different catalog section may link the same style; keep the first.
                    seen_href.add(href)
                    continue
                seen_style.add(style)

            seen_href.add(href)
            all_links.append(href)

    return all_links, preferred_names


def extract_style_from_url(link: str) -> str:
    m = re.search(r"/safety-boot/(\d+)[-/]", link)
    if m:
        return m.group(1)
    m = re.search(r"/safety-boot/(\d+)", link)
    return m.group(1) if m else ""

# -----------------------------
# Checkpoint auto-repair helpers
# -----------------------------
def _looks_like_junk(s: str) -> bool:
    s = (s or "").strip().lower()
    if not s:
        return True
    # Common junk we saw when overlays/ads got captured as text
    junk_tokens = ("window.open", "javascript:", "facebook.com", "http://", "https://")
    if any(t in s for t in junk_tokens):
        return True
    # Overlong "brands" are almost always bad (ads/cookie banners)
    if len(s) > 60:
        return True
    return False


def _is_bad_row(row: List[str]) -> bool:
    try:
        style = (row[0] or "").strip()
        name = (row[1] or "").strip()
    except Exception:
        return True

    if not style:
        return True

    if name.upper() == BAD_NAME_SENTINEL:
        return True

    return False



def repair_missing_media(ck: Dict) -> int:
    """If checkpoint rows are missing URL/Image columns or have empty URL/Image, requeue those links.
    Runs at most once per checkpoint unless you delete/clear ck['media_repair_done'].
    """
    if ck.get("media_repair_done"):
        return 0
    rows = ck.get("rows", []) or []
    done_links = set(ck.get("done_links", []) or [])
    removed = 0

    styles_needed = set()
    for r in rows:
        if not isinstance(r, list) or not r:
            continue
        style = (r[0] or "").strip()
        if not style:
            continue
        if len(r) < EXPECTED_COLS:
            styles_needed.add(style)
            continue
        url_cell = (r[2] or "").strip() if len(r) > 2 else ""
        img_cell = (r[3] or "").strip() if len(r) > 3 else ""
        if not url_cell or not img_cell:
            styles_needed.add(style)

    if not styles_needed:
        return 0

    for href in list(done_links):
        st = extract_style_from_url(href)
        if st and st in styles_needed:
            done_links.discard(href)
            removed += 1

    ck["done_links"] = sorted(done_links)
    return removed

def repair_bad_checkpoint_rows(ck: Dict) -> int:
    """If checkpoint contains obviously bad rows, un-mark those links as 'done'.

    This fixes the situation where a previous run captured the site header/ads into the Name/Brand
    columns and those rows are now 'stuck' because resume logic skips already-done links.

    Returns:
        Number of links that were re-queued (removed from done_links).
    """
    try:
        rows = list(ck.get("rows", []) or [])
        done_links = set(ck.get("done_links", []) or [])
        if not rows or not done_links:
            return 0

        bad_styles = {r[0] for r in rows if r and _is_bad_row(r)}
        if not bad_styles:
            return 0

        removed = 0
        for link in list(done_links):
            st = extract_style_from_url(link)
            if st and st in bad_styles:
                done_links.remove(link)
                removed += 1

        if removed:
            ck["done_links"] = sorted(done_links)

            # Clear any fail counters for the re-queued links so they get their full retries.
            fc = dict(ck.get("fail_counts", {}) or {})
            for k in list(fc.keys()):
                st = extract_style_from_url(k)
                if st and st in bad_styles:
                    fc.pop(k, None)
            ck["fail_counts"] = fc

            # Also remove from hard-failed list so salvage logic doesn't get confused.
            ck["hard_failed_links"] = [u for u in (ck.get("hard_failed_links", []) or [])
                                      if extract_style_from_url(u) not in bad_styles]

            save_checkpoint(ck)

        return removed
    except Exception:
        return 0




def parse_field_line(text_block: str, field_name: str) -> str:
    """Parse a simple field/value from extracted page text.

    Red Wing pages often render fields in tables so the extracted text looks like:

        Name
        DynaForce®

    We support both 'Name: DynaForce' and 'Name' on one line with the value on the next.
    """
    lines = [ln.strip() for ln in (text_block or "").splitlines()]
    for i, line in enumerate(lines):
        if not line:
            continue
        if line.startswith(field_name):
            tail = line[len(field_name):].strip(" :\t")
            if tail:
                return tail
            # Field on its own line — take the next non-empty line.
            for j in range(i + 1, min(i + 6, len(lines))):
                nxt = (lines[j] or "").strip()
                if nxt:
                    return nxt
            return ""
    return ""


def extract_brand(header_text: str, body_text: str) -> str:
    ht = (header_text or "").strip()
    m = re.search(r"^(.*?)\s+style\s*#\s*\d+", ht, flags=re.IGNORECASE)
    if m:
        raw = m.group(1).strip()
        if raw:
            out = []
            for w in raw.split():
                out.append(w if w.isupper() else w.capitalize())
            return " ".join(out)

    for line in body_text.splitlines():
        if re.search(r"\bstyle\s*#\s*\d+\b", line, flags=re.IGNORECASE):
            m2 = re.search(r"^(.*?)\s+style\s*#\s*\d+\b", line.strip(), flags=re.IGNORECASE)
            if m2:
                raw = m2.group(1).strip()
                if raw:
                    out = []
                    for w in raw.split():
                        out.append(w if w.isupper() else w.capitalize())
                    return " ".join(out)
            break
    return ""


def extract_about_name(body_text: str, style: str) -> str:
    if style:
        idx = body_text.upper().find(f"ABOUT THE {style}")
        if idx != -1:
            chunk = body_text[idx: idx + 1600]
            val = parse_field_line(chunk, "Name")
            if val:
                return val.strip()
    val = parse_field_line(body_text, "Name")
    return val.strip() if val else ""


def parse_heights(text: str) -> Dict[str, bool]:
    found = set(int(x) for x in re.findall(r"\b(\d{1,2})\s*-\s*INCH\b", text.upper()))
    found |= set(int(x) for x in re.findall(r"\b(\d{1,2})\s+INCH\b", text.upper()))
    return {f'{i}"': (i in found) for i in range(5, 13)}


def classify_brand_family(brand_str: str) -> Tuple[bool, bool, bool]:
    b = (brand_str or "").lower()
    is_rw = b.startswith("red wing")
    is_is = b.startswith("irish setter")
    is_worx = b.startswith("worx")
    if ("worx" in b) and (not b.startswith("worx")) and ("by red wing" in b):
        is_worx = True
    return is_rw, is_is, is_worx




# -----------------------------
# Image capture / saving helpers
# -----------------------------
def _safe_filename_from_name(name: str, max_len: int = 120) -> str:
    s = (name or "").strip()
    s = re.sub(r"\s+", " ", s)
    s = s.replace("®", "")
    s = re.sub(r"[^\w\-\.\s]+", "", s, flags=re.UNICODE)
    s = s.strip().replace(" ", "_")
    s = re.sub(r"_+", "_", s)
    if not s:
        s = "boot"
    if len(s) > max_len:
        s = s[:max_len].rstrip("_")
    return s


def _images_dir() -> Path:
    d = (Path.cwd().parent / "Images").resolve()
    d.mkdir(parents=True, exist_ok=True)
    return d


def _rel_image_path(fname: str) -> str:
    return str(Path("..") / "Images" / fname)


def _pick_best_product_img_element(driver):
    # First, try the known product image container
    try:
        container = driver.find_element(By.ID, "productImage")
        img = container.find_element(By.CSS_SELECTOR, "img")
        if img and img.is_displayed():
            return img
    except Exception:
        pass

    # Prefer images in main content; pick the largest visible <img>
    try:
        imgs = driver.find_elements(By.CSS_SELECTOR, "main img")
    except Exception:
        imgs = []
    if not imgs:
        try:
            imgs = driver.find_elements(By.TAG_NAME, "img")
        except Exception:
            imgs = []

    best = None
    best_score = 0.0
    for el in imgs:
        try:
            if not el.is_displayed():
                continue
            sz = el.size or {}
            w = float(sz.get("width") or 0)
            h = float(sz.get("height") or 0)
            if w < 120 or h < 120:
                continue
            alt = (el.get_attribute("alt") or "").lower()
            cls = (el.get_attribute("class") or "").lower()
            src = (el.get_attribute("src") or "").lower()
            score = w * h
            if any(k in alt for k in ("boot", "shoe", "chukka", "hiker", "moc")):
                score *= 1.2
            if any(k in cls for k in ("product", "primary", "hero", "image")):
                score *= 1.15
            if any(k in src for k in ("/dw/image", "scene7", "/images", "static")):
                score *= 1.1
            if score > best_score:
                best_score = score
                best = el
        except Exception:
            continue
    return best



def _extract_product_image_url(driver) -> str:
    # 1) Container by id
    try:
        container = driver.find_element(By.ID, "productImage")
        try:
            img = container.find_element(By.CSS_SELECTOR, "img")
        except Exception:
            img = None
        if img:
            z = (img.get_attribute("zoomimg") or "").strip()
            s = (img.get_attribute("src") or "").strip()
            if z or s:
                return z or s
    except Exception:
        pass

    # 2) Orbit slider / active slide
    for sel in [
        'li[data-orbit-slide="product-1"] img',
        'li.active img',
        'ul#productImage li.active img',
        'main li.active img',
        'main img',
    ]:
        try:
            img = driver.find_element(By.CSS_SELECTOR, sel)
            z = (img.get_attribute("zoomimg") or "").strip()
            s = (img.get_attribute("src") or "").strip()
            if z or s:
                return z or s
        except Exception:
            continue

    return ""

def capture_product_image_temp(driver, style: str) -> str:
    """Save the main product image as a temp file (__<style>.png).

    Preferred: download the product image URL (zoomimg/src) for best quality.
    Fallback: element screenshot if download fails.
    Returns relative path like ../Images/__595.png (or empty string).
    """
    style = (style or "").strip()
    if not style:
        return ""

    fname = f"__{style}.png"
    out_path = _images_dir() / fname

    # URL download first (avoids overlays, best resolution)
    try:
        img_url = _extract_product_image_url(driver)
        if img_url:
            img_url = img_url.replace("&amp;", "&")
            headers = {"User-Agent": "Mozilla/5.0"}
            r = requests.get(img_url, headers=headers, timeout=30)
            if r.ok and r.content:
                out_path.write_bytes(r.content)
                return _rel_image_path(fname)
    except Exception:
        pass

    # Fallback: screenshot the best image element
    img_el = None
    try:
        img_el = _pick_best_product_img_element(driver)
    except Exception:
        img_el = None

    if not img_el:
        return ""

    try:
        driver.execute_script("arguments[0].scrollIntoView({block:'center', inline:'center'});", img_el)
        time.sleep(0.2)
    except Exception:
        pass

    try:
        dismiss_popups(driver)
    except Exception:
        pass

    try:
        img_el.screenshot(str(out_path))
        return _rel_image_path(fname)
    except Exception:
        try:
            driver.save_screenshot(str(out_path))
            return _rel_image_path(fname)
        except Exception:
            return ""


def finalize_image_filename(style: str, boot_name: str, rel_temp_path: str) -> str:
    """Rename ../Images/__<style>.png -> ../Images/<style>.png."""
    if not rel_temp_path:
        return ""
    temp_abs = (Path.cwd() / rel_temp_path).resolve()
    if not temp_abs.exists():
        return rel_temp_path

    style_clean = re.sub(r"[^\dA-Za-z_-]", "", (style or "").strip())
    if not style_clean:
        return rel_temp_path

    fname = f"{style_clean}.png"
    dest_abs = _images_dir() / fname

    try:
        temp_abs.replace(dest_abs)
    except Exception:
        return rel_temp_path

    return _rel_image_path(fname)

def infer_gender(header: str, source_url: str) -> Tuple[bool, bool]:
    """Infer gender flags from URL + header (do NOT use body text; it frequently contains both words).

    Priority:
      1) URL slug/query (mens/womens)
      2) Header text (MEN'S / WOMEN'S)
      3) Otherwise: unknown -> (False, False)
    """
    hu = (header or "").upper()
    ul = (source_url or "").lower()

    # URL is most reliable (product pages use /mens-... or /womens-...)
    if re.search(r"/womens(?:[-/]|$)", ul) or "womens-" in ul or "gender=women" in ul or "gender=female" in ul:
        return (False, True)
    if re.search(r"/mens(?:[-/]|$)", ul) or "mens-" in ul or "gender=men" in ul or "gender=male" in ul:
        return (True, False)

    male = bool(re.search(r"\bMEN'?S\b", hu))
    female = bool(re.search(r"\bWOMEN'?S\b", hu))
    return (male, female)

def _build_row_from_text(
    style: str,
    name: str,
    header: str,
    body_text: str,
    source_url: str = "",
    image_rel_path: str = "",
) -> List[str]:
    body_text = body_text or ""
    header = header or ""

    body_upper = body_text.upper()
    body_lower = body_text.lower()

    brand_str = extract_brand(header, body_text)

    # Guard against junk captured from overlays/ads (seen as "Window.open(...)" etc.)
    if _looks_like_junk(brand_str):
        brand_str = ""
    is_rw, is_is, is_worx = classify_brand_family(brand_str)

    if not brand_str:
        hu = header.upper()
        bu = body_text.upper()
        if "IRISH SETTER" in hu or "IRISH SETTER" in bu:
            brand_str = "Irish Setter"
        elif "WORX" in hu or "WORX" in bu:
            brand_str = "Worx"
        elif "RED WING" in hu or "RED WING" in bu:
            brand_str = "Red Wing"
        is_rw, is_is, is_worx = classify_brand_family(brand_str)

    header_upper = header.upper()
    male, female = infer_gender(header, source_url)

    # IMPORTANT: Name must come ONLY from extract_style_and_name_from_html().
    # No fallbacks, no alternative heuristics.
    steel_toe = "STEEL TOE" in body_upper
    aluminum_toe = "ALUMINUM TOE" in body_upper or "ALLOY TOE" in body_upper
    non_metal_toe = ("NON-METALLIC TOE" in body_upper) or ("NON METALLIC TOE" in body_upper) or ("COMPOSITE TOE" in body_upper)
    safety_toe = ("SAFETY TOE" in body_upper) or steel_toe or aluminum_toe or non_metal_toe
    soft_toe = not safety_toe

    met_guard = ("METATARSAL GUARD" in body_upper) or ("MET GUARD" in body_upper)
    waterproof = "WATERPROOF" in body_upper

    insulation_line = parse_field_line(body_text, "Insulation")
    if insulation_line:
        il = insulation_line.strip().lower()
        if re.search(r"\b(non[-\s]?insulated|uninsulated)\b", il):
            insulated = False
        else:
            m = re.search(r"(\d+)\s*g\b", il)
            if m:
                try:
                    insulated = int(m.group(1)) > 0
                except Exception:
                    insulated = True
            else:
                insulated = ("insulat" in il) or ("thinsulate" in il)
    else:
        if re.search(r"\b(non[-\s]?insulated|uninsulated)\b", body_lower):
            insulated = False
        else:
            insulated = ("insulat" in body_lower) or ("thinsulate" in body_lower)

    slip_resistant = ("SLIP RESISTANT" in body_upper) or (re.search(r"\bSR\b", body_text) is not None)
    electrical_hazard = ("ELECTRICAL HAZARD" in body_upper) or (re.search(r"\bEH\b", body_text) is not None)
    puncture_resistant = ("PUNCTURE RESISTANT" in body_upper) or ("PUNCTURE" in body_upper)
    static_dissipative = ("STATIC DISSIPATIVE" in body_upper) or (re.search(r"\bSD\b", body_text) is not None)
    ankle_protection = ("ANKLE PROTECTION" in body_upper) or ("ankle" in body_lower and "protect" in body_lower)
    boa = ("BOA" in body_upper)

    defined_heel_line = parse_field_line(body_text, "Defined Heel")
    defined_heel = defined_heel_line.lower().startswith("yes") if defined_heel_line else ("DEFINED HEEL" in body_upper)

    leather_line = parse_field_line(body_text, "Leather Type")
    all_leather_upper = False
    if leather_line:
        ll = leather_line.lower()
        if not any(x in ll for x in ["mesh", "nylon", "fabric", "textile", "poly", "synthetic"]):
            all_leather_upper = True

    resoleable_line = parse_field_line(body_text, "Resoleable") or parse_field_line(body_text, "Resolvable")
    resoleable = False
    if resoleable_line:
        resoleable = resoleable_line.strip().lower().startswith("yes")
    elif re.search(r"\bresoleable\b", body_lower) or re.search(r"\bresolvable\b", body_lower):
        resoleable = "yes" in body_lower or "true" in body_lower

    name_lower = (name or "").lower()
    oxford_athletic = ("oxford" in name_lower) or ("athletic" in name_lower) or ("shoe" in name_lower)
    chukka = "chukka" in name_lower
    hiker = "hiker" in name_lower

    heights = parse_heights(header + "\n" + body_text)

    origin_line = parse_field_line(body_text, "Country of Origin")
    origin_lower = origin_line.lower()
    made_in_usa = ("made in usa" in origin_lower) or ("made in the usa" in origin_lower)
    built_in_usa = ("built in usa" in origin_lower) or ("assembled in the usa" in origin_lower) or made_in_usa

    return [
        md_escape_cell(style),
        md_escape_cell(name),
        md_escape_cell(source_url),
        md_escape_cell(image_rel_path),
        md_escape_cell(brand_str),

        b01(male),
        b01(female),

        b01(is_rw),
        b01(is_is),
        b01(is_worx),

        b01(safety_toe),
        b01(steel_toe),
        b01(non_metal_toe),
        b01(aluminum_toe),
        b01(met_guard),
        b01(soft_toe),

        b01(waterproof),
        b01(insulated),
        b01(slip_resistant),
        b01(electrical_hazard),
        b01(puncture_resistant),
        b01(static_dissipative),
        b01(ankle_protection),
        b01(boa),

        b01(defined_heel),
        b01(all_leather_upper),
        b01(resoleable),

        b01(oxford_athletic),
        b01(chukka),
        b01(hiker),

        b01(heights['5"']),
        b01(heights['6"']),
        b01(heights['7"']),
        b01(heights['8"']),
        b01(heights['9"']),
        b01(heights['10"']),
        b01(heights['11"']),
        b01(heights['12"']),

        b01(built_in_usa),
        b01(made_in_usa),
    ]


def _scrape_product_via_http(link: str, image_temp_rel: str = "") -> List[str]:
    last_exc: Optional[BaseException] = None
    for u in _url_variants(link):
        try:
            html = _fetch_html(u, timeout=30)
            style, name, style_text = extract_style_and_name_from_html(html, u)
            header = style_text
            body_text = _html_to_text(html)
            row = _build_row_from_text(style, name, header, body_text, source_url=link, image_rel_path=image_temp_rel)
            return row
        except Exception as e:
            last_exc = e

    if last_exc:
        raise last_exc
    raise RuntimeError("HTTP fallback failed")


def scrape_product(driver, link: str, *, prefer_http: bool = False) -> List[str]:
    image_temp_rel = ""  # always defined (used for HTTP fallback and row build)
    style = extract_style_from_url(link)
    style_from_url = style

    _polite_delay()

    if prefer_http:
        try:
            return _scrape_product_via_http(link, image_temp_rel=image_temp_rel)
        except Exception:
            pass

    # Selenium first
    try:
        safe_get(driver, link)
        dismiss_popups(driver)

        # IMPORTANT: Style/Name must come ONLY from this HTML parsing logic.
        html = driver.page_source or ""
        style, name, style_text = extract_style_and_name_from_html(html, link)

        # Capture product image early (temp file: __<style>.png)
        try:
            if style:
                image_temp_rel = capture_product_image_temp(driver, style)
        except Exception:
            image_temp_rel = ""
        WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        body_text = driver.find_element(By.TAG_NAME, "body").text or ""
        header = style_text

        row = _build_row_from_text(style, name, header, body_text, source_url=link, image_rel_path=image_temp_rel)
        return row

    except Exception:
        # Fallback to HTTP text extraction (much more reliable for the pages Selenium hangs on)
        return _scrape_product_via_http(link, image_temp_rel=image_temp_rel)
def write_markdown(rows: List[List[str]]) -> None:
    headers = MD_HEADERS

    def style_key(r: List[str]):
        s = r[0]
        try:
            return (0, int(re.sub(r"\D", "", s)))
        except Exception:
            return (1, s)

    sorted_rows = sorted(rows, key=style_key)

    out_lines = []
    out_lines.append("|" + "|".join(headers) + "|")
    out_lines.append("|" + "|".join(["---"] * len(headers)) + "|")
    for r in sorted_rows:
        # Pad/cut rows if checkpoint has older/newer schema
        if len(r) < len(headers):
            r = r + [""] * (len(headers) - len(r))
        elif len(r) > len(headers):
            r = r[:len(headers)]
        out_lines.append("|" + "|".join(r) + "|")

    atomic_write(OUT_MD, "\n".join(out_lines) + "\n")


def _read_errors_urls() -> List[str]:
    if not ERRORS_TXT.exists():
        return []
    urls: List[str] = []
    for line in ERRORS_TXT.read_text(encoding="utf-8", errors="replace").splitlines():
        line = line.strip()
        if not line:
            continue
        # Lines look like: "FAILED 3x: <url>"
        m = re.search(r"https?://\S+", line)
        if m:
            u = m.group(0)
            if u not in urls:
                urls.append(u)
    return urls


def _rewrite_errors_file(urls: List[str]) -> None:
    if not urls:
        try:
            ERRORS_TXT.unlink(missing_ok=True)
        except Exception:
            pass
        return
    lines = [f"FAILED {MAX_FAILS_PER_LINK}x: {u}" for u in urls]
    atomic_write(ERRORS_TXT, "\n".join(lines) + "\n")


def main() -> None:
    ck = load_checkpoint()

    done_links: Set[str] = set(ck.get("done_links", []))
    fail_counts: Dict[str, int] = dict(ck.get("fail_counts", {}))
    rows: List[List[str]] = list(ck.get("rows", []))
    started_at = float(ck.get("started_at", time.time()))

    hard_failed_links: List[str] = list(ck.get("hard_failed_links", []))
    if not hard_failed_links:
        hard_failed_links = _read_errors_urls()


    # If older checkpoint rows captured overlays/ads, they may be "stuck" because resume logic
    # skips already-done links. Auto-repair by re-queuing those styles.
    if AUTO_REPAIR_BAD_ROWS:
        repaired = repair_bad_checkpoint_rows(ck)
        if repaired:
            done_links = set(ck.get("done_links", []))
            fail_counts = dict(ck.get("fail_counts", {}))
            rows = list(ck.get("rows", []))
            hard_failed_links = list(ck.get("hard_failed_links", []))
    
    
    reporter: Optional[ProgressReporter] = None
    driver = None

    try:
        driver = create_driver(ck, headless=True, page_load_strategy="eager", block_images=True)

        product_links = ck.get("product_links") or []
        preferred_names: Dict[str, str] = ck.get("preferred_names") or {}

        if REFRESH_PRODUCT_LINKS_EACH_RUN or not product_links:
            reporter = ProgressReporter(total=1, started_at=started_at)
            reporter.update(0, note="Refreshing product links...")
            fresh_links, fresh_preferred_names = collect_product_links(driver)
            if fresh_links:
                product_links = fresh_links
            else:
                sys.stderr.write(
                    "[warn] Link refresh discovered 0 links; falling back to checkpoint link list.\n"
                )
            if fresh_preferred_names:
                preferred_names = fresh_preferred_names

            # Apply preferred International listing names to any already-scraped rows (resume-safe)
            try:
                if PREFER_INTERNATIONAL_LISTING_NAMES and preferred_names:
                    nupd = apply_preferred_names(rows, preferred_names)
                    if nupd:
                        ck["rows"] = rows
                        ck["preferred_names"] = preferred_names
                        save_checkpoint(ck)
                        print(f"Applied preferred International names to {nupd} existing rows")
            except Exception:
                pass

            ck["product_links"] = product_links
            if PREFER_INTERNATIONAL_LISTING_NAMES:
                ck["preferred_names"] = preferred_names
            save_checkpoint(ck)

        # If resuming and we prefer International listing names but the mapping is missing, collect it once.
        if PREFER_INTERNATIONAL_LISTING_NAMES and not preferred_names:
            try:
                _, preferred_names = collect_product_links(driver)
                ck["preferred_names"] = preferred_names
                save_checkpoint(ck)
            except Exception:
                preferred_names = preferred_names or {}

            # Apply preferred names to any already-scraped rows (resume-safe)
            try:
                if preferred_names:
                    nupd = apply_preferred_names(rows, preferred_names)
                    if nupd:
                        ck["rows"] = rows
                        ck["preferred_names"] = preferred_names
                        save_checkpoint(ck)
                        print(f"Applied preferred International names to {nupd} existing rows")
            except Exception:
                pass

        if AUTO_REPAIR_MISSING_MEDIA:
            rm = repair_missing_media(ck)
            # Mark done so we don't requeue forever
            if not ck.get("media_repair_done"):
                ck["media_repair_done"] = True
                save_checkpoint(ck)
            if rm:
                print(f"Re-queued {rm} links due to missing URL/Image")
                save_checkpoint(ck)

        # Optional single-style test mode
        if ONLY_STYLE.strip():
            want = ONLY_STYLE.strip()
            product_links = [u for u in (product_links or []) if extract_style_from_url(u) == want]
            ck["product_links"] = product_links
            save_checkpoint(ck)

        total = len(product_links)
        if reporter:
            reporter.close()
        reporter = ProgressReporter(total=total, started_at=started_at)

        processed = len(done_links)
        reporter.update(processed, note="starting")

        # map existing rows by style for dedupe/overwrite
        style_to_idx = {r[0]: i for i, r in enumerate(rows) if r and r[0]}

        for link in product_links:
            if link in done_links:
                processed += 1
                reporter.update(processed, note="(resumed)")
                continue

            ok = False
            while True:
                try:
                    row = scrape_product(driver, link)

                    if not row[0]:
                        raise RuntimeError(f"Style number parsed empty for link: {link}")
                    if not row[1]:
                        raise RuntimeError(f"Name parsed empty for style {row[0]} ({link})")

                    # Prefer International listing display name when available
                    if PREFER_INTERNATIONAL_LISTING_NAMES and preferred_names:
                        pref = _clean_listing_name(preferred_names.get(row[0], ""))
                        if pref:
                            row[1] = md_escape_cell(pref)

                    # Ensure URL column is populated
                    if len(row) > 2 and not row[2]:
                        row[2] = md_escape_cell(link)

                    # Finalize image filename based on final Name
                    try:
                        if len(row) > 3 and row[3]:
                            row[3] = md_escape_cell(finalize_image_filename(row[0], row[1], row[3]))
                    except Exception:
                        pass

                    # dedupe by style
                    if row[0] in style_to_idx:
                        rows[style_to_idx[row[0]]] = row
                    else:
                        style_to_idx[row[0]] = len(rows)
                        rows.append(row)

                    done_links.add(link)

                    ck["done_links"] = sorted(done_links)
                    ck["rows"] = rows
                    ck["fail_counts"] = fail_counts
                    ck["hard_failed_links"] = hard_failed_links
                    save_checkpoint(ck)

                    write_markdown(rows)

                    ok = True
                    break

                except Exception as e:
                    fail_counts[link] = int(fail_counts.get(link, 0)) + 1

                    ck["done_links"] = sorted(done_links)
                    ck["rows"] = rows
                    ck["fail_counts"] = fail_counts
                    ck["hard_failed_links"] = hard_failed_links
                    save_checkpoint(ck)

                    if fail_counts[link] >= MAX_FAILS_PER_LINK:
                        if link not in hard_failed_links:
                            hard_failed_links.append(link)
                            write_errors_line(f"FAILED {fail_counts[link]}x: {link}")
                        done_links.add(link)
                        ck["done_links"] = sorted(done_links)
                        ck["hard_failed_links"] = hard_failed_links
                        save_checkpoint(ck)
                        break

                    sys.stderr.write(
                        f"\nError scraping link (attempt {fail_counts[link]}/{MAX_FAILS_PER_LINK}): {link}\n"
                    )
                    sys.stderr.write("".join(traceback.format_exception(type(e), e, e.__traceback__)) + "\n")

                    # reset driver only if it's likely wedged
                    try:
                        if driver:
                            driver.quit()
                    except Exception:
                        pass
                    driver = create_driver(ck, headless=True, page_load_strategy="eager", block_images=True)

            processed += 1
            reporter.update(processed, note=("ok" if ok else "skipped"))

        # Main pass done
        write_markdown(rows)
        reporter.update(total, note="done")

        # Salvage pass: retry hard failures once more (HTTP-first) with a conservative driver.
        if ENABLE_SALVAGE_PASS and hard_failed_links:
            sys.stderr.write(f"\nStarting salvage pass for {len(hard_failed_links)} failed links...\n")

            remaining: List[str] = []
            try:
                if driver:
                    try:
                        driver.quit()
                    except Exception:
                        pass
                driver = create_driver(
                    ck,
                    headless=True,
                    page_load_strategy="normal",
                    block_images=False,
                    page_load_timeout=SALVAGE_PAGE_LOAD_TIMEOUT,
                )

                for i, link in enumerate(list(hard_failed_links), start=1):
                    salvaged = False
                    for _ in range(SALVAGE_MAX_TRIES_PER_LINK):
                        try:
                            row = scrape_product(driver, link, prefer_http=True)
                            if row and row[0] and row[1]:
                                if row[0] in style_to_idx:
                                    rows[style_to_idx[row[0]]] = row
                                else:
                                    style_to_idx[row[0]] = len(rows)
                                    rows.append(row)
                                salvaged = True
                                break
                        except Exception:
                            try:
                                driver.get("about:blank")
                            except Exception:
                                pass
                            time.sleep(0.5)

                    if salvaged:
                        sys.stderr.write(f"Salvaged {i}/{len(hard_failed_links)}: {link}\n")
                    else:
                        remaining.append(link)

                hard_failed_links = remaining

            except Exception as e:
                sys.stderr.write("\nSalvage pass encountered an error:\n")
                sys.stderr.write("".join(traceback.format_exception(type(e), e, e.__traceback__)) + "\n")

            ck["rows"] = rows
            ck["hard_failed_links"] = hard_failed_links
            save_checkpoint(ck)
            write_markdown(rows)
            _rewrite_errors_file(hard_failed_links)

        sys.stderr.write(f"\nDONE. Wrote: {OUT_MD}\n")
        if ERRORS_TXT.exists():
            sys.stderr.write(f"Some links failed repeatedly; see: {ERRORS_TXT}\n")

    finally:
        try:
            if reporter:
                reporter.close()
        except Exception:
            pass
        try:
            if driver:
                driver.quit()
        except Exception:
            pass


if __name__ == "__main__":
    main()
