#!/usr/bin/env python3
"""
RW_Site_Scraper-Orders_Page.py
==============================

Scrapes Red Wing footwear catalog:
- https://order.redwingshoes.com/footwear-rwbr

Test mode:
- Set ONLY_STYLE at the top of this file to a style number (e.g. "400")
  to scrape just that single product.

Outputs (next to this script):
- RW_Orders_Site_Scrape.md
- RW_Site_Scraper_v2_checkpoint.json   (resume state)
- RW_Site_Scraper_v2_errors.txt        (links that failed repeatedly)

Stability features:
- `safe_get()` uses short timeouts + window.stop() so Selenium doesn't hang forever
- HTTP fallback (no Selenium) for the few product pages that still time out
- Optional salvage pass at the end to retry hard failures (HTTP-first)

Cross-platform (Windows + Linux Cinnamon):
- Headless Firefox (default)
- Geckodriver resolution "like Parts_Auto" (explicit Service path; no Selenium Manager):
    1) GECKODRIVER_PATH env var (file or directory)
    2) geckodriver(.exe) on PATH
    3) auto-download geckodriver (GitHub releases) into a user cache dir
    4) (optional) if double-click/no terminal and Tk is available, prompt to pick geckodriver

Feature columns are 1/0 (not Yes/No).
Includes Brand (string) + brand family flags (Red Wing / Irish Setter / Worx).

Dependencies:
- Python 3.9+
- Firefox installed
- Selenium installed:
    Linux Mint/Ubuntu: sudo apt install -y python3-selenium
    Windows: python -m pip install selenium

Notes for Linux Mint PEP 668:
- Prefer `python3-selenium` from apt (as above).
- This script does NOT require webdriver-manager.
"""

from __future__ import annotations

import json
import os
import platform
import re
import shutil
import stat
import sys
import tarfile
import time
import traceback
import random
import requests
import html as html_lib
from html.parser import HTMLParser
import zipfile
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import parse_qs, urljoin, urlparse
from urllib.request import Request, urlopen

# NOTE: Style/Name extraction is now done via BeautifulSoup using the exact
# logic requested (shoeguide/printSpacing -> h3 + strong). This intentionally
# avoids any other name/title fallbacks.
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


# -----------------------------
# Config
# -----------------------------
# Catalog pages to scan for product links.
CATALOG_URLS = [
    "https://order.redwingshoes.com/footwear-rwbr",
]
CATALOG_URL_PRIMARY = CATALOG_URLS[0]
SITE_HOST = "order.redwingshoes.com"
FOOTWEAR_RWB_SLUG = "/footwear-rwbr"

# Login credentials (env vars take precedence)
RW_SITE_USERNAME = os.environ.get("RW_SITE_USERNAME", "rwss614@redwingshoes.com")
RW_SITE_PASSWORD = os.environ.get("RW_SITE_PASSWORD", "WelcomeBack99!")
# Default is headless. Set RW_HEADLESS=0 to run with a visible browser.
RUN_HEADLESS = os.environ.get("RW_HEADLESS", "1").strip().lower() in {"1", "true", "yes", "y", "on"}
# Re-discover catalog links on every run so newly added products are picked up.
REFRESH_PRODUCT_LINKS_EACH_RUN = os.environ.get("RW_REFRESH_LINKS", "1").strip().lower() in {"1", "true", "yes", "y", "on"}

# Optional: scrape only a single style number (set "" to scrape all discovered styles)
ONLY_STYLE: str = ""
BASE_DIR = Path(__file__).resolve().parent

OUT_MD = BASE_DIR / "RW_Orders_Site_Scrape.md"
CHECKPOINT = BASE_DIR / "RW_Site_Scraper_v2_checkpoint.json"
ERRORS_TXT = BASE_DIR / "RW_Site_Scraper_v2_errors.txt"

MAX_FAILS_PER_LINK = 3


# Auto-repair: if old checkpoint rows have clearly-bad Name/Brand (e.g. "RED WING FOR BUSINESS"),
# automatically un-mark those links as done so they get re-scraped on the next run.
AUTO_REPAIR_BAD_ROWS = True
BAD_NAME_SENTINEL = "RED WING FOR BUSINESS"

# Prefer names as shown on the International catalog listing page.
# Disabled: listing tiles can include UI strings like "Save to list".
PREFER_INTERNATIONAL_LISTING_NAMES = False


MD_HEADERS = [
    "Style #", "Name", "URL", "Image", "Brand",
    "Male", "Female",
    "Red Wing", "Irish Setter", "Worx",
    "Safety Toe", "Steel Toe", "Non-Metallic Toe", "Aluminum Toe", "Metatarsal Guard", "Soft Toe",
    "Waterproof", "Insulation", "Slip Resistant", "Electrical Hazard", "Puncture Resistant",
    "Static Dissipative", "Ankle Protection", "BOA® Lacing System",
    "Defined Heel", "All Leather Upper", "Resoleable",
    "Oxford/Athletic", "Chukka", "Hiker",
    '5"', '6"', '7"', '8"', '9"', '10"', '11"', '12"',
    "Built in USA", "Made in USA",
]

EXPECTED_COLS = len(MD_HEADERS)


AUTO_REPAIR_MISSING_MEDIA = True

# Navigation tuning (Red Wing pages sometimes never fully 'finish' loading)
PAGE_LOAD_TIMEOUT = 45
SCRIPT_TIMEOUT = 30
NAV_SETTLE_SECONDS = 1.25

# Small polite delay between product navigations to reduce transient server hiccups
REQUEST_DELAY_RANGE = (0.15, 0.55)

# Force private-session behavior so prior cookies/tracking data do not affect scraping.
PRIVATE_BROWSER_MODE = True

# After the main pass, retry hard-failed links once with a fresh driver + more conservative settings
ENABLE_SALVAGE_PASS = True
SALVAGE_PAGE_LOAD_TIMEOUT = 90
SALVAGE_MAX_TRIES_PER_LINK = 2

CHECKPOINT_VERSION = 11
# Optional: set this environment variable to explicitly point at geckodriver
#   Linux:   export GECKODRIVER_PATH="/home/you/bin/geckodriver"
#   Windows: setx GECKODRIVER_PATH "C:\Users\you\bin\geckodriver.exe"
GECKODRIVER_PATH = os.environ.get("GECKODRIVER_PATH", "").strip() or None


# -----------------------------
# Small helpers
# -----------------------------
def b01(v: bool) -> str:
    return "1" if v else "0"


def md_escape_cell(s: str) -> str:
    s = (s or "").replace("|", r"\|")
    s = re.sub(r"\s+", " ", s).strip()
    return s


def atomic_write(path: Path, text: str) -> None:
    tmp = path.with_suffix(path.suffix + ".tmp")
    tmp.write_text(text, encoding="utf-8")
    tmp.replace(path)


def load_checkpoint() -> Dict:
    if not CHECKPOINT.exists():
        return {
            "version": CHECKPOINT_VERSION,
            "catalog_urls": CATALOG_URLS,
            "product_links": [],
            "done_links": [],
            "rows": [],
            "preferred_names": {},
            "media_repair_done": False,
            "fail_counts": {},
            "hard_failed_links": [],
            "started_at": time.time(),
            "geckodriver_path": None,
        }
    try:
        data = json.loads(CHECKPOINT.read_text(encoding="utf-8"))
        old_ver = int(data.get("version", 0))
        if old_ver != CHECKPOINT_VERSION:
            try:
                CHECKPOINT.replace(CHECKPOINT.with_suffix(f".json.v{old_ver}.bak"))
            except Exception:
                pass
            return {
                "version": CHECKPOINT_VERSION,
                "catalog_urls": CATALOG_URLS,
                "product_links": [],
                "done_links": [],
                "rows": [],
                "fail_counts": {},
                "started_at": time.time(),
                "geckodriver_path": None,
            }
        data.setdefault("version", CHECKPOINT_VERSION)
        data.setdefault("catalog_urls", CATALOG_URLS)
        data.setdefault("product_links", [])
        data.setdefault("done_links", [])
        data.setdefault("rows", [])
        data.setdefault("fail_counts", {})
        data.setdefault("hard_failed_links", [])
        data.setdefault("started_at", time.time())
        data.setdefault("geckodriver_path", None)
        return data
    except Exception:
        try:
            CHECKPOINT.replace(CHECKPOINT.with_suffix(".json.corrupt"))
        except Exception:
            pass
        return {
            "version": CHECKPOINT_VERSION,
            "catalog_urls": CATALOG_URLS,
            "product_links": [],
            "done_links": [],
            "rows": [],
            "preferred_names": {},
            "media_repair_done": False,
            "fail_counts": {},
            "hard_failed_links": [],
            "started_at": time.time(),
            "geckodriver_path": None,
        }


def save_checkpoint(data: Dict) -> None:
    atomic_write(CHECKPOINT, json.dumps(data, indent=2, sort_keys=True))


def write_errors_line(line: str) -> None:
    with open(ERRORS_TXT, "a", encoding="utf-8") as f:
        f.write(line.rstrip() + "\n")


def _can_use_tk() -> bool:
    try:
        import tkinter  # noqa: F401
        return True
    except Exception:
        return False


def _tk_pick_file(title: str) -> Optional[str]:
    # Only use this when not running in a terminal (double-click scenario).
    if sys.stdout.isatty() or not _can_use_tk():
        return None
    try:
        import tkinter as tk
        from tkinter import filedialog, messagebox

        root = tk.Tk()
        root.withdraw()
        root.attributes("-topmost", True)

        messagebox.showinfo(
            "RW Site Scraper",
            "Could not find geckodriver automatically.\n\n"
            "Please select the geckodriver executable.\n"
            "Windows: geckodriver.exe\nLinux: geckodriver"
        )
        path = filedialog.askopenfilename(title=title)
        root.destroy()
        path = (path or "").strip()
        return path if path else None
    except Exception:
        return None


class ProgressReporter:
    """TTY progress bar, or Tk window if launched by double-click (no TTY)."""

    def __init__(self, total: int, started_at: float):
        self.total = max(int(total), 1)
        self.started_at = started_at
        self.use_tty = sys.stdout.isatty()
        self.gui = False
        self._root = None
        self._label = None
        self._pbar = None

        if (not self.use_tty) and _can_use_tk():
            try:
                import tkinter as tk
                from tkinter import ttk

                self._root = tk.Tk()
                self._root.title("RW Site Scraper")
                self._root.geometry("620x150")
                self._root.resizable(False, False)

                self._label = tk.Label(self._root, text="Starting...", anchor="w")
                self._label.pack(fill="x", padx=12, pady=(12, 6))

                self._pbar = ttk.Progressbar(self._root, maximum=self.total, length=580)
                self._pbar.pack(padx=12, pady=(0, 10))

                self.gui = True
                self._root.update_idletasks()
                self._root.update()
            except Exception:
                self.gui = False

    def update(self, current: int, note: str = "") -> None:
        current = max(0, min(int(current), self.total))

        if self.gui and self._root:
            msg = f"{current}/{self.total}  {note}".strip()
            if self._label:
                self._label.config(text=msg)
            if self._pbar:
                self._pbar["value"] = current
            self._root.update_idletasks()
            self._root.update()
            return

        width = 32
        frac = current / self.total
        filled = int(round(frac * width))
        bar = "#" * filled + "-" * (width - filled)

        elapsed = max(time.time() - self.started_at, 0.0001)
        rate = current / elapsed
        eta = (self.total - current) / rate if rate > 1e-9 else 0.0

        msg = f"[{bar}] {current}/{self.total} ({frac*100:5.1f}%) ETA {int(eta)}s"
        if note:
            msg += f"  {note}"

        sys.stdout.write("\r" + msg[:240])
        sys.stdout.flush()
        if current == self.total:
            sys.stdout.write("\n")
            sys.stdout.flush()

    def close(self) -> None:
        if self.gui and self._root:
            try:
                self._root.destroy()
            except Exception:
                pass


# -----------------------------
# Geckodriver resolution / install
# -----------------------------
def _cache_dir() -> Path:
    if os.name == "nt":
        base = os.environ.get("LOCALAPPDATA") or os.environ.get("APPDATA") or str(Path.home())
        d = Path(base) / "RW_Site_Scraper"
    else:
        base = os.environ.get("XDG_CACHE_HOME") or str(Path.home() / ".cache")
        d = Path(base) / "rw_site_scraper"
    d.mkdir(parents=True, exist_ok=True)
    return d


def _resolve_from_env_or_path() -> Optional[str]:
    # 1) explicit env var (file or directory)
    if GECKODRIVER_PATH:
        p = Path(GECKODRIVER_PATH)
        if p.is_file():
            return str(p)
        if p.is_dir():
            for name in ("geckodriver.exe", "geckodriver"):
                cand = p / name
                if cand.is_file():
                    return str(cand)

    # 2) alongside this script
    for name in ("geckodriver.exe", "geckodriver"):
        cand = BASE_DIR / name
        if cand.is_file():
            return str(cand)

    # 3) scraper cache dir
    for name in ("geckodriver.exe", "geckodriver"):
        cand = _cache_dir() / name
        if cand.is_file():
            return str(cand)

    # 4) common user bin dirs (linux)
    if os.name != "nt":
        for d in (Path.home() / "bin", Path.home() / ".local" / "bin"):
            cand = d / "geckodriver"
            if cand.is_file():
                return str(cand)

    # 5) PATH
    p = shutil.which("geckodriver") or shutil.which("geckodriver.exe")
    if p:
        return p
    return None


def _platform_asset_key() -> Tuple[str, str]:
    """
    Returns (asset_contains, archive_type) matching geckodriver releases.
    """
    sysplat = sys.platform.lower()
    mach = platform.machine().lower()

    if sysplat.startswith("win"):
        is_64 = "64" in platform.architecture()[0]
        return ("win64" if is_64 else "win32", "zip")

    if sysplat.startswith("linux"):
        if "aarch64" in mach or "arm64" in mach:
            return ("linux-aarch64", "tar.gz")
        return ("linux64", "tar.gz")

    if sysplat == "darwin":
        if "arm64" in mach or "aarch64" in mach:
            return ("macos-aarch64", "tar.gz")
        return ("macos", "tar.gz")

    return ("linux64", "tar.gz")


def _download_latest_geckodriver(dest_dir: Path) -> Path:
    """
    Downloads and extracts latest geckodriver into dest_dir.
    Returns path to extracted driver.
    """
    asset_key, archive_type = _platform_asset_key()

    api = "https://api.github.com/repos/mozilla/geckodriver/releases/latest"
    req = Request(api, headers={"User-Agent": "RW_Site_Scraper/1.0"})
    with urlopen(req, timeout=30) as r:
        data = json.loads(r.read().decode("utf-8", errors="replace"))

    assets = data.get("assets", [])
    dl_url = None
    dl_name = None
    for a in assets:
        name = a.get("name", "")
        if asset_key in name and name.endswith(archive_type):
            dl_url = a.get("browser_download_url")
            dl_name = name
            break

    if not dl_url:
        raise RuntimeError(f"Could not find a geckodriver asset for {asset_key} ({archive_type}).")

    archive_path = dest_dir / dl_name
    req2 = Request(dl_url, headers={"User-Agent": "RW_Site_Scraper/1.0"})
    with urlopen(req2, timeout=60) as r, open(archive_path, "wb") as f:
        shutil.copyfileobj(r, f)

    driver_name = "geckodriver.exe" if os.name == "nt" else "geckodriver"
    extracted_path = dest_dir / driver_name

    if archive_type == "zip":
        with zipfile.ZipFile(archive_path, "r") as z:
            for member in z.namelist():
                if member.endswith(driver_name):
                    z.extract(member, path=dest_dir)
                    p = dest_dir / member
                    p.replace(extracted_path)
                    break
    else:
        with tarfile.open(archive_path, "r:gz") as t:
            for member in t.getmembers():
                if member.name.endswith("/" + driver_name) or member.name == driver_name:
                    t.extract(member, path=dest_dir)
                    p = dest_dir / member.name
                    p.replace(extracted_path)
                    break

    try:
        archive_path.unlink(missing_ok=True)
    except Exception:
        pass

    if os.name != "nt":
        try:
            st = os.stat(extracted_path)
            os.chmod(extracted_path, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
        except Exception:
            pass

    if not extracted_path.exists():
        raise RuntimeError("Download succeeded but geckodriver was not extracted.")

    return extracted_path


def ensure_geckodriver(ck: Dict) -> str:
    # 1) env/path/script dir
    p = _resolve_from_env_or_path()
    if p and Path(p).exists():
        return p

    # 2) saved in checkpoint
    saved = (ck.get("geckodriver_path") or "").strip()
    if saved and Path(saved).exists():
        return saved

    # 3) auto-download
    try:
        dest = _cache_dir()
        driver_path = _download_latest_geckodriver(dest)
        ck["geckodriver_path"] = str(driver_path)
        save_checkpoint(ck)
        return str(driver_path)
    except Exception as e:
        # 4) optional interactive pick (only when double-click/no TTY)
        picked = _tk_pick_file("Select geckodriver / geckodriver.exe")
        if picked and Path(picked).exists():
            ck["geckodriver_path"] = picked
            save_checkpoint(ck)
            return picked

        raise RuntimeError(
            "Unable to locate or install geckodriver.\n\n"
            "Fix options:\n"
            "  - Put geckodriver on PATH\n"
            "  - OR set GECKODRIVER_PATH to the full driver path\n"
            "  - OR install via package manager (Linux often: sudo apt install firefox-geckodriver)\n\n"
            f"Underlying error: {e}"
        )


def create_driver(
    ck: Dict,
    headless: bool = True,
    *,
    page_load_strategy: str = "eager",
    block_images: bool = True,
    page_load_timeout: int = PAGE_LOAD_TIMEOUT,
    user_agent: Optional[str] = None,
) -> webdriver.Firefox:
    """Start Firefox using an explicit geckodriver path (no Selenium Manager).

    Args:
        page_load_strategy: "eager" returns after DOMContentLoaded. "normal" waits for full load.
        block_images: If True, blocks images to reduce load stalls/timeouts.
        page_load_timeout: Seconds for Selenium navigation timeout.
    """
    gecko = ensure_geckodriver(ck)

    options = Options()
    if headless:
        options.add_argument("-headless")
    if PRIVATE_BROWSER_MODE:
        # Explicit private mode in addition to private-browsing prefs.
        options.add_argument("-private")

    if page_load_strategy:
        options.set_capability("pageLoadStrategy", page_load_strategy)

    # Safe stability prefs
    options.set_preference("dom.webnotifications.enabled", False)
    options.set_preference("media.volume_scale", "0.0")
    options.set_preference("browser.privatebrowsing.autostart", True)
    options.set_preference("network.http.http3.enable", False)
    options.set_preference("browser.cache.disk.enable", False)
    options.set_preference("browser.cache.memory.enable", False)
    options.set_preference("browser.cache.offline.enable", False)
    options.set_preference("network.http.use-cache", False)
    options.set_preference("places.history.enabled", False)
    options.set_preference("signon.rememberSignons", False)
    options.set_preference("privacy.trackingprotection.enabled", True)
    options.set_preference("privacy.trackingprotection.pbmode.enabled", True)

    if block_images:
        # 2 = block images
        options.set_preference("permissions.default.image", 2)

    if user_agent:
        options.set_preference("general.useragent.override", user_agent)

    service = FirefoxService(executable_path=gecko)
    driver = webdriver.Firefox(service=service, options=options)
    driver.set_page_load_timeout(page_load_timeout)
    driver.set_script_timeout(SCRIPT_TIMEOUT)
    return driver


def reset_browser_state(driver) -> None:
    """Clear cookies/storage for a clean private session baseline."""
    try:
        driver.delete_all_cookies()
    except Exception:
        pass
    try:
        driver.get("about:blank")
        driver.execute_script(
            """
            try { localStorage.clear(); } catch (e) {}
            try { sessionStorage.clear(); } catch (e) {}
            """
        )
    except Exception:
        pass


# -----------------------------
# Navigation + HTTP fallback helpers
# -----------------------------

def _polite_delay() -> None:
    lo, hi = REQUEST_DELAY_RANGE
    try:
        time.sleep(random.uniform(lo, hi))
    except Exception:
        time.sleep(lo)


def safe_get(
    driver,
    url: str,
    timeout: int = PAGE_LOAD_TIMEOUT,
    settle: float = NAV_SETTLE_SECONDS,
    max_tries: int = 2,
) -> None:
    """Navigate without getting stuck on pages that never fully finish loading.

    - Uses page_load_timeout
    - On timeout, calls window.stop() and continues if the DOM exists
    - Retries a couple times with light cleanup
    """
    last_exc: Optional[BaseException] = None

    for attempt in range(1, max_tries + 2):
        try:
            driver.set_page_load_timeout(timeout)
            _polite_delay()
            # Python-side HTTP timeout to geckodriver (prevents indefinite hangs in driver.get)
            try:
                ce = getattr(driver, "command_executor", None)
                if ce is not None:
                    # Selenium 4: RemoteConnection holds a ClientConfig with a request timeout
                    if hasattr(ce, "_client_config") and hasattr(ce._client_config, "timeout"):
                        ce._client_config.timeout = int(timeout) + 10
                    # urllib3 PoolManager may also have a timeout attribute
                    if hasattr(ce, "_conn") and hasattr(ce._conn, "timeout"):
                        ce._conn.timeout = int(timeout) + 10
            except Exception:
                pass
            driver.get(url)

            # Give the DOM a moment to settle
            time.sleep(settle)

            # If we have any DOM, we can proceed.
            try:
                driver.find_element(By.TAG_NAME, "body")
            except Exception:
                pass
            return

        except TimeoutException as e:
            last_exc = e
            try:
                driver.execute_script("window.stop();")
            except Exception:
                pass
            time.sleep(settle)
            try:
                driver.find_element(By.TAG_NAME, "body")
                return
            except Exception:
                pass

        except WebDriverException as e:
            last_exc = e

        # Cleanup between retries
        try:
            driver.get("about:blank")
        except Exception:
            pass
        time.sleep(0.25 * attempt)

    if last_exc:
        raise last_exc



def _url_variants(url: str) -> List[str]:
    """Return a small set of URL variants (www/non-www) to dodge occasional redirects."""
    outs: List[str] = []
    for u in [url]:
        if u not in outs:
            outs.append(u)
    if "//www." in url:
        u2 = url.replace("//www.", "//", 1)
        if u2 not in outs:
            outs.append(u2)
    else:
        u2 = url.replace("//", "//www.", 1)
        if u2 not in outs:
            outs.append(u2)
    return outs


def _fetch_html(url: str, timeout: int = 30) -> str:
    req = Request(
        url,
        headers={
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
        },
    )
    with urlopen(req, timeout=timeout) as r:
        data = r.read()
    # best-effort decode
    return data.decode("utf-8", errors="replace")


class _VisibleTextExtractor(HTMLParser):
    """Very small HTML->visible text extractor (no external deps)."""

    _BLOCK_TAGS = {
        "p", "div", "br", "li", "tr", "td", "th",
        "h1", "h2", "h3", "h4", "h5", "h6",
        "section", "article", "header", "footer",
    }

    def __init__(self):
        super().__init__()
        self.parts: List[str] = []

    def handle_starttag(self, tag, attrs):
        if tag.lower() in self._BLOCK_TAGS:
            self.parts.append("\n")

    def handle_endtag(self, tag):
        if tag.lower() in self._BLOCK_TAGS:
            self.parts.append("\n")

    def handle_data(self, data):
        if data and data.strip():
            self.parts.append(data)


def _html_to_text(html: str) -> str:
    p = _VisibleTextExtractor()
    try:
        p.feed(html)
    except Exception:
        pass
    raw = "".join(p.parts)
    raw = html_lib.unescape(raw)
    raw = re.sub(r"\n{3,}", "\n\n", raw)
    raw = re.sub(r"[\t\r]+", " ", raw)
    return raw.strip()


def _extract_first_h1(html: str) -> str:
    m = re.search(r"<h1\b[^>]*>(.*?)</h1>", html, flags=re.IGNORECASE | re.DOTALL)
    if not m:
        return ""
    inner = m.group(1)
    inner = re.sub(r"<[^>]+>", " ", inner)
    inner = html_lib.unescape(inner)
    inner = re.sub(r"\s+", " ", inner).strip()
    return inner




def _extract_first_heading(html: str) -> str:
    """Extract a likely product title from HTML.

    Many Red Wing product pages place the actual product name in <h3>, while <h1>
    can be a site/banner header (e.g., 'RED WING FOR BUSINESS'). We therefore
    try meaningful <h3> first, then fall back to <h1>.
    """
    BAD = {"RED WING FOR BUSINESS"}
    STOP = {
        "FEATURES", "SPECIFICATIONS", "TECHNOLOGY", "DETAILS",
        "SIZE & FIT", "SIZING", "CARE", "REVIEWS", "RELATED PRODUCTS",
    }

    def _clean(inner: str) -> str:
        inner = re.sub(r"<[^>]+>", " ", inner)
        inner = html_lib.unescape(inner)
        inner = re.sub(r"\s+", " ", inner).strip()
        return inner

    def _ok(s: str) -> bool:
        s = (s or "").strip()
        if not s:
            return False
        up = s.upper()
        if up in BAD or up in STOP:
            return False
        if len(up) <= 3:
            return False
        if not re.search(r"[A-Z]", up):
            return False
        return True

    # Prefer the 'Name' field from the productInfo table when present
    mname = re.search(
        r"<td[^>]*class=['\"][^'\"]*prTitle[^'\"]*['\"][^>]*>\s*Name\s*</td>\s*"
        r"<td[^>]*class=['\"][^'\"]*prValue[^'\"]*['\"][^>]*>(.*?)</td>",
        html,
        flags=re.IGNORECASE | re.DOTALL,
    )
    if mname:
        s = _clean(mname.group(1))
        if _ok(s):
            return s


    # Try first meaningful <h3>
    for mh3 in re.finditer(r"<h3\b[^>]*>(.*?)</h3>", html, flags=re.IGNORECASE | re.DOTALL):
        s = _clean(mh3.group(1))
        if _ok(s):
            return s

    return _extract_first_h1(html)

# -----------------------------
# Scraping logic
# -----------------------------

def _is_product_link(href: str) -> bool:
    href = (href or "").strip()
    if not href:
        return False
    low = href.lower()
    if low.startswith("javascript:") or low.startswith("mailto:") or low.startswith("tel:"):
        return False
    if "/safety-boot/" in low:
        return True
    try:
        parsed = urlparse(href)
        path = (parsed.path or "").lower().rstrip("/")
        query = parse_qs(parsed.query or "")
    except Exception:
        path = low
        query = {}

    if "/footwear-" in path:
        # Keep only actual product routes, but allow query-driven product URLs.
        if re.fullmatch(r".*/footwear-[^/]+", path):
            for key in ("style", "styleNumber", "sku", "item", "pid", "productId"):
                if query.get(key) or query.get(key.lower()):
                    return True
            return False
        return True

    # Some site variants expose product URLs under generic paths.
    if "/product/" in path and re.search(r"\d{3,6}", path):
        return True

    # Red Wing wholesale product detail pages can be root-level numeric html paths: /02478.html
    if re.search(r"/\d{3,6}\.html$", path):
        return True

    return False


def _find_product_anchors(driver):
    """Locate product anchors on the catalog page.

    The Red Wing wholesale catalog renders product tiles with specific CSS
    classes.  Each tile (<li>) contains two links pointing at the same
    product detail page: one wrapping the image and one wrapping the product
    name.  Both have an href that ends in ".html" with the numeric style
    number.  On the current site the anchor for the name has the classes
    ``c-product-tile__pdp-link js-product-name``, while the image link has
    no special class but is still within the tile.  This helper first
    attempts to extract anchors using those specific selectors and falls
    back to scanning all anchors when necessary.

    Returns a list of WebElement objects (anchors) with unique hrefs.
    """
    anchors: List = []
    seen: Set[str] = set()
    # First try targeted selectors for the product name link.
    try:
        elements = driver.find_elements(
            By.CSS_SELECTOR,
            "li.js-product-grid-item a.c-product-tile__pdp-link.js-product-name[href]",
        )
        for a in elements:
            try:
                href = (a.get_attribute("href") or "").strip()
            except Exception:
                href = ""
            if not href or href in seen:
                continue
            if _is_product_link(href):
                anchors.append(a)
                seen.add(href)
    except Exception:
        pass
    # Then collect image links from the same tiles.
    try:
        elements2 = driver.find_elements(
            By.CSS_SELECTOR, "li.js-product-grid-item a[href$='.html']"
        )
        for a in elements2:
            try:
                href = (a.get_attribute("href") or "").strip()
            except Exception:
                href = ""
            if not href or href in seen:
                continue
            if _is_product_link(href):
                anchors.append(a)
                seen.add(href)
    except Exception:
        pass
    # Fallback: generic anchor scanning if targeted selectors found nothing.
    if not anchors:
        try:
            for a in driver.find_elements(By.XPATH, "//a[@href]"):
                try:
                    href = (a.get_attribute("href") or "").strip()
                except Exception:
                    href = ""
                if not href or href in seen:
                    continue
                if _is_product_link(href):
                    anchors.append(a)
                    seen.add(href)
        except Exception:
            pass
    return anchors


def _extract_product_links_from_page_source(driver) -> List[str]:
    """Fallback link discovery when clickable anchors are sparse or JS-rendered."""
    try:
        base_url = driver.current_url or CATALOG_URL_PRIMARY
    except Exception:
        base_url = CATALOG_URL_PRIMARY

    try:
        html = driver.page_source or ""
    except Exception:
        html = ""

    if not html:
        return []

    candidates: Set[str] = set()

    # 1) Parse tile anchors directly from source.
    try:
        soup = BeautifulSoup(html, "html.parser")
        # Preferred selector discovered on /footwear-rwbr product grid.
        for a in soup.select("li.js-product-grid-item a.c-product-tile__pdp-link.js-product-name[href]"):
            href = (a.get("href") or "").strip()
            if href:
                candidates.add(href)
        # Fallback for image-wrapped links inside the same tile.
        for a in soup.select("li.js-product-grid-item a[href$='.html']"):
            href = (a.get("href") or "").strip()
            if href:
                candidates.add(href)
        # Final generic fallback.
        for a in soup.find_all("a", href=True):
            href = (a.get("href") or "").strip()
            if href:
                candidates.add(href)
    except Exception:
        pass

    # 2) Regex fallback for URLs embedded in JSON/script blobs.
    blob = html_lib.unescape(html)
    patterns = [
        r"https?://[^\s\"'<>]+",
        r"/(?:footwear-rwbr|safety-boot|product)/[^\s\"'<>]+",
        r"/\d{3,6}\.html(?:[?#][^\s\"'<>]*)?",
    ]
    for pat in patterns:
        for m in re.finditer(pat, blob, flags=re.IGNORECASE):
            u = (m.group(0) or "").strip()
            if u:
                candidates.add(u)

    out: List[str] = []
    seen: Set[str] = set()
    for raw in candidates:
        abs_url = urljoin(base_url, raw)
        if not _is_product_link(abs_url):
            continue
        if abs_url in seen:
            continue
        seen.add(abs_url)
        out.append(abs_url)
    return out


def ensure_logged_in(driver, target_url: str) -> None:
    """Attempt a best-effort login for order.redwingshoes.com if prompted."""
    _polite_delay()
    safe_get(driver, target_url)
    dismiss_popups(driver)

    login_user_selectors = [
        (By.CSS_SELECTOR, "input[type='email']"),
        (By.CSS_SELECTOR, "input[name='email']"),
        (By.CSS_SELECTOR, "input[name='username']"),
        (By.CSS_SELECTOR, "input[id*='email' i]"),
        (By.CSS_SELECTOR, "input[id*='user' i]"),
        (By.CSS_SELECTOR, "input[autocomplete='username']"),
    ]
    login_pass_selectors = [
        (By.CSS_SELECTOR, "input[type='password']"),
        (By.CSS_SELECTOR, "input[name='password']"),
        (By.CSS_SELECTOR, "input[id*='password' i]"),
        (By.CSS_SELECTOR, "input[autocomplete='current-password']"),
    ]
    submit_selectors = [
        (By.CSS_SELECTOR, "button[type='submit']"),
        (By.XPATH, "//button[contains(translate(.,'LOGINSGNIN','loginsgnin'),'login') or contains(translate(.,'LOGINSGNIN','loginsgnin'),'sign in')]"),
        (By.XPATH, "//input[@type='submit']"),
    ]

    # If product links are already visible, skip login attempts.
    try:
        if _find_product_anchors(driver):
            return
    except Exception:
        pass

    user_el = None
    pass_el = None
    for by, sel in login_user_selectors:
        try:
            cand = driver.find_element(by, sel)
            if cand.is_displayed():
                user_el = cand
                break
        except Exception:
            pass
    for by, sel in login_pass_selectors:
        try:
            cand = driver.find_element(by, sel)
            if cand.is_displayed():
                pass_el = cand
                break
        except Exception:
            pass

    if not user_el or not pass_el:
        return

    try:
        user_el.clear()
    except Exception:
        pass
    user_el.send_keys(RW_SITE_USERNAME)
    try:
        pass_el.clear()
    except Exception:
        pass
    pass_el.send_keys(RW_SITE_PASSWORD)

    submitted = False
    for by, sel in submit_selectors:
        try:
            btn = driver.find_element(by, sel)
            if btn.is_displayed() and btn.is_enabled():
                btn.click()
                submitted = True
                break
        except Exception:
            pass
    if not submitted:
        pass_el.send_keys(Keys.ENTER)

    # Wait for either product links to appear or password field to disappear.
    end = time.time() + 25
    while time.time() < end:
        dismiss_popups(driver)
        try:
            if _find_product_anchors(driver):
                return
        except Exception:
            pass
        try:
            for by, sel in login_pass_selectors:
                pe = driver.find_element(by, sel)
                if pe.is_displayed():
                    break
            else:
                return
        except Exception:
            return
        time.sleep(0.4)


def open_footwear_section(driver) -> None:
    """Open the Footwear section after login when required by the site UI."""
    selectors = [
        (By.ID, "footwear-label"),
        (By.CSS_SELECTOR, "#footwear-label"),
        (By.XPATH, "//button[@id='footwear-label' or @aria-controls='footwear']"),
        (By.XPATH, "//a[@id='footwear-label']"),
        (By.XPATH, "//*[self::button or self::a or @role='tab'][contains(normalize-space(.), 'Footwear')]"),
    ]
    for by, sel in selectors:
        try:
            el = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((by, sel)))
            driver.execute_script("arguments[0].scrollIntoView({block:'center', inline:'center'});", el)
            time.sleep(0.15)
            try:
                el.click()
            except Exception:
                driver.execute_script("arguments[0].click();", el)
            time.sleep(0.8)
            return
        except Exception:
            continue


def open_red_wing_footwear_page(driver) -> None:
    """Use Footwear > Red Wing navigation described in docs, then ensure /footwear-rwbr."""
    # Prefer clicking the Red Wing submenu entry if visible.
    selectors = [
        (By.XPATH, "//a[contains(@href,'/footwear-rwbr')]"),
        (By.XPATH, "//button[contains(@data-target,'footwear-rwbr')]"),
        (By.XPATH, "//*[self::a or self::button][contains(normalize-space(.), 'Red Wing') and not(contains(normalize-space(.), 'Heritage'))]"),
    ]
    clicked = False
    for by, sel in selectors:
        try:
            el = WebDriverWait(driver, 6).until(EC.element_to_be_clickable((by, sel)))
            driver.execute_script("arguments[0].scrollIntoView({block:'center', inline:'center'});", el)
            time.sleep(0.15)
            try:
                el.click()
            except Exception:
                driver.execute_script("arguments[0].click();", el)
            clicked = True
            break
        except Exception:
            continue

    # Always navigate directly to the Red Wing footwear catalog.
    # The current site sometimes renders a quick-order dashboard first even
    # after clicking menu items, so we explicitly load the slug.
    try:
        safe_get(driver, CATALOG_URL_PRIMARY)
    except Exception:
        pass
    try:
        WebDriverWait(driver, 12).until(
            lambda d: FOOTWEAR_RWB_SLUG in ((urlparse(d.current_url).path or "").lower())
        )
    except Exception:
        try:
            safe_get(driver, CATALOG_URL_PRIMARY)
        except Exception:
            pass


def discover_footwear_catalog_urls(driver) -> List[str]:
    """Discover footwear brand catalog paths from RW site navigation/elements."""
    try:
        base_url = driver.current_url or CATALOG_URL_PRIMARY
    except Exception:
        base_url = CATALOG_URL_PRIMARY

    candidates: List[str] = []

    # Pull links from visible navigation.
    dom_selectors = [
        "a[href*='/footwear-']",
        "[id*='footwear' i] a[href]",
        "nav a[href*='footwear']",
    ]
    for sel in dom_selectors:
        try:
            for el in driver.find_elements(By.CSS_SELECTOR, sel):
                href = (el.get_attribute("href") or "").strip()
                if href:
                    candidates.append(href)
        except Exception:
            pass

    # Some menus store the target in data attributes.
    try:
        for el in driver.find_elements(By.CSS_SELECTOR, "[data-target], [data-url], [data-href]"):
            for attr in ("data-target", "data-url", "data-href"):
                v = (el.get_attribute(attr) or "").strip()
                if "/footwear-" in v:
                    candidates.append(v)
    except Exception:
        pass

    # Parse page source for hidden/off-canvas menu URLs.
    try:
        html = driver.page_source or ""
    except Exception:
        html = ""
    if html:
        blob = html_lib.unescape(html)
        for m in re.finditer(r"/footwear-[a-z0-9\-]+", blob, flags=re.IGNORECASE):
            candidates.append(m.group(0))
        for m in re.finditer(r"https?://[^\s\"'<>]*/footwear-[a-z0-9\-]+", blob, flags=re.IGNORECASE):
            candidates.append(m.group(0))

    urls: List[str] = []
    seen: Set[str] = set()
    for raw in candidates:
        u = urljoin(base_url, raw)
        try:
            p = urlparse(u)
            host = (p.netloc or "").lower()
            path = (p.path or "").lower()
        except Exception:
            continue
        if SITE_HOST not in host:
            continue
        if not re.search(r"/footwear-[a-z0-9\-]+$", path):
            continue
        norm = f"{p.scheme}://{p.netloc}{path}"
        if norm not in seen:
            seen.add(norm)
            urls.append(norm)

    # Always include configured default as fallback.
    if CATALOG_URL_PRIMARY not in seen:
        urls.insert(0, CATALOG_URL_PRIMARY)

    return urls


def wait_for_footwear_catalog_ready(driver, timeout: int = 30) -> None:
    """Wait for shared footwear-page elements instead of requiring product links immediately."""
    def _ready(d) -> bool:
        """Check that the catalog UI has loaded.

        On the current Red Wing site, the grid of products is present in
        ``li.js-product-grid-item`` elements.  Earlier heuristics looked for
        text such as "Sort By", "Results" or "Filters"; those still apply
        but may not appear until the user scrolls.  We consider the catalog
        ready when either at least one product tile exists or legacy text
        heuristics are satisfied.
        """
        try:
            # If product tiles are present, we are ready.
            tiles = d.find_elements(By.CSS_SELECTOR, "li.js-product-grid-item")
            if tiles:
                return True
        except Exception:
            pass
        try:
            txt = (d.find_element(By.TAG_NAME, "body").text or "").lower()
        except Exception:
            return False
        has_results = "result" in txt
        has_sort = "sort by" in txt
        has_filters = "filters" in txt
        has_more = "more results" in txt
        has_search = "search" in txt
        has_footwear_ctx = "footwear" in txt or "/footwear-" in (d.current_url or "").lower()
        return has_footwear_ctx and (has_sort or has_results or has_filters or has_more or has_search)

    WebDriverWait(driver, timeout).until(_ready)


def extract_style_and_name_from_html(html: str, url: str) -> Tuple[str, str, str]:
    """Extract (style_number, name, style_text) from legacy and RW order-site pages."""
    soup = BeautifulSoup(html, "html.parser")
    style_number = extract_style_from_url(url)
    name = ""
    style_text = ""

    guide_div = soup.find("div", class_="shoeguide") or soup.find(id="printSpacing")
    if guide_div:
        h3_tag = guide_div.find("h3")
        strong_tag = guide_div.find("strong")
        if h3_tag:
            style_text = h3_tag.get_text(" ", strip=True)
            match = re.search(r"#\s*(\d+)", style_text)
            if match:
                style_number = match.group(1)
        if strong_tag:
            name = strong_tag.get_text(" ", strip=True)

    if not name:
        for script in soup.select("script[type='application/ld+json']"):
            try:
                payload = json.loads(script.get_text(strip=True))
            except Exception:
                continue
            objs = payload if isinstance(payload, list) else [payload]
            for obj in objs:
                if not isinstance(obj, dict):
                    continue
                if not name:
                    nm = (obj.get("name") or "").strip()
                    if nm:
                        name = nm
                if not style_number:
                    for key in ("sku", "mpn", "productID"):
                        raw = str(obj.get(key) or "").strip()
                        m = re.search(r"\b(\d{3,6})\b", raw)
                        if m:
                            style_number = m.group(1)
                            break
                if name and style_number:
                    break
            if name and style_number:
                break

    if not name:
        for sel in ("h1", "h2", "title", "meta[property='og:title']"):
            try:
                el = soup.select_one(sel)
            except Exception:
                el = None
            if not el:
                continue
            if sel.startswith("meta"):
                txt = (el.get("content") or "").strip()
            else:
                txt = el.get_text(" ", strip=True)
            if txt and "red wing for business" not in txt.lower():
                name = txt
                break

    if not style_text:
        style_text = f"Style #{style_number}" if style_number else (name or "")
    if not style_number:
        style_number = style_text.strip()
    if not name:
        name = style_text
    return style_number, name, style_text
def dismiss_popups(driver) -> None:
    xpaths = [
        "//button[contains(.,'Agree') or contains(.,'Accept') or contains(.,'Proceed')]",
        "//a[contains(.,'Agree') or contains(.,'Accept') or contains(.,'Proceed')]",
        "//button[contains(.,'Close') or contains(.,'×') or contains(@aria-label,'Close')]",
    ]
    end = time.time() + 2.0
    while time.time() < end:
        clicked = False
        for xp in xpaths:
            try:
                el = driver.find_element(By.XPATH, xp)
                if el.is_displayed() and el.is_enabled():
                    el.click()
                    clicked = True
                    break
            except Exception:
                pass
        if not clicked:
            break
        time.sleep(0.2)


def scroll_to_load_all(driver, max_rounds: int = 80) -> None:
    stable_rounds = 0
    last_count = -1
    for _ in range(max_rounds):
        # Close any transient popups.
        dismiss_popups(driver)
        clicked_more = False
        # Attempt to click the "More Results" button if it exists.  The site renders
        # this as either a <button> or <a> element.  We search case-insensitively
        # and normalize whitespace to improve robustness.
        for xp in [
            "//button[contains(translate(normalize-space(.),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'more results')]",
            "//a[contains(translate(normalize-space(.),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'more results')]",
        ]:
            try:
                btn = driver.find_element(By.XPATH, xp)
                # Only click if the button is visible and not disabled
                if not btn.is_displayed():
                    continue
                # Some browsers don't expose is_enabled for <a>, so guard on disabled attribute
                disabled_attr = btn.get_attribute("disabled")
                if hasattr(btn, "is_enabled") and not btn.is_enabled():
                    continue
                if disabled_attr not in (None, "false", ""):  # skip if explicitly disabled
                    continue
                # Scroll the button into view to ensure clickable
                try:
                    driver.execute_script(
                        "arguments[0].scrollIntoView({block:'center', inline:'center'});",
                        btn,
                    )
                except Exception:
                    pass
                time.sleep(0.15)
                try:
                    btn.click()
                except Exception:
                    # Fallback to JS click
                    try:
                        driver.execute_script("arguments[0].click();", btn)
                    except Exception:
                        continue
                clicked_more = True
                # Allow new products to load before evaluating counts
                time.sleep(1.2)
                break
            except Exception:
                continue
        # Count current unique product anchors
        links = _find_product_anchors(driver)
        count = len({a.get_attribute("href") for a in links if a.get_attribute("href")})
        if count == last_count:
            stable_rounds += 1
        else:
            stable_rounds = 0
            last_count = count
        # If we've clicked "More Results" this round, give the page additional
        # scrolling time and avoid early termination.  Otherwise break after
        # consecutive stable rounds.
        if not clicked_more and stable_rounds >= 3:
            break
        # Scroll to the bottom to trigger lazy-loaded results and images.
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        except Exception:
            pass
        time.sleep(1.0)



def _clean_listing_name(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"\s+", " ", s)
    if not s:
        return ""
    u = s.upper()
    if u in {
        "RED WING FOR BUSINESS",
        "FEATURES",
        "SPECIFICATIONS",
        "TECHNOLOGY",
        "DETAILS",
        "VIEW DETAILS",
        "QUICK VIEW",
        "ADD TO CART",
    }:
        return ""
    if re.fullmatch(r"\d{2,}", s):
        return ""
    if "$" in s:
        return ""
    if len(s) < 3:
        return ""
    return s


def _listing_name_from_anchor(a) -> str:
    """Best-effort: read the product name as displayed on a catalog listing tile."""
    candidates = []
    try:
        candidates.append(a.get_attribute("aria-label") or "")
    except Exception:
        pass
    try:
        candidates.append(a.get_attribute("title") or "")
    except Exception:
        pass
    try:
        candidates.append(a.text or "")
    except Exception:
        pass

    lines = []
    for c in candidates:
        if not c:
            continue
        for ln in str(c).splitlines():
            ln = _clean_listing_name(ln)
            if ln:
                lines.append(ln)

    # De-dup while preserving order
    seen = set()
    uniq = []
    for ln in lines:
        if ln not in seen:
            seen.add(ln)
            uniq.append(ln)

    if not uniq:
        return ""

    def score(s: str) -> int:
        sc = len(s)
        if re.search(r"[A-Za-z]", s):
            sc += 15
        if re.search(r"\bWOMEN'?S\b", s.upper()):
            sc += 10
        if re.search(r"\bMEN'?S\b", s.upper()):
            sc += 8
        if any(t in s.lower() for t in ("view", "quick", "cart", "compare", "wishlist")):
            sc -= 30
        return sc

    uniq.sort(key=score, reverse=True)
    return uniq[0]


def apply_preferred_names(rows: List[List[str]], preferred: Dict[str, str]) -> int:
    """Overwrite Name column using preferred mapping (by style). Returns number updated."""
    if not rows or not preferred:
        return 0
    updated = 0
    for r in rows:
        if not isinstance(r, list) or len(r) < 2:
            continue
        style = (r[0] or "").strip()
        if not style:
            continue
        pref = _clean_listing_name(preferred.get(style, ""))
        if pref and r[1] != pref:
            r[1] = pref
            updated += 1
    return updated

def collect_product_links(driver) -> Tuple[List[str], Dict[str, str]]:
    """Collect unique product links from all configured catalog sections.

    Dedupes by style number when possible (preferred), otherwise by URL.
    """
    all_links: List[str] = []
    seen_href: Set[str] = set()
    seen_style: Set[str] = set()
    preferred_names: Dict[str, str] = {}

    ensure_logged_in(driver, CATALOG_URL_PRIMARY)
    open_footwear_section(driver)
    catalog_urls = discover_footwear_catalog_urls(driver) or list(CATALOG_URLS)

    sys.stderr.write(f"[collect] footwear catalogs discovered: {len(catalog_urls)}\n")
    for u in catalog_urls:
        sys.stderr.write(f"[collect] catalog: {u}\n")

    for url in catalog_urls:
        _polite_delay()
        ensure_logged_in(driver, url)
        safe_get(driver, url)
        wait_for_footwear_catalog_ready(driver, timeout=35)
        dismiss_popups(driver)
        scroll_to_load_all(driver)

        # Primary: live DOM anchors
        page_links: List[str] = []
        for a in _find_product_anchors(driver):
            href = a.get_attribute("href")
            if not href:
                continue
            if not _is_product_link(href):
                continue
            page_links.append(href)

        # Fallback: parse page source for JS-embedded links.
        for href in _extract_product_links_from_page_source(driver):
            if href not in page_links:
                page_links.append(href)

        sys.stderr.write(f"[collect] {url} -> discovered {len(page_links)} candidate product links\n")

        for href in page_links:
            if href in seen_href:
                continue

            style = extract_style_from_url(href)
            if PREFER_INTERNATIONAL_LISTING_NAMES and style and "catalog=international" in url:
                # Preferred-name capture requires DOM anchor context; skip in URL-only pass.
                pass
            if style:
                if style in seen_style:
                    # Different catalog section may link the same style; keep the first.
                    seen_href.add(href)
                    continue
                seen_style.add(style)

            seen_href.add(href)
            all_links.append(href)

    return all_links, preferred_names


def extract_style_from_url(link: str) -> str:
    if not link:
        return ""
    m = re.search(r"/safety-boot/(\d+)[-/]", link)
    if m:
        return m.group(1)
    m = re.search(r"/safety-boot/(\d+)", link)
    if m:
        return m.group(1)

    # New order-site patterns
    m = re.search(r"/footwear-[^/]+/(?:[^/?#]*/)?(\d{3,6})(?:[-/?#]|$)", link, flags=re.IGNORECASE)
    if m:
        return m.group(1)
    # Red Wing wholesale often uses root numeric html product urls, e.g. /02478.html
    m = re.search(r"/(\d{3,6})\.html(?:[?#]|$)", link, flags=re.IGNORECASE)
    if m:
        return m.group(1)
    m = re.search(r"/(\d{3,6})(?:[-/?#]|$)", link)
    if m and "footwear-rwbr" in link.lower():
        return m.group(1)

    try:
        q = parse_qs(urlparse(link).query)
        for key in ("style", "styleNumber", "sku", "item", "pid", "productId"):
            vals = q.get(key) or q.get(key.lower()) or []
            for v in vals:
                m = re.search(r"\b(\d{3,6})\b", str(v))
                if m:
                    return m.group(1)
    except Exception:
        pass
    return ""

# -----------------------------
# Checkpoint auto-repair helpers
# -----------------------------
def _looks_like_junk(s: str) -> bool:
    s = (s or "").strip().lower()
    if not s:
        return True
    # Common junk we saw when overlays/ads got captured as text
    junk_tokens = ("window.open", "javascript:", "facebook.com", "http://", "https://")
    if any(t in s for t in junk_tokens):
        return True
    # Overlong "brands" are almost always bad (ads/cookie banners)
    if len(s) > 60:
        return True
    return False


def _is_bad_row(row: List[str]) -> bool:
    try:
        style = (row[0] or "").strip()
        name = (row[1] or "").strip()
    except Exception:
        return True

    if not style:
        return True

    if name.upper() == BAD_NAME_SENTINEL:
        return True

    return False



def repair_missing_media(ck: Dict) -> int:
    """If checkpoint rows are missing URL/Image columns or have empty URL/Image, requeue those links.
    Runs at most once per checkpoint unless you delete/clear ck['media_repair_done'].
    """
    if ck.get("media_repair_done"):
        return 0
    rows = ck.get("rows", []) or []
    done_links = set(ck.get("done_links", []) or [])
    removed = 0

    styles_needed = set()
    for r in rows:
        if not isinstance(r, list) or not r:
            continue
        style = (r[0] or "").strip()
        if not style:
            continue
        if len(r) < EXPECTED_COLS:
            styles_needed.add(style)
            continue
        url_cell = (r[2] or "").strip() if len(r) > 2 else ""
        img_cell = (r[3] or "").strip() if len(r) > 3 else ""
        if not url_cell or not img_cell:
            styles_needed.add(style)

    if not styles_needed:
        return 0

    for href in list(done_links):
        st = extract_style_from_url(href)
        if st and st in styles_needed:
            done_links.discard(href)
            removed += 1

    ck["done_links"] = sorted(done_links)
    return removed

def repair_bad_checkpoint_rows(ck: Dict) -> int:
    """If checkpoint contains obviously bad rows, un-mark those links as 'done'.

    This fixes the situation where a previous run captured the site header/ads into the Name/Brand
    columns and those rows are now 'stuck' because resume logic skips already-done links.

    Returns:
        Number of links that were re-queued (removed from done_links).
    """
    try:
        rows = list(ck.get("rows", []) or [])
        done_links = set(ck.get("done_links", []) or [])
        if not rows or not done_links:
            return 0

        bad_styles = {r[0] for r in rows if r and _is_bad_row(r)}
        if not bad_styles:
            return 0

        removed = 0
        for link in list(done_links):
            st = extract_style_from_url(link)
            if st and st in bad_styles:
                done_links.remove(link)
                removed += 1

        if removed:
            ck["done_links"] = sorted(done_links)

            # Clear any fail counters for the re-queued links so they get their full retries.
            fc = dict(ck.get("fail_counts", {}) or {})
            for k in list(fc.keys()):
                st = extract_style_from_url(k)
                if st and st in bad_styles:
                    fc.pop(k, None)
            ck["fail_counts"] = fc

            # Also remove from hard-failed list so salvage logic doesn't get confused.
            ck["hard_failed_links"] = [u for u in (ck.get("hard_failed_links", []) or [])
                                      if extract_style_from_url(u) not in bad_styles]

            save_checkpoint(ck)

        return removed
    except Exception:
        return 0




def parse_field_line(text_block: str, field_name: str) -> str:
    """Parse a simple field/value from extracted page text.

    Red Wing pages often render fields in tables so the extracted text looks like:

        Name
        DynaForce®

    We support both 'Name: DynaForce' and 'Name' on one line with the value on the next.
    """
    lines = [ln.strip() for ln in (text_block or "").splitlines()]
    for i, line in enumerate(lines):
        if not line:
            continue
        if line.startswith(field_name):
            tail = line[len(field_name):].strip(" :\t")
            if tail:
                return tail
            # Field on its own line — take the next non-empty line.
            for j in range(i + 1, min(i + 6, len(lines))):
                nxt = (lines[j] or "").strip()
                if nxt:
                    return nxt
            return ""
    return ""


def extract_brand(header_text: str, body_text: str) -> str:
    ht = (header_text or "").strip()
    m = re.search(r"^(.*?)\s+style\s*#\s*\d+", ht, flags=re.IGNORECASE)
    if m:
        raw = m.group(1).strip()
        if raw:
            out = []
            for w in raw.split():
                out.append(w if w.isupper() else w.capitalize())
            return " ".join(out)

    for line in body_text.splitlines():
        if re.search(r"\bstyle\s*#\s*\d+\b", line, flags=re.IGNORECASE):
            m2 = re.search(r"^(.*?)\s+style\s*#\s*\d+\b", line.strip(), flags=re.IGNORECASE)
            if m2:
                raw = m2.group(1).strip()
                if raw:
                    out = []
                    for w in raw.split():
                        out.append(w if w.isupper() else w.capitalize())
                    return " ".join(out)
            break
    return ""


def extract_about_name(body_text: str, style: str) -> str:
    if style:
        idx = body_text.upper().find(f"ABOUT THE {style}")
        if idx != -1:
            chunk = body_text[idx: idx + 1600]
            val = parse_field_line(chunk, "Name")
            if val:
                return val.strip()
    val = parse_field_line(body_text, "Name")
    return val.strip() if val else ""


def parse_heights(text: str) -> Dict[str, bool]:
    found = set(int(x) for x in re.findall(r"\b(\d{1,2})\s*-\s*INCH\b", text.upper()))
    found |= set(int(x) for x in re.findall(r"\b(\d{1,2})\s+INCH\b", text.upper()))
    return {f'{i}"': (i in found) for i in range(5, 13)}


def classify_brand_family(brand_str: str) -> Tuple[bool, bool, bool]:
    b = (brand_str or "").lower()
    is_rw = b.startswith("red wing")
    is_is = b.startswith("irish setter")
    is_worx = b.startswith("worx")
    if ("worx" in b) and (not b.startswith("worx")) and ("by red wing" in b):
        is_worx = True
    return is_rw, is_is, is_worx




# -----------------------------
# Image capture / saving helpers
# -----------------------------
def _safe_filename_from_name(name: str, max_len: int = 120) -> str:
    s = (name or "").strip()
    s = re.sub(r"\s+", " ", s)
    s = s.replace("®", "")
    s = re.sub(r"[^\w\-\.\s]+", "", s, flags=re.UNICODE)
    s = s.strip().replace(" ", "_")
    s = re.sub(r"_+", "_", s)
    if not s:
        s = "boot"
    if len(s) > max_len:
        s = s[:max_len].rstrip("_")
    return s


def _images_dir() -> Path:
    d = (Path.cwd().parent / "Images").resolve()
    d.mkdir(parents=True, exist_ok=True)
    return d


def _rel_image_path(fname: str) -> str:
    return str(Path("..") / "Images" / fname)


def _pick_best_product_img_element(driver):
    # Preferred on wholesale PDP: active slick carousel image.
    for sel in [
        "div.slick-slide.slick-current.slick-active img[itemprop='image']",
        "div.slick-slide.slick-current.slick-active img",
        "div.c-image-carousel__slider-item.js-carousel-item img[itemprop='image']",
    ]:
        try:
            img = driver.find_element(By.CSS_SELECTOR, sel)
            if img and img.is_displayed():
                return img
        except Exception:
            pass

    # First, try the known product image container
    try:
        container = driver.find_element(By.ID, "productImage")
        img = container.find_element(By.CSS_SELECTOR, "img")
        if img and img.is_displayed():
            return img
    except Exception:
        pass

    # Prefer images in main content; pick the largest visible <img>
    try:
        imgs = driver.find_elements(By.CSS_SELECTOR, "main img")
    except Exception:
        imgs = []
    if not imgs:
        try:
            imgs = driver.find_elements(By.TAG_NAME, "img")
        except Exception:
            imgs = []

    best = None
    best_score = 0.0
    for el in imgs:
        try:
            if not el.is_displayed():
                continue
            sz = el.size or {}
            w = float(sz.get("width") or 0)
            h = float(sz.get("height") or 0)
            if w < 120 or h < 120:
                continue
            alt = (el.get_attribute("alt") or "").lower()
            cls = (el.get_attribute("class") or "").lower()
            src = (el.get_attribute("src") or "").lower()
            score = w * h
            if any(k in alt for k in ("boot", "shoe", "chukka", "hiker", "moc")):
                score *= 1.2
            if any(k in cls for k in ("product", "primary", "hero", "image")):
                score *= 1.15
            if any(k in src for k in ("/dw/image", "scene7", "/images", "static")):
                score *= 1.1
            if score > best_score:
                best_score = score
                best = el
        except Exception:
            continue
    return best



def _extract_product_image_url(driver) -> str:
    # 0) Preferred on current wholesale PDP: active slick carousel slide image
    for sel in [
        "div.slick-slide.slick-current.slick-active img[itemprop='image']",
        "div.slick-slide.slick-current.slick-active img",
        "div.c-image-carousel__slider-item.js-carousel-item img[itemprop='image']",
        "div.c-image-carousel__slider-item.js-carousel-item img",
    ]:
        try:
            img = driver.find_element(By.CSS_SELECTOR, sel)
            z = (img.get_attribute("zoomimg") or "").strip()
            s = (img.get_attribute("src") or "").strip()
            if z or s:
                return z or s
        except Exception:
            continue

    # 1) Container by id
    try:
        container = driver.find_element(By.ID, "productImage")
        try:
            img = container.find_element(By.CSS_SELECTOR, "img")
        except Exception:
            img = None
        if img:
            z = (img.get_attribute("zoomimg") or "").strip()
            s = (img.get_attribute("src") or "").strip()
            if z or s:
                return z or s
    except Exception:
        pass

    # 2) Orbit slider / active slide
    for sel in [
        'li[data-orbit-slide="product-1"] img',
        'li.active img',
        'ul#productImage li.active img',
        'main li.active img',
        'main img',
    ]:
        try:
            img = driver.find_element(By.CSS_SELECTOR, sel)
            z = (img.get_attribute("zoomimg") or "").strip()
            s = (img.get_attribute("src") or "").strip()
            if z or s:
                return z or s
        except Exception:
            continue

    return ""

def capture_product_image_temp(driver, style: str) -> str:
    """Save the main product image as a temp file (__<style>.png).

    Preferred: download the product image URL (zoomimg/src) for best quality.
    Fallback: element screenshot if download fails.
    Returns relative path like ../Images/__595.png (or empty string).
    """
    style = (style or "").strip()
    if not style:
        return ""

    fname = f"__{style}.png"
    out_path = _images_dir() / fname

    # URL download first (avoids overlays, best resolution)
    try:
        img_url = _extract_product_image_url(driver)
        if img_url:
            img_url = img_url.replace("&amp;", "&")
            headers = {"User-Agent": "Mozilla/5.0"}
            r = requests.get(img_url, headers=headers, timeout=30)
            if r.ok and r.content:
                out_path.write_bytes(r.content)
                return _rel_image_path(fname)
    except Exception:
        pass

    # Fallback: screenshot the best image element
    img_el = None
    try:
        img_el = _pick_best_product_img_element(driver)
    except Exception:
        img_el = None

    if not img_el:
        return ""

    try:
        driver.execute_script("arguments[0].scrollIntoView({block:'center', inline:'center'});", img_el)
        time.sleep(0.2)
    except Exception:
        pass

    try:
        dismiss_popups(driver)
    except Exception:
        pass

    try:
        img_el.screenshot(str(out_path))
        return _rel_image_path(fname)
    except Exception:
        try:
            driver.save_screenshot(str(out_path))
            return _rel_image_path(fname)
        except Exception:
            return ""


def finalize_image_filename(style: str, boot_name: str, rel_temp_path: str) -> str:
    """Rename ../Images/__<style>.png -> ../Images/<style>.png."""
    if not rel_temp_path:
        return ""
    temp_abs = (Path.cwd() / rel_temp_path).resolve()
    if not temp_abs.exists():
        return rel_temp_path

    style_clean = re.sub(r"[^\dA-Za-z_-]", "", (style or "").strip())
    if not style_clean:
        return rel_temp_path

    fname = f"{style_clean}.png"
    dest_abs = _images_dir() / fname

    try:
        temp_abs.replace(dest_abs)
    except Exception:
        return rel_temp_path

    return _rel_image_path(fname)

def infer_gender(header: str, source_url: str) -> Tuple[bool, bool]:
    """Infer gender flags from URL + header (do NOT use body text; it frequently contains both words).

    Priority:
      1) URL slug/query (mens/womens)
      2) Header text (MEN'S / WOMEN'S)
      3) Otherwise: unknown -> (False, False)
    """
    hu = (header or "").upper()
    ul = (source_url or "").lower()

    # URL is most reliable (product pages use /mens-... or /womens-...)
    if re.search(r"/womens(?:[-/]|$)", ul) or "womens-" in ul or "gender=women" in ul or "gender=female" in ul:
        return (False, True)
    if re.search(r"/mens(?:[-/]|$)", ul) or "mens-" in ul or "gender=men" in ul or "gender=male" in ul:
        return (True, False)

    male = bool(re.search(r"\bMEN'?S\b", hu))
    female = bool(re.search(r"\bWOMEN'?S\b", hu))
    return (male, female)

def _build_row_from_text(
    style: str,
    name: str,
    header: str,
    body_text: str,
    *,
    source_url: str = "",
    image_rel_path: str = "",
    html: str = "",
) -> List[str]:
    """
    Build a row for the markdown table from the scraped pieces of a product page.

    In addition to the previously-supported text parsing, this version accepts the
    raw HTML of the page (via the ``html`` parameter) so that additional
    features can be inferred from non-visible attributes such as ``alt``,
    ``title``, or ``aria-label`` on icons.  Passing ``html`` is optional; if
    omitted, the function falls back to the older behaviour.
    """

    body_text = body_text or ""
    header = header or ""

    body_upper = body_text.upper()
    body_lower = body_text.lower()

    brand_str = extract_brand(header, body_text)

    # Guard against junk captured from overlays/ads (seen as "Window.open(...)" etc.)
    if _looks_like_junk(brand_str):
        brand_str = ""
    is_rw, is_is, is_worx = classify_brand_family(brand_str)

    if not brand_str:
        hu = header.upper()
        bu = body_text.upper()
        if "IRISH SETTER" in hu or "IRISH SETTER" in bu:
            brand_str = "Irish Setter"
        elif "WORX" in hu or "WORX" in bu:
            brand_str = "Worx"
        elif "RED WING" in hu or "RED WING" in bu:
            brand_str = "Red Wing"
        is_rw, is_is, is_worx = classify_brand_family(brand_str)

    header_upper = header.upper()
    male, female = infer_gender(header, source_url)

    # Fallback gender detection: if neither flag is set by URL or header,
    # examine the first few hundred characters of the body text.  Many Red Wing
    # product pages include a tagline like "Men's 9-Inch Waterproof…" or
    # "Women's 6-Inch Safety Toe Boot" near the top of the body.  Checking
    # only the first portion of the text reduces false positives from
    # unrelated content further down the page.
    if not male and not female:
        first_chunk = body_upper[:600]
        male_found = bool(re.search(r"\bMEN'?S\b", first_chunk))
        female_found = bool(re.search(r"\bWOMEN'?S\b", first_chunk))
        if male_found and not female_found:
            male = True
        elif female_found and not male_found:
            female = True

    # IMPORTANT: Name must come ONLY from extract_style_and_name_from_html().
    # No fallbacks, no alternative heuristics.
    steel_toe = "STEEL TOE" in body_upper
    aluminum_toe = "ALUMINUM TOE" in body_upper or "ALLOY TOE" in body_upper
    non_metal_toe = ("NON-METALLIC TOE" in body_upper) or ("NON METALLIC TOE" in body_upper) or ("COMPOSITE TOE" in body_upper)
    safety_toe = ("SAFETY TOE" in body_upper) or steel_toe or aluminum_toe or non_metal_toe
    soft_toe = not safety_toe

    met_guard = ("METATARSAL GUARD" in body_upper) or ("MET GUARD" in body_upper)
    waterproof = "WATERPROOF" in body_upper

    insulation_line = parse_field_line(body_text, "Insulation")
    if insulation_line:
        il = insulation_line.strip().lower()
        if re.search(r"\b(non[-\s]?insulated|uninsulated)\b", il):
            insulated = False
        else:
            m = re.search(r"(\d+)\s*g\b", il)
            if m:
                try:
                    insulated = int(m.group(1)) > 0
                except Exception:
                    insulated = True
            else:
                insulated = ("insulat" in il) or ("thinsulate" in il)
    else:
        if re.search(r"\b(non[-\s]?insulated|uninsulated)\b", body_lower):
            insulated = False
        else:
            insulated = ("insulat" in body_lower) or ("thinsulate" in body_lower)

    slip_resistant = ("SLIP RESISTANT" in body_upper) or (re.search(r"\bSR\b", body_text) is not None)
    electrical_hazard = ("ELECTRICAL HAZARD" in body_upper) or (re.search(r"\bEH\b", body_text) is not None)
    puncture_resistant = ("PUNCTURE RESISTANT" in body_upper) or ("PUNCTURE" in body_upper)
    static_dissipative = ("STATIC DISSIPATIVE" in body_upper) or (re.search(r"\bSD\b", body_text) is not None)
    ankle_protection = ("ANKLE PROTECTION" in body_upper) or ("ankle" in body_lower and "protect" in body_lower)
    boa = ("BOA" in body_upper)

    defined_heel_line = parse_field_line(body_text, "Defined Heel")
    defined_heel = defined_heel_line.lower().startswith("yes") if defined_heel_line else ("DEFINED HEEL" in body_upper)

    leather_line = parse_field_line(body_text, "Leather Type")
    all_leather_upper = False
    if leather_line:
        ll = leather_line.lower()
        if not any(x in ll for x in ["mesh", "nylon", "fabric", "textile", "poly", "synthetic"]):
            all_leather_upper = True

    resoleable_line = parse_field_line(body_text, "Resoleable") or parse_field_line(body_text, "Resolvable")
    resoleable = False
    if resoleable_line:
        resoleable = resoleable_line.strip().lower().startswith("yes")
    elif re.search(r"\bresoleable\b", body_lower) or re.search(r"\bresolvable\b", body_lower):
        resoleable = "yes" in body_lower or "true" in body_lower

    name_lower = (name or "").lower()
    oxford_athletic = ("oxford" in name_lower) or ("athletic" in name_lower) or ("shoe" in name_lower)
    chukka = "chukka" in name_lower
    hiker = "hiker" in name_lower

    heights = parse_heights(header + "\n" + body_text)

    origin_line = parse_field_line(body_text, "Country of Origin")
    origin_lower = origin_line.lower()
    made_in_usa = ("made in usa" in origin_lower) or ("made in the usa" in origin_lower)
    built_in_usa = ("built in usa" in origin_lower) or ("assembled in the usa" in origin_lower) or made_in_usa

    # Use raw HTML (if provided) to look for additional signals via alt/title/aria-label.
    # Some attributes like "Defined Heel" or "Made in USA" are exposed only through
    # icon alt text rather than the visible page text.  Collect all non-empty
    # ``alt``, ``title`` and ``aria-label`` values and normalise them for
    # case-insensitive matching.
    alt_tokens: Set[str] = set()
    if html:
        try:
            soup = BeautifulSoup(html, "html.parser")
            for tag in soup.find_all(True):
                # Consider only known attribute names that could contain human-readable labels.
                for attr in ("alt", "title", "aria-label"):
                    val = tag.get(attr)
                    if isinstance(val, str) and val.strip():
                        alt_tokens.add(val.strip().upper())
        except Exception:
            # If BeautifulSoup parsing fails for any reason, silently ignore HTML-based hints.
            pass

    # If not already marked, infer 'Defined Heel' from alt tokens.
    if not defined_heel and alt_tokens:
        for tok in alt_tokens:
            # Exact match on the phrase or any token containing both words.
            if tok == "DEFINED HEEL" or ("DEFINED" in tok and "HEEL" in tok):
                defined_heel = True
                break

    # Infer Made/Built in USA from alt tokens if not already set.
    if alt_tokens:
        for tok in alt_tokens:
            t = tok.replace("\xa0", " ")  # normalise non-breaking spaces
            if not made_in_usa and ("MADE IN USA" in t or "MADE IN THE USA" in t):
                made_in_usa = True
                built_in_usa = True
                continue
            # Some icons may specify assembled/built rather than made.
            if not built_in_usa and ("BUILT IN USA" in t or "ASSEMBLED IN USA" in t or "ASSEMBLED IN THE USA" in t):
                built_in_usa = True
                # Do not set made_in_usa here as assembled can use imported materials.
        # no break: examine all tokens

    return [
        md_escape_cell(style),
        md_escape_cell(name),
        md_escape_cell(source_url),
        md_escape_cell(image_rel_path),
        md_escape_cell(brand_str),

        b01(male),
        b01(female),

        b01(is_rw),
        b01(is_is),
        b01(is_worx),

        b01(safety_toe),
        b01(steel_toe),
        b01(non_metal_toe),
        b01(aluminum_toe),
        b01(met_guard),
        b01(soft_toe),

        b01(waterproof),
        b01(insulated),
        b01(slip_resistant),
        b01(electrical_hazard),
        b01(puncture_resistant),
        b01(static_dissipative),
        b01(ankle_protection),
        b01(boa),

        b01(defined_heel),
        b01(all_leather_upper),
        b01(resoleable),

        b01(oxford_athletic),
        b01(chukka),
        b01(hiker),

        b01(heights['5"']),
        b01(heights['6"']),
        b01(heights['7"']),
        b01(heights['8"']),
        b01(heights['9"']),
        b01(heights['10"']),
        b01(heights['11"']),
        b01(heights['12"']),

        b01(built_in_usa),
        b01(made_in_usa),
    ]


def _scrape_product_via_http(link: str, image_temp_rel: str = "") -> List[str]:
    # The new order site is typically authenticated; HTTP fallback won't carry browser auth state.
    if SITE_HOST in (urlparse(link).netloc or ""):
        raise RuntimeError("HTTP fallback disabled for authenticated order site")
    last_exc: Optional[BaseException] = None
    for u in _url_variants(link):
        try:
            html = _fetch_html(u, timeout=30)
            style, name, style_text = extract_style_and_name_from_html(html, u)
            header = style_text
            body_text = _html_to_text(html)
            row = _build_row_from_text(
                style,
                name,
                header,
                body_text,
                source_url=link,
                image_rel_path=image_temp_rel,
                html=html,
            )
            return row
        except Exception as e:
            last_exc = e

    if last_exc:
        raise last_exc
    raise RuntimeError("HTTP fallback failed")


def scrape_product(driver, link: str, *, prefer_http: bool = False) -> List[str]:
    image_temp_rel = ""  # always defined (used for HTTP fallback and row build)
    style = extract_style_from_url(link)
    style_from_url = style

    _polite_delay()

    if prefer_http:
        try:
            return _scrape_product_via_http(link, image_temp_rel=image_temp_rel)
        except Exception:
            pass

    # Selenium first
    try:
        if SITE_HOST in (urlparse(link).netloc or ""):
            ensure_logged_in(driver, CATALOG_URL_PRIMARY)
        safe_get(driver, link)
        dismiss_popups(driver)

        # IMPORTANT: Style/Name must come ONLY from this HTML parsing logic.
        html = driver.page_source or ""
        style, name, style_text = extract_style_and_name_from_html(html, link)

        # Capture product image early (temp file: __<style>.png)
        try:
            if style:
                image_temp_rel = capture_product_image_temp(driver, style)
        except Exception:
            image_temp_rel = ""
        WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        body_text = driver.find_element(By.TAG_NAME, "body").text or ""
        header = style_text

        row = _build_row_from_text(
            style,
            name,
            header,
            body_text,
            source_url=link,
            image_rel_path=image_temp_rel,
            html=html,
        )
        return row

    except Exception:
        # Fallback to HTTP text extraction (much more reliable for the pages Selenium hangs on)
        return _scrape_product_via_http(link, image_temp_rel=image_temp_rel)
def write_markdown(rows: List[List[str]]) -> None:
    headers = MD_HEADERS

    def style_key(r: List[str]):
        s = r[0]
        try:
            return (0, int(re.sub(r"\D", "", s)))
        except Exception:
            return (1, s)

    sorted_rows = sorted(rows, key=style_key)

    out_lines = []
    out_lines.append("|" + "|".join(headers) + "|")
    out_lines.append("|" + "|".join(["---"] * len(headers)) + "|")
    for r in sorted_rows:
        # Pad/cut rows if checkpoint has older/newer schema
        if len(r) < len(headers):
            r = r + [""] * (len(headers) - len(r))
        elif len(r) > len(headers):
            r = r[:len(headers)]
        out_lines.append("|" + "|".join(r) + "|")

    atomic_write(OUT_MD, "\n".join(out_lines) + "\n")


def _read_errors_urls() -> List[str]:
    if not ERRORS_TXT.exists():
        return []
    urls: List[str] = []
    for line in ERRORS_TXT.read_text(encoding="utf-8", errors="replace").splitlines():
        line = line.strip()
        if not line:
            continue
        # Lines look like: "FAILED 3x: <url>"
        m = re.search(r"https?://\S+", line)
        if m:
            u = m.group(0)
            if u not in urls:
                urls.append(u)
    return urls


def _rewrite_errors_file(urls: List[str]) -> None:
    if not urls:
        try:
            ERRORS_TXT.unlink(missing_ok=True)
        except Exception:
            pass
        return
    lines = [f"FAILED {MAX_FAILS_PER_LINK}x: {u}" for u in urls]
    atomic_write(ERRORS_TXT, "\n".join(lines) + "\n")


def main() -> None:
    ck = load_checkpoint()

    done_links: Set[str] = set(ck.get("done_links", []))
    fail_counts: Dict[str, int] = dict(ck.get("fail_counts", {}))
    rows: List[List[str]] = list(ck.get("rows", []))
    started_at = float(ck.get("started_at", time.time()))

    hard_failed_links: List[str] = list(ck.get("hard_failed_links", []))
    if not hard_failed_links:
        hard_failed_links = _read_errors_urls()


    # If older checkpoint rows captured overlays/ads, they may be "stuck" because resume logic
    # skips already-done links. Auto-repair by re-queuing those styles.
    if AUTO_REPAIR_BAD_ROWS:
        repaired = repair_bad_checkpoint_rows(ck)
        if repaired:
            done_links = set(ck.get("done_links", []))
            fail_counts = dict(ck.get("fail_counts", {}))
            rows = list(ck.get("rows", []))
            hard_failed_links = list(ck.get("hard_failed_links", []))
    
    
    reporter: Optional[ProgressReporter] = None
    driver = None

    try:
        driver = create_driver(ck, headless=RUN_HEADLESS, page_load_strategy="eager", block_images=True)
        reset_browser_state(driver)

        product_links = ck.get("product_links") or []
        preferred_names: Dict[str, str] = ck.get("preferred_names") or {}

        if REFRESH_PRODUCT_LINKS_EACH_RUN or not product_links:
            reporter = ProgressReporter(total=1, started_at=started_at)
            reporter.update(0, note="Refreshing product links...")
            fresh_links, fresh_preferred_names = collect_product_links(driver)
            if fresh_links:
                product_links = fresh_links
            else:
                sys.stderr.write(
                    "[warn] Link refresh discovered 0 links; falling back to checkpoint link list.\n"
                )
            if fresh_preferred_names:
                preferred_names = fresh_preferred_names

            # Apply preferred International listing names to any already-scraped rows (resume-safe)
            try:
                if PREFER_INTERNATIONAL_LISTING_NAMES and preferred_names:
                    nupd = apply_preferred_names(rows, preferred_names)
                    if nupd:
                        ck["rows"] = rows
                        ck["preferred_names"] = preferred_names
                        save_checkpoint(ck)
                        print(f"Applied preferred International names to {nupd} existing rows")
            except Exception:
                pass

            ck["product_links"] = product_links
            if PREFER_INTERNATIONAL_LISTING_NAMES:
                ck["preferred_names"] = preferred_names
            save_checkpoint(ck)

        # If resuming and we prefer International listing names but the mapping is missing, collect it once.
        if PREFER_INTERNATIONAL_LISTING_NAMES and not preferred_names:
            try:
                _, preferred_names = collect_product_links(driver)
                ck["preferred_names"] = preferred_names
                save_checkpoint(ck)
            except Exception:
                preferred_names = preferred_names or {}

            # Apply preferred names to any already-scraped rows (resume-safe)
            try:
                if preferred_names:
                    nupd = apply_preferred_names(rows, preferred_names)
                    if nupd:
                        ck["rows"] = rows
                        ck["preferred_names"] = preferred_names
                        save_checkpoint(ck)
                        print(f"Applied preferred International names to {nupd} existing rows")
            except Exception:
                pass

        if AUTO_REPAIR_MISSING_MEDIA:
            rm = repair_missing_media(ck)
            # Mark done so we don't requeue forever
            if not ck.get("media_repair_done"):
                ck["media_repair_done"] = True
                save_checkpoint(ck)
            if rm:
                print(f"Re-queued {rm} links due to missing URL/Image")
                save_checkpoint(ck)

        # Optional single-style test mode
        if ONLY_STYLE.strip():
            want = ONLY_STYLE.strip()
            product_links = [u for u in (product_links or []) if extract_style_from_url(u) == want]
            ck["product_links"] = product_links
            save_checkpoint(ck)

        total = len(product_links)
        if total == 0:
            raise RuntimeError(
                "No product links discovered from catalog page. "
                "Login/navigation likely succeeded but product link extraction found nothing."
            )
        if reporter:
            reporter.close()
        reporter = ProgressReporter(total=total, started_at=started_at)

        processed = len(done_links)
        reporter.update(processed, note="starting")

        # map existing rows by style for dedupe/overwrite
        style_to_idx = {r[0]: i for i, r in enumerate(rows) if r and r[0]}

        for link in product_links:
            if link in done_links:
                processed += 1
                reporter.update(processed, note="(resumed)")
                continue

            ok = False
            while True:
                try:
                    row = scrape_product(driver, link)

                    if not row[0]:
                        raise RuntimeError(f"Style number parsed empty for link: {link}")
                    if not row[1]:
                        raise RuntimeError(f"Name parsed empty for style {row[0]} ({link})")

                    # Prefer International listing display name when available
                    if PREFER_INTERNATIONAL_LISTING_NAMES and preferred_names:
                        pref = _clean_listing_name(preferred_names.get(row[0], ""))
                        if pref:
                            row[1] = md_escape_cell(pref)

                    # Ensure URL column is populated
                    if len(row) > 2 and not row[2]:
                        row[2] = md_escape_cell(link)

                    # Finalize image filename based on final Name
                    try:
                        if len(row) > 3 and row[3]:
                            row[3] = md_escape_cell(finalize_image_filename(row[0], row[1], row[3]))
                    except Exception:
                        pass

                    # dedupe by style
                    if row[0] in style_to_idx:
                        rows[style_to_idx[row[0]]] = row
                    else:
                        style_to_idx[row[0]] = len(rows)
                        rows.append(row)

                    done_links.add(link)

                    ck["done_links"] = sorted(done_links)
                    ck["rows"] = rows
                    ck["fail_counts"] = fail_counts
                    ck["hard_failed_links"] = hard_failed_links
                    save_checkpoint(ck)

                    write_markdown(rows)

                    ok = True
                    break

                except Exception as e:
                    fail_counts[link] = int(fail_counts.get(link, 0)) + 1

                    ck["done_links"] = sorted(done_links)
                    ck["rows"] = rows
                    ck["fail_counts"] = fail_counts
                    ck["hard_failed_links"] = hard_failed_links
                    save_checkpoint(ck)

                    if fail_counts[link] >= MAX_FAILS_PER_LINK:
                        if link not in hard_failed_links:
                            hard_failed_links.append(link)
                            write_errors_line(f"FAILED {fail_counts[link]}x: {link}")
                        done_links.add(link)
                        ck["done_links"] = sorted(done_links)
                        ck["hard_failed_links"] = hard_failed_links
                        save_checkpoint(ck)
                        break

                    sys.stderr.write(
                        f"\nError scraping link (attempt {fail_counts[link]}/{MAX_FAILS_PER_LINK}): {link}\n"
                    )
                    sys.stderr.write("".join(traceback.format_exception(type(e), e, e.__traceback__)) + "\n")

                    # reset driver only if it's likely wedged
                    try:
                        if driver:
                            driver.quit()
                    except Exception:
                        pass
                    driver = create_driver(ck, headless=RUN_HEADLESS, page_load_strategy="eager", block_images=True)
                    reset_browser_state(driver)

            processed += 1
            reporter.update(processed, note=("ok" if ok else "skipped"))

        # Main pass done
        write_markdown(rows)
        reporter.update(total, note="done")

        # Salvage pass: retry hard failures once more (HTTP-first) with a conservative driver.
        if ENABLE_SALVAGE_PASS and hard_failed_links:
            sys.stderr.write(f"\nStarting salvage pass for {len(hard_failed_links)} failed links...\n")

            remaining: List[str] = []
            try:
                if driver:
                    try:
                        driver.quit()
                    except Exception:
                        pass
                driver = create_driver(
                    ck,
                    headless=RUN_HEADLESS,
                    page_load_strategy="normal",
                    block_images=False,
                    page_load_timeout=SALVAGE_PAGE_LOAD_TIMEOUT,
                )
                reset_browser_state(driver)

                for i, link in enumerate(list(hard_failed_links), start=1):
                    salvaged = False
                    for _ in range(SALVAGE_MAX_TRIES_PER_LINK):
                        try:
                            row = scrape_product(driver, link, prefer_http=True)
                            if row and row[0] and row[1]:
                                if row[0] in style_to_idx:
                                    rows[style_to_idx[row[0]]] = row
                                else:
                                    style_to_idx[row[0]] = len(rows)
                                    rows.append(row)
                                salvaged = True
                                break
                        except Exception:
                            try:
                                driver.get("about:blank")
                            except Exception:
                                pass
                            time.sleep(0.5)

                    if salvaged:
                        sys.stderr.write(f"Salvaged {i}/{len(hard_failed_links)}: {link}\n")
                    else:
                        remaining.append(link)

                hard_failed_links = remaining

            except Exception as e:
                sys.stderr.write("\nSalvage pass encountered an error:\n")
                sys.stderr.write("".join(traceback.format_exception(type(e), e, e.__traceback__)) + "\n")

            ck["rows"] = rows
            ck["hard_failed_links"] = hard_failed_links
            save_checkpoint(ck)
            write_markdown(rows)
            _rewrite_errors_file(hard_failed_links)

        sys.stderr.write(f"\nDONE. Wrote: {OUT_MD}\n")
        if ERRORS_TXT.exists():
            sys.stderr.write(f"Some links failed repeatedly; see: {ERRORS_TXT}\n")

    finally:
        try:
            if reporter:
                reporter.close()
        except Exception:
            pass
        try:
            if driver:
                driver.quit()
        except Exception:
            pass


if __name__ == "__main__":
    main()
