# -----------------------------------------------------------------------------
# Naming rules update
#   - Folder: Name_Account#(_Parent Account if applicable)
#   - Markdown filename: Program name you click to open (e.g. "$150 Subsidy Safety Toe.md")
#   - Parent Account read from "Company Information" on the Account Summary page.
# -----------------------------------------------------------------------------

#!/usr/bin/env python3
"""Voucher_List_Scraper.py

USAGE
  pip install selenium
  # Ensure Firefox + geckodriver are installed.
  # If geckodriver is not on PATH, set:
  #   export GECKODRIVER_PATH=/full/path/to/geckodriver

  python3 Voucher_List_Scraper.py
  python3 Voucher_List_Scraper.py --headless

WHAT THIS DOES
  - Opens the Accounts list page and logs in (credentials hard-coded below).
  - Clicks id=accSearchBox, types "aa", waits 3 seconds, then presses Enter.
  - Waits a moment for the results to refresh.
  - Clicks the FIRST account row in the results table id='device-grid-list'.
  - Creates a folder in the same directory as this .py.
  - Writes a Markdown file inside that folder named after the clicked account text.
  - On the account page, finds id=ProgramListView; if the FIRST program row is active
    (checkbox checked), it records that program name and opens it.
  - Records "Program Summary" section (best effort) and captures all rows from the
    main DataTables product list (not just the first 10 displayed).

NOTES
  - The portal uses DataTables; to avoid stale element errors, this script snapshots
    tables with JavaScript (reads text + hrefs from the DOM) instead of reading
    WebElement.text repeatedly.
"""

from __future__ import annotations

import argparse
import hashlib
import json
import os
import re
import shutil
import time
try:
    import fcntl  # Unix
except ImportError:  # Windows
    fcntl = None
    import msvcrt
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Tuple
from urllib.parse import urljoin

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


# =========================
# HARD-CODED LOGIN (per your request)
# =========================
LOGIN_URL = "https://portal.redwingforbusiness.com/RWS_AccountsListPage?tab=account"
USERNAME = "walpine614@rwfb.com"
PASSWORD = "SammiandCoco2!"
# =========================

SEARCH_TERM = "aa"
ACC_SEARCHBOX_ID = "accSearchBox"
ACCOUNTS_TABLE_ID = "device-grid-list"
PROGRAM_TABLE_ID = "ProgramListView"
OUTPUT_ROOT_DIRNAME = "ALL"

# If True, only scrape programs whose checkbox is checked in the Program list.
ONLY_ACTIVE_PROGRAMS = True
SPECIAL_SLOW_RETRY_ACCOUNTS = {
    "town of indian head",
    "charles county utilities",
    "battle creek construction",
    "charles county public works",
    "town of la plata",
    "charles county emergency services",
    "maryland transportation authority",
    "smithsonian - african american history museum",
    "bolling afb commissary",
}
@dataclass(frozen=True)
class LinkCell:
    text: str
    href: str


_WINDOWS_RESERVED = {
    "CON", "PRN", "AUX", "NUL",
    "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
    "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
}


def _safe_fs_component(s: str, max_len: int = 160) -> str:
    """Return a filesystem-safe single path component.

    - Preserves spaces and '$' (so you can get names like '$150 Subsidy Safety Toe.md').
    - Removes characters that are invalid on Windows and path separators on all OSes.
    """
    s = (s or "").replace("\r", " ").replace("\n", " ").replace("\t", " ").strip()
    s = re.sub(r"\s+", " ", s)

    # Windows-forbidden + path separators
    s = re.sub(r'[<>:"/\\|?*]', "_", s)

    # Remove ASCII control characters
    s = re.sub(r"[\x00-\x1f]", "", s)

    # Windows doesn't allow trailing dots/spaces
    s = s.rstrip(" .")

    if not s:
        s = "item"

    if s.upper() in _WINDOWS_RESERVED:
        s = f"{s}_"

    return s[:max_len]



def _is_elf_executable(path: str) -> bool:
    """Return True if `path` looks like a real ELF binary (not a shell/snap wrapper)."""
    try:
        p = Path(path)
        if not p.exists() or not p.is_file():
            return False
        # executable bit helps, but not required for ELF check
        with p.open("rb") as f:
            return f.read(4) == b"\x7fELF"
    except Exception:
        return False


def _is_windows_executable(path: str) -> bool:
    try:
        p = Path(path)
        return p.exists() and p.is_file() and p.suffix.lower() == ".exe"
    except Exception:
        return False


def _is_usable_executable(path: str) -> bool:
    if os.name == "nt":
        return _is_windows_executable(path)
    return _is_elf_executable(path)


def _locate_firefox_binary() -> Optional[str]:
    """Find a real Firefox executable for GeckoDriver.

    On many Pi / Ubuntu setups, `firefox` on PATH is a snap wrapper script, which
    GeckoDriver rejects. We prefer `firefox-bin` (real ELF) when available.
    """
    # 1) Explicit override
    env_bin = os.environ.get("FIREFOX_BINARY")
    if env_bin:
        env_bin = str(Path(env_bin).expanduser())
        if _is_usable_executable(env_bin):
            return env_bin
        # Common case: user points at 'firefox' launcher inside snap dir; try sibling firefox-bin.
        sib = str(Path(env_bin).with_name("firefox-bin"))
        if _is_elf_executable(sib):
            return sib

    if os.name == "nt":
        win_candidates = [
            r"C:\Program Files\Mozilla Firefox\firefox.exe",
            r"C:\Program Files (x86)\Mozilla Firefox\firefox.exe",
            shutil.which("firefox.exe"),
            shutil.which("firefox"),
        ]
        for c in win_candidates:
            if c and _is_windows_executable(c):
                return c
        return None

    # 2) Snap Firefox (common on Ubuntu)
    snap_candidates = [
        "/snap/firefox/current/usr/lib/firefox/firefox-bin",
        "/snap/firefox/current/usr/lib/firefox/firefox",
    ]
    for c in snap_candidates:
        if _is_elf_executable(c):
            return c

    # 3) PATH candidates
    path_candidates = [
        shutil.which("firefox-bin"),
        shutil.which("firefox"),
        shutil.which("firefox-esr"),
    ]
    for c in path_candidates:
        if c and _is_elf_executable(c):
            return c
        # If 'firefox' is a wrapper, try a sibling 'firefox-bin' next to it.
        if c and Path(c).name == "firefox":
            sib = str(Path(c).with_name("firefox-bin"))
            if _is_elf_executable(sib):
                return sib

    return None


def _locate_geckodriver(firefox_bin: Optional[str]) -> Optional[str]:
    """Locate geckodriver with sane preferences for Pi/snap setups."""
    # 1) Explicit override
    env_gecko = os.environ.get("GECKODRIVER_PATH")
    if env_gecko:
        env_gecko = str(Path(env_gecko).expanduser())
        if _is_usable_executable(env_gecko):
            return env_gecko

    if os.name == "nt":
        if firefox_bin:
            sib = str(Path(firefox_bin).with_name("geckodriver.exe"))
            if _is_windows_executable(sib):
                return sib
        path_candidates = [
            shutil.which("geckodriver.exe"),
            shutil.which("geckodriver"),
        ]
        for c in path_candidates:
            if c and _is_windows_executable(c):
                return c
        common = [
            str(Path(__file__).resolve().parent / "geckodriver.exe"),
            r"C:\tools\geckodriver.exe",
            r"C:\WebDriver\bin\geckodriver.exe",
        ]
        for c in common:
            if _is_windows_executable(c):
                return c
        return None

    # 2) If Firefox is from snap, prefer the geckodriver shipped in the same snap
    snap_gecko = "/snap/firefox/current/usr/lib/firefox/geckodriver"
    if _is_elf_executable(snap_gecko):
        return snap_gecko

    # 3) If firefox_bin is known, try sibling geckodriver
    if firefox_bin:
        sib = str(Path(firefox_bin).with_name("geckodriver"))
        if _is_elf_executable(sib):
            return sib

    # 4) PATH (may include /snap/bin or /usr/local/bin)
    gecko = shutil.which("geckodriver")
    if gecko and _is_elf_executable(gecko):
        return gecko

    # 5) If PATH geckodriver isn't ELF (wrapper), try common locations
    common = [
        "/usr/bin/geckodriver",
        "/snap/bin/geckodriver",
        "/usr/local/bin/geckodriver",
    ]
    for c in common:
        if _is_elf_executable(c):
            return c

    return None


def _geckodriver_service(firefox_bin: Optional[str]) -> Service:
    gecko = _locate_geckodriver(firefox_bin)
    log_path = str(Path(__file__).resolve().parent / "geckodriver.log")

    if gecko:
        print(f"[driver] Using geckodriver: {gecko}")
        try:
            return Service(executable_path=gecko, log_output=log_path)
        except TypeError:
            return Service(executable_path=gecko, log_path=log_path)

    print("[driver] WARNING: Could not locate geckodriver; letting Selenium try defaults.")
    try:
        return Service(log_output=log_path)
    except TypeError:
        return Service(log_path=log_path)


def _configure_selenium_runtime_env() -> None:
    """Use a project-local Selenium cache to avoid unwritable default locations."""
    try:
        base = Path(__file__).resolve().parent
        cache_dir = base / ".selenium-cache"
        cache_dir.mkdir(parents=True, exist_ok=True)
        os.environ.setdefault("SE_CACHE_PATH", str(cache_dir))
        os.environ.setdefault("XDG_CACHE_HOME", str(base / ".cache"))
    except Exception:
        pass


def build_driver(headless: bool) -> webdriver.Firefox:
    _configure_selenium_runtime_env()
    opts = Options()
    if headless:
        opts.add_argument("--headless")

    # Point to a REAL Firefox binary (not /usr/bin/firefox snap wrapper)
    firefox_bin = _locate_firefox_binary()
    if firefox_bin:
        print(f"[driver] Using firefox binary: {firefox_bin}")
        opts.binary_location = firefox_bin
    else:
        print("[driver] WARNING: Could not locate a real Firefox binary; GeckoDriver may fail on snap-wrapper setups.")

    # Reduce popups/noise
    opts.set_preference("dom.webnotifications.enabled", False)
    opts.set_preference("media.volume_scale", "0.0")

    try:
        driver = webdriver.Firefox(service=_geckodriver_service(firefox_bin), options=opts)
    except Exception:
        if os.name == "nt":
            print("[driver] Windows setup check:")
            print(f"[driver]   Firefox detected: {'yes' if firefox_bin else 'no'}")
            print(f"[driver]   Firefox path: {firefox_bin or '(not found)'}")
            print("[driver]   geckodriver detected: "
                  f"{'yes' if _locate_geckodriver(firefox_bin) else 'no'}")
            print("[driver]   Tip: install geckodriver.exe and put it on PATH, or set GECKODRIVER_PATH.")
        raise
    try:
        driver.set_window_size(1280, 900)
    except Exception:
        pass
    return driver

def wait_present(driver, by, value, timeout: int = 30):
    return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))


def wait_clickable(driver, by, value, timeout: int = 30):
    return WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, value)))


def datatables_wait_idle(driver, timeout: int = 30) -> None:
    """Best-effort wait for DataTables processing overlay to disappear."""
    end = time.time() + timeout
    while time.time() < end:
        try:
            procs = driver.find_elements(By.CSS_SELECTOR, "div.dataTables_processing")
            if not procs:
                return
            if not any(p.is_displayed() for p in procs):
                return
        except Exception:
            return
        time.sleep(0.2)


def js_table_snapshot(driver, table_id: str) -> Dict[str, object]:
    """Return headers + row cell texts + first link (text+href) per row for a table id."""
    script = r"""
        const table = document.getElementById(arguments[0]);
        if (!table) return null;
        const headers = Array.from(table.querySelectorAll('thead th')).map(th => (th.innerText || '').trim());
        const rows = Array.from(table.querySelectorAll('tbody tr')).map(tr => {
            const cells = Array.from(tr.querySelectorAll('td')).map(td => (td.innerText || '').trim());
            const a = tr.querySelector('a[href]');
            const link = a ? {text: (a.innerText||'').trim(), href: a.getAttribute('href')||''} : null;
            return {cells, link};
        });
        return {headers, rows};
    """
    return driver.execute_script(script, table_id)


def login(driver) -> None:
    driver.get(LOGIN_URL)

    # Salesforce usually uses id=username and id=password.
    user_el = wait_present(driver, By.ID, "username", 30)
    pass_el = wait_present(driver, By.ID, "password", 30)

    try:
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", user_el)
    except Exception:
        pass
    try:
        user_el.clear()
    except Exception:
        pass
    user_el.send_keys(USERNAME)
    try:
        pass_el.clear()
    except Exception:
        pass
    pass_el.send_keys(PASSWORD)

    # Login button is typically id=Login
    try:
        driver.find_element(By.ID, "Login").click()
    except Exception:
        pass_el.send_keys(Keys.ENTER)

    # Wait for account search box
    wait_present(driver, By.ID, ACC_SEARCHBOX_ID, 60)


def filter_accounts_term(
    driver,
    term: str,
    *,
    pre_enter_sleep: float = 3.0,
    post_enter_sleep: float = 0.5,
    final_sleep: float = 1.0,
) -> None:
    """Type `term` into accSearchBox, wait, then press Enter."""
    term = (term or "").strip().lower()
    print(f"Filtering accounts: typing '{term}' into accSearchBox...")

    box = wait_clickable(driver, By.ID, ACC_SEARCHBOX_ID, 60)
    box.click()
    try:
        box.clear()
    except Exception:
        # Some inputs don't support clear reliably; use Ctrl+A then Backspace
        try:
            box.send_keys(Keys.CONTROL, "a")
            box.send_keys(Keys.BACKSPACE)
        except Exception:
            pass

    box.send_keys(term)

    time.sleep(pre_enter_sleep)

    try:
        box.click()
    except Exception:
        pass

    try:
        box.send_keys(Keys.ENTER)
    except Exception:
        pass
    try:
        box.send_keys(Keys.RETURN)
    except Exception:
        pass
    try:
        ActionChains(driver).send_keys(Keys.ENTER).perform()
    except Exception:
        pass

    try:
        box.send_keys(Keys.TAB)
    except Exception:
        pass

    time.sleep(post_enter_sleep)
    datatables_wait_idle(driver, 20)
    time.sleep(final_sleep)


def filter_accounts_simple(driver) -> None:
    """Back-compat wrapper: uses SEARCH_TERM."""
    return filter_accounts_term(driver, SEARCH_TERM)


def get_first_account_from_results(driver) -> Tuple[LinkCell, Dict[str, str]]:
    """Return the first account link + row mapping from accounts results table."""
    # Wait for the table to exist and have at least one link
    wait_present(driver, By.ID, ACCOUNTS_TABLE_ID, 60)

    def _has_first_link(d) -> bool:
        try:
            tbl = d.find_element(By.ID, ACCOUNTS_TABLE_ID)
            a = tbl.find_elements(By.CSS_SELECTOR, "tbody tr td a[href]")
            return len(a) > 0
        except Exception:
            return False

    WebDriverWait(driver, 60).until(_has_first_link)

    snap = js_table_snapshot(driver, ACCOUNTS_TABLE_ID)
    if not snap or not snap.get("rows"):
        raise RuntimeError("Accounts results table snapshot is empty.")

    headers: List[str] = [h for h in (snap.get("headers") or [])]
    first = snap["rows"][0]
    link = first.get("link")
    if not link or not link.get("href"):
        raise RuntimeError("Could not find a clickable account link in the first row.")

    cells: List[str] = first.get("cells") or []
    row_map: Dict[str, str] = {}
    for i, h in enumerate(headers):
        if i < len(cells):
            row_map[h or f"col_{i}"] = cells[i]

    return LinkCell(text=link.get("text", "").strip(), href=link.get("href", "").strip()), row_map


def parse_company_information_from_text(page_text: str) -> Dict[str, str]:
    """Parse Company Information from the Account Summary page text.

    Parent Account is the first line under 'Company Information' (if present).
    We pull:
      - parent_account (optional)
      - company_name
      - account_number
    """
    lines = [ln.strip() for ln in (page_text or "").splitlines()]
    upper = [ln.upper() for ln in lines]

    start = -1
    for i, u in enumerate(upper):
        if u == "COMPANY INFORMATION" or "COMPANY INFORMATION" in u:
            start = i + 1
            break
    if start == -1:
        return {}

    stop_headers = {
        "COMPANY ADDRESS",
        "TAX INFORMATION",
        "CONTACT INFORMATION",
        "CREDIT & BILLING INFORMATION",
        "PAYMENT TERMS & INVOICE PREFERENCES",
        "ACCOUNTS",
        "PROGRAMS",
        "ATTACHMENTS",
    }

    section: List[str] = []
    for j in range(start, len(lines)):
        if upper[j] in stop_headers:
            break
        section.append(lines[j])
        if len(section) >= 80:
            break

    def _value_after(label: str) -> str:
        lab_u = label.upper()
        for idx, raw in enumerate(section):
            ru = raw.upper()
            if ru == lab_u:
                for k in range(idx + 1, len(section)):
                    nxt = section[k].strip()
                    if not nxt:
                        continue
                    if nxt.upper() in {"PARENT ACCOUNT", "COMPANY NAME", "ACCOUNT NUMBER", "ACCOUNT #"}:
                        continue
                    return nxt
            if ru.startswith(lab_u + " "):
                return raw[len(label):].strip()
        return ""

    out: Dict[str, str] = {}
    out["parent_account"] = _value_after("PARENT ACCOUNT")
    out["company_name"] = _value_after("COMPANY NAME")
    out["account_number"] = _value_after("ACCOUNT NUMBER") or _value_after("ACCOUNT #")
    return {k: v for k, v in out.items() if v}



def get_account_name_from_summary_span(driver) -> str:
    """Get the account name from the Account Summary header span.

    After clicking an account row, the portal shows the canonical account name in:
      <span id="AccountSummary:accountForm:aname">Aaron Equipment Company</span>
    """
    try:
        # Give the page a moment to render this header
        wait_present(driver, By.ID, "AccountSummary:accountForm:aname", 10)
        el = driver.find_element(By.ID, "AccountSummary:accountForm:aname")
        return (el.text or "").strip()
    except Exception:
        return ""

def drop_hidden_product_columns(headers: List[str], rows: List[List[str]]) -> Tuple[List[str], List[List[str]]]:
    """Drop backend/hidden columns that DataTables sometimes includes in the DOM.

    The Red Wing portal product table often contains extra non-user columns such as:
      - Product Id
      - Filter Number

    These may be invisible in the UI but appear in our scrape output.
    """
    drop_names = {"product id", "filter number"}
    keep_idx: List[int] = []
    for i, h in enumerate(headers or []):
        hh = (h or "").strip()
        if not hh:
            # ignore blank header columns
            continue
        if hh.lower() in drop_names:
            continue
        keep_idx.append(i)

    if not keep_idx:
        return headers, rows

    new_headers = [headers[i] for i in keep_idx]
    new_rows: List[List[str]] = []
    for r in rows or []:
        new_rows.append([(r[i] if i < len(r) else "") for i in keep_idx])
    return new_headers, new_rows


def get_all_accounts_from_results(driver) -> List[Tuple[LinkCell, Dict[str, str]]]:
    """Return ALL account links + row mappings from the accounts results table."""
    wait_present(driver, By.ID, ACCOUNTS_TABLE_ID, 60)

    # Try to show all accounts on one page
    set_datatable_length_all(driver, ACCOUNTS_TABLE_ID)
    time.sleep(0.4)
    datatables_wait_idle(driver, 20)
    datatable_goto_first(driver, ACCOUNTS_TABLE_ID)

    out: List[Tuple[LinkCell, Dict[str, str]]] = []
    seen = set()

    for _ in range(300):
        snap = js_table_snapshot(driver, ACCOUNTS_TABLE_ID)
        if not snap or not snap.get("rows"):
            break

        headers: List[str] = [h for h in (snap.get("headers") or [])]
        for row in snap.get("rows") or []:
            link = row.get("link") or {}
            href = (link.get("href") or "").strip()
            if not href or href in seen:
                continue
            seen.add(href)

            link_cell = LinkCell(text=(link.get("text") or "").strip(), href=href)
            cells: List[str] = row.get("cells") or []
            row_map: Dict[str, str] = {}
            for i, h in enumerate(headers):
                if i < len(cells):
                    row_map[h or f"col_{i}"] = cells[i]
            out.append((link_cell, row_map))

        clicked = datatable_click_next(driver, ACCOUNTS_TABLE_ID)
        if not clicked:
            break
        time.sleep(0.35)
        datatables_wait_idle(driver, 20)

    return out


def js_program_list_snapshot(driver, table_id: str) -> List[Dict[str, object]]:
    """Snapshot program rows: link text/href + checkbox active state."""
    script = r"""
        const table = document.getElementById(arguments[0]);
        if (!table) return [];
        const rows = Array.from(table.querySelectorAll('tbody tr')).map(tr => {
            const a = tr.querySelector('a[href]');
            if (!a) return null;
            const cb = tr.querySelector('input[type="checkbox"]');
            const active = cb ? (cb.checked || cb.getAttribute('checked') !== null) : false;
            return {text: (a.innerText||'').trim(), href: a.getAttribute('href')||'', active: active};
        }).filter(x => x && x.href);
        return rows;
    """
    try:
        res = driver.execute_script(script, table_id)
        return list(res or [])
    except Exception:
        return []


def get_account_program_links(driver, only_active: bool = False) -> List[Dict[str, object]]:
    """Return program links from the 'List Of Account Related Programs' table.

    Returns dicts: {text, href, active}
    """
    try:
        wait_present(driver, By.ID, PROGRAM_TABLE_ID, 30)
    except TimeoutException:
        return []

    # Try to show all programs on one page
    set_datatable_length_all(driver, PROGRAM_TABLE_ID)
    time.sleep(0.4)
    datatables_wait_idle(driver, 20)
    datatable_goto_first(driver, PROGRAM_TABLE_ID)

    out: List[Dict[str, object]] = []
    seen = set()

    for _ in range(200):
        rows = js_program_list_snapshot(driver, PROGRAM_TABLE_ID)
        for r in rows:
            href = (r.get("href") or "").strip()
            if not href or href in seen:
                continue
            seen.add(href)
            active = bool(r.get("active"))
            if only_active and not active:
                continue
            out.append({"text": (r.get("text") or "").strip(), "href": href, "active": active})

        clicked = datatable_click_next(driver, PROGRAM_TABLE_ID)
        if not clicked:
            break
        time.sleep(0.35)
        datatables_wait_idle(driver, 20)

    return out


def extract_program_summary_block(driver) -> str:
    """Best-effort extraction of the 'Program Summary' section."""
    script = r"""
        function norm(s){return (s||'').replace(/\s+/g,' ').trim();}
        const headers = Array.from(document.querySelectorAll('h1,h2,h3,h4,legend,strong'));
        let h = headers.find(x => norm(x.innerText).toLowerCase() === 'program summary');
        if (!h) {
            // fallback: contains
            h = headers.find(x => norm(x.innerText).toLowerCase().includes('program summary'));
        }
        if (!h) return '';

        // Walk forward through siblings collecting text until the next header-like element.
        let out = [];
        let n = h.nextElementSibling;
        let guard = 0;
        while (n && guard < 200) {
            const tag = (n.tagName||'').toLowerCase();
            const txt = norm(n.innerText);
            if (['h1','h2','h3','h4','legend'].includes(tag)) break;
            if (txt) out.push(txt);
            n = n.nextElementSibling;
            guard++;
        }
        return out.join('\n\n');
    """
    try:
        txt = driver.execute_script(script)
        return (txt or "").strip()
    except Exception:
        return ""


def find_table_by_headers(driver, required_headers: Sequence[str]) -> Optional[str]:
    """Return a table id for the first table that contains all required headers (case-insensitive substring)."""
    required = [r.strip().lower() for r in required_headers]
    script = r"""
        const required = arguments[0];
        const tables = Array.from(document.querySelectorAll('table'));
        for (const t of tables) {
            const ths = Array.from(t.querySelectorAll('thead th')).map(th => (th.innerText||'').trim().toLowerCase());
            if (!ths.length) continue;
            let ok = true;
            for (const r of required) {
                if (!ths.some(h => h.includes(r))) { ok = false; break; }
            }
            if (ok) {
                return t.id || null;
            }
        }
        return null;
    """
    try:
        return driver.execute_script(script, required)
    except Exception:
        return None


def set_datatable_length_all(driver, table_id: str) -> None:
    """Try to set the DataTable page length to 'All' (value -1) or the largest option."""
    script = r"""
        const tableId = arguments[0];
        const sel = document.querySelector(`select[name='${tableId}_length'], select[name$='_length']`);
        if (!sel) return false;

        const opts = Array.from(sel.options);
        let chosen = opts.find(o => o.value === '-1') || opts.find(o => (o.text||'').toLowerCase().includes('all')) || opts[opts.length-1];
        if (!chosen) return false;
        sel.value = chosen.value;
        sel.dispatchEvent(new Event('change', {bubbles:true}));
        return true;
    """
    try:
        driver.execute_script(script, table_id)
    except Exception:
        pass


def datatable_click_next(driver, table_id: str) -> bool:
    """Click Next for a DataTable if available. Returns True if clicked."""
    script = r"""
        const tableId = arguments[0];
        const pag = document.getElementById(`${tableId}_paginate`);
        if (!pag) return false;
        const next = pag.querySelector('a.paginate_button.next');
        if (!next) return false;
        const cls = (next.className||'').toLowerCase();
        if (cls.includes('disabled') || next.getAttribute('aria-disabled') === 'true') return false;
        next.click();
        return true;
    """
    try:
        return bool(driver.execute_script(script, table_id))
    except Exception:
        return False

def datatable_goto_first(driver, table_id: str) -> None:
    """Best-effort: jump DataTables pagination to the first page."""
    script = r"""
        const tableId = arguments[0];
        const pag = document.getElementById(`${tableId}_paginate`);
        if (!pag) return false;

        const isDisabled = (a) => {
            if (!a) return true;
            const cls = (a.className||'').toLowerCase();
            if (cls.includes('disabled')) return true;
            if (a.getAttribute('aria-disabled') === 'true') return true;
            return false;
        };

        // Prefer "First" button if present
        const first = pag.querySelector('a.paginate_button.first');
        if (first && !isDisabled(first)) { first.click(); return true; }

        // Otherwise click the smallest numbered page button (usually page 1)
        const nums = Array.from(pag.querySelectorAll('a.paginate_button')).filter(a => {
            const cls = (a.className||'').toLowerCase();
            if (cls.includes('previous') || cls.includes('next') || cls.includes('first') || cls.includes('last')) return false;
            const txt = (a.innerText||'').trim();
            return /^\d+$/.test(txt);
        });
        if (nums.length > 0) {
            nums.sort((a,b) => parseInt(a.innerText.trim(),10) - parseInt(b.innerText.trim(),10));
            const btn = nums[0];
            if (!isDisabled(btn)) { btn.click(); return true; }
        }
        return false;
    """
    try:
        clicked = bool(driver.execute_script(script, table_id))
        if clicked:
            time.sleep(0.25)
            datatables_wait_idle(driver, 15)
    except Exception:
        pass


def iter_two_letter_terms(start: str = "aa", end: str = "zz") -> List[str]:
    """Return a list of two-letter search terms from start..end inclusive (aa..zz)."""
    start = (start or "aa").strip().lower()
    end = (end or "zz").strip().lower()

    letters = "abcdefghijklmnopqrstuvwxyz"
    terms: List[str] = []
    for a in letters:
        for b in letters:
            t = a + b
            if t < start:
                continue
            if t > end:
                return terms
            terms.append(t)
    return terms


def load_scrape_state(state_path: Path) -> Dict[str, set]:
    """Load scraper resume state from disk.

    Backward compatible with older state files that only stored `scanned_account_urls`.
    """
    state = {
        "scanned_account_urls": set(),
        "completed_terms": set(),
    }
    try:
        if state_path.exists():
            data = json.loads(state_path.read_text(encoding="utf-8") or "{}")
            state["scanned_account_urls"] = set(str(x) for x in (data.get("scanned_account_urls") or []))
            state["completed_terms"] = set(str(x).strip().lower() for x in (data.get("completed_terms") or []))
    except Exception:
        pass
    return state


def _load_scrape_state_from_text(raw: str) -> Dict[str, set]:
    data = {}
    try:
        data = json.loads(raw or "{}")
    except Exception:
        data = {}
    return {
        "scanned_account_urls": set(str(x) for x in (data.get("scanned_account_urls") or [])),
        "completed_terms": set(str(x).strip().lower() for x in (data.get("completed_terms") or [])),
    }


def _state_payload(scanned: set, completed_terms: set) -> Dict[str, List[str]]:
    return {
        "scanned_account_urls": sorted(str(x) for x in scanned if x),
        "completed_terms": sorted(str(t).strip().lower() for t in completed_terms if t),
    }


def _lock_file_exclusive(fh) -> None:
    if fcntl is not None:
        fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
        return
    fh.seek(0)
    while True:
        try:
            msvcrt.locking(fh.fileno(), msvcrt.LK_LOCK, 1)
            return
        except OSError:
            time.sleep(0.05)


def _unlock_file(fh) -> None:
    if fcntl is not None:
        fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
        return
    fh.seek(0)
    try:
        msvcrt.locking(fh.fileno(), msvcrt.LK_UNLCK, 1)
    except OSError:
        pass


def _locked_read_scrape_state(state_path: Path) -> Dict[str, set]:
    state_path.parent.mkdir(parents=True, exist_ok=True)
    with state_path.open("a+", encoding="utf-8") as fh:
        _lock_file_exclusive(fh)
        try:
            fh.seek(0)
            return _load_scrape_state_from_text(fh.read())
        finally:
            _unlock_file(fh)


def save_scrape_state(state_path: Path, scanned: set, completed_terms: set) -> None:
    """Persist resume state, merging with on-disk state (safe for parallel workers)."""
    try:
        state_path.parent.mkdir(parents=True, exist_ok=True)
        with state_path.open("a+", encoding="utf-8") as fh:
            _lock_file_exclusive(fh)
            try:
                fh.seek(0)
                current = _load_scrape_state_from_text(fh.read())
                merged_scanned = set(current["scanned_account_urls"]) | set(scanned)
                merged_completed = set(current["completed_terms"]) | set(completed_terms)
                payload = _state_payload(merged_scanned, merged_completed)
                fh.seek(0)
                fh.truncate()
                fh.write(json.dumps(payload, indent=2, sort_keys=True))
                fh.flush()
                os.fsync(fh.fileno())
            finally:
                _unlock_file(fh)
    except Exception:
        pass


def is_account_scanned(state_path: Path, account_url: str) -> bool:
    try:
        state = _locked_read_scrape_state(state_path)
        return account_url in state["scanned_account_urls"]
    except Exception:
        return False


def _account_claim_path(base_dir: Path, account_url: str) -> Path:
    claims_dir = base_dir / ".voucher_account_claims"
    claims_dir.mkdir(parents=True, exist_ok=True)
    digest = hashlib.sha1(account_url.encode("utf-8", errors="ignore")).hexdigest()
    return claims_dir / f"{digest}.lock"


def try_claim_account(base_dir: Path, account_url: str, stale_seconds: int = 6 * 3600) -> Optional[Path]:
    """Best-effort account claim to reduce duplicate work across parallel workers."""
    claim_path = _account_claim_path(base_dir, account_url)
    now = time.time()

    for _ in range(2):
        try:
            fd = os.open(str(claim_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
            with os.fdopen(fd, "w", encoding="utf-8") as fh:
                fh.write(f"pid={os.getpid()}\n")
                fh.write(f"time={int(now)}\n")
                fh.write(f"url={account_url}\n")
            return claim_path
        except FileExistsError:
            try:
                age = now - claim_path.stat().st_mtime
                if age > stale_seconds:
                    claim_path.unlink(missing_ok=True)
                    continue
            except Exception:
                pass
            return None
        except Exception:
            return None
    return None


def release_account_claim(claim_path: Optional[Path]) -> None:
    if not claim_path:
        return
    try:
        claim_path.unlink(missing_ok=True)
    except Exception:
        pass


def _norm_name(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip()).lower()


def should_retry_term_in_slow_mode(accounts: List[Tuple[LinkCell, Dict[str, str]]]) -> bool:
    if len(accounts) != 9:
        return False
    names = {_norm_name(link.text) for link, _ in accounts}
    return names == SPECIAL_SLOW_RETRY_ACCOUNTS


def term_assigned_to_worker(term: str, worker_count: int, worker_index: int, all_terms: List[str]) -> bool:
    if worker_count <= 1:
        return True
    try:
        pos = all_terms.index(term)
    except ValueError:
        return False
    return (pos % worker_count) == worker_index



def extract_all_datatable_rows(driver, table_id: str) -> Tuple[List[str], List[List[str]]]:
    """Extract all rows from a DataTables table by snapshotting each page and paging Next."""
    headers: List[str] = []
    rows: List[List[str]] = []

    # Try to show all rows on one page
    set_datatable_length_all(driver, table_id)
    time.sleep(0.5)
    datatables_wait_idle(driver, 20)

    seen = set()
    for _ in range(500):
        snap = js_table_snapshot(driver, table_id)
        if not snap:
            break
        headers = [h for h in (snap.get("headers") or [])]
        for r in (snap.get("rows") or []):
            cells = r.get("cells") or []
            sig = "|".join(cells[:4])
            if sig in seen:
                continue
            seen.add(sig)
            rows.append([c for c in cells])

        # If length=All worked, there may be no pagination / Next disabled.
        clicked = datatable_click_next(driver, table_id)
        if not clicked:
            break
        time.sleep(0.4)
        datatables_wait_idle(driver, 20)

    return headers, rows


def get_first_active_program_link(driver) -> Optional[LinkCell]:
    """If the first program row is active (checkbox checked), return its link."""
    try:
        wait_present(driver, By.ID, PROGRAM_TABLE_ID, 30)
    except TimeoutException:
        return None

    script = r"""
        const table = document.getElementById(arguments[0]);
        if (!table) return null;
        const firstRow = table.querySelector('tbody tr');
        if (!firstRow) return null;
        const cb = firstRow.querySelector('input[type="checkbox"]');
        const active = cb ? (cb.checked || cb.getAttribute('checked') !== null) : false;
        if (!active) return null;
        const a = firstRow.querySelector('a[href]');
        if (!a) return null;
        return {text: (a.innerText||'').trim(), href: a.getAttribute('href')||''};
    """
    try:
        res = driver.execute_script(script, PROGRAM_TABLE_ID)
        if not res:
            return None
        return LinkCell(text=(res.get("text") or "").strip(), href=(res.get("href") or "").strip())
    except Exception:
        return None


def write_markdown(output_folder: Path, md_name: str, content: str) -> Path:
    output_folder.mkdir(parents=True, exist_ok=True)

    base_name = _safe_fs_component(md_name)
    md_path = output_folder / f"{base_name}.md"

    md_path.write_text(content, encoding="utf-8")
    return md_path


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--headless", action="store_true", help="Run Firefox headless")
    ap.add_argument("--worker-count", type=int, default=1, help="Total number of parallel workers/browsers")
    ap.add_argument("--worker-index", type=int, default=0, help="This worker index (0-based)")
    args = ap.parse_args()
    if args.worker_count < 1:
        ap.error("--worker-count must be >= 1")
    if args.worker_index < 0 or args.worker_index >= args.worker_count:
        ap.error("--worker-index must be between 0 and --worker-count-1")

    base_dir = Path(__file__).resolve().parent
    vouchers_root = base_dir / OUTPUT_ROOT_DIRNAME
    state_path = base_dir / "voucher_scanned_accounts.json"
    scrape_state = load_scrape_state(state_path)
    scanned_accounts = scrape_state["scanned_account_urls"]
    completed_terms = scrape_state["completed_terms"]

    driver = build_driver(headless=args.headless)
    try:
        login(driver)

        terms = iter_two_letter_terms(SEARCH_TERM, "zz")
        worker_terms = [t for i, t in enumerate(terms) if (i % args.worker_count) == args.worker_index]
        print(f"Search terms: {terms[0]} .. {terms[-1]}  (total {len(terms)})")
        print(
            f"Worker {args.worker_index + 1}/{args.worker_count}: "
            f"{len(worker_terms)} assigned terms."
        )
        print(f"Resume (local snapshot): {len(completed_terms)} completed terms, {len(scanned_accounts)} scanned accounts.")
        if worker_terms:
            print(f"First assigned term: {worker_terms[0]}")
        else:
            print("No terms assigned to this worker.")

        for term in worker_terms:
            current_state = _locked_read_scrape_state(state_path)
            scanned_accounts |= current_state["scanned_account_urls"]
            completed_terms |= current_state["completed_terms"]
            if term in completed_terms:
                continue

            print("\n" + "=" * 80)
            print(f"SEARCH TERM: {term}")
            print("=" * 80)

            term_slow_mode = False
            try:
                filter_accounts_term(driver, term)
            except Exception:
                try:
                    driver.get(driver.current_url)
                    time.sleep(0.7)
                except Exception:
                    pass
                filter_accounts_term(driver, term)

            accounts_list_url = driver.current_url

            accounts = get_all_accounts_from_results(driver)
            if should_retry_term_in_slow_mode(accounts):
                print(
                    "Detected known 9-account partial-load pattern. "
                    "Retrying same term with slower waits."
                )
                term_slow_mode = True
                try:
                    filter_accounts_term(driver, term, pre_enter_sleep=7.0, post_enter_sleep=1.2, final_sleep=3.0)
                except Exception:
                    try:
                        driver.get(accounts_list_url)
                        time.sleep(2.0)
                    except Exception:
                        pass
                    filter_accounts_term(driver, term, pre_enter_sleep=7.0, post_enter_sleep=1.2, final_sleep=3.0)
                accounts_list_url = driver.current_url
                time.sleep(2.0)
                accounts = get_all_accounts_from_results(driver)

            if not accounts:
                print(f"No accounts returned for term '{term}'.")
                completed_terms.add(term)
                save_scrape_state(state_path, scanned_accounts, completed_terms)
                continue

            print(f"Accounts found for '{term}': {len(accounts)}")
            if term_slow_mode:
                print("Slow mode active for this term.")

            account_open_sleep = 1.8 if term_slow_mode else 0.7
            program_open_sleep = 1.8 if term_slow_mode else 0.9
            account_return_sleep = 1.0 if term_slow_mode else 0.5
            list_return_sleep = 1.4 if term_slow_mode else 0.6

            for idx, (account_link, row_map) in enumerate(accounts, start=1):
                account_url = urljoin(accounts_list_url, account_link.href)

                if account_url in scanned_accounts or is_account_scanned(state_path, account_url):
                    scanned_accounts.add(account_url)
                    print(f"[{idx}/{len(accounts)}] SKIP (already scanned): {account_link.text}")
                    continue
                claim_path = try_claim_account(base_dir, account_url)
                if not claim_path:
                    print(f"[{idx}/{len(accounts)}] SKIP (claimed by another worker): {account_link.text}")
                    continue

                try:
                    print(f"[{idx}/{len(accounts)}] Account: {account_link.text}")

                    acct_num = row_map.get("Account #") or row_map.get("Account#") or row_map.get("col_2") or ""

                    driver.get(account_url)
                    time.sleep(account_open_sleep)

                    account_page_text = driver.find_element(By.TAG_NAME, "body").text
                    company_info = parse_company_information_from_text(account_page_text)

                    span_name = get_account_name_from_summary_span(driver)
                    company_name = span_name or company_info.get("company_name") or account_link.text
                    account_number = company_info.get("account_number") or acct_num
                    parent_account = company_info.get("parent_account") or ""

                    folder_label = f"{company_name}_{account_number}" if account_number else f"{company_name}"
                    if parent_account:
                        folder_label = f"{folder_label}_{parent_account}"
                    out_folder = vouchers_root / _safe_fs_component(folder_label, max_len=200)
                    out_folder.mkdir(parents=True, exist_ok=True)

                    programs = get_account_program_links(driver, only_active=True)
                    active_programs = [p for p in programs if bool(p.get("active"))]
                    print(f"  Active programs: {len(active_programs)}")

                    if not active_programs:
                        lines: List[str] = []
                        lines.append(f"# {company_name}")
                        if account_number:
                            lines.append(f"\n- **Account #**: {account_number}")
                        if parent_account:
                            lines.append(f"- **Parent Account**: {parent_account}")
                        lines.append(f"- **Account URL**: {account_url}")
                        lines.append(f"- **Search Term**: {term}")

                        lines.append("\n## Account Row (from results table)")
                        for k, v in row_map.items():
                            lines.append(f"- **{k}**: {v}")

                        lines.append("\n## Account Page Text")
                        lines.append("```text")
                        lines.append(account_page_text)
                        lines.append("```")

                        md_content = "\n".join(lines) + "\n"
                        write_markdown(out_folder, "Account Summary", md_content)

                        scanned_accounts.add(account_url)
                        save_scrape_state(state_path, scanned_accounts, completed_terms)
                        continue

                    for p in active_programs:
                        program_name = (p.get("text") or "").strip() or "Program"
                        program_href = (p.get("href") or "").strip()
                        program_url = urljoin(account_url, program_href)

                        print(f"    - {program_name}")

                        driver.get(program_url)
                        time.sleep(program_open_sleep)

                        program_summary_text = extract_program_summary_block(driver)
                        if not program_summary_text:
                            program_summary_text = driver.find_element(By.TAG_NAME, "body").text

                        product_headers: List[str] = []
                        product_rows: List[List[str]] = []

                        prod_table_id = find_table_by_headers(driver, ["Style", "Product Name"])
                        if prod_table_id:
                            product_headers, product_rows = extract_all_datatable_rows(driver, prod_table_id)
                            product_headers, product_rows = drop_hidden_product_columns(product_headers, product_rows)
                        else:
                            prod_table_id = find_table_by_headers(driver, ["Style"]) or find_table_by_headers(driver, ["Product"])
                            if prod_table_id:
                                product_headers, product_rows = extract_all_datatable_rows(driver, prod_table_id)
                                product_headers, product_rows = drop_hidden_product_columns(product_headers, product_rows)

                        lines = []
                        lines.append(f"# {program_name}")
                        lines.append(f"\n- **Account Name**: {company_name}")
                        if account_number:
                            lines.append(f"- **Account #**: {account_number}")
                        if parent_account:
                            lines.append(f"- **Parent Account**: {parent_account}")
                        lines.append(f"- **Account URL**: {account_url}")
                        lines.append(f"- **Program URL**: {program_url}")
                        lines.append(f"- **Search Term**: {term}")

                        lines.append("\n## Program Summary")
                        lines.append("```text")
                        lines.append(program_summary_text)
                        lines.append("```")

                        lines.append(f"\n## Product List (all rows) — {len(product_rows)} rows")
                        if product_headers and product_rows:
                            lines.append("```csv")
                            lines.append(",".join(h.replace(",", ";") for h in product_headers))
                            for r in product_rows:
                                cleaned = [(c or "").replace("\n", " ").replace(",", ";") for c in r]
                                if len(cleaned) < len(product_headers):
                                    cleaned += [""] * (len(product_headers) - len(cleaned))
                                if len(cleaned) > len(product_headers):
                                    cleaned = cleaned[: len(product_headers)]
                                lines.append(",".join(cleaned))
                            lines.append("```")
                        else:
                            lines.append("_No product table found (or it had no rows)._\n")

                        md_content = "\n".join(lines) + "\n"
                        write_markdown(out_folder, program_name, md_content)

                        driver.get(account_url)
                        time.sleep(account_return_sleep)

                    scanned_accounts.add(account_url)
                    save_scrape_state(state_path, scanned_accounts, completed_terms)

                    driver.get(accounts_list_url)
                    time.sleep(list_return_sleep)
                finally:
                    release_account_claim(claim_path)

            completed_terms.add(term)
            save_scrape_state(state_path, scanned_accounts, completed_terms)

        print("\nAll terms completed.")
        return 0

    finally:
        driver.quit()


if __name__ == "__main__":
    raise SystemExit(main())
