#!/usr/bin/env python3
"""
Run all RW scraper scripts and merge outputs by style number.

Merge rules:
- Match rows by Style # while ignoring leading zeros.
- Prefer RW_Orders_Site values for: Style #, Image, Brand, Resoleable.
- For all other columns, prefer RW_Site_Scrape (original scraper), then RW_Orders.
- Drop any row containing "Page Not Found" in any cell.
"""

from __future__ import annotations

import json
import re
import subprocess
import sys
from pathlib import Path
from typing import Dict, List, Tuple


BASE_DIR = Path(__file__).resolve().parent
SCRAPERS_DIR = BASE_DIR / "RW_Scrapers"
REQUIRED_FOR_MERGE = [
    "RW_Site_Scraper-Orders_Page.py",
    "RW_Site_Scraper-For_Buisness.py",
]

ORDERS_MD = SCRAPERS_DIR / "RW_Orders_Site_Scrape.md"
OTHER_MD = SCRAPERS_DIR / "RW_Site_Scrape.md"
MERGED_MD = SCRAPERS_DIR / "RW_Site_Scrape_Merged.md"
PRICES_MD = SCRAPERS_DIR / "Prices.md"
VOUCHER_PRICE_CACHE = BASE_DIR.parent / "Vouchers" / "config" / "voucher_price_consensus_cache.json"


def _run_script(path: Path) -> int:
    print(f"\n[run] {path.name}", flush=True)
    proc = subprocess.run([sys.executable, str(path)], cwd=str(SCRAPERS_DIR))
    print(f"[run] exit={proc.returncode} ({path.name})", flush=True)
    return int(proc.returncode)


def _discover_scrapers() -> List[Path]:
    if not SCRAPERS_DIR.exists():
        raise RuntimeError(f"Scrapers folder not found: {SCRAPERS_DIR}")
    scripts = [p for p in SCRAPERS_DIR.glob("RW_Site_Scraper*.py") if p.is_file()]
    if not scripts:
        raise RuntimeError(f"No RW_Site_Scraper*.py files found in: {SCRAPERS_DIR}")

    preferred = {name: i for i, name in enumerate(REQUIRED_FOR_MERGE)}
    scripts.sort(key=lambda p: (preferred.get(p.name, 999), p.name.lower()))
    return scripts


def _split_md_row(line: str) -> List[str]:
    s = line.strip()
    if not s.startswith("|"):
        return []
    s = s.strip("|")
    parts = re.split(r"(?<!\\)\|", s)
    return [p.strip().replace(r"\|", "|") for p in parts]


def _parse_markdown_table(path: Path) -> Tuple[List[str], List[Dict[str, str]]]:
    if not path.exists():
        return [], []

    lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
    table_lines = [ln for ln in lines if ln.strip().startswith("|")]
    if len(table_lines) < 2:
        return [], []

    headers = _split_md_row(table_lines[0])
    if not headers:
        return [], []

    rows: List[Dict[str, str]] = []
    for ln in table_lines[2:]:
        vals = _split_md_row(ln)
        if not vals:
            continue
        if len(vals) < len(headers):
            vals = vals + [""] * (len(headers) - len(vals))
        elif len(vals) > len(headers):
            vals = vals[:len(headers)]
        row = {h: vals[i] for i, h in enumerate(headers)}
        rows.append(row)

    return headers, rows


def _normalize_style(style: str) -> str:
    raw = (style or "").strip()
    if not raw:
        return ""
    m = re.search(r"\d+", raw)
    if m:
        return str(int(m.group(0)))
    return raw.lower()


def _row_has_page_not_found(row: Dict[str, str]) -> bool:
    needle = "page not found"
    for v in row.values():
        if needle in (v or "").lower():
            return True
    return False


def _style_sort_key(style_key: str):
    if style_key.isdigit():
        return (0, int(style_key))
    return (1, style_key)


def _normalize_image_name(image_value: str) -> str:
    # Remove trailing "_2" immediately before ".png" (case-insensitive).
    return re.sub(r"(?i)_2(?=\.png\b)", "", image_value or "")


def _load_prices_by_style() -> Dict[str, str]:
    headers, rows = _parse_markdown_table(PRICES_MD)
    if not headers:
        return {}
    style_col = "Style #"
    price_col = "Price"
    if style_col not in headers or price_col not in headers:
        return {}
    out: Dict[str, str] = {}
    for r in rows:
        k = _normalize_style(r.get(style_col, ""))
        if not k:
            continue
        price = (r.get(price_col, "") or "").strip()
        if not price:
            continue
        out[k] = price
    return out


def _load_voucher_prices_by_style() -> Dict[str, str]:
    if not VOUCHER_PRICE_CACHE.exists():
        return {}
    try:
        payload = json.loads(VOUCHER_PRICE_CACHE.read_text(encoding="utf-8"))
    except Exception:
        return {}
    prices = payload.get("prices", {})
    if not isinstance(prices, dict):
        return {}
    out: Dict[str, str] = {}
    for style, value in prices.items():
        k = _normalize_style(str(style))
        if not k:
            continue
        try:
            num = float(value)
        except Exception:
            continue
        if num <= 0:
            continue
        out[k] = f"{num:.2f}"
    return out


def merge_tables() -> Tuple[int, int, int]:
    orders_headers, orders_rows = _parse_markdown_table(ORDERS_MD)
    other_headers, other_rows = _parse_markdown_table(OTHER_MD)

    headers = orders_headers or other_headers
    if not headers:
        raise RuntimeError("Could not parse headers from either markdown output file.")

    for h in other_headers:
        if h not in headers:
            headers.append(h)
    if "Price" not in headers:
        headers.append("Price")

    by_style_orders: Dict[str, Dict[str, str]] = {}
    by_style_other: Dict[str, Dict[str, str]] = {}

    for r in orders_rows:
        if _row_has_page_not_found(r):
            continue
        k = _normalize_style(r.get("Style #", ""))
        if k:
            by_style_orders[k] = r

    for r in other_rows:
        if _row_has_page_not_found(r):
            continue
        k = _normalize_style(r.get("Style #", ""))
        if k:
            by_style_other[k] = r

    all_styles = sorted(set(by_style_orders.keys()) | set(by_style_other.keys()), key=_style_sort_key)
    merged_rows: List[Dict[str, str]] = []
    prices_by_style = _load_prices_by_style()
    voucher_prices_by_style = _load_voucher_prices_by_style()

    for k in all_styles:
        ro = by_style_orders.get(k, {})
        rt = by_style_other.get(k, {})
        merged: Dict[str, str] = {}

        for h in headers:
            if h == "Resoleable":
                merged[h] = "1" if (ro.get(h, "").strip() == "1" or rt.get(h, "").strip() == "1") else (ro.get(h, "") or rt.get(h, "")).strip()
            elif h in ("Style #", "Image", "Brand"):
                merged[h] = (ro.get(h, "") or rt.get(h, "")).strip()
            else:
                merged[h] = (rt.get(h, "") or ro.get(h, "")).strip()

        if "Image" in merged:
            merged["Image"] = _normalize_image_name(merged.get("Image", "")).strip()

        if not merged.get("Style #"):
            merged["Style #"] = ro.get("Style #", "") or rt.get("Style #", "")
        merged["Price"] = (
            prices_by_style.get(k)
            or voucher_prices_by_style.get(k)
            or (merged.get("Price", "") or "").strip()
        )

        if _row_has_page_not_found(merged):
            continue
        merged_rows.append(merged)

    out_lines: List[str] = []
    out_lines.append("|" + "|".join(headers) + "|")
    out_lines.append("|" + "|".join(["---"] * len(headers)) + "|")

    for row in merged_rows:
        vals = []
        for h in headers:
            cell = (row.get(h, "") or "").replace("|", r"\|").strip()
            vals.append(cell)
        out_lines.append("|" + "|".join(vals) + "|")

    MERGED_MD.write_text("\n".join(out_lines) + "\n", encoding="utf-8")
    return len(orders_rows), len(other_rows), len(merged_rows)


def main() -> None:
    scripts = _discover_scrapers()
    rc_by_name: Dict[str, int] = {}
    for script in scripts:
        rc_by_name[script.name] = _run_script(script)

    for name, rc in rc_by_name.items():
        if rc != 0:
            print(f"[warn] Scraper exited non-zero: {name} ({rc})", flush=True)

    o_count, t_count, m_count = merge_tables()
    print(
        f"[merge] orders_rows={o_count}, other_rows={t_count}, merged_rows={m_count}\n"
        f"[merge] wrote: {MERGED_MD}\n"
        f"[run] scripts_ran={len(scripts)} in {SCRAPERS_DIR}",
        flush=True,
    )

    # Non-zero only if both merge-required scrapers failed.
    req_failures = [
        rc_by_name.get(name, 1) != 0
        for name in REQUIRED_FOR_MERGE
    ]
    if all(req_failures):
        raise SystemExit(1)


if __name__ == "__main__":
    main()