"""
redwing_scraper.py
-------------------

This script crawls the public Red Wing Shoes website to discover product pages
and extract the current selling price for each style.  It starts from the
``sitemap_index.xml`` advertised in the site's ``robots.txt`` and follows
each sitemap entry looking for product pages (anything ending in a ``.html``
with a numeric style code).  For every discovered product page the script
downloads the HTML, scans for the first occurrence of ``"Current Price:"``
and the associated dollar amount, and pairs it with the style number found in
the page.  The results are written to a Markdown file (``Prices.md``) with
two columns: ``Style #`` and ``Price``.

Usage:
    python RW_Site_Scraper-Prices.py

The script outputs ``Prices.md`` next to this script by default.  You can
optionally set the output filename with the ``--output`` argument and the
sitemap URL with ``--sitemap-url``.  A polite delay between requests
ensures that the crawler does not overwhelm the server.

Note:
    This scraper uses only publicly available endpoints and does not attempt
    to bypass access controls or interact with a shopping cart.  Ensure you
    comply with the site's terms of service and adjust the request rate via
    the ``--delay`` parameter if necessary.
"""

from __future__ import annotations

import argparse
import re
import sys
import time
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Iterator, List, Optional
from urllib.parse import urljoin, urlparse

import requests


@dataclass
class ProductPrice:
    """Simple structure to hold a product style number and its price."""

    style_number: str
    price: str

    def to_markdown_row(self) -> str:
        """Return the product as a Markdown table row."""
        return f"|{self.style_number}|${self.price}|"


class RedWingScraper:
    """A helper class to scrape Red Wing Shoes product prices."""

    # Regular expressions to match product pages and extract style/price.
    PRODUCT_PAGE_RE = re.compile(r"(?:-|/)([0-9]{3,6})\.html(?:[?#].*)?$", re.IGNORECASE)
    PDP_HREF_RE = re.compile(r'href=["\']([^"\']*(?:-|/)[0-9]{3,6}\.html(?:\?[^"\']*)?)["\']', re.IGNORECASE)
    STYLE_RE = re.compile(
        r"(?:style(?:\s*(?:#|number))?[:\s\"'>]*|sku[:\s\"'>]*)(\d{3,6})",
        re.IGNORECASE,
    )
    PRICE_PATTERNS = [
        re.compile(r"itemprop=[\"']price[\"'][^>]{0,120}content=[\"']([0-9]+(?:\.[0-9]{2})?)[\"']", re.IGNORECASE),
        re.compile(r"Current\s+Price:\s*\$([0-9]+(?:\.[0-9]{2})?)", re.IGNORECASE),
        re.compile(r"price-sales[^$]{0,120}\$([0-9]+(?:\.[0-9]{2})?)", re.IGNORECASE),
        re.compile(r"\"price\"\s*:\s*\"?([0-9]+(?:\.[0-9]{2})?)\"?", re.IGNORECASE),
    ]

    def __init__(self, session: Optional[requests.Session] = None) -> None:
        self.session = session or requests.Session()
        # Use a browser-like user agent and accept English pages.
        self.session.headers.update(
            {
                "User-Agent": (
                    "Mozilla/5.0 (X11; Linux x86_64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/120 Safari/537.36"
                ),
                "Accept-Language": "en-US,en;q=0.9",
            }
        )

    def fetch_text(self, url: str) -> str:
        """Download a URL and return its text, raising for HTTP errors."""
        last_exc: Optional[Exception] = None
        for attempt in range(3):
            try:
                resp = self.session.get(url, timeout=30)
                resp.raise_for_status()
                return resp.text
            except requests.RequestException as exc:
                last_exc = exc
                if attempt < 2:
                    time.sleep(0.8 * (attempt + 1))
                    continue
                raise
        if last_exc is not None:
            raise last_exc
        raise RuntimeError(f"Failed to fetch URL: {url}")

    @staticmethod
    def extract_style_from_url(url: str) -> str:
        """Extract style number from modern Red Wing PDP URLs."""
        try:
            path = (urlparse(url).path or "").strip()
        except Exception:
            path = url
        path_l = path.lower()
        # Primary PDP pattern: "...-02440.html"
        m = re.search(r"-(\d{3,6})\.html$", path, flags=re.IGNORECASE)
        if not m:
            # Secondary pattern: "/02440.html" (only in product sections).
            m = re.search(r"/(\d{3,6})\.html$", path, flags=re.IGNORECASE)
            if not m:
                return ""
            if not any(
                token in path_l for token in ("/mens/", "/womens/", "/work/", "/heritage/", "/accessories/", "/worx/")
            ):
                return ""
        return m.group(1)

    def parse_sitemap(self, xml_text: str) -> List[str]:
        """Extract a list of ``loc`` values from a sitemap XML document."""
        root = ET.fromstring(xml_text)
        locs: List[str] = []
        for elem in root.iter():
            # Namespaces may decorate tags; match on suffix.
            if elem.tag.endswith("loc") and elem.text:
                locs.append(elem.text.strip())
        return locs

    def iter_product_urls(self, sitemap_index_url: str) -> Iterator[str]:
        """
        Iterate through all sitemaps listed in the ``sitemap_index_url`` and
        yield URLs that look like product pages (based on numeric style).
        """
        try:
            index_xml = self.fetch_text(sitemap_index_url)
        except Exception:
            return
        sitemap_urls = self.parse_sitemap(index_xml)
        listing_urls: set[str] = set()
        yielded: set[str] = set()

        def _yield_if_product(raw_url: str) -> Iterator[str]:
            style = self.extract_style_from_url(raw_url)
            if not style:
                return
            if raw_url in yielded:
                return
            yielded.add(raw_url)
            yield raw_url

        for sitemap_url in sitemap_urls:
            # Only process XML documents (ignore images etc.)
            if not sitemap_url.lower().endswith(".xml"):
                continue
            try:
                sm_xml = self.fetch_text(sitemap_url)
            except Exception:
                continue
            urls = self.parse_sitemap(sm_xml)
            for url in urls:
                for p in _yield_if_product(url):
                    yield p
                if url.endswith("/") and "redwingshoes.com" in url:
                    listing_urls.add(url)

        # Sitemaps currently omit many PDPs; discover links from listing pages.
        for list_url in sorted(listing_urls):
            try:
                page = self.fetch_text(list_url)
            except Exception:
                continue
            for m in self.PDP_HREF_RE.finditer(page):
                href = (m.group(1) or "").strip()
                if not href:
                    continue
                abs_url = urljoin(list_url, href)
                for p in _yield_if_product(abs_url):
                    yield p

    def extract_price_and_style(self, html: str, url: str = "") -> Optional[ProductPrice]:
        """Extract the style number and price from a product page's HTML."""
        style = ""
        style_match = self.STYLE_RE.search(html)
        if style_match:
            style = style_match.group(1)
        if not style:
            style = self.extract_style_from_url(url)

        price = ""
        for pat in self.PRICE_PATTERNS:
            m = pat.search(html)
            if m:
                price = m.group(1)
                break

        if not style or not price:
            return None
        return ProductPrice(style, price)

    def scrape_all_products(
        self,
        sitemap_index_url: str,
        delay: float = 1.0,
    ) -> List[ProductPrice]:
        """Scrape all product pages discovered via the sitemap index."""
        results: List[ProductPrice] = []
        seen: set[str] = set()
        seen_styles: set[str] = set()
        for product_url in self.iter_product_urls(sitemap_index_url):
            if product_url in seen:
                continue
            seen.add(product_url)
            try:
                html = self.fetch_text(product_url)
            except Exception:
                continue
            product = self.extract_price_and_style(html, product_url)
            if product:
                if product.style_number in seen_styles:
                    continue
                seen_styles.add(product.style_number)
                results.append(product)
            # Be polite: wait between requests
            time.sleep(delay)
        return results

    @staticmethod
    def write_markdown(products: Iterable[ProductPrice], path: str) -> None:
        """Write the products to a Markdown file in table format."""
        with open(path, "w", encoding="utf-8") as md:
            md.write("|Style #|Price|\n")
            md.write("|---|---|\n")
            for product in products:
                md.write(product.to_markdown_row() + "\n")


def main(argv: Optional[List[str]] = None) -> None:
    parser = argparse.ArgumentParser(description="Scrape Red Wing Shoes prices")
    parser.add_argument(
        "--sitemap-url",
        default="https://www.redwingshoes.com/sitemap_index.xml",
        help="URL of the sitemap index to start crawling",
    )
    parser.add_argument("--output", default="", help="Output Markdown filename")
    parser.add_argument(
        "--delay",
        type=float,
        default=1.0,
        help="Delay between requests in seconds",
    )
    args = parser.parse_args(argv)

    scraper = RedWingScraper()
    print(f"Discovering products from {args.sitemap_url} …", file=sys.stderr)
    products = scraper.scrape_all_products(args.sitemap_url, delay=args.delay)
    print(f"Found {len(products)} products with prices.", file=sys.stderr)
    out_path = Path(args.output).expanduser() if args.output else (Path(__file__).resolve().parent / "Prices.md")
    scraper.write_markdown(products, str(out_path))
    print(f"Wrote price table to {out_path}")


if __name__ == "__main__":
    main()