+
    Ji(                        R t ^ RIHt ^ RIt^ RIt^ RIt^ RIt^ RIHu H	t
 ^ RIHt ^ RIHt ^ RIHtHtHtHt ^ RIHtHt ^ RIt] ! R R4      4       t ! R	 R
4      tRR R llt]R8X  d
   ]! 4        R# R# )a  
redwing_scraper.py
-------------------

This script crawls the public Red Wing Shoes website to discover product pages
and extract the current selling price for each style.  It starts from the
``sitemap_index.xml`` advertised in the site's ``robots.txt`` and follows
each sitemap entry looking for product pages (anything ending in a ``.html``
with a numeric style code).  For every discovered product page the script
downloads the HTML, scans for the first occurrence of ``"Current Price:"``
and the associated dollar amount, and pairs it with the style number found in
the page.  The results are written to a Markdown file (``Prices.md``) with
two columns: ``Style #`` and ``Price``.

Usage:
    python RW_Site_Scraper-Prices.py

The script outputs ``Prices.md`` next to this script by default.  You can
optionally set the output filename with the ``--output`` argument and the
sitemap URL with ``--sitemap-url``.  A polite delay between requests
ensures that the crawler does not overwhelm the server.

Note:
    This scraper uses only publicly available endpoints and does not attempt
    to bypass access controls or interact with a shopping cart.  Ensure you
    comply with the site's terms of service and adjust the request rate via
    the ``--delay`` parameter if necessary.
)annotationsN)	dataclass)Path)IterableIteratorListOptional)urljoinurlparsec                  <    ] tR t^-t$ RtR]R&   R]R&   R R ltRtR# )	ProductPricez>Simple structure to hold a product style number and its price.strstyle_numberpricec                   V ^8  d   QhRR/# )   returnr    )formats   "5.\Boot_Features\RW_Scrapers\RW_Site_Scraper-Prices.py__annotate__ProductPrice.__annotate__4   s     6 6 6    c                <    RV P                    RV P                   R2# )z+Return the product as a Markdown table row.|z|$)r   r   )selfs   &r   to_markdown_rowProductPrice.to_markdown_row4   s"    4$$%R

|155r   r   N)__name__
__module____qualname____firstlineno____doc____annotations__r   __static_attributes__r   r   r   r   r   -   s    HJ6 6r   r   c                  ,   ] tR t^9tRt]P                  ! R]P                  4      t]P                  ! R]P                  4      t	]P                  ! R]P                  4      t
]P                  ! R]P                  4      ]P                  ! R]P                  4      ]P                  ! R]P                  4      ]P                  ! R]P                  4      .tRR
 R lltR R lt]R R l4       tR R ltR R ltRR R lltRR R llt]R R l4       tRtR	# )RedWingScraperz7A helper class to scrape Red Wing Shoes product prices.z%(?:-|/)([0-9]{3,6})\.html(?:[?#].*)?$z=href=["\']([^"\']*(?:-|/)[0-9]{3,6}\.html(?:\?[^"\']*)?)["\']z>(?:style(?:\s*(?:#|number))?[:\s\"'>]*|sku[:\s\"'>]*)(\d{3,6})zLitemprop=[\"']price[\"'][^>]{0,120}content=[\"']([0-9]+(?:\.[0-9]{2})?)[\"']z,Current\s+Price:\s*\$([0-9]+(?:\.[0-9]{2})?)z/price-sales[^$]{0,120}\$([0-9]+(?:\.[0-9]{2})?)z-\"price\"\s*:\s*\"?([0-9]+(?:\.[0-9]{2})?)\"?Nc                    V ^8  d   QhRRRR/# )r   sessionzOptional[requests.Session]r   Noner   )r   s   "r   r   RedWingScraper.__annotate__J   s     
 
 : 
d 
r   c                	    T;'       g    \         P                  ! 4       V n        V P                  P                  P	                  R RRR/4       R# )z
User-Agentz_Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36zAccept-Languagezen-US,en;q=0.9N)requestsSessionr(   headersupdate)r   r(   s   &&r   __init__RedWingScraper.__init__J   sG    44("2"2"4##/ "#3		
r   c                    V ^8  d   QhRRRR/# r   urlr   r   r   )r   s   "r   r   r*   X   s     : :c :c :r   c                h   Rp\        ^4       F=  p V P                  P                  V^R7      pVP                  4        VP                  u # 	  Ve   Vh\        RV 24      h  \
        P                   d:   pTpT^8  d+   \        P                  ! RT^,           ,          4        Rp?K  h Rp?ii ; i)z<Download a URL and return its text, raising for HTTP errors.N)timeoutg?zFailed to fetch URL: )
ranger(   getraise_for_statustextr,   RequestExceptiontimesleepRuntimeError)r   r4   last_excattemptrespexcs   &&    r   
fetch_textRedWingScraper.fetch_textX   s    (,QxG	||''R'8%%'yy 	   N23%899 ,, Q;JJsgk23s   8A##B18-B,+B,,B1c                    V ^8  d   QhRRRR/# r3   r   )r   s   "r   r   r*   k   s      C C r   c                  a  \        V 4      P                  ;'       g    RP                  4       pVP	                  4       o\
        P                  ! RV\
        P                  R7      pV'       gt   \
        P                  ! RV\
        P                  R7      pV'       g   R# \        ;QJ d    V3R lR 4       F  '       g   K   RM	  RM! V3R lR 4       4      '       g   R# VP                  ^4      #   \         d    T p Li ; i)	z3Extract style number from modern Red Wing PDP URLs. z-(\d{3,6})\.html$)flagsz/(\d{3,6})\.html$c              3  ,   <"   T F	  qS9   x  K  	  R # 5iNr   ).0tokenpath_ls   & r   	<genexpr>8RedWingScraper.extract_style_from_url.<locals>.<genexpr>y   s      -vE-vs   TF)z/mens/z/womens/z/work/z
/heritage/z/accessories/z/worx/)
r
   pathstrip	Exceptionlowerresearch
IGNORECASEanygroup)r4   rP   mrM   s   &  @r   extract_style_from_url%RedWingScraper.extract_style_from_urlj   s    	SM&&,,"335D II*DF		.BMMJA3 -v333 -v   wwqz  	D	s   C3 C3 3DDc                    V ^8  d   QhRRRR/# )r   xml_textr   r   z	List[str]r   )r   s   "r   r   r*      s      c i r   c                $   \         P                  ! V4      p. pVP                  4        Fc  pVP                  P	                  R4      '       g   K&  VP
                  '       g   K:  VP                  VP
                  P                  4       4       Ke  	  V# )z=Extract a list of ``loc`` values from a sitemap XML document.loc)ET
fromstringitertagendswithr:   appendrQ   )r   r]   rootlocselems   &&   r   parse_sitemapRedWingScraper.parse_sitemap   s`    }}X&IIKDxx  ''DIIIDIIOO-.   r   c                    V ^8  d   QhRRRR/# )r   sitemap_index_urlr   r   Iterator[str]r   )r   s   "r   r   r*      s     1 13 1= 1r   c              #  t  a a"    S P                  V4      pS P                  T4      p\        4       p\        4       oR T T3R llpT F  pTP	                  4       P                  R4      '       g   K*   S P                  T4      pS P                  T4      pT FI  p	T! T	4       F  p
T
x  K	  	  T	P                  R4      '       g   K/  RT	9   g   K8  TP                  T	4       KK  	  K  	  \        T4       F  p S P                  T4      pS P                  P                  T4       FT  pTP                  ^4      ;'       g    RP                  4       pT'       g   K6  \        Y4      pT! T4       F  p
T
x  K	  	  KV  	  K  	  R#   \         d     R# i ; i  \         d     EKZ  i ; i  \         d     K  i ; i5i)z
Iterate through all sitemaps listed in the ``sitemap_index_url`` and
yield URLs that look like product pages (based on numeric style).
Nc                    V ^8  d   QhRRRR/# )r   raw_urlr   r   rm   r   )r   s   "r   r   6RedWingScraper.iter_product_urls.<locals>.__annotate__   s     	 	s 	} 	r   c              3     <"   SP                  V 4      pV'       g   R # V S9   d   R # SP                  V 4       V x  R # 5irJ   )rZ   add)rp   styler   yieldeds   & r   _yield_if_product;RedWingScraper.iter_product_urls.<locals>._yield_if_product   s8     //8E'!KK Ms   <?z.xml/zredwingshoes.comrG   )rC   rR   ri   setrS   rd   rs   sortedPDP_HREF_REfinditerrX   rQ   r	   )r   rl   	index_xmlsitemap_urlslisting_urlsrv   sitemap_urlsm_xmlurlsr4   plist_urlpagerY   hrefabs_urlru   s   f&              @r   iter_product_urls RedWingScraper.iter_product_urls   s    
	(9:I )))4!$E	 	 (K$$&//775 %%f-D*3/AG 0<<$$);s)B $$S)	  (  |,Hx0 %%..t4
((b//1!(1*73AG 4 5 -?  		*    s   F8F AF83F=F8F8'F87F&5F8>F8*F8FF8FF8F#F8"F##F8&F51F84F55F8c               $    V ^8  d   QhRRRRRR/# )r   htmlr   r4   r   zOptional[ProductPrice]r   )r   s   "r   r   r*      s"     * *C *c *CY *r   c                \   RpV P                   P                  V4      pV'       d   VP                  ^4      pV'       g   V P                  V4      pRpV P                   F/  pVP                  V4      pV'       g   K  VP                  ^4      p M	  V'       d	   V'       g   R# \        W54      # )z>Extract the style number and price from a product page's HTML.rG   N)STYLE_RErU   rX   rZ   PRICE_PATTERNSr   )r   r   r4   rt   style_matchr   patrY   s   &&&     r   extract_price_and_style&RedWingScraper.extract_price_and_style   s    mm**40%%a(E//4E&&C

4 Aq
	 ' EE))r   c               $    V ^8  d   QhRRRRRR/# )r   rl   r   delayfloatr   zList[ProductPrice]r   )r   s   "r   r   r*      s(        
	r   c                   . p\        4       p\        4       pV P                  V4       F  pWd9   d   K  VP                  V4        V P                  V4      pT P                  Yv4      pT'       d@   TP                  T9   d   KZ  TP                  TP                  4       TP                  T4       \        P                  ! T4       K  	  V#   \         d     K  i ; i)z:Scrape all product pages discovered via the sitemap index.)
ry   r   rs   rC   rR   r   r   re   r<   r=   )	r   rl   r   resultsseenseen_stylesproduct_urlr   products	   &&&      r   scrape_all_products"RedWingScraper.scrape_all_products   s     ') #112CDK"HH[!{3 224EG'';6 4 45w'JJu E    s   C

CCc               $    V ^8  d   QhRRRRRR/# )r   productszIterable[ProductPrice]rP   r   r   r)   r   )r   s   "r   r   r*      s"     ; ;!7 ;s ;t ;r   c                   \        VRRR7      ;_uu_ 4       pVP                  R4       VP                  R4       V  F)  pVP                  VP                  4       R,           4       K+  	  RRR4       R#   + '       g   i     R# ; i)z6Write the products to a Markdown file in table format.wzutf-8)encodingz|Style #|Price|
z
|---|---|

N)openwriter   )r   rP   mdr   s   &&  r   write_markdownRedWingScraper.write_markdown   sa     $g.."HH()HH]##002T9: $ /...s   AA33B	)r(   rJ   )rG   )      ?)r   r   r    r!   r"   rT   compilerV   PRODUCT_PAGE_REr{   r   r   r0   rC   staticmethodrZ   ri   r   r   r   r   r$   r   r   r   r&   r&   9   s    A jj!I2==YO**]_a_l_lmKzzI
H
 	

bdfdqdqr


BBMMR


Er}}U


CR]]S	N
:$  (1f*(6 ; ;r   r&   c                    V ^8  d   QhRRRR/# )r   argvzOptional[List[str]]r   r)   r   )r   s   "r   r   r      s     . ." .d .r   c                   \         P                  ! R R7      pVP                  RRRR7       VP                  RRRR7       VP                  R	\        R
RR7       VP	                  V 4      p\        4       p\        RVP                   R2\        P                  R7       VP                  VP                  VP                  R7      p\        R\        V4       R2\        P                  R7       VP                  '       d$   \        VP                  4      P                  4       M-\        \         4      P#                  4       P$                  R,          pVP'                  V\)        V4      4       \        RV 24       R# )zScrape Red Wing Shoes prices)descriptionz--sitemap-urlz.https://www.redwingshoes.com/sitemap_index.xmlz*URL of the sitemap index to start crawling)defaulthelpz--outputrG   zOutput Markdown filenamez--delayr   z!Delay between requests in seconds)typer   r   zDiscovering products from u    …)file)r   zFound z products with prices.z	Prices.mdzWrote price table to N)argparseArgumentParseradd_argumentr   
parse_argsr&   printr   sysstderrr   r   lenoutputr   
expanduser__file__resolveparentr   r   )r   parserargsscraperr   out_paths   &     r   mainr      s8   $$1OPF
@9  
 
B5OP
0	   T"DG	&t'7'7&8
=CJJO**4+;+;4::*NH	F3x=/!7
8szzJ15tDKK ++-4>CYCYC[CbCbepCpH8S]3	!(
,-r   __main__rJ   )r"   
__future__r   r   rT   r   r<   xml.etree.ElementTreeetreeElementTreer`   dataclassesr   pathlibr   typingr   r   r   r   urllib.parser	   r
   r,   r   r&   r   r   r   r   r   <module>r      sr   : #  	 
  " " !  5 5 *  6 6 6y; y;x.2 zF r   