
    ?i                    "   U d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlZddlZddlZddlmZ ddlZddlmZ ddlmZmZmZmZmZ ddlmZmZ ddlm Z  dd	l!m"Z" dd
l#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z, ddl-m.Z/ ddl0m1Z1 dgZ2e2d   Z3dZ4de5d<    ee6      jo                         jp                  Z9ejt                  jw                  dd      jy                         j{                         dv Z>e9dz  Z?e9dz  Z@e9dz  ZAdZBdZCdZDdZEg dZF eGeF      ZHdZIdZJd ZKd!ZLd"ZMdZNd#ZOd$ZPd%ZQejt                  jw                  d&d      jy                         xs dZRddd'ZSded(ZTdfd)ZUdgd*ZVdhd+ZWdid,ZXdjd-ZYdkd.ZZ G d/ d0      Z[dld1Z\dmd2Z]dnd3Z^dod4Z_dpd5Z`	 dqd6deJdd7	 	 	 	 	 	 	 	 	 	 	 	 	 drd8Zadsd9ZbeJeLd$f	 	 	 	 	 	 	 	 	 dtd:Zcdud;Zddvdwd<Ze G d= d>e      Zfdxd?Zgdxd@ZhdxdAZidydBZjdsdCZkdzd{dDZldedEZmd|dFZnd}dGZod~dHZpddIZqddJZrddKZsddLZtddMZuddNZvddOZwddPZxddQZyddRZzdddSZ{dldTZ|ddUZ}dV Z~d|dWZddXZddYZddZZ	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd[Zddd\Zdd]dd^Zdd_Zdd`ZddaZdsdbZedck(  r e        yy)a9  
RW_Site_Scraper-For_Buisness.py
===============================

Scrapes Red Wing Safety "For Business" safety boots catalogue:
- https://www.redwingsafety.com/safety-boots/page-1/maxnum-0?catalog=international

Test mode:
- Set ONLY_STYLE at the top of this file to a style number (e.g. "400")
  to scrape just that single product.

Outputs (next to this script):
- RW_Site_Scrape.md
- RW_Site_Scraper_checkpoint.json   (resume state)
- RW_Site_Scraper_errors.txt        (links that failed repeatedly)

Stability features:
- `safe_get()` uses short timeouts + window.stop() so Selenium doesn't hang forever
- HTTP fallback (no Selenium) for the few product pages that still time out
- Optional salvage pass at the end to retry hard failures (HTTP-first)

Cross-platform (Windows + Linux Cinnamon):
- Headless Firefox (default)
- Geckodriver resolution "like Parts_Auto" (explicit Service path; no Selenium Manager):
    1) GECKODRIVER_PATH env var (file or directory)
    2) geckodriver(.exe) on PATH
    3) auto-download geckodriver (GitHub releases) into a user cache dir
    4) (optional) if double-click/no terminal and Tk is available, prompt to pick geckodriver

Feature columns are 1/0 (not Yes/No).
Includes Brand (string) + brand family flags (Red Wing / Irish Setter / Worx).

Dependencies:
- Python 3.9+
- Firefox installed
- Selenium installed:
    Linux Mint/Ubuntu: sudo apt install -y python3-selenium
    Windows: python -m pip install selenium

Notes for Linux Mint PEP 668:
- Prefer `python3-selenium` from apt (as above).
- This script does NOT require webdriver-manager.
    )annotationsN)
HTMLParser)Path)DictListOptionalSetTuple)Requesturlopen)BeautifulSoup)	webdriver)TimeoutExceptionWebDriverException)By)Options)Service)expected_conditions)WebDriverWaitzPhttps://www.redwingsafety.com/safety-boots/page-1/maxnum-0?catalog=international str
ONLY_STYLERW_REFRESH_LINKS1>   r   yonyestruezRW_Site_Scrape.mdzRW_Site_Scraper_checkpoint.jsonzRW_Site_Scraper_errors.txt   TRED WING FOR BUSINESSF)'zStyle #NameURLImageBrandMaleFemaleRed WingIrish SetterWorxz
Safety Toez	Steel ToezNon-Metallic ToezAluminum ToezMetatarsal GuardzSoft Toe
Waterproof
InsulationzSlip ResistantzElectrical HazardzPuncture ResistantzStatic DissipativezAnkle Protectionu   BOA® Lacing SystemDefined HeelzAll Leather UpperzOxford/AthleticChukkaHiker5"6"7"8"9"10"11"12"zBuilt in USAzMade in USA-      g      ?)g333333?g?Z         GECKODRIVER_PATHc                    | rdS dS )Nr   0 )vs    -Boot_Features/RW_Site_Scraper-For_Buisness.pyb01rB      s    3    c                |    | xs dj                  dd      } t        j                  dd|       j                         } | S )Nr   |z\|\s+ )replaceresubstrip)ss    rA   md_escape_cellrM      s8    	
b#u%A
vsA$$&AHrC   c                    | j                  | j                  dz         }|j                  |d       |j                  |        y )Nz.tmputf-8encoding)with_suffixsuffix
write_textrH   )pathtexttmps      rA   atomic_writerX      s7    


4;;/
0CNN4'N*KKrC   c                 4   t         j                         s(t        t        g g g i di g t	        j                         d dS 	 t        j                  t         j                  d            } t        | j                  dd            }|t        k7  rR	 t         j                  t         j                  d| d             t        t        g g g i t	        j                         d d	S | j                  dt               | j                  d
t               | j                  dg        | j                  dg        | j                  dg        | j                  di        | j                  dg        | j                  dt	        j                                | j                  dd        | S # t        $ r Y w xY w# t        $ rd 	 t         j                  t         j                  d             n# t        $ r Y nw xY wt        t        g g g i di g t	        j                         d dcY S w xY w)NF)versioncatalog_urlsproduct_links
done_linksrowspreferred_namesmedia_repair_donefail_countshard_failed_links
started_atgeckodriver_pathrO   rP   rZ   r   z.json.vz.bak)rZ   r[   r\   r]   r^   ra   rc   rd   r[   r\   r]   r^   ra   rb   rc   rd   z.json.corrupt)
CHECKPOINTexistsCHECKPOINT_VERSIONCATALOG_URLStimejsonloads	read_textintgetrH   rR   	Exception
setdefault)dataold_vers     rA   load_checkpointrs      s   )(!!&!#))+ $
 	
-
zz*...@Adhhy!,-(("":#9#9GG9D:Q#RS . ,!# !"iik$(	 	 		#565,b)#r*+R0diik2*D1+  ,  
	z55oFG 		 *(!!&!#))+ $
 	

sa   AF* ,F 8$F* B=F* 	F'$F* &F''F* *	H4(GH	G)&H(G))+HHc                P    t        t        t        j                  | dd             y )Nr:   T)indent	sort_keys)rX   re   rj   dumps)rq   s    rA   save_checkpointrx      s    TZZQ$GHrC   c                    t        t        dd      5 }|j                  | j                         dz          d d d        y # 1 sw Y   y xY w)NarO   rP   
)open
ERRORS_TXTwriterstrip)linefs     rA   write_errors_liner      s<    	j#	0 &A	$%& & &s	   #?Ac                 ,    	 dd l } y# t        $ r Y yw xY w)Nr   TF)tkinterro   )r   s    rA   _can_use_tkr      s     s    	c                   t         j                  j                         s
t               sy 	 dd l}ddlm}m} |j                         }|j                          |j                  dd       |j                  dd       |j                  |       }|j                          |xs dj                         }|r|S d S # t        $ r Y y w xY w)	Nr   )
filedialog
messageboxz-topmostTRW Site ScraperzCould not find geckodriver automatically.

Please select the geckodriver executable.
Windows: geckodriver.exe
Linux: geckodriver)titler   )sysstdoutisattyr   r   r   r   Tkwithdraw
attributesshowinfoaskopenfilenamedestroyrK   ro   )r   tkr   r   rootrU   s         rA   _tk_pick_filer      s    
zz+-2uuw
D);	
 )))6
!!#t%% s   B	B7 5B7 7	CCc                  *    e Zd ZdZddZdddZd	dZy)
ProgressReporterzDTTY progress bar, or Tk window if launched by double-click (no TTY).c                   t        t        |      d      | _        || _        t        j
                  j                         | _        d| _        d | _	        d | _
        d | _        | j                  sEt               r9	 dd l}ddlm} |j                         | _	        | j                  j!                  d       | j                  j#                  d       | j                  j%                  dd       |j'                  | j                  dd	      | _
        | j                  j)                  d
dd       |j+                  | j                  | j                  d      | _        | j                  j)                  dd       d| _        | j                  j-                          | j                  j/                          y y y # t0        $ r
 d| _        Y y w xY w)N   Fr   )ttkr   620x150zStarting...w)rV   anchorx   )r      )fillpadxpadyiD  )maximumlength)r   
   )r   r   T)maxrm   totalrc   r   r   r   use_ttygui_root_label_pbarr   r   r   r   r   geometry	resizableLabelpackProgressbarupdate_idletasksupdatero   )selfr   rc   r   r   s        rA   __init__zProgressReporter.__init__  sS   UQ'
$zz((*

+-!$'UUW


  !23

##I.

$$UE2 hhtzzchR  c A __TZZTW_X


Rg6

++-

!!#% #0&  ! !s   :D7F4 4GGc                >   t        dt        t        |      | j                              }| j                  r| j
                  r| d| j                   d| j                         }| j                  r| j                  j                  |       | j                  r|| j                  d<   | j
                  j                          | j
                  j                          y d}|| j                  z  }t        t        ||z              }d|z  d||z
  z  z   }t        t        j                         | j                  z
  d	      }||z  }	|	d
kD  r| j                  |z
  |	z  nd}
d| d| d| j                   d|dz  ddt        |
       d}|r|d| z  }t        j                   j#                  d|d d z          t        j                   j%                          || j                  k(  r>t        j                   j#                  d       t        j                   j%                          y y )Nr   /z  )rV   value    #-g-C6?g&.>        [z]  (d   z5.1fz%) ETA rL      r{   )r   minrm   r   r   r   rK   r   configr   r   r   roundri   rc   r   r   r~   flush)r   currentnotemsgwidthfracfilledbarelapsedrateetas              rA   r   zProgressReporter.update7  s   aS\4::6788

IQtzzl"TF399;C{{""",zz&-

7#JJ'')JJ#U4%<()FlSEFN33diikDOO3V< /3d{tzzG#t+#b	4::,bc$ws3xjPQRRv;C

DS	)*

djj JJT"JJ !rC   c                    | j                   r)| j                  r	 | j                  j                          y y y # t        $ r Y y w xY wN)r   r   r   ro   )r   s    rA   closezProgressReporter.closeW  s?    88



""$ #8  s   7 	AAN)r   rm   rc   floatr   )r   rm   r   r   returnNoner   r   )__name__
__module____qualname____doc__r   r   r   r?   rC   rA   r   r     s    N!>@rC   r   c                    t         j                  dk(  rnt         j                  j                  d      xs> t         j                  j                  d      xs t	        t        j                               } t        |       dz  }nOt         j                  j                  d      xs  t	        t        j                         dz        } t        |       dz  }|j                  dd	       |S )
NntLOCALAPPDATAAPPDATARW_Site_ScraperXDG_CACHE_HOMEz.cacherw_site_scraperTparentsexist_ok)osnameenvironrn   r   r   homemkdir)baseds     rA   
_cache_dirr   b  s    	ww$zz~~n-^	1J^cRVR[R[R]N^J**zz~~./N3tyy{X7M3NJ**GGD4G(HrC   c                 f   t         rct        t               } | j                         rt        |       S | j	                         r)dD ]$  }| |z  }|j                         st        |      c S  dD ](  }t
        |z  }|j                         st        |      c S  t        j                  dk7  rXt        j                         dz  t        j                         dz  dz  fD ]$  }|dz  }|j                         st        |      c S  t        j                  d      xs t        j                  d      } | r| S y )N)geckodriver.exegeckodriverr   binz.localr   r   )r<   r   is_filer   is_dirBASE_DIRr   r   r   shutilwhich)pr   candr   s       rA   _resolve_from_env_or_pathr   m  s   !"99;q6M88:: %4x<<>t9$% 3 $<<>t9 
ww$))+%tyy{X'='EF 	!A}$D||~4y 	! 	]#Fv||4E'FArC   c                 @   t         j                  j                         } t        j                         j                         }| j	                  d      r#dt        j
                         d   v }|rddfS ddfS | j	                  d      r
d|v sd	|v ry
y| dk(  r
d	|v sd|v ryyy)zO
    Returns (asset_contains, archive_type) matching geckodriver releases.
    win64r   win64win32ziplinuxaarch64arm64)zlinux-aarch64tar.gz)linux64r  darwin)zmacos-aarch64r  )macosr  )r   platformlowermachine
startswitharchitecture)sysplatmachis_64s      rA   _platform_asset_keyr    s     ll  "G##%D% --/22 u55gu55'"4.$(d?i4/." rC   c                   t               \  }}d}t        |ddi      }t        |d      5 }t        j                  |j                         j                  dd	            }d
d
d
       j                  dg       }d
}d
}	|D ]?  }
|
j                  dd      }||v s|j                  |      s,|
j                  d      }|}	 n |st        d| d| d      | |	z  }t        |ddi      }t        |d      5 }t        |d      5 }t        j                  ||       d
d
d
       d
d
d
       t        j                  dk(  rdnd}| |z  }|dk(  rqt        j                   |d      5 }|j#                         D ]>  }|j                  |      s|j%                  ||        | |z  }|j'                  |        n d
d
d
       nt)        j                  |d      5 }|j+                         D ]d  }|j                  j                  d|z         s|j                  |k(  s1|j%                  ||        | |j                  z  }|j'                  |        n d
d
d
       	 |j-                  d       t        j                  dk7  ri	 t        j0                  |      }t        j2                  ||j4                  t0        j6                  z  t0        j8                  z  t0        j:                  z         |j=                         st        d      |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# t.        $ r Y w xY w# t.        $ r Y vw xY w)zh
    Downloads and extracts latest geckodriver into dest_dir.
    Returns path to extracted driver.
    z@https://api.github.com/repos/mozilla/geckodriver/releases/latest
User-AgentzRW_Site_Scraper/1.0headersr8   timeoutrO   rH   errorsNassetsr   r   browser_download_urlz'Could not find a geckodriver asset for r   z).<   wbr   r   r   r  r)rU   zr:gzr   T
missing_okz5Download succeeded but geckodriver was not extracted.)r  r   r   rj   rk   readdecodern   endswithRuntimeErrorr|   r   copyfileobjr   r   zipfileZipFilenamelistextractrH   tarfile
getmembersunlinkro   statchmodst_modeS_IXUSRS_IXGRPS_IXOTHrf   )dest_dir	asset_keyarchive_typeapireqr  rq   r  dl_urldl_namerz   r   archive_pathreq2r   driver_nameextracted_pathzmemberr   tsts                         rA   _download_latest_geckodriverrB    s5   
 23I|
LC
#.CD
EC	b	! FQzz!&&(//')/DEF XXh#FFG uuVR |!<UU12FG DYKrR^Q__abccg%L6L2G#HID	r	" !alD)A !Q1a ! ! (*ww$#MK+Nu__\3/ 	1**, ??;/IIf8I4 6)AIIn-	 	 \\,/ 	1,,. ;;''k(9:fkk[>XIIf8I4 6;;.AIIn-	t, 
ww$	(BHH^RZZ$,,%>%MPTP\P\%\]   "RSSqF F$! ! ! !	 		 	    		sy   5L 8LLL#%L'	,L'AL36L3L? ;A(M  L
L	LL$'L03L<?	M
M	MMc                   t               }|rt        |      j                         r|S | j                  d      xs dj	                         }|rt        |      j                         r|S 	 t               }t        |      }t        |      | d<   t        |        t        |      S # t        $ rP}t        d      }|r0t        |      j                         r|| d<   t        |        |cY d }~S t        d|       d }~ww xY w)Nrd   r   z$Select geckodriver / geckodriver.exezUnable to locate or install geckodriver.

Fix options:
  - Put geckodriver on PATH
  - OR set GECKODRIVER_PATH to the full driver path
  - OR install via package manager (Linux often: sudo apt install firefox-geckodriver)

Underlying error: )r   r   rf   rn   rK   r   rB  r   rx   ro   r   r$  )ckr   saveddestdriver_pathepickeds          rA   ensure_geckodriverrJ    s    !#AT!W^^ VV&'-2446Ee##%
|248!$[!1; 
EFd6l))+%+B!"BM!
 "#%
 	

s$   )8B" "	C;+7C6"C;(C66C;eager)page_load_strategyblock_imagespage_load_timeout
user_agentc                  t        |       }t               }|r|j                  d       |r|j                  d|       |j	                  dd       |j	                  dd       |j	                  dd       |j	                  d	d       |r|j	                  d
d       |r|j	                  d|       t        |      }t        j                  ||      }	|	j                  |       |	j                  t               |	S )aJ  Start Firefox using an explicit geckodriver path (no Selenium Manager).

    Args:
        page_load_strategy: "eager" returns after DOMContentLoaded. "normal" waits for full load.
        block_images: If True, blocks images to reduce load stalls/timeouts.
        page_load_timeout: Seconds for Selenium navigation timeout.
    z	-headlesspageLoadStrategyzdom.webnotifications.enabledFzmedia.volume_scalez0.0z!browser.privatebrowsing.autostartTznetwork.http.http3.enablezpermissions.default.imager:   zgeneral.useragent.override)executable_path)serviceoptions)rJ  r   add_argumentset_capabilityset_preferenceFirefoxServicer   Firefoxset_page_load_timeoutset_script_timeoutSCRIPT_TIMEOUT)
rD  headlessrL  rM  rN  rO  geckorT  rS  drivers
             rA   create_driverr`    s      r"EiG[)13EF 95A/7>E6>:A>;ZHU3Gw@F
  !23
n-MrC   c                     t         \  } }	 t        j                  t        j                  | |             y # t
        $ r t        j                  |        Y y w xY wr   )REQUEST_DELAY_RANGEri   sleeprandomuniformro   )lohis     rA   _polite_delayrh  >  sA     FB

6>>"b)* 

2s   )5 AAc                   d}t        d|dz         D ]  }	 | j                  |       t                	 t        | dd      }|~t	        |d      r3t	        |j
                  d      rt        |      dz   |j
                  _        t	        |d      r3t	        |j                  d      rt        |      dz   |j                  _        | j                  |       t        j                  |       	 | j                  t        j                  d	        y |r|y# t        $ r Y Zw xY w# t        $ r Y  yw xY w# t         $ ry}|}	 | j#                  d
       n# t        $ r Y nw xY wt        j                  |       	 | j                  t        j                  d	       Y d}~ y# t        $ r Y nw xY wY d}~nd}~wt$        $ r}|}Y d}~nd}~ww xY w	 | j                  d       n# t        $ r Y nw xY wt        j                  d|z         )zNavigate without getting stuck on pages that never fully finish loading.

    - Uses page_load_timeout
    - On timeout, calls window.stop() and continues if the DOM exists
    - Retries a couple times with light cleanup
    Nr   r:   command_executor_client_configr  r   _connbodyzwindow.stop();about:blankg      ?)rangerZ  rh  getattrhasattrrk  rm   r  rl  ro   rn   ri   rc  find_elementr   TAG_NAMEr   execute_scriptr   )	r_  urlr  settle	max_trieslast_excattemptcerH  s	            rA   safe_getr{  F  s    )-HIM* 1#)	((1O
V%7>>r#34ARART]9^47L24E))1r7+)0L+.w<"+<( JJsO JJv##BKK8 51#f  K       	H%%&67 JJv##BKK8 ! 	H		JJ}% 		

4'>"s   D-BD?&D-& D	DD-DD-	D*&D-)D**D--	G6F*9E
F*	EF*EF*0 F	F"F*!F""F**G6F==GG	G$#G$c                    g }| fD ]  }||vs|j                  |        d| v r*| j                  ddd      }||vr|j                  |       |S | j                  ddd      }||vr|j                  |       |S )zOReturn a small set of URL variants (www/non-www) to dodge occasional redirects.z//www.z//r   )appendrH   )ru  outsuu2s       rA   _url_variantsr    s    DU D=KKN 3[[4+T>KKO
 K [[x+T>KKOKrC   c                    t        | dddd      }t        ||      5 }|j                         }d d d        j                  dd	      S # 1 sw Y   xY w)
Nz_Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36z?text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8zen-US,en;q=0.9)r  AcceptzAccept-Languager  r  rO   rH   r  )r   r   r!  r"  )ru  r  r7  r  rq   s        rA   _fetch_htmlr    s`    
{W/
C 
g	& !vvx ;;wy;11 s   AAc                  <     e Zd ZdZh dZ fdZd Zd Zd Z xZ	S )_VisibleTextExtractorz;Very small HTML->visible text extractor (no external deps).>   r   brh1h2h3h4h5h6litdthtrdivfooterheaderarticlesectionc                0    t         |           g | _        y r   )superr   parts)r   	__class__s    rA   r   z_VisibleTextExtractor.__init__  s     "
rC   c                t    |j                         | j                  v r| j                  j                  d       y y Nr{   r
  _BLOCK_TAGSr  r}  )r   tagattrss      rA   handle_starttagz%_VisibleTextExtractor.handle_starttag  -    99;$***JJd# +rC   c                t    |j                         | j                  v r| j                  j                  d       y y r  r  )r   r  s     rA   handle_endtagz#_VisibleTextExtractor.handle_endtag  r  rC   c                b    |r-|j                         r| j                  j                  |       y y y r   )rK   r  r}  )r   rq   s     rA   handle_dataz!_VisibleTextExtractor.handle_data  s&    DJJLJJd# !4rC   )
r   r   r   r   r  r   r  r  r  __classcell__)r  s   @rA   r  r    s!    EK#$$$rC   r  c                4   t               }	 |j                  |        dj                  |j                        }t        j                  |      }t        j                  dd|      }t        j                  dd|      }|j                         S # t        $ r Y yw xY w)Nr   z\n{3,}z

z[\t\r]+rG   )
r  feedro   joinr  html_libunescaperI   rJ   rK   )htmlr   raws      rA   _html_to_textr    s    A	t ''!''
C


C
 C
&&FC
(C
&&S#
&C99;  s   B 	BBc                >   t        j                  d| t         j                  t         j                  z        }|sy|j	                  d      }t        j
                  dd|      }t        j                  |      }t        j
                  dd|      j                         }|S )Nz<h1\b[^>]*>(.*?)</h1>flagsr   r   <[^>]+>rG   rF   )	rI   search
IGNORECASEDOTALLgrouprJ   r  r  rK   )r  minners      rA   _extract_first_h1r    sw    
		*D		8QRAGGAJEFF:sE*Ee$EFF63&,,.ELrC   c                   dhh dd	d}d
fd}t        j                  d| t         j                  t         j                  z        }|r! ||j	                  d            } ||      r|S t        j
                  d| t         j                  t         j                  z        D ]%  } ||j	                  d            } ||      s#|c S  t        |       S )a  Extract a likely product title from HTML.

    Many Red Wing product pages place the actual product name in <h3>, while <h1>
    can be a site/banner header (e.g., 'RED WING FOR BUSINESS'). We therefore
    try meaningful <h3> first, then fall back to <h1>.
    r    >	   
SIZE & FITRELATED PRODUCTSCARESIZINGDETAILSREVIEWSFEATURES
TECHNOLOGYSPECIFICATIONSc                    t        j                  dd|       } t        j                  |       } t        j                  dd|       j	                         } | S )Nr  rG   rF   )rI   rJ   r  r  rK   )r  s    rA   _cleanz&_extract_first_heading.<locals>._clean  sD    z3.!!%(vsE*002rC   c                    | xs dj                         } | sy| j                         }|v s|v ryt        |      dk  ryt        j                  d|      syy)Nr   Fr   z[A-Z]T)rK   upperlenrI   r  )rL   upBADSTOPs     rA   _okz#_extract_first_heading.<locals>._ok  sV    W"OOWWY9d
r7a<yy2&rC   z<td[^>]*class=['\"][^'\"]*prTitle[^'\"]*['\"][^>]*>\s*Name\s*</td>\s*<td[^>]*class=['\"][^'\"]*prValue[^'\"]*['\"][^>]*>(.*?)</td>r  r   z<h3\b[^>]*>(.*?)</h3>)r  r   r   r   rL   r   r   bool)rI   r  r  r  r  finditerr  )r  r  r  mnamerL   mh3r  r  s         @@rA   _extract_first_headingr    s     #
#CD
 II	Immbii'	E 5;;q>"q6H {{3TQSQZQZAZ[ 399Q< q6H
 T""rC   c                t   t        | d      }|j                  dd      xs |j                  d      }|j                  d      }|j                  d      }|j                  d	d
      }t        j                  d|      }|r|j                  d      n|j                         }|j                  d	d
      }	||	|fS )zExtract (style_number, name, style_text) using the ONLY allowed method.

    IMPORTANT: Per user instruction, this function intentionally uses *only* the
    shoeguide/printSpacing -> h3 + strong logic (no fallbacks).
    zhtml.parserr  	shoeguide)class_printSpacing)idr  strongrG   T)rK   z	#\s*(\d+)r   )r   findget_textrI   r  r  rK   )
r  ru  soup	guide_divh3_tag
strong_tag
style_textmatchstyle_numberr   s
             rA    extract_style_and_name_from_htmlr    s     }-D		%	4T		^	8TI^^D!F)JD1JIIlJ/E%*5;;q>
0@0@0BLs$/Dz))rC   c                   g d}t        j                          dz   }t        j                          |k  rd}|D ]W  }	 | j                  t        j                  |      }|j	                         r$|j                         r|j                          d} nY |sy t        j                  d       t        j                          |k  ry y # t        $ r Y w xY w)N)zN//button[contains(.,'Agree') or contains(.,'Accept') or contains(.,'Proceed')]zI//a[contains(.,'Agree') or contains(.,'Accept') or contains(.,'Proceed')]uR   //button[contains(.,'Close') or contains(.,'×') or contains(@aria-label,'Close')]g       @FT皙?)	ri   rr  r   XPATHis_displayed
is_enabledclickro   rc  )r_  xpathsendclickedxpels         rA   dismiss_popupsr  *  s    F
 ))+
C
))+
 	B((26??$HHJ"G	 

3 ))+
  s   AC	CCc           	     x   d}d}t        |      D ]  }t        |        | j                  t        j                  d      }t        |D ch c]%  }|j                  d      s|j                  d      ' c}      }||k(  r|dz  }nd}|}|dk\  r y | j                  d       t        j                  d        y c c}w )	Nr   $//a[contains(@href,'/safety-boot/')]hrefr   r   z/window.scrollTo(0, document.body.scrollHeight);g      ?)
ro  r  find_elementsr   r  r  get_attributert  ri   rc  )r_  
max_roundsstable_rounds
last_count_linksrz   counts           rA   scroll_to_load_allr  A  s    MJ: v$$RXX/UVeWqv?VQ__V,WXJQMMJAOP

3 Xs   B7
B7
c                    | xs dj                         } t        j                  dd|       } | sy| j                         }|dv ryt        j                  d|       ryd| v ryt        |       dk  ry| S )Nr   rF   rG   >   
QUICK VIEWADD TO CARTVIEW DETAILSr  r  r  r  r    z\d{2,}$r   )rK   rI   rJ   r  	fullmatchr  )rL   r  s     rA   _clean_listing_namer  U  su    	
bA
vsAA		A 	 	 	||Iq!
ax
1vzHrC   c                x   g }	 |j                  | j                  d      xs d       	 |j                  | j                  d      xs d       	 |j                  | j                  xs d       g }|D ]B  }|st	        |      j                         D ]!  }t        |      }|s|j                  |       # D t               }g }|D ])  }||vs|j                  |       |j                  |       + |sydd}|j                  |d       |d   S # t        $ r Y w xY w# t        $ r Y w xY w# t        $ r Y w xY w)	zJBest-effort: read the product name as displayed on a catalog listing tile.z
aria-labelr   r   c                *    t               }t        j                  d       r|dz  }t        j                  d j                               r|dz  }t        j                  d j                               r|dz  }t	         fddD              r|d	z  }|S )
Nz[A-Za-z]   \bWOMEN'?S\br   
\bMEN'?S\br;   c              3  B   K   | ]  }|j                         v   y wr   )r
  .0r@  rL   s     rA   	<genexpr>z;_listing_name_from_anchor.<locals>.score.<locals>.<genexpr>  s     X!qAGGI~Xs   )viewquickcartcomparewishlistr8   )r  rI   r  r  any)rL   scs   ` rA   scorez(_listing_name_from_anchor.<locals>.score  sz    V99[!$"HB99_aggi0"HB99]AGGI.!GBX'WXX"HB	rC   T)keyreverser   )rL   r   r   rm   )
r}  r  ro   rV   r   
splitlinesr  setaddsort)rz   
candidateslinesclnseenuniqr  s           rA   _listing_name_from_anchorr!  o  sR   J!//,7=2>!//'28b9!&&,B' E !a&##% 	!B$R(BR 	!! 5DD T>HHRLKKO
 
 	II%I&7NY      s4   $D $D D- 	DD	D*)D*-	D98D9c                    | r|syd}| D ]l  }t        |t              rt        |      dk  r"|d   xs dj                         }|s<t	        |j                  |d            }|sZ|d   |k7  sc||d<   |dz  }n |S )zQOverwrite Name column using preferred mapping (by style). Returns number updated.r   r:   r   r   )
isinstancelistr  rK   r  rn   )r^   	preferredupdatedr  styleprefs         rA   apply_preferred_namesr)    s    yG 	!T"c!fqj1""$"9==#;<AaDDLAaDqLG	 NrC   c                   g }t               }t               }i }t        | d      }t        D ](  }t                t	        | |       |j                  t        j                  t        j                  df             t        |        t        |        | j                  t        j                  d      D ]  }|j                  d      }|sd|vr||v r!t        |      }	t        r&|	r$d|v r 	 t!        t#        |            }
|
r	|	|vr|
||	<   |	r'|	|v r|j'                  |       p|j'                  |	       |j'                  |       |j)                  |        + ||fS # t$        $ r Y _w xY w)zCollect unique product links from all configured catalog sections.

    Dedupes by style number when possible (preferred), otherwise by URL.
       r  r  z/safety-boot/zcatalog=international)r  r   rh   rh  r{  untilECpresence_of_element_locatedr   r  r  r  r  r  extract_style_from_url"PREFER_INTERNATIONAL_LISTING_NAMESr  r!  ro   r  r}  )r_  	all_links	seen_href
seen_styler_   waitru  rz   r  r'  nms              rA   collect_product_linksr6    si   
 I%I5J&(O$D   #

211288=c2defv6"%%bhh0VW 	#A??6*Dd*y *40E1e@W[^@^,-Fq-IJBe?:13. J&MM$'u%MM$T"3	# #D o%% ! s   (E	E'&E'c                    t        j                  d|       }|r|j                  d      S t        j                  d|       }|r|j                  d      S dS )Nz/safety-boot/(\d+)[-/]r   z/safety-boot/(\d+)r   )rI   r  r  )linkr  s     rA   r/  r/    sJ    
		+T2Awwqz
		'.A1771:""rC   c                      xs dj                         j                           syd}t         fd|D              ryt               dkD  ryy)Nr   T)zwindow.openzjavascript:zfacebook.comzhttp://zhttps://c              3  &   K   | ]  }|v  
 y wr   r?   r
  s     rA   r  z#_looks_like_junk.<locals>.<genexpr>  s     
'a16
'   r  F)rK   r
  r  r  )rL   junk_tokenss   ` rA   _looks_like_junkr=    sI    	
b!AWK

';
''
1v{rC   c                    	 | d   xs dj                         }| d   xs dj                         }|sy|j                         t        k(  ryy# t        $ r Y yw xY w)Nr   r   r   TF)rK   ro   r  BAD_NAME_SENTINEL)rowr'  r   s      rA   _is_bad_rowrA    sd    Q2$$&A"##% zz|((  s   .A 	AAc                   | j                  d      ry| j                  dg       xs g }t        | j                  dg       xs g       }d}t               }|D ]  }t        |t              r|s|d   xs dj	                         }|s0t        |      t        k  r|j                  |       Tt        |      dkD  r|d   xs dj	                         nd}t        |      dkD  r|d   xs dj	                         nd}|r|r|j                  |        |syt        |      D ]+  }	t        |	      }
|
s|
|v s|j                  |	       |dz  }- t        |      | d<   |S )	zIf checkpoint rows are missing URL/Image columns or have empty URL/Image, requeue those links.
    Runs at most once per checkpoint unless you delete/clear ck['media_repair_done'].
    r`   r   r^   r]   r   r:   r   r   )rn   r  r#  r$  rK   r  EXPECTED_COLSr  r/  discardsorted)rD  r^   r]   removedstyles_neededr  r'  url_cellimg_cellr  rA  s              rA   repair_missing_mediarJ    sZ    
vv!"66&"#DRVVL"-34JGEM %!T"!1""$q6M!e$+.q6A:AaDJB%%'2+.q6A:AaDJB%%'2xe$% Z  #D)"%t$qLG	 j)B|NrC   c                   	 t        | j                  dg       xs g       }t        | j                  dg       xs g       }|r|sy|D ch c]  }|st        |      s|d    }}|syd}t        |      D ]+  }t	        |      }|s||v s|j                  |       |dz  }- |rt        |      | d<   t        | j                  di       xs i       }t        |j                               D ]'  }	t	        |	      }|s||v s|j                  |	d       ) || d<   | j                  dg       xs g D 
cg c]  }
t	        |
      |vr|
 c}
| d<   t        |        |S c c}w c c}
w # t        $ r Y yw xY w)aa  If checkpoint contains obviously bad rows, un-mark those links as 'done'.

    This fixes the situation where a previous run captured the site header/ads into the Name/Brand
    columns and those rows are now 'stuck' because resume logic skips already-done links.

    Returns:
        Number of links that were re-queued (removed from done_links).
    r^   r]   r   r   ra   Nrb   )r$  rn   r  rA  r/  removerE  dictkeyspoprx   ro   )rD  r^   r]   r  
bad_stylesrF  r8  rA  fckr  s              rA   repair_bad_checkpoint_rowsrS  4  s   $BFF62&,"-b17R8
:$(AqA+a.adA
A$ 	D'-BbJ&!!$'1		 %j1B| bff]B/526B"'')_ $+A."
*FF1dO$ !#B} 4666:Mr3R3XVX 'VQ)?)B*)T () 'VB"# B9 B.'V  sa   AE& E& 	EEE$E& )E& E& A0E& =E& 1E& 3E!	E& 
E& &	E21E2c           
        | xs dj                         D cg c]  }|j                          }}t        |      D ]  \  }}|s	|j                  |      s|t	        |      d j                  d      }|r|c S t        |dz   t        |dz   t	        |                  D ]!  }||   xs dj                         }|s|c c S   y yc c}w )u	  Parse a simple field/value from extracted page text.

    Red Wing pages often render fields in tables so the extracted text looks like:

        Name
        DynaForce®

    We support both 'Name: DynaForce' and 'Name' on one line with the value on the next.
    r   Nz :	r   r   )r  rK   	enumerater  r  ro  r   )	
text_block
field_namer  r  ir   tailjnxts	            rA   parse_field_liner\  f  s     $.#3"?"?"ABBRXXZBEBU# 4??:&J()//7D1q5#a!eSZ"89 Qx~2,,.J   Cs   Cc                N   | xs dj                         }t        j                  d|t        j                        }|rz|j	                  d      j                         }|rYg }|j                         D ]3  }|j                  |j                         r|n|j                                5 dj                  |      S |j                         D ]  }t        j                  d|t        j                        s*t        j                  d|j                         t        j                        }|r||j	                  d      j                         }|r[g }|j                         D ]3  }|j                  |j                         r|n|j                                5 dj                  |      c S  y y)Nr   z^(.*?)\s+style\s*#\s*\d+r  r   rG   z\bstyle\s*#\s*\d+\bz^(.*?)\s+style\s*#\s*\d+\b)rK   rI   r  r  r  splitr}  isupper
capitalizer  r  )	header_text	body_texthtr  r  outr   r   m2s	            rA   extract_brandrf    sE   

	"	"	$B
		-rGAggaj CYY[ A

		1@A88C= $$& 
99+TG8$**,bmm\Bhhqk'')C YY[ I

		1HI88C=(
 rC   c                    |rM| j                         j                  d|       }|dk7  r&| ||dz    }t        |d      }|r|j                         S t        | d      }|r|j                         S dS )Nz
ABOUT THE r  i@  r!   r   )r  r  r\  rK   )rb  r'  idxchunkvals        rA   extract_about_namerk    st    oo$$z%%9:"9c3:.E"5&1Cyy{"
9f
-C399;%2%rC   c           	     "   t        d t        j                  d| j                               D              }|t        d t        j                  d| j                               D              z  }t	        dd      D ci c]
  }| d||v  c}S c c}w )Nc              3  2   K   | ]  }t        |        y wr   rm   r  r   s     rA   r  z parse_heights.<locals>.<genexpr>  s     V1AV   z\b(\d{1,2})\s*-\s*INCH\bc              3  2   K   | ]  }t        |        y wr   rn  ro  s     rA   r  z parse_heights.<locals>.<genexpr>  s     SAQSrp  z\b(\d{1,2})\s+INCH\b      ")r  rI   findallr  ro  )rV   foundrX  s      rA   parse_heightsrw    st    V

+F

 UVVE	SS,CTZZ\!RSSSE+0B<8aqcGa5j 888s   :Bc                    | xs dj                         }|j                  d      }|j                  d      }|j                  d      }d|v r|j                  d      sd|v rd}|||fS )Nr   zred wingzirish setterworxzby red wingT)r
  r  )	brand_strbis_rwis_isis_worxs        rA   classify_brand_familyr    si    	b!ALL$ELL(Ell6"G!all62!9K%  rC   c                   | xs dj                         }t        j                  dd|      }|j                  dd      }t        j                  dd|t        j                        }|j                         j                  dd      }t        j                  dd|      }|sd	}t        |      |kD  r|d | j                  d      }|S )
Nr   rF   rG      ®z[^\w\-\.\s]+r  r  z_+boot)rK   rI   rJ   rH   UNICODEr  r   )r   max_lenrL   s      rA   _safe_filename_from_namer    s    	A
vsAA			$A
ARZZ8A		#s#A
uc1A
1vhwKs#HrC   c                     t        j                         j                  dz  j                         } | j	                  dd       | S )NImagesTr   )r   cwdparentresolver   )r   s    rA   _images_dirr    s6    			X	%..0AGGD4G(HrC   c                6    t        t        d      dz  | z        S )Nz..r  )r   r   )fnames    rA   _rel_image_pathr    s    tDzH$u,--rC   c                6   	 | j                  t        j                  d      }|j                  t        j                  d      }|r|j	                         r|S 	 | j                  t        j                  d      }|s!	 | j                  t        j                  d      }d }d}|D ].  }	 |j	                         s|j                  xs i }t        |j                  d      xs d      }t        |j                  d      xs d      }	|dk  s|	dk  rm|j                  d	      xs d
j                         |j                  d      xs d
j                         |j                  d      xs d
j                         ||	z  }
t        fddD              r|
dz  }
t        fddD              r|
dz  }
t        fddD              r|
dz  }
|
|kD  r|
}|}1 |S # t
        $ r Y w xY w# t
        $ r g }Y {w xY w# t
        $ r g }Y jw xY w# t
        $ r Y tw xY w)NproductImageimgmain imgr   r   r   heightx   altr   classsrcc              3  &   K   | ]  }|v  
 y wr   r?   )r  rR  r  s     rA   r  z1_pick_best_product_img_element.<locals>.<genexpr>  s     P18Pr;  )r  shoechukkahikermocg333333?c              3  &   K   | ]  }|v  
 y wr   r?   )r  rR  clss     rA   r  z1_pick_best_product_img_element.<locals>.<genexpr>  s     M18Mr;  )productprimaryheroimagegffffff?c              3  &   K   | ]  }|v  
 y wr   r?   )r  rR  r  s     rA   r  z1_pick_best_product_img_element.<locals>.<genexpr>  s     R18Rr;  )z	/dw/imagescene7z/imagesstaticg?)rr  r   IDCSS_SELECTORr  ro   r  rs  sizer   rn   r  r
  r  )r_  	containerr  imgsbest
best_scorer  szr   hr  r  r  r  s              @@@rA   _pick_best_product_img_elementr    s   ''~>	$$R__e<3##%J
##BOOZ@ 	''U;D DJ 	??$BBbffWo*+AbffX&+!,A3w!c'##E*0b779C##G,299;C##E*0b779CEEP%OPPM%LMMR%QRRz!"
+0 KQ    
  	D	6  		sV   AG  G' = G9 (H9AHCH	G$#G$'G65G69HH	HHc                ~   	 | j                  t        j                  d      }	 |j                  t        j                  d      }|rP|j                  d      xs dj                         }|j                  d      xs dj                         }|s|r|xs |S dD ]u  }	 | j                  t        j                  |      }|j                  d      xs dj                         }|j                  d      xs dj                         }|s|r|xs |c S w y# t        $ r d }Y w xY w# t        $ r Y w xY w# t        $ r Y w xY w)Nr  r  zoomimgr   r  )z$li[data-orbit-slide="product-1"] imgzli.active imgzul#productImage li.active imgzmain li.active imgr  )rr  r   r  r  ro   r  rK   )r_  r  r  r>  rL   sels         rA   _extract_product_image_urlr    sC   ''~>		((%@C ""9-3::<A""5)/R668AAvA
 	%%boos;C""9-3::<A""5)/R668AAvA   7  	C	  "  		sH    D!  D AD! A/D0DD! DD! !	D-,D-0	D<;D<c                &   |xs dj                         }|syd| d}t               |z  }	 t        |       }|rl|j                  dd      }ddi}t	        j
                  ||d	      }|j                  r2|j                  r&|j                  |j                         t        |      S d
}	 t        |       }|sy	 | j                  d|       t        j                  d       	 t        |        	 |j!                  t#        |             t        |      S # t        $ r Y vw xY w# t        $ r d
}Y yw xY w# t        $ r Y ]w xY w# t        $ r Y `w xY w# t        $ r9 	 | j%                  t#        |             t        |      cY S # t        $ r Y Y yw xY ww xY w)a	  Save the main product image as a temp file (__<style>.png).

    Preferred: download the product image URL (zoomimg/src) for best quality.
    Fallback: element screenshot if download fails.
    Returns relative path like ../Images/__595.png (or empty string).
    r   __.pngz&amp;&r  zMozilla/5.0r8   )r  r  Nz?arguments[0].scrollIntoView({block:'center', inline:'center'});r  )rK   r  r  rH   requestsrn   okcontentwrite_bytesr  ro   r  rt  ri   rc  r  
screenshotr   save_screenshot)r_  r'  r  out_pathimg_urlr  r  img_els           rA   capture_product_image_tempr  )  s    [b!EtE}u$H
,V4oogs3G#]3GWgrBAtt		$$QYY/&u--
 F/7 _agh

3v#h-(u%%5      
    	""3x=1"5)) 			sx   A8D (D 7'D0 D? +$E 	DDD-,D-0	D<;D<?	E
E	F$E?<F?	FFFFc                   |syt        j                         |z  j                         }|j                         s|S t	        |      }| d}t               |z  }|j                         r`| d|  d}t               |z  }|j                         r;t        dd      D ],  }| d|  d| d}t               |z  }|j                         r, n 	 |j                  |       t        |      S # t        $ r |cY S w xY w)zNRename ../Images/__<style>.png -> ../Images/Name_of_Boot.png (collision-safe).r   r  r  r:   r   )
r   r  r  rf   r  r  ro  rH   ro   r  )r'  	boot_namerel_temp_pathtemp_absstemr  dest_absrX  s           rA   finalize_image_filenamer  d  s   
]*335H??#I.DfDME}u$H&%%=5(??1c] &%!D1&=50(	" 5!!  s   	C% %C32C3c                b   | xs dj                         }|xs dj                         }t        j                  d|      sd|v sd|v sd|v ryt        j                  d|      sd|v sd	|v sd
|v ryt	        t        j                  d|            }t	        t        j                  d|            }||fS )zInfer gender flags from URL + header (do NOT use body text; it frequently contains both words).

    Priority:
      1) URL slug/query (mens/womens)
      2) Header text (MEN'S / WOMEN'S)
      3) Otherwise: unknown -> (False, False)
    r   z/womens(?:[-/]|$)zwomens-zgender=womenzgender=female)FTz/mens(?:[-/]|$)zmens-z
gender=menzgender=male)TFr  r  )r  r
  rI   r  r  )r  
source_urlhuulmalefemales         rA   infer_genderr    s     ,B			B

	!	!	#B 
yy%r*i2oSUAUYhlnYn	yy#R(GrM\R=OS`dfSf		-,-D"))OR01F&>rC   c                	  / |xs d}|xs d}|j                         }|j                         }t        ||      }t        |      rd}t	        |      \  }	}
}|sO|j                         }|j                         }d|v sd|v rd}nd|v sd|v rd}n
d|v sd|v rd}t	        |      \  }	}
}|j                         }t        ||      \  }}d|v }d	|v xs d
|v }d|v xs
 d|v xs d|v }d|v xs
 |xs |xs |}| }d|v xs d|v }d|v }t        |d      }|ry|j                         j                         }t        j                  d|      rd}net        j                  d|      }|r	 t        |j                  d            dkD  }n.d|v xs d|v }n#t        j                  d|      rd}n
d|v xs d|v }d|v xs t        j                  d|      d u}d|v xs t        j                  d|      d u}d|v xs d |v }d!|v xs t        j                  d"|      d u}d#|v xs
 d$|v xr d%|v } d&|v }!t        |d'      }"|"r|"j                         j                  d(      nd)|v }#t        |d*      }$d}%|$r&|$j                         /t        /fd+d,D              sd}%|xs dj                         }&d-|&v xs
 d.|&v xs d/|&v }'d0|&v }(d1|&v })t        |d2z   |z         }*t        |d3      }+|+j                         },d4|,v xs d5|,v }-d6|,v xs d7|,v xs |-}.g t!        |       t!        |      t!        |      t!        |      t!        |      t#        |      t#        |      t#        |	      t#        |
      t#        |      t#        |      t#        |      t#        |      t#        |      t#        |      t#        |      t#        |      t#        |      t#        |      t#        |      t#        |      t#        |      t#        |       t#        |!      t#        |#      t#        |%      t#        |'      t#        |(      t#        |)      t#        |*d8         t#        |*d9         t#        |*d:         t#        |*d;         t#        |*d<         t#        |*d=         t#        |*d>         t#        |*d?         t#        |.      t#        |-      S # t        $ r d}Y %w xY w)@Nr   zIRISH SETTERr(   WORXr)   zRED WINGr'   z	STEEL TOEzALUMINUM TOEz	ALLOY TOEzNON-METALLIC TOEzNON METALLIC TOEzCOMPOSITE TOEz
SAFETY TOEzMETATARSAL GUARDz	MET GUARD
WATERPROOFr+   z$\b(non[-\s]?insulated|uninsulated)\bFz(\d+)\s*g\br   r   Tinsulat
thinsulatezSLIP RESISTANTz\bSR\bzELECTRICAL HAZARDz\bEH\bzPUNCTURE RESISTANTPUNCTUREzSTATIC DISSIPATIVEz\bSD\bzANKLE PROTECTIONankleprotectBOAr,   r   zDEFINED HEELzLeather Typec              3  &   K   | ]  }|v  
 y wr   r?   )r  r   lls     rA   r  z'_build_row_from_text.<locals>.<genexpr>  s     `q17`r;  )meshnylonfabrictextilepoly	syntheticoxfordathleticr  r  r  r{   zCountry of Originzmade in usazmade in the usazbuilt in usazassembled in the usar/   r0   r1   r2   r3   r4   r5   r6   )r  r
  rf  r=  r  r  r\  rK   rI   r  rm   r  ro   r  r  rw  rM   rB   )0r'  r   r  rb  r  image_rel_path
body_upper
body_lowerrz  r|  r}  r~  r  buheader_upperr  r  	steel_toealuminum_toenon_metal_toe
safety_toesoft_toe	met_guard
waterproofinsulation_lineil	insulatedr  slip_resistantelectrical_hazardpuncture_resistantstatic_dissipativeankle_protectionboadefined_heel_linedefined_heelleather_lineall_leather_upper
name_loweroxford_athleticr  r  heightsorigin_lineorigin_lowermade_in_usabuilt_in_usar  s0                                                  @rA   _build_row_from_textr    sK    RI\rF"J"Jfi0I 	"	1)<E5'\\^__R>R#7&Ir\Vr\I2r!1"I 5i @ug<<>L
3LD& z)I!Z/L;*3LL':5;MQ[;[apt~a~M*,[[l[mJ~H#z1Q{j7PI+J&y,?O""$**,99<bAI		."-A% #AGGAJ! 3I '"_E,"2D	99<jII"j0Qlj6PI&*4f"))Iy:Yae:eN,
:l		)U^@_gk@k.*<[*PZBZ.*<n"))IW`BaimBm*j8pg>S>oXaeoXoJC(NCBS$**,77>ZhlvZvL#I~>L!`$_`` $*"##%J:-f:3KfQW[eQeO#Fz!EFTMI56G"9.ABK$$&L L0X6G<6WK"l2n8NR^8^ncnL0u0t0 	z"0 	~&	0
 	y!0 	D	0 	F0 	E
0 	E
0 	G0 	J0 	I0  	M!0" 	L#0$ 	I%0& 	H'0* 	J+0, 	I-0. 	N/00 	102 	304 	506 	708 	C90< 	L=0> 	?0B 	OC0D 	FE0F 	E
G0J 	GDMK0L 	GDMM0N 	GDMO0P 	GDMQ0R 	GDMS0T 	GENU0V 	GENW0X 	GENY0\ 	L]0^ 	K_0 0O ! % $I%s   #S SSc           	         d }t        |       D ]A  }	 t        |d      }t        ||      \  }}}|}t        |      }	t	        ||||	| |      }
|
c S  |r|t        d      # t
        $ r}|}Y d }~cd }~ww xY w)Nr8   r  r  r  zHTTP fallback failed)r  r  r  r  r  ro   r$  )r8  image_temp_relrx  r  r  r'  r   r  r  rb  r@  rH  s               rA   _scrape_product_via_httpr  )  s    (,H4  		q"-D&FtQ&O#E4F%d+I&udFIRVguvCJ	 
-
..  	H	s   <A!!	A6*A11A6prefer_httpc               \   d}t        |      }|}t                |r	 t        ||      S 	 t	        | |       t        |        | j                  xs d}t        ||      \  }}}	 |rt        | |      }t        | d      j                  t        j                  t        j                  df             | j                  t        j                  d      j                   xs d}	|}
t#        |||
|	||      }|S # t        $ r Y w xY w# t        $ r d}Y w xY w# t        $ r t        ||      cY S w xY w)Nr   )r  r+  rm  r  )r/  rh  r  ro   r{  r  page_sourcer  r  r   r,  r-  r.  r   rs  rr  rV   r  )r_  r8  r	  r  r'  style_from_urlr  r   r  rb  r  r@  s               rA   scrape_productr  ;  s5   N"4(ENO	+DPP
Mv !!'R"B4"NtZ	 !;FE!J 	fb!''(F(FU[G\(]^''V<AAGR	"5$	dcqr
1  		   	 N	   M'^LLMsG   C2 7D #D 1B D 2	C>=C>DD DD D+*D+c           	     <   t         }dd}t        | |      }g }|j                  ddj                  |      z   dz          |j                  ddj                  dgt	        |      z        z   dz          |D ]  }t	        |      t	        |      k  r|dgt	        |      t	        |      z
  z  z   }n%t	        |      t	        |      kD  r|d t	        |       }|j                  ddj                  |      z   dz           t        t        dj                  |      dz          y )Nc                x    | d   }	 dt        t        j                  dd|            fS # t        $ r d|fcY S w xY w)Nr   z\Dr   r   )rm   rI   rJ   ro   )r  rL   s     rA   	style_keyz!write_markdown.<locals>.style_keye  sE    aD	s266%Q/011 	q6M	s   !) 99)r  rE   z---r   r{   )r  	List[str])
MD_HEADERSrE  r}  r  r  rX   OUT_MD)r^   r  r  sorted_rows	out_linesr  s         rA   write_markdownr  b  s   G 9-KIS388G,,s23S388UGc'l$:;;cAB 2q6CL RDCL3q6122AVc'l"-3w< Asxx{*S012 9-45rC   c                 0   t         j                         sg S g } t         j                  dd      j                         D ]U  }|j	                         }|st        j                  d|      }|s/|j                  d      }|| vsE| j                  |       W | S )NrO   rH   )rQ   r  zhttps?://\S+r   )	r}   rf   rl   r  rK   rI   r  r  r}  )urlsr   r  r  s       rA   _read_errors_urlsr  |  s    	D$$gi$HSSU 	zz|IIot,
A}A	 KrC   c                    | s	 t         j                  d       y | D cg c]  }dt         d|  }}t	        t         dj                  |      dz          y # t        $ r Y y w xY wc c}w )NTr  FAILED x: r{   )r}   r,  ro   MAX_FAILS_PER_LINKrX   r  )r  r  r  s      rA   _rewrite_errors_filer    sv    	. 	;?@aw)*#aS1@E@TYYu-45	  		 As   A A&	A#"A#c                    t               } t        | j                  dg             }t        | j                  di             }t	        | j                  dg             }t        | j                  dt        j                                     }t	        | j                  dg             }|s
t               }t        ryt        |       }|rlt        | j                  dg             }t        | j                  di             }t	        | j                  dg             }t	        | j                  dg             }d }d }	 t        | ddd      }| j                  d	      xs g }	| j                  d
      xs i }
t        s|	st        d|      }|j                  dd       t        |      \  }}|r|}	nt        j                   j#                  d       |r|}
	 t$        r4|
r2t'        ||
      }|r$|| d<   |
| d
<   t)        |        t+        d| d       |	| d	<   t$        r|
| d
<   t)        |        t$        rV|
sT	 t        |      \  }}
|
| d
<   t)        |        	 |
r2t'        ||
      }|r$|| d<   |
| d
<   t)        |        t+        d| d       t.        rHt1        |       }| j                  d      sd| d<   t)        |        |rt+        d| d       t)        |        t2        j5                         rFt2        j5                         }|	xs g D cg c]  }t7        |      |k(  s| }	}|	| d	<   t)        |        t9        |	      }|r|j;                          t        ||      }t9        |      }|j                  |d       t=        |      D ci c]  \  }}|s	|d   s|d   | }}}|	D ]  }||v r|dz  }|j                  |d       !d}	 	 t?        ||      }|d   stA        d|       |d   stA        d|d    d| d      t$        r0|
r.tC        |
j                  |d   d            }|rtE        |      |d<   t9        |      dkD  r|d   stE        |      |d<   	 t9        |      dkD  r'|d   r"tE        tG        |d   |d   |d               |d<   |d   |v r||||d      <   n"t9        |      ||d   <   |jI                  |       |jK                  |       tM        |      | d<   || d<   || d<   || d<   t)        |        tO        |       d}	 |dz  }|j                  ||rd&nd'        tO        |       |j                  |d(       tb        rT|rQt        j                   j#                  d)t9        |       d*       g }	 |r	 |ja                          t        | dd+dtd        ,      }t=        t	        |      d-      D ]  \  }}d}tg        th              D ]V  }	 t?        ||d.      }|rC|d   r>|d   r9|d   |v r||||d      <   n"t9        |      ||d   <   |jI                  |       d} nX |r3t        j                   j#                  d1| d#t9        |       d2| d%       |jI                  |        |}|| d<   || d<   t)        |        tO        |       tm        |       t        j                   j#                  d4tn         d%       tp        js                         r't        j                   j#                  d5tp         d%       	 |r|j;                          	 |r|ja                          y y # t,        $ r Y (w xY w# t,        $ r
 |
xs i }
Y w xY w# t,        $ r Y w xY wc c}w c c}}w # t,        $ r Y w xY w# t,        $ rx}tQ        |j                  |d            dz   ||<   tM        |      | d<   || d<   || d<   || d<   t)        |        ||   tR        k\  r^||vr%|jI                  |       tU        d ||    d!|        |jK                  |       tM        |      | d<   || d<   t)        |        Y d }~0t        j                   j#                  d"||    d#tR         d$| d%       t        j                   j#                  djW                  tY        jZ                  t]        |      ||j^                              d%z          	 |r|ja                          n# t,        $ r Y nw xY wt        | ddd      }Y d }~nd }~ww xY w=# t,        $ r Y }w xY w# t,        $ r; 	 |j                  d/       n# t,        $ r Y nw xY wt        jj                  d0       Y w xY w# t,        $ r}t        j                   j#                  d3       t        j                   j#                  djW                  tY        jZ                  t]        |      ||j^                              d%z          Y d }~kd }~ww xY w# t,        $ r Y w xY w# t,        $ r Y y w xY w# 	 |r|j;                          n# t,        $ r Y nw xY w	 |r|ja                          w w # t,        $ r Y w w xY wxY w)6Nr]   ra   r^   rc   rb   TrK  )r]  rL  rM  r\   r_   r   )r   rc   r   zRefreshing product links...)r   zN[warn] Link refresh discovered 0 links; falling back to checkpoint link list.
z)Applied preferred International names to z existing rowsr`   z
Re-queued z links due to missing URL/Imagestartingz	(resumed)Fz$Style number parsed empty for link: zName parsed empty for style r   )r   r:   r   r  r  z
Error scraping link (attempt r   z): r{   r  skippeddonez
Starting salvage pass for z failed links...
normal)r]  rL  rM  rN  )startr  rn  g      ?z	Salvaged z: z$
Salvage pass encountered an error:
z
DONE. Wrote: z#Some links failed repeatedly; see: ):rs   r  rn   rM  r$  r   ri   r  AUTO_REPAIR_BAD_ROWSrS  r`  REFRESH_PRODUCT_LINKS_EACH_RUNr   r   r6  r   stderrr~   r0  r)  rx   printro   AUTO_REPAIR_MISSING_MEDIArJ  r   rK   r/  r  r   rU  r  r$  r  rM   r  r}  r  rE  r  rm   r  r   r  	tracebackformat_exceptiontype__traceback__quitENABLE_SALVAGE_PASSSALVAGE_PAGE_LOAD_TIMEOUTro  SALVAGE_MAX_TRIES_PER_LINKrc  r  r  r}   rf   )rD  r]   ra   r^   rc   rb   repairedreporterr_  r\   r_   fresh_linksfresh_preferred_namesnupdr  rmwantr  r   	processedrX  r  style_to_idxr8  r  r@  r(  rH  	remainingsalvageds                                 rA   mainr>    sq
   		BrvvlB78J"&rvvmR'@"AK !34DrvvlDIIK89J#'/BB(G#H-/
 -b1RVVL"56JrvvmR89Kvr*+D $RVV,?%D E ,0HFwrDW[_`/52*,&&1B*C*Ir)'aJGHOOA$AOB1Fv1N.K. +

  e %"7	5/0GD%)6
0?,-'+ I$~^_ #0B1(7$%B .o8%:6%B"?(7$%#
	"0GD%)6
0?,-'+ I$~^_ %%b)B66-.*.&'#
2$&EFG# ##%D)6)<"c1AWXYAZ^bAbQcMc"/BBM"NN#%JG
O		
3 -6dOJDAqqQqT!aJJ! V	IDz!Q		<BKm(6Cq6*-QRVQW+XYYq6*-I#a&QSTXSYYZ+[\\ :o2?3F3Fs1vr3RS%3D%9CF 3x!|CF!/!5As8a<CF%34KCPQFTWXYTZ\_`a\b4c%dCF
 1v-58\#a&12/24ySV,C(NN4('-j'9B|$!%BvJ(3B}%.?B*+#B'"4(BD NIOOIRTYOHmV	Ir 	tF+ #4JJ;C@Q<R;SSefg#%I+j '!'/!&&?  ).?)@J /GAt$H"#=> ,,"04"PC"s1v#a&#&q6\#9ADDc!f)=$>;>t9LQ$8$(KK$4+/ %,$  

((9QCq=N9O8PPRSWRXXZ)[\!((.//2 %.! BvJ&7B"#B4  !23

?6("56JJB:,bQR	 	 q    8"1"7R8    d K@ % . ! m(+KOOD!,D(E(IK%'-j'9B|$!%BvJ(3B}%.?B*+#B'"4(,>>'88-44T:-D8I7J#dV.TU"t,+1*+=<(2C./'+JJ$$9+d:K9LAN`Maadeidjjlm JJ$$RWWY-G-GQQRTUTcTc-d%ehl%lm!"KKM$ *2QXgklF=m] | % .  ) ,% &

= 9#, % $% JJsO,  j

  !IJ

  )C)CDGQPQP_P_)`!adh!hiij$  		
  			  			  		s)  4Bg? :\" #g? *\2 	4] =A>g? ;]]A(g? ;
]]])g? >B]35]#A;]3?A6g? 6e 9c: 	Ae Ad
Ae -Bg? ;g  g0 "	\/+g? .\//g? 2]g? ]g? 	]g? ]g? #	]0,]3/]00]33
c5=B-c0*g? 0B	c0:cc0	cc0cc0+g? 0c55g? :	de de 
	ed&%e&	d2	/e1d2	2e
e ee 	gA8gg? gg?  	g-,g-0	g<;g<?ihi	h ih  i$h86i8	iiii__main__)r@   r  r   r   )rL   r   r   r   )rU   r   rV   r   r   r   )r   r   )rq   r   r   r   )r   r   r   r   )r   r  )r   r   r   Optional[str])r   r   )r   r@  )r   zTuple[str, str])r3  r   r   r   )rD  r   r   r   )T)rD  r   r]  r  rL  r   rM  r  rN  rm   rO  r@  r   zwebdriver.Firefoxr   )
ru  r   r  rm   rv  r   rw  rm   r   r   )ru  r   r   r  )r8   )ru  r   r  rm   r   r   )r  r   r   r   )r  r   ru  r   r   zTuple[str, str, str])P   )r  rm   r   r   )r   r   )r^   List[List[str]]r%  zDict[str, str]r   rm   )r   z Tuple[List[str], Dict[str, str]])r8  r   r   r   r  )r@  r  r   r  )rD  r   r   rm   )rV  r   rW  r   r   r   )ra  r   rb  r   r   r   )rb  r   r'  r   r   r   )rV   r   r   zDict[str, bool])rz  r   r   zTuple[bool, bool, bool])r  )r   r   r  rm   r   r   )r  r   r   r   )r'  r   r   r   )r'  r   r  r   r  r   r   r   )r  r   r  r   r   zTuple[bool, bool])r   r   )r'  r   r   r   r  r   rb  r   r  r   r  r   r   r  r   )r8  r   r  r   r   r  )r8  r   r	  r  r   r  )r^   rB  r   r   )r   r  )r  r  r   r   )r   
__future__r   rj   r   r	  rI   r   r-  r   r*  ri   r+  rd  r  r  r  html.parserr   r&  pathlibr   typingr   r   r   r	   r
   urllib.requestr   r   bs4r   seleniumr   selenium.common.exceptionsr   r   selenium.webdriver.common.byr   "selenium.webdriver.firefox.optionsr   "selenium.webdriver.firefox.servicer   rX  selenium.webdriver.supportr   r-  selenium.webdriver.support.uir   rh   CATALOG_URL_PRIMARYr   __annotations____file__r  r  r   r   rn   rK   r
  r'  r  re   r}   r  r&  r?  r0  r  r  rC  r*  PAGE_LOAD_TIMEOUTr\  NAV_SETTLE_SECONDSrb  r0  r1  r2  rg   r<   rB   rM   rX   rs   rx   r   r   r   r   r   r   r  rB  rJ  r`  rh  r{  r  r  r  r  r  r  r  r  r  r  r!  r)  r6  r/  r=  rA  rJ  rS  r\  rf  rk  rw  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r>  r   r?   rC   rA   <module>rU     s  *X #  	  	   
       "   3 3 +
   K + 6 H @ 7 W #1o  
C >!!#**!#0BC!H!N!N!P!V!V!X\{!{ 	'	'99
44
 
  +  &+ "
 J !     #       ::>>"4b9??AIT <
~I&
4G GZ@!0AH!
L * &. $*** 	*
 * * * *b %&C	C C 	C
 C 
CN"2$J $4
3#r* .(41h"/&d#"#J-d62	&9!./f!F8v"<4 OO
O O 	O
 O O Od/$ >C &MN64"6Rj zF rC   