8181 'video recording' : ":movie_camera:" ,
8282 'webpage' : ":earth_americas:" ,
8383}
84+ URL_CHECK_HEADERS = {
85+ 'Accept' : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ,
86+ 'Accept-Language' : "en-US,en;q=0.9" ,
87+ 'Accept-Encoding' : "gzip, deflate, br" ,
88+ 'Connection' : "close" ,
89+ 'Range' : "bytes=0-0" ,
90+ 'Upgrade-Insecure-Requests' : "1" ,
91+ 'User-Agent' : "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0" ,
92+ }
8493URL_NO_CHECK = (
8594 "arxiv.org" ,
8695 "dial.uclouvain.be" ,
9099 "inria.hal.science" ,
91100 "iopscience.iop.org" ,
92101 "link.springer.com" ,
102+ "onlinelibrary.wiley.com" ,
93103 "researchportal.rma.ac.be" ,
94104 "scholar.dsu.edu" ,
95105 "web.archive.org" ,
101111 "www.sciencedirect.com" ,
102112 "www.scopus.com" ,
103113 "www.semanticscholar.org" ,
114+ "www.softpedia.com" ,
104115 "www.usenix.org" ,
105116)
106117
107118
108- _check_url = lambda url , nocheck = URL_NO_CHECK : url if url .split ("://" )[- 1 ].split ("/" )[0 ] not in nocheck and \
109- requests .head (url ).status_code >= 400 else None
119+ def _check_url (url , nocheck = URL_NO_CHECK ):
120+ try :
121+ scheme , domain = url .split (":" , 1 )
122+ except ValueError :
123+ scheme , domain = "http" , url
124+ domain = domain .lstrip ("/" ).split ("/" )[0 ].split ("@" )[- 1 ].split (":" )[0 ]
125+ if scheme not in ["http" , "https" ] or ts .is_iterable (nocheck ) and domain in nocheck :
126+ return
127+ response = requests .get (url , headers = URL_CHECK_HEADERS , allow_redirects = True , stream = True )
128+ code = response .status_code
129+ response .close ()
130+ if code >= 400 :
131+ return url
110132
111133
112134def _lower_title (title ):
@@ -610,14 +632,14 @@ def _marks(self, marker, filters=None, sort=None, desc=False, limit=None):
610632 json .dump (self .marks , f )
611633
612634 @ts .try_or_die (exc = ValueError , trace = False )
613- def count (self , filters = None ):
635+ def count (self , filters = None , ** kw ):
614636 """ Count items while applying filters. """
615637 _ , data = self ._items (["title" ], filters or [])
616638 print (len (data ))
617639
618640 @ts .try_or_die (exc = ValueError , trace = False )
619641 def export (self , fields = None , filters = None , sort = None , desc = False , limit = None , line_format = None ,
620- output_format = "xlsx" , check_url = False ):
642+ output_format = "xlsx" , check_url = False , url_no_check = None , ** kw ):
621643 """ Export the selected fields of items while applying filters to an Excel file. """
622644 if "{stars}" in (line_format or "" ) and "rank" not in fields :
623645 fields .append ("rank" )
@@ -664,11 +686,13 @@ def export(self, fields=None, filters=None, sort=None, desc=False, limit=None, l
664686 for row in data :
665687 r = row [i_rank ]
666688 mr = max (mr , float (r if r != "-" else 0 ))
667- lines , url_check_executor , results = [], ThreadPoolExecutor (max_workers = 10 ), []
689+ lines = []
690+ if check_url :
691+ url_check_executor , results = ThreadPoolExecutor (max_workers = 10 ), {}
668692 for row in data :
669693 d = {k .lower (): v for k , v in zip (headers , row )}
670- if d ['url' ] not in ["" , "-" ]:
671- results . append ( url_check_executor .submit (_check_url , d ['url' ]))
694+ if check_url and d ['url' ] not in ["" , "-" ]:
695+ results [ url_check_executor .submit (_check_url , d ['url' ], url_no_check or URL_NO_CHECK )] = d [ 'url' ]
672696 if "Title" in headers and "Url" in headers :
673697 d ['lower_title' ] = t = _lower_title (d ['title' ])
674698 d ['link' ] = d ['title' ] if d ['url' ] in ["" , "-" ] else f"[{ d ['title' ]} ]({ d ['url' ]} )"
@@ -687,23 +711,25 @@ def export(self, fields=None, filters=None, sort=None, desc=False, limit=None, l
687711 s = " :star2:" if r == mr else " :star:"
688712 d ['stars' ] = "" if r < .35 else s if .35 <= r < .65 else 2 * s if .65 <= r < .85 else 3 * s
689713 lines .append (line_format .format (** d ))
690- for r in as_completed (results ):
691- try :
692- url = r .result ()
693- if url is not None :
714+ if check_url :
715+ for r in as_completed (results ):
716+ try :
717+ url = results [r ]
718+ if r .result () is None :
719+ continue
694720 logger .warning (f"Broken link: { url } " )
695- except requests .exceptions .ConnectionError :
696- logger . error ( f"Broken link: { url } (connection aborted))" )
697- except requests .exceptions .SSLError :
698- logger .error (f"Broken link: { url } (SSL certificate check failure))" )
699- url_check_executor .shutdown ()
721+ except requests .exceptions .ConnectionError :
722+ continue
723+ except requests .exceptions .SSLError :
724+ logger .error (f"Broken link: { url } (SSL certificate check failure))" )
725+ url_check_executor .shutdown ()
700726 r = Report (List (* lines ))
701727 r .filename = "export"
702728 logger .debug (f"Creating { output_format .upper ()} file..." )
703729 getattr (r , output_format )(save_to_file = True )
704730
705731 @ts .try_or_die (exc = ValueError , trace = False )
706- def list (self , field , filters = None , desc = False , limit = None ):
732+ def list (self , field , filters = None , desc = False , limit = None , ** kw ):
707733 """ List field's values while applying filters. """
708734 if field == "collections" :
709735 l = [c ['data' ]['name' ] for c in self .collections ]
@@ -731,12 +757,12 @@ def list(self, field, filters=None, desc=False, limit=None):
731757 print (ts .BorderlessTable ([[ZoteroCLI .header (field )]] + data ))
732758
733759 @ts .try_or_die (exc = ValueError , trace = False )
734- def mark (self , marker , filters = None , sort = None , desc = False , limit = None ):
760+ def mark (self , marker , filters = None , sort = None , desc = False , limit = None , ** kw ):
735761 """ Mark the selected items as read/unread. """
736762 self ._marks (marker , filters , sort , desc , limit )
737763
738764 @ts .try_or_die (exc = ValueError , trace = False )
739- def plot (self , name , filters = None ):
765+ def plot (self , name , filters = None , ** kw ):
740766 """ Plot a chart given its slug. """
741767 if name == "software-in-time" :
742768 data = {}
@@ -752,7 +778,7 @@ def plot(self, name, filters=None):
752778 raise ValueError
753779
754780 @ts .try_or_die (exc = ValueError , trace = False )
755- def show (self , fields = None , filters = None , sort = None , desc = False , limit = None ):
781+ def show (self , fields = None , filters = None , sort = None , desc = False , limit = None , ** kw ):
756782 """ Show the selected fields of items while applying filters. """
757783 # ensure the 'key' field is included for filtering the items ; then do not keep it if not selected
758784 output_key = "key" in fields
@@ -774,7 +800,7 @@ def show(self, fields=None, filters=None, sort=None, desc=False, limit=None):
774800 print (table )
775801
776802 @ts .try_or_die (exc = ValueError , trace = False )
777- def view (self , name , value , fields = None ):
803+ def view (self , name , value , fields = None , ** kw ):
778804 """ View a single item given a field and its value. """
779805 headers , data = self ._items (fields , [f"{ name } :{ value } " ])
780806 for h , d in zip (headers , data [0 ]):
0 commit comments