Skip to content

Commit 9c4ca71

Browse files
committed
Improved URL check when exporting
1 parent f6f5bd1 commit 9c4ca71

3 files changed

Lines changed: 76 additions & 50 deletions

File tree

src/zotero/VERSION.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.7.1
1+
1.7.2

src/zotero/__init__.py

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,15 @@
8181
'video recording': ":movie_camera:",
8282
'webpage': ":earth_americas:",
8383
}
84+
URL_CHECK_HEADERS = {
85+
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
86+
'Accept-Language': "en-US,en;q=0.9",
87+
'Accept-Encoding': "gzip, deflate, br",
88+
'Connection': "close",
89+
'Range': "bytes=0-0",
90+
'Upgrade-Insecure-Requests': "1",
91+
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101 Firefox/138.0",
92+
}
8493
URL_NO_CHECK = (
8594
"arxiv.org",
8695
"dial.uclouvain.be",
@@ -90,6 +99,7 @@
9099
"inria.hal.science",
91100
"iopscience.iop.org",
92101
"link.springer.com",
102+
"onlinelibrary.wiley.com",
93103
"researchportal.rma.ac.be",
94104
"scholar.dsu.edu",
95105
"web.archive.org",
@@ -101,12 +111,24 @@
101111
"www.sciencedirect.com",
102112
"www.scopus.com",
103113
"www.semanticscholar.org",
114+
"www.softpedia.com",
104115
"www.usenix.org",
105116
)
106117

107118

108-
_check_url = lambda url, nocheck=URL_NO_CHECK: url if url.split("://")[-1].split("/")[0] not in nocheck and \
109-
requests.head(url).status_code >= 400 else None
119+
def _check_url(url, nocheck=URL_NO_CHECK):
120+
try:
121+
scheme, domain = url.split(":", 1)
122+
except ValueError:
123+
scheme, domain = "http", url
124+
domain = domain.lstrip("/").split("/")[0].split("@")[-1].split(":")[0]
125+
if scheme not in ["http", "https"] or ts.is_iterable(nocheck) and domain in nocheck:
126+
return
127+
response = requests.get(url, headers=URL_CHECK_HEADERS, allow_redirects=True, stream=True)
128+
code = response.status_code
129+
response.close()
130+
if code >= 400:
131+
return url
110132

111133

112134
def _lower_title(title):
@@ -610,14 +632,14 @@ def _marks(self, marker, filters=None, sort=None, desc=False, limit=None):
610632
json.dump(self.marks, f)
611633

612634
@ts.try_or_die(exc=ValueError, trace=False)
613-
def count(self, filters=None):
635+
def count(self, filters=None, **kw):
614636
""" Count items while applying filters. """
615637
_, data = self._items(["title"], filters or [])
616638
print(len(data))
617639

618640
@ts.try_or_die(exc=ValueError, trace=False)
619641
def export(self, fields=None, filters=None, sort=None, desc=False, limit=None, line_format=None,
620-
output_format="xlsx", check_url=False):
642+
output_format="xlsx", check_url=False, url_no_check=None, **kw):
621643
""" Export the selected fields of items while applying filters to an Excel file. """
622644
if "{stars}" in (line_format or "") and "rank" not in fields:
623645
fields.append("rank")
@@ -664,11 +686,13 @@ def export(self, fields=None, filters=None, sort=None, desc=False, limit=None, l
664686
for row in data:
665687
r = row[i_rank]
666688
mr = max(mr, float(r if r != "-" else 0))
667-
lines, url_check_executor, results = [], ThreadPoolExecutor(max_workers=10), []
689+
lines = []
690+
if check_url:
691+
url_check_executor, results = ThreadPoolExecutor(max_workers=10), {}
668692
for row in data:
669693
d = {k.lower(): v for k, v in zip(headers, row)}
670-
if d['url'] not in ["", "-"]:
671-
results.append(url_check_executor.submit(_check_url, d['url']))
694+
if check_url and d['url'] not in ["", "-"]:
695+
results[url_check_executor.submit(_check_url, d['url'], url_no_check or URL_NO_CHECK)] = d['url']
672696
if "Title" in headers and "Url" in headers:
673697
d['lower_title'] = t = _lower_title(d['title'])
674698
d['link'] = d['title'] if d['url'] in ["", "-"] else f"[{d['title']}]({d['url']})"
@@ -687,23 +711,25 @@ def export(self, fields=None, filters=None, sort=None, desc=False, limit=None, l
687711
s = " :star2:" if r == mr else " :star:"
688712
d['stars'] = "" if r < .35 else s if .35 <= r < .65 else 2*s if .65 <= r < .85 else 3*s
689713
lines.append(line_format.format(**d))
690-
for r in as_completed(results):
691-
try:
692-
url = r.result()
693-
if url is not None:
714+
if check_url:
715+
for r in as_completed(results):
716+
try:
717+
url = results[r]
718+
if r.result() is None:
719+
continue
694720
logger.warning(f"Broken link: {url}")
695-
except requests.exceptions.ConnectionError:
696-
logger.error(f"Broken link: {url} (connection aborted))")
697-
except requests.exceptions.SSLError:
698-
logger.error(f"Broken link: {url} (SSL certificate check failure))")
699-
url_check_executor.shutdown()
721+
except requests.exceptions.ConnectionError:
722+
continue
723+
except requests.exceptions.SSLError:
724+
logger.error(f"Broken link: {url} (SSL certificate check failure))")
725+
url_check_executor.shutdown()
700726
r = Report(List(*lines))
701727
r.filename = "export"
702728
logger.debug(f"Creating {output_format.upper()} file...")
703729
getattr(r, output_format)(save_to_file=True)
704730

705731
@ts.try_or_die(exc=ValueError, trace=False)
706-
def list(self, field, filters=None, desc=False, limit=None):
732+
def list(self, field, filters=None, desc=False, limit=None, **kw):
707733
""" List field's values while applying filters. """
708734
if field == "collections":
709735
l = [c['data']['name'] for c in self.collections]
@@ -731,12 +757,12 @@ def list(self, field, filters=None, desc=False, limit=None):
731757
print(ts.BorderlessTable([[ZoteroCLI.header(field)]] + data))
732758

733759
@ts.try_or_die(exc=ValueError, trace=False)
734-
def mark(self, marker, filters=None, sort=None, desc=False, limit=None):
760+
def mark(self, marker, filters=None, sort=None, desc=False, limit=None, **kw):
735761
""" Mark the selected items as read/unread. """
736762
self._marks(marker, filters, sort, desc, limit)
737763

738764
@ts.try_or_die(exc=ValueError, trace=False)
739-
def plot(self, name, filters=None):
765+
def plot(self, name, filters=None, **kw):
740766
""" Plot a chart given its slug. """
741767
if name == "software-in-time":
742768
data = {}
@@ -752,7 +778,7 @@ def plot(self, name, filters=None):
752778
raise ValueError
753779

754780
@ts.try_or_die(exc=ValueError, trace=False)
755-
def show(self, fields=None, filters=None, sort=None, desc=False, limit=None):
781+
def show(self, fields=None, filters=None, sort=None, desc=False, limit=None, **kw):
756782
""" Show the selected fields of items while applying filters. """
757783
# ensure the 'key' field is included for filtering the items ; then do not keep it if not selected
758784
output_key = "key" in fields
@@ -774,7 +800,7 @@ def show(self, fields=None, filters=None, sort=None, desc=False, limit=None):
774800
print(table)
775801

776802
@ts.try_or_die(exc=ValueError, trace=False)
777-
def view(self, name, value, fields=None):
803+
def view(self, name, value, fields=None, **kw):
778804
""" View a single item given a field and its value. """
779805
headers, data = self._items(fields, [f"{name}:{value}"])
780806
for h, d in zip(headers, data[0]):

src/zotero/__main__.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"""
2222
__examples__ = [
2323
"count -f \"collections:biblio\" -f \"rank:>1.0\"",
24+
"export title date url itemType -f \"collections:doc\" -s \"title\" -l \"{emoji} {link_lower}\" -o md --check-url",
2425
"export year title itemType numAuthors numPages zscc references what results comments -f \"collections:biblio\" " \
2526
"-s date -l \">rank:50\"",
2627
"list attachments",
@@ -31,11 +32,23 @@
3132
]
3233

3334

35+
def _domains(url_no_check):
36+
r = []
37+
for url in (url_no_check or []):
38+
if ts.is_file(url):
39+
with open(url) as f:
40+
for l in f:
41+
r.append(l.strip())
42+
else:
43+
r.append(url.strip())
44+
return r
45+
46+
3447
def _set_arg(subparser, arg, msg=None):
3548
""" Shortcut function to set arguments repeated for multiple subparsers. """
3649
if arg == "filter":
37-
subparser.add_argument("-f", "--filter", action="extend", nargs="*", default=[], note="format: [field]:[regex]",
38-
help=msg or "filter to be applied on field's value")
50+
subparser.add_argument("-f", "--filter", action="extend", dest="filters", nargs="*", default=[],
51+
note="format: [field]:[regex]", help=msg or "filter to be applied on field's value")
3952
elif arg == "limit":
4053
subparser.add_argument("-l", "--limit", help="limit the number of displayed records", note="format: either a "
4154
"number or [field]:[number]\n '<' and '>' respectively indicates ascending or "
@@ -76,11 +89,13 @@ def main():
7689
_set_arg(ccount, "filter", "filter to be applied while counting")
7790
_set_arg(ccount, "query")
7891
cexpt = sparsers.add_parser("export", help="export items to a file", category="manage")
79-
cexpt.add_argument("field", nargs="+", help="field to be shown")
92+
cexpt.add_argument("fields", nargs="+", help="field to be shown")
8093
cexpt.add_argument("-l", "--line-format", help="line's format string for outputting as a list")
8194
cexpt.add_argument("-o", "--output-format", default="xlsx", help="output format",
8295
choices=["csv", "html", "json", "md", "pdf", "rst", "xml", "xlsx", "yaml"])
8396
cexpt.add_argument("-u", "--check-url", action="store_true", help="check for broken URL's")
97+
cexpt.add_argument("--url-no-check", nargs="*", help="either a domain or a file with one domain per line for "
98+
"skipping URL check")
8499
_set_args(cexpt, "filter", "limit", "query", "sort")
85100
if __GPT:
86101
cingest = sparsers.add_parser("ingest", help="ingest Zotero documents", category="GPT")
@@ -105,17 +120,17 @@ def main():
105120
cselect = sparsers.add_parser("select", help="select a GPT model", category="GPT")
106121
cselect.add_argument("name", default=MODEL_DEFAULT_NAME, choices=MODELS, nargs="?", help="model name")
107122
cshow = sparsers.add_parser("show", help="show a list of items", category="read")
108-
cshow.add_argument("field", nargs="*", help="field to be shown")
123+
cshow.add_argument("fields", nargs="*", help="field to be shown")
109124
_set_args(cshow, "filter", "limit", "query", "sort")
110125
cview = sparsers.add_parser("view", help="view a single item", category="read")
111126
cview.add_argument("name", help="field name for selection")
112127
cview.add_argument("value", help="field value to be selected")
113-
cview.add_argument("field", nargs="+", help="field to be shown")
128+
cview.add_argument("fields", nargs="+", help="field to be shown")
114129
initialize()
115130
args.logger = logger
116131
if getattr(args, "query", None):
117-
if hasattr(args, "field") and args.field == ["-"]:
118-
args.field = QUERIES[args.query].get('fields', ["title"])
132+
if hasattr(args, "fields") and args.fields == ["-"]:
133+
args.fields = QUERIES[args.query].get('fields', ["title"])
119134
args.filter.extend(QUERIES[args.query].get('filter', []))
120135
if getattr(args, "limit", None) is None:
121136
args.limit = QUERIES[args.query].get('limit')
@@ -132,25 +147,10 @@ def main():
132147
if getattr(args, "reset_items", False) and k not in ["items"] + OBJECTS or k == "marks":
133148
continue
134149
CACHE_PATH.joinpath(k + ".json").remove(False)
135-
z = ZoteroCLI(args.id, ["user", "group"][args.group or GROUP_FILE.exists()], args.key, logger)
136-
if args.command == "count":
137-
z.count(args.filter)
138-
elif args.command == "export":
139-
z.export(args.field, args.filter, args.sort, args.desc, args.limit, args.line_format, args.output_format,
140-
args.check_url)
141-
elif args.command == "install":
142-
install()
143-
elif args.command == "list":
144-
z.list(args.field, args.filter, args.desc, args.limit)
145-
elif args.command == "mark":
146-
args.filter.append("numPages:>0")
147-
z.mark(args.marker, args.filter, args.sort, args.desc, args.limit)
148-
elif args.command == "plot":
149-
z.plot(args.chart)
150-
elif args.command == "show":
151-
z.show(args.field, args.filter, args.sort, args.desc, args.limit)
152-
elif args.command == "view":
153-
z.view(args.name, args.value, args.field)
154-
elif args.command != "reset": # handle commands from gpt.py
155-
globals()[args.command](**vars(args))
150+
if args.command != "reset":
151+
if args.command == "export":
152+
args.url_no_check = l = _domains(args.url_no_check)
153+
args.check_url |= len(l) > 0
154+
getattr(ZoteroCLI(args.id, ["user", "group"][args.group or GROUP_FILE.exists()], args.key, logger),
155+
args.command, globals().get(args.command))(**vars(args))
156156

0 commit comments

Comments
 (0)