-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtasks.py
More file actions
501 lines (404 loc) · 13.9 KB
/
Copy pathtasks.py
File metadata and controls
501 lines (404 loc) · 13.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
"""
Repo-level commands. Run with
pixi r invoke <task>
Note that when running tasks this way, underscores (_) in task names should be replaced with dashes (-)
"""
import datetime as _dt
import os
import sys
from pathlib import Path
import yaml
from invoke import task
from mecfs_bio.util.link_check_streak import (
dump_state,
load_state,
read_lychee_report,
update_streaks,
)
NEW_UNIT_TEST_PATH = Path("test_mecfs_bio/unit")
SRC_PATH = Path("mecfs_bio")
DOCS_PATH = Path("docs")
USER_AGENT = '"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"'
PULL_FIGURE_SCRIPT_PATH = Path("mecfs_bio/figures/key_scripts/pull_figures.py")
PUBLISH_FIGURES_SCRIPT_PATH = Path("mecfs_bio/figures/key_scripts/publish_figures.py")
FIGS_PATH = Path("docs/_figs/")
FIGS_PATTERN = "_figs"
GITHUB_TOKEN_CONFIG = Path("gh_token_config.yaml")
# Default location for the streak-state file. The CI workflow downloads this
# from the prior run's artifact and uploads the updated copy after each run.
LINK_CHECK_STATE_PATH = Path("link_check_state.json")
LINK_CHECK_REPORT_PATH = Path("lychee_report.json")
LINK_CHECK_DEFAULT_THRESHOLD = 7
# Lychee flags shared by every link-checking task. Keep this in one place so
# the strict and streak-aware tasks stay in sync.
# - 403 in --accept tolerates anti-bot systems.
# - --cache-exclude-status 400..=999 means failure results are NOT cached,
# so a previously-failing URL is always re-checked next run (essential
# for streak recovery detection).
LYCHEE_SHARED_ARGS = (
f"--insecure --cache "
f"--accept 100..=103,200..=299,403 "
f"--cache-exclude-status 400..=999 "
f"--exclude {FIGS_PATTERN} "
f"--user-agent {USER_AGENT}"
)
# helper
def _set_gh_token() -> None:
if not GITHUB_TOKEN_CONFIG.exists():
print(f"No github token file found at {GITHUB_TOKEN_CONFIG}")
return
with open(GITHUB_TOKEN_CONFIG) as infile:
loaded = yaml.load(infile, Loader=yaml.FullLoader)
assert "GH_TOKEN" in loaded
print("setting github token environment variable")
os.environ["GH_TOKEN"] = loaded["GH_TOKEN"]
def _set_use_gh_token() -> None:
os.environ["HAVE_GH_TOKEN"] = "True"
def _set_enable_api_autonav() -> None:
os.environ["ENABLE_API_AUTONAV"] = "True"
# dev tasks
@task
def test(c):
print(
"Running unit and integration tests with pytest, while skipping un-needed tests with testmon..."
)
cmd = f" python -m pytest --testmon --typeguard-packages={SRC_PATH} {NEW_UNIT_TEST_PATH}"
print(cmd)
c.run(cmd, pty=True)
@task
def test_debug(c):
"""
Run tests with extra information printed. Useful for debugging.
"""
print("Running unit and integration tests with pytest...")
cmd = f" python -m pytest -s --typeguard-debug-instrumentation --typeguard-packages={SRC_PATH} {NEW_UNIT_TEST_PATH}"
print(cmd)
c.run(cmd, pty=True)
@task
def format(c):
"""
Format code
"""
print("Formatting with ruff...")
c.run(" ruff format", pty=True)
@task
def formatcheck(c):
"""
Check for format errors
"""
print("Checking format with black...")
c.run(" ruff format --check .")
@task
def lintfix(c):
print("linting and applying lint auto-fixes using ruff...")
c.run(" ruff check --fix --unsafe-fixes")
@task
def lintcheck(c):
print("linting using ruff...")
c.run("ruff check")
@task
def typecheck(c):
"""
Check for type errors
"""
print("Typechecking with ty...")
c.run(f"ty check {SRC_PATH} {NEW_UNIT_TEST_PATH}", pty=True)
@task
def spellcheck_docs(c):
"""
check docs for spelling errors
Edit _typos.toml to add exceptions
"""
print(
f"Checking documentation spelling using typos... (Add exceptions to {DOCS_PATH}/_typos.toml)"
)
c.run(f"typos {DOCS_PATH}", pty=True)
@task
def spellcheck_src(c):
"""
check code for spelling errors
Edit _typos.toml to add exceptions
"""
print(
f"Checking source spelling using typos... (Add exceptions to {SRC_PATH}/_typos.toml)"
)
c.run(f"typos {SRC_PATH}", pty=True)
@task
def checkimports(c):
"""
Use import linter to enforce architectural constraints
"""
print("Checking architectural constraints using import-linter...")
c.run("lint-imports", pty=True)
def _missing_init_dirs() -> list[Path]:
return [
d
for d in sorted(SRC_PATH.rglob("*"))
if d.is_dir() and d.name != "__pycache__" and not (d / "__init__.py").exists()
]
@task
def check_init_files(c):
"""
Verify that every directory under mecfs_bio/ contains an __init__.py.
"""
print(f"Checking that every directory under {SRC_PATH}/ has an __init__.py...")
missing = _missing_init_dirs()
if missing:
print("ERROR: The following directories are missing __init__.py:")
for d in missing:
print(f" {d}")
sys.exit(1)
else:
print("OK: all directories have __init__.py.")
@task
def fix_init_files(c):
"""
Add a blank __init__.py to any directory under mecfs_bio/ that is missing one.
"""
print(f"Adding missing __init__.py files under {SRC_PATH}/...")
missing = _missing_init_dirs()
if missing:
for d in missing:
(d / "__init__.py").touch()
print(f" created {d / '__init__.py'}")
print(f"Created {len(missing)} file(s).")
else:
print("Nothing to fix.")
@task
def check_all_links(c):
"""
Check all links with lychee.
Shared flags live in ``LYCHEE_SHARED_ARGS``. 403 is accepted to tolerate
anti-bot systems.
"""
print("Checking links with lychee...")
c.run(f"lychee {LYCHEE_SHARED_ARGS} {SRC_PATH} {DOCS_PATH}")
@task(help={"threshold": "Consecutive failing runs required to fail the task."})
def check_all_links_with_streak(c, threshold=LINK_CHECK_DEFAULT_THRESHOLD):
"""
Run lychee and update the per-URL consecutive-failure streak state.
Behaves like ``check-all-links`` but only fails when a URL has been broken
for ``threshold`` runs in a row. Transient single-day outages no longer
fail the daily workflow.
State is read from / written to ``link_check_state.json`` at the repo
root. The CI workflow round-trips this file through an artifact between
runs (see ``.github/workflows/check_links.yml``).
"""
threshold = int(threshold)
state_path = LINK_CHECK_STATE_PATH
report_path = LINK_CHECK_REPORT_PATH
print(
f"Checking links with lychee (streak-aware, threshold={threshold} consecutive runs)..."
)
# --verbose is required for lychee to populate success_map in the JSON
# report; without it we cannot tell ok URLs from URLs that simply weren't
# seen this run. --no-progress keeps the CI log readable.
cmd = (
f"lychee {LYCHEE_SHARED_ARGS} "
f"--format json --verbose --no-progress "
f"--output {report_path} "
f"{SRC_PATH} {DOCS_PATH}"
)
# lychee exits non-zero when any URL fails; we want the report regardless.
c.run(cmd, warn=True)
if not report_path.exists():
print(f"ERROR: lychee did not produce a JSON report at {report_path}.")
sys.exit(1)
# read_lychee_report raises LycheeReportSchemaError on schema drift. We
# let that propagate so a lychee upgrade that changes the report shape
# surfaces as a loud CI failure rather than a silently-empty check.
report = read_lychee_report(report_path)
previous_state = load_state(state_path)
result = update_streaks(
report,
previous_state,
run_date=_dt.date.today(),
threshold=threshold,
)
dump_state(state_path, result.new_state)
print(
f"\nlychee summary: total={report.total}, errors={report.errors}, "
f"timeouts={report.timeouts}"
)
print(f"State written to {state_path} ({len(result.new_state)} URLs tracked).")
if result.cleared:
print(f"\nRecovered URLs (streak reset to 0): {len(result.cleared)}")
for url in result.cleared:
print(f" {url}")
if result.new_failures:
print(f"\nNew failures this run (streak=1): {len(result.new_failures)}")
for url in result.new_failures:
print(f" {url}")
if result.all_failing:
print(f"\nAll currently failing URLs ({len(result.all_failing)}):")
for entry in result.all_failing:
print(f" streak={entry.streak:>3} {entry.url}")
if result.persistent_failures:
print(
f"\nERROR: {len(result.persistent_failures)} URL(s) have been failing for "
f">= {threshold} consecutive runs and are likely permanently broken:"
)
for entry in result.persistent_failures:
print(f" streak={entry.streak:>3} {entry.url}")
sys.exit(1)
print(
f"\nOK: no URLs have crossed the {threshold}-run persistent-failure threshold."
)
@task
def check_table_trailing_newlines(c):
"""
Added by Claude:
Check that markdown docs ending with a table have at least one blank line after the table.
The mkdocs-bibtex plugin breaks bibliography footnotes when a page ends with a
markdown table and no trailing blank lines.
"""
print(
"Checking for markdown files ending with a table but missing trailing newlines..."
)
violations = []
for md_file in sorted(DOCS_PATH.rglob("*.md")):
content = md_file.read_text(encoding="utf-8")
if not content:
continue
last_nonempty = next(
(line for line in reversed(content.splitlines()) if line.strip()), None
)
if last_nonempty and last_nonempty.strip().startswith("|"):
if not content.endswith("\n\n"):
violations.append(md_file)
if violations:
print(
"ERROR: The following markdown files end with a table but lack a trailing blank line.\n"
"This breaks mkdocs-bibtex bibliography rendering. Add a blank line after the final table row."
)
for f in violations:
print(f" {f}")
sys.exit(1)
else:
print("OK: all markdown files pass the table-trailing-newline check.")
@task
def fix_table_trailing_newlines(c):
"""
Added by Claude:
Autofix markdown files that end with a table but lack a trailing blank line.
See check_table_trailing_newlines for why this is needed.
"""
print("Fixing markdown files ending with a table but missing trailing newlines...")
fixed = []
for md_file in sorted(DOCS_PATH.rglob("*.md")):
content = md_file.read_text(encoding="utf-8")
if not content:
continue
last_nonempty = next(
(line for line in reversed(content.splitlines()) if line.strip()), None
)
if last_nonempty and last_nonempty.strip().startswith("|"):
if not content.endswith("\n\n"):
md_file.write_text(content.rstrip("\n") + "\n\n", encoding="utf-8")
fixed.append(md_file)
if fixed:
print(f"Fixed {len(fixed)} file(s):")
for f in fixed:
print(f" {f}")
else:
print("No files needed fixing.")
@task
def check_local_links(c):
"""
Check local links with lychee
"""
print("Checking offline links with lychee...")
cmd = f"lychee --exclude {FIGS_PATTERN} --offline {SRC_PATH} {DOCS_PATH}"
print(f"running {cmd}")
c.run(cmd)
@task
def lint_actions(c):
print("Linting github actions...")
cmd = "actionlint"
print(f"running {cmd}")
c.run(cmd)
@task(
pre=[
lintfix,
format,
spellcheck_docs,
spellcheck_src,
check_local_links,
fix_table_trailing_newlines,
checkimports,
# fix_init_files,
typecheck,
lint_actions,
test,
]
)
def green(c):
pass
@task
def install_r_packages(c):
"""
Install R packages such as TwoSampleMR and LAVA
"""
c.run("pixi r install-mr", pty=True)
c.run("pixi r install-lava", pty=True)
### Figures and Documentation
@task()
def pfig(c):
"""
Pull figures from github to populate the _figs directory
"""
c.run(f"python {PULL_FIGURE_SCRIPT_PATH}")
@task()
def publish_figures(c):
"""
Run the end-to-end figure-system workflow: generate any missing
figures from ALL_FIGURE_TASKS, prune orphan manifest entries (after
confirming no docs still reference them), then update the manifest
and push new blobs to the GitHub release.
"""
c.run(f"python {PUBLISH_FIGURES_SCRIPT_PATH}", pty=True)
@task
def serve_docs(
c,
strict: bool = True,
include_authors: bool = False,
enable_api_autonav: bool = False,
):
"""
Use mkdocs to serve documentation
"""
if include_authors:
_set_gh_token()
_set_use_gh_token()
if enable_api_autonav:
_set_enable_api_autonav()
cmd = " mkdocs serve"
if strict:
cmd += " --strict"
print("Serving documentation...")
print(f"running {cmd}")
c.run(cmd, pty=True)
@task(
pre=[
pfig,
]
)
def sdocs(c, include_authors: bool = False, enable_api_autonav: bool = False):
"""
Retrieve figures, then serve docs
"""
serve_docs(
c, include_authors=include_authors, enable_api_autonav=enable_api_autonav
)
@task
def build_docs(c):
cmd = "mkdocs build --strict"
print(f"running {cmd}")
c.run(cmd)
# initialization
@task(pre=[install_r_packages, pfig, green])
def init(c):
"""
Initial repo setup
"""
pass