|
with BUILDS_INPROGRESS.track_inprogress(): |
|
done = False |
|
failed = False |
|
|
|
def _check_result(future): |
|
nonlocal done |
|
nonlocal failed |
|
try: |
|
r = future.result() |
|
app_log.debug("task completed: %s", r) |
|
except Exception: |
|
app_log.error("task failed", exc_info=True) |
|
done = True |
|
failed = True |
|
# TODO: Propagate error to front-end |
|
|
|
build_starttime = time.perf_counter() |
|
pool = self.settings["build_pool"] |
|
# Start building |
|
submit_future = pool.submit(build.submit) |
|
submit_future.add_done_callback(_check_result) |
|
IOLoop.current().add_callback(lambda: submit_future) |
|
|
|
log_future = None |
|
|
|
# initial waiting event |
|
await self.emit( |
|
{ |
|
"phase": "waiting", |
|
"message": "Waiting for build to start...\n", |
|
} |
|
) |
|
|
|
while not done: |
|
progress = await q.get() |
|
# FIXME: If pod goes into an unrecoverable stage, such as ImagePullBackoff or |
|
# whatever, we should fail properly. |
|
if progress.kind == ProgressEvent.Kind.BUILD_STATUS_CHANGE: |
|
phase = progress.payload.value |
|
if progress.payload == ProgressEvent.BuildStatus.PENDING: |
|
# nothing to do, just waiting |
|
continue |
|
elif progress.payload == ProgressEvent.BuildStatus.BUILT: |
|
if build_only: |
|
message = "Done! Image built\n" |
|
phase = "ready" |
|
else: |
|
message = "Built image, launching...\n" |
|
event = { |
|
"phase": phase, |
|
"message": message, |
|
"imageName": image_name, |
|
} |
|
BUILD_TIME.labels(status="success").observe( |
|
time.perf_counter() - build_starttime |
|
) |
|
BUILD_COUNT.labels( |
|
status="success", **self.repo_metric_labels |
|
).inc() |
|
done = True |
|
elif progress.payload == ProgressEvent.BuildStatus.RUNNING: |
|
# start capturing build logs once the pod is running |
|
if log_future is None: |
|
log_future = pool.submit(build.stream_logs) |
|
log_future.add_done_callback(_check_result) |
|
continue |
|
elif progress.payload == ProgressEvent.BuildStatus.BUILT: |
|
# Do nothing, is ok! |
|
continue |
|
elif progress.payload == ProgressEvent.BuildStatus.FAILED: |
|
event = {"phase": phase} |
|
elif progress.payload == ProgressEvent.BuildStatus.UNKNOWN: |
|
event = {"phase": phase} |
|
else: |
|
raise ValueError( |
|
f"Found unknown phase {phase} in ProgressEvent" |
|
) |
|
elif progress.kind == ProgressEvent.Kind.LOG_MESSAGE: |
|
# The logs are coming out of repo2docker, so we expect |
|
# them to be JSON structured anyway |
|
event = progress.payload |
|
payload = json.loads(event) |
|
if payload.get("phase") in ("failure", "failed"): |
|
failed = True |
|
BUILD_TIME.labels(status="failure").observe( |
|
time.perf_counter() - build_starttime |
|
) |
|
BUILD_COUNT.labels( |
|
status="failure", **self.repo_metric_labels |
|
).inc() |
|
await self.emit(event) |
As reported downstream on jupyterhub/mybinder.org-deploy#3594, the
binderhub_inprogress_builds/BUILDS_INPROGRESSmetric is not working as expected.BUILDS_INPROGRESSis defined inbinderhub/binderhub/builder.py
Line 55 in 5532677
client_pythonGauge()can go up and down but from the observed Grafana visualisation, it is only going up.track_inprogress()is used inbinderhub/binderhub/builder.py
Lines 507 to 597 in 5532677
client_pythondocumentation.My search pointed me that the cause of this problem could be
withblock completesawaitinstruction@yuvipanda and @manics what does your experience says?
Should we replace the
BUILDS_INPROGRESSwith a downstream calculation ofBUILDS_STARTED - BUILDS_COMPLETED - BUILDS_FAILED?