Compare commits
84 Commits
mirror-pha
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| ff09c4075b | |||
| 63befd05b0 | |||
| 802b2792a7 | |||
| 0264af72c7 | |||
| 8945d13674 | |||
| f5119a9703 | |||
| 49fb818c60 | |||
| 12318582aa | |||
| 76a4b6b3fa | |||
| 6060086c01 | |||
| 9987fba4b6 | |||
| 74ed24053d | |||
| 2894778810 | |||
| 536a3595b9 | |||
| 0684576d74 | |||
| fa9a89bcf8 | |||
| 374371966f | |||
| b1bca1a745 | |||
| 4f6c9554b7 | |||
| 96ba67a63f | |||
| 139e319d7e | |||
| b6e12ef428 | |||
| 2173894f07 | |||
| e392c73cbc | |||
| 3180ae1355 | |||
| 9d82a02026 | |||
| bbc2bafbcb | |||
| b7a009c1fc | |||
| e1c4198c08 | |||
| 56723ae0ec | |||
| dfa5c8b9ee | |||
| bb5eb3d3aa | |||
| 83a6c6e157 | |||
| 8b9033f3d6 | |||
| e8e52cf4c6 | |||
| d3fe9e26bb | |||
| 84d90fb655 | |||
| c51692b57e | |||
| ffcf441364 | |||
| 2080d734d3 | |||
| 91d3cc7e99 | |||
| f98b444559 | |||
| 17ebdf39ac | |||
| 08b629f52a | |||
| b302f3ab63 | |||
| b492f995bd | |||
| e350c94c3f | |||
| 45afccbef5 | |||
| 48d03d8405 | |||
| 5b65c6caa3 | |||
| 157d06dc77 | |||
| e6d55b53c7 | |||
| 79c652ddd3 | |||
| 68ef0f84fb | |||
| c828f6cdd0 | |||
| c0df77d0d9 | |||
| 9a7772563a | |||
| 1ba0d961a3 | |||
| e76d4005ab | |||
| c32e6105d0 | |||
| c51cd84159 | |||
| f5a6f7196f | |||
| a78ec2de12 | |||
| ef65d898ed | |||
| 0dea3410ee | |||
| 117028ff0a | |||
| c90cf1e1d0 | |||
| 49a56e873e | |||
| f2fa38df6f | |||
| 31b71f9949 | |||
| 9449b22f24 | |||
| 74364d0a46 | |||
| c7ede9cfbb | |||
| 3b7267cbee | |||
| 090724ec80 | |||
| 3859cd7f40 | |||
| cf405b4195 | |||
| 3dd06ef0ce | |||
| b268a14cad | |||
| a2a6eea757 | |||
| 464760ebb7 | |||
| fd3db37c49 | |||
| 91a7088f56 | |||
| f202c5aa7f |
40
.drone.yml
40
.drone.yml
@ -35,10 +35,12 @@ steps:
|
||||
# the comment-bridge). Deploys the recipe at the PR head, runs install/upgrade/backup + any
|
||||
# recipe-local tests via the shared harness, then guarantees teardown (plan §4.2/§4.3).
|
||||
#
|
||||
# Resource safety (plan §4.2/§4.3): MAX_TESTS=DRONE_RUNNER_CAPACITY=1 (nix/modules/drone-runner.nix) is
|
||||
# the primary concurrency cap; concurrency.limit below is a redundant belt. CCCI_JANITOR_MAX_AGE=0
|
||||
# makes the run-start janitor reap ANY orphaned run app before deploying — safe because capacity=1
|
||||
# means no concurrent run exists (a SIGKILL'd/timed-out build leaves an orphan with no teardown).
|
||||
# Resource safety (plan §4.2/§4.3): DRONE_RUNNER_CAPACITY=2 (nix/modules/drone-runner.nix, the
|
||||
# single concurrency knob) allows two recipe runs in parallel. Concurrent-run safety is enforced by
|
||||
# the harness, not by serialisation: every run holds an exclusive flock on its app domain
|
||||
# (/run/lock/cc-ci-app-<domain>.lock) for its whole process lifetime, the run-start janitor probes
|
||||
# that lock to reap only orphans (held lock = live run, never touched), and recipe working trees
|
||||
# are per-run ($ABRA_DIR/recipes — no shared checkout, no recipe lock). See docs/concurrency.md.
|
||||
kind: pipeline
|
||||
type: exec
|
||||
name: recipe-ci
|
||||
@ -51,21 +53,37 @@ trigger:
|
||||
event:
|
||||
- custom
|
||||
|
||||
concurrency:
|
||||
limit: 1
|
||||
# NB deliberately NO `concurrency.limit` here: DRONE_RUNNER_CAPACITY (nix/modules/drone-runner.nix
|
||||
# maxTests) is the single concurrency knob (P4 — two knobs in two files drifted).
|
||||
|
||||
steps:
|
||||
- name: ci
|
||||
environment:
|
||||
STAGES: install,upgrade,backup,restore,custom
|
||||
CCCI_JANITOR_MAX_AGE: "0"
|
||||
# The exec runner points HOME at a per-build workspace; force it to /root so abra finds its
|
||||
# server config + recipes under /root/.abra (as the manual M4/M5 runs did). Safe: capacity=1
|
||||
# means no concurrent build shares /root/.abra.
|
||||
# The exec runner points HOME at a per-build workspace; force it to /root so abra's server
|
||||
# config is found via the per-run ABRA_DIR's servers/ symlink -> /root/.abra/servers.
|
||||
# Recipe trees are PER-RUN ($ABRA_DIR/recipes, exported by run_recipe_ci before any abra
|
||||
# call), so concurrent builds never share a recipe checkout; app .env files are per-domain
|
||||
# in the shared canonical servers/ path, guarded by the app-domain flock.
|
||||
HOME: /root
|
||||
commands:
|
||||
# RECIPE/REF/PR/SRC (+ CCCI_QUICK for `!testme --quick`) are injected as env vars from the
|
||||
# build's custom params. CCCI_QUICK=1 makes run_recipe_ci take the opt-in fast lane (WC7);
|
||||
# absent => full cold (default). run_quick ignores STAGES (always upgrade+custom).
|
||||
- 'echo "recipe-ci: RECIPE=$RECIPE REF=$REF PR=$PR SRC=$SRC stages=$STAGES quick=${CCCI_QUICK:-0}"'
|
||||
- cc-ci-run runner/run_recipe_ci.py
|
||||
# P1 lock-lifetime hardening: run the harness in its own session/process group (setsid) and
|
||||
# forward a drone cancel (TERM to this step shell) to the WHOLE group, so the harness's
|
||||
# SIGTERM handler runs its teardown funnel instead of being leaked (the exec runner kills
|
||||
# only the step shell, not the tree). PDEATHSIG inside the harness backstops the case where
|
||||
# this shell dies without the trap firing. The harness exit code is captured explicitly and
|
||||
# the traps cleared before exiting: the runner shell is `set -e`, and an EXIT-trap kill of
|
||||
# the already-gone process group returns ESRCH, which otherwise poisons a GREEN run's exit
|
||||
# status to 1 (observed live, build 269: all tiers pass, step exit 1).
|
||||
- |
|
||||
setsid cc-ci-run runner/run_recipe_ci.py &
|
||||
PID=$!
|
||||
trap 'kill -TERM -- "-$PID" 2>/dev/null || true' TERM EXIT
|
||||
rc=0
|
||||
wait "$PID" || rc=$?
|
||||
trap - TERM EXIT
|
||||
exit "$rc"
|
||||
|
||||
68
BACKLOG-conc.md
Normal file
68
BACKLOG-conc.md
Normal file
@ -0,0 +1,68 @@
|
||||
# BACKLOG — sub-phase conc
|
||||
|
||||
## Build backlog
|
||||
|
||||
- [x] P1 lock-lifetime hardening: prctl PDEATHSIG + ppid race check + SIGTERM handler →
|
||||
teardown funnel + signal.alarm(3600) hard deadline; .drone.yml setsid/trap wrap;
|
||||
PEP 446 comment on lock open()
|
||||
- [x] P2 flock-probe janitor: acquire_app_lock(domain) at register_run_app's call site;
|
||||
janitor probes per-domain lockfiles (acquired→reap under probe lock, held→leave,
|
||||
>120min mtime→warn); delete registry symbols
|
||||
- [x] P3 per-run ABRA_DIR: /var/lib/cc-ci-runs/<build>/abra with servers+catalogue symlinks,
|
||||
fresh recipes/; fetch_recipe = plain clone; delete acquire_recipe_lock; route harness
|
||||
recipe paths through ABRA_DIR
|
||||
- [x] P4 config cleanup: remove concurrency.limit from .drone.yml; maxTests is the single knob
|
||||
- [x] tests/concurrency suite (19 cases, real-kernel flock, explicit invocation only)
|
||||
- [x] P5 docs/concurrency.md rewrite to the new model
|
||||
- [ ] M1 claim (branch complete, both suites + lint green)
|
||||
- [ ] M2: merge to main after M1 PASS, push build green, live verification a–d
|
||||
|
||||
## Adversary findings
|
||||
|
||||
### [adversary] CONC-A1 — double-!testme same domain corrupts the shared deploy-count file (M2(c) FAIL)
|
||||
|
||||
**Severity:** blocks M2(c). Both runs of a same-domain double-!testme go RED.
|
||||
|
||||
**Root cause (two coupled defects, one shared root):**
|
||||
1. The DG4.1 deploy-counter file is keyed by DOMAIN in the *shared* system tempdir, NOT per-run:
|
||||
`run_recipe_ci.py:930 countfile = /tmp/ccci-deploys-<domain>`. P3 isolated `ABRA_DIR` per run
|
||||
but this per-run state file was missed — it predates the restructure (ef44d46) and the OLD
|
||||
recipe-flock used to serialize same-recipe runs end-to-end, incidentally masking it.
|
||||
2. `lifecycle.deploy_app()` calls `_record_deploy()` (lifecycle.py:250) BEFORE
|
||||
`acquire_app_lock(domain)` (lifecycle.py:254, introduced by P2 b302f3a). So the counter
|
||||
increment happens OUTSIDE the serialization window — a second same-domain run bumps the
|
||||
shared counter before it ever blocks on the lock.
|
||||
|
||||
**Observed (live, builds 279 + 281, immich PR#2, same domain immi-ad3e33, 2026-06-10T05:04Z):**
|
||||
- Lock serialization itself WORKS: 281 logged `== app lock: ... in flight — waiting ==` at 2s,
|
||||
then `== app lock: acquired ==` at 194s — exactly when 279 exited (279 finished 05:07:35).
|
||||
- 279 RED: `!! deploy-count 2 != 1 (DG4.1 violation)`. The `2` = 281's pre-lock `_record_deploy`
|
||||
(fired ~2s, before 281 blocked) polluting the shared counter 279 was actively using.
|
||||
- 281 RED: `FileNotFoundError: /tmp/ccci-deploys-immi-ad3e33...` at run_recipe_ci.py:1213 —
|
||||
279's end-of-run `os.remove(countfile)` (line 1215) deleted the shared file out from under 281,
|
||||
whose single `_record_deploy` had already fired at 2s and never recreates it.
|
||||
- Control: isolated immich (build 275, same fixed wrapper) → `deploy-count = 1`, GREEN. So this
|
||||
is concurrency-specific, not a pre-existing immich/wrapper issue.
|
||||
|
||||
**Repro:** two `!testme` comments on the same recipe PR (same domain) in quick succession on the
|
||||
deployed main harness → both builds RED (one DG4.1 false-violation, one FileNotFoundError).
|
||||
|
||||
**Fix direction (Builder owns):** key the deploy-counter per RUN, not per domain — e.g. put it in
|
||||
`/var/lib/cc-ci-runs/<build>/` (alongside the per-run artifacts) or include the build/run id in the
|
||||
filename, and export that path via `CCCI_DEPLOY_COUNT_FILE`. Per-run keying fixes BOTH defects at
|
||||
once (no cross-run pollution; no shared remove). Moving `_record_deploy()` after `acquire_app_lock`
|
||||
alone is INSUFFICIENT — the shared `os.remove`/`FileNotFoundError` collision survives. Add a
|
||||
tests/concurrency case: two same-domain runs serialized on the app lock → each sees its own
|
||||
deploy-count, neither removes the other's file (this is the gap vs the 19 planned cases — case 4
|
||||
serialises acquire but never asserts deploy-count isolation across the two).
|
||||
|
||||
**Closure:** adversary-owned. Re-test the (c) double-!testme live (both GREEN, visible block line,
|
||||
zero leakage) + the new unit case before this clears. Only I close it.
|
||||
|
||||
**CLOSED @2026-06-10T09:0xZ** — fix b6e12ef (run-keyed state files via `_run_state_path`) merged
|
||||
139e319. Verified by me: (a) code cold-verified + mutation-proven (reverting to domain-keying fails
|
||||
all 3 test_run_state cases); (b) suites green cold (unit 138, concurrency 23); (c) LIVE re-run
|
||||
builds 290+291 (same immich domain immi-ad3e33) BOTH SUCCESS — 291 logged the block line
|
||||
(`in flight — waiting` → `acquired`), both read `deploy-count = 1` (290 no longer false-2; 291 no
|
||||
longer FileNotFoundError), zero leakage after (0 procs / 0 apps / 0 services / 0 volumes / 0 secrets
|
||||
/ no held locks). Full evidence in REVIEW-conc M2(c) PASS.
|
||||
23
BACKLOG-rcust.md
Normal file
23
BACKLOG-rcust.md
Normal file
@ -0,0 +1,23 @@
|
||||
# BACKLOG — sub-phase rcust
|
||||
|
||||
## Build backlog
|
||||
|
||||
- [ ] P1.1 `runner/harness/meta.py`: KEYS registry (14 keys + 3 deprecated) + `load(recipe) -> RecipeMeta`
|
||||
- [ ] P1.2 migrate readers L1–L6 to `meta.load()` (orchestrator loads once, passes down)
|
||||
- [ ] P1.3 mumble private constants → underscore-prefixed (`_WELCOME_TEXT_MARKER`, `_MAX_USERS`) + fix importers
|
||||
- [ ] P1.4 `tests/unit/test_meta.py` (all-recipes-load-clean, MetaError cases, defaults, R2 proof)
|
||||
- [ ] P1.5 `scripts/gen-meta-docs.py` + doc-sync unit test
|
||||
- [ ] P2a compose.ccci.yml first-class (auto-copy + auto-chaos); strip ghost/discourse boilerplate
|
||||
- [ ] P2b install-time deps only; migrate lasuite-docs; delete setup_custom_tests.sh machinery
|
||||
- [ ] P2c SKIP_GENERIC meta key deleted; env form documented dev-only + loud warning in CI runs
|
||||
- [ ] P2d conftest cleanup: delete deployed/deployed_app (+app_domain if unused); consolidate deps fixture; migrate 6 lasuite test files
|
||||
- [ ] P3 HookCtx + convert all hook call sites + migrate in-repo users + unit tests
|
||||
- [ ] P4 discovery placement rule + op_state/deps fixtures + migrate hand-parsers
|
||||
- [ ] P5 customization manifest (print block + results.json key) + unit tests
|
||||
- [ ] P6 docs rewrite (recipe-customization.md §8, testing.md, enroll-recipe.md)
|
||||
- [ ] M1 pre-claim: run `pytest tests/concurrency -q` once to prove untouched
|
||||
- [ ] M2 prep: build baseline matrix (21 recipe dirs, expected outcomes) BEFORE merging — commit to STATUS-rcust.md
|
||||
|
||||
## Adversary findings
|
||||
|
||||
(Adversary-owned section)
|
||||
165
JOURNAL-conc.md
Normal file
165
JOURNAL-conc.md
Normal file
@ -0,0 +1,165 @@
|
||||
# JOURNAL — sub-phase conc (Builder, append-only)
|
||||
|
||||
## 2026-06-10 — bootstrap
|
||||
|
||||
Read concurrency-restructure-full-plan.md (SSOT) + plan.md §6.1/§7/§9. Oriented on the code:
|
||||
|
||||
- `runner/harness/lifecycle.py` — recipe flock (l.46), registry (l.65–97), deploy_app
|
||||
registration (l.283), teardown unregister (l.723), three-way janitor (l.726).
|
||||
- `runner/run_recipe_ci.py` — `acquire_recipe_lock` call site (l.843), `fetch_recipe` (l.140,
|
||||
rm-rf + reclone of the shared tree), janitor call sites (l.600 quick, l.932 cold).
|
||||
- `.drone.yml` — recipe-ci step runs `cc-ci-run runner/run_recipe_ci.py` bare (P1 wraps it),
|
||||
`concurrency.limit: 2` (P4 removes).
|
||||
- Greps for P3 fallout: `~/.abra/recipes` referenced in abra.py (recipe_checkout,
|
||||
has_lightweight_version_tags, recipe_head_commit, recipe_versions), generic.py:28,
|
||||
lifecycle.prepull_images, run_recipe_ci (fetch_recipe, snapshot_recipe_tests, comment),
|
||||
warm_reconcile.py:202 (runs OUTSIDE per-run context — keeps default), and
|
||||
tests/ghost+discourse install_steps.sh (`${HOME}/.abra/recipes/...` — these run INSIDE a
|
||||
run and copy compose.ccci.yml into the deploy tree, so they must resolve the per-run dir).
|
||||
- `~/.abra/servers/...` paths are unaffected by design (servers/ is symlinked to the canonical
|
||||
/root/.abra/servers, so both resolutions land on the same file).
|
||||
|
||||
Working setup: state files on main in this clone; code on branch `restructure/concurrency`
|
||||
via a git worktree at ../cc-ci-conc; test runs on the cc-ci host via /root/builder-clone
|
||||
(`cc-ci-run -m pytest ...`, `nix develop .#lint`).
|
||||
|
||||
## 2026-06-10 — P1–P4 landed on restructure/concurrency
|
||||
|
||||
- P1 b492f99: harness/lifetime.py (PDEATHSIG+ppid recheck, SIGTERM/SIGALRM→SystemExit funnel
|
||||
with re-entrancy guard, alarm(3600)); main() installs first; both finally blocks mark
|
||||
begin_teardown(); .drone.yml setsid+trap wrap. Live smoke on cc-ci (cc-ci-run /tmp/p1-smoke.py):
|
||||
TERM→rc=143+finally; ALRM→rc=142+finally+deadline log; parent-kill→child TERM'd, teardown ran.
|
||||
- P2 b302f3a: acquire_app_lock + _probe_and_reap + janitor rewrite; registry deleted. Live smoke
|
||||
(/tmp/p2-smoke*.py): held lock → "live concurrent run, leaving it", reaped=[]; killed holder →
|
||||
reap exactly once + lockfile unlinked; waiter blocked during probe-held reap, then re-acquired
|
||||
on the FRESH inode (probe confirmed held by waiter). Note: a select()-on-fd readline artifact
|
||||
in my smoke script initially looked like a failure — kernel state was verified directly.
|
||||
Unlink/recreate race guarded on BOTH sides via fstat/stat st_ino identity checks.
|
||||
- P3 17ebdf3: per-run ABRA_DIR. Verified abra CLI honors $ABRA_DIR on-host (skeleton probe:
|
||||
FATAs only on empty servers/; with servers+catalogue symlinks + recipes/ it works and even
|
||||
auto-clones recipes for `app ls` resolution into the per-run dir). p3-smoke: setup + fetch of
|
||||
custom-html-tiny landed in /tmp/p3runs/9999/abra/recipes, head commit + versions readable via
|
||||
abra.recipe_dir(). install_steps.sh path fix justified in DECISIONS.md (conc P3 entry).
|
||||
Pre-existing observation (NOT mine, unchanged): `abra app ls -S -m -n` currently FATAs
|
||||
"unable to resolve '0cc57a5a'" under the DEFAULT abra dir too → janitor's abra discovery
|
||||
yields [] and the docker-service sweep carries discovery. Out of this phase's scope.
|
||||
- P4 91d3cc7: concurrency.limit removed; maxTests comment states single-knob + new model.
|
||||
One stale comment line (.drone.yml l.39 "concurrency.limit=2 below") folds into P5.
|
||||
|
||||
All four commits: tests/unit 138 passed + lint PASS before each. Next: tests/concurrency suite.
|
||||
|
||||
## 2026-06-10 — tests/concurrency (84d90fb) + P5 (d3fe9e2) + M1 claim (e8e52cf)
|
||||
|
||||
- Suite: 20 tests / 19 plan cases, all real-kernel (helpers.py subprocesses hold real flocks,
|
||||
install real prctl/alarm guards; CCCI_APP_LOCK_DIR sandboxes /run/lock; HelperPool reaps every
|
||||
helper + recorded grandchildren). First full run on cc-ci: 20 passed in 9.96s, zero flakes in
|
||||
3 repeat runs during the P5 verification re-runs.
|
||||
- Design notes for the Adversary's blind-spot hunt (my own known limits):
|
||||
- case 8 (two janitors) uses threads in one process — valid because flock conflicts are
|
||||
per-open-file-description, and overlap is forced via a Barrier + 2s slow teardown stub.
|
||||
- case 14 relies on reparent-to-pid-1 (true on the cc-ci host; would need adjustment in a
|
||||
subreaper environment — marked NEVER_REPARENTED visibly if so).
|
||||
- cases 5-12 stub teardown_app (recording) — janitor probe/reap ordering is what's under
|
||||
test, not teardown internals (covered by Phase-1 e2e + M2 live checks).
|
||||
- M1 claimed at e8e52cf; full verification recipe in STATUS-conc.md (WHAT/WHERE/HOW/EXPECTED).
|
||||
|
||||
## 2026-06-10 — M2: merge + live verification (a)
|
||||
|
||||
- Merge: bb5eb3d (--no-ff) pushed; push build 266 (self-test lint+hello) SUCCESS.
|
||||
- (a) cancel-mid-run: !testme on immich#2 → build 267 (custom) running on the NEW harness —
|
||||
log shows the setsid/trap wrap + "== per-run ABRA_DIR: /var/lib/cc-ci-runs/267/abra ==";
|
||||
lock /run/lock/cc-ci-app-immi-ad3e33...lock held by pid 636902; 4 immich services up.
|
||||
Canceled via drone API 04:42:07Z (HTTP 200, build status "killed"). Result: harness pid
|
||||
GONE (no leaked python — the old §8.1 gap is closed), immich services 0, volumes 0,
|
||||
secrets 0, .env 0 — the SIGTERM funnel ran the run's own teardown (better than the plan's
|
||||
minimum, which allowed the janitor to do the reaping). Lock RELEASED (lockfile present but
|
||||
unheld — tidy-swept by the next janitor, to be observed during (b)).
|
||||
- (b) triggered 04:46:53Z: !testme immich#2 (comment 14287) + plausible#3 (14288) in parallel.
|
||||
|
||||
## 2026-06-10 — M2(b) round 1: green runs, poisoned exit code → wrapper fix
|
||||
|
||||
- Builds 268 (immich#2) + 269 (plausible#3) ran in PARALLEL on the new harness: both logs end
|
||||
with all-tiers-pass RUN SUMMARY (level=4, deploy-count 1/1) and the host shows ZERO leakage
|
||||
after (no harness processes, no immi/plau services/volumes/secrets, only unheld lockfiles).
|
||||
Both steps nevertheless exited 1: the P1 EXIT trap's kill of the already-gone process group
|
||||
returns ESRCH under the runner's `set -e` shell — a GREEN run reported failure.
|
||||
- Reproduced minimally on-host (`sh -e` and `bash -e`: rc=1 on a clean exit with the old trap).
|
||||
Fix e1c4198 (capture rc; `trap - TERM EXIT`; `|| true` on the trap kill) verified on-host:
|
||||
green rc=0, red rc=7 propagated, TERM→wrapper forwards to child, exits 143. Merged to main
|
||||
b7a009c; push builds 272-274 green. Adversary notified via inbox.
|
||||
- (b) re-triggered on the fixed wrapper 04:56:10Z (immich#2 + plausible#3).
|
||||
|
||||
## 2026-06-10 — M2(b) PASS + (c) triggered
|
||||
|
||||
- (b) round 2 on fixed wrapper: builds 275 (immich#2) + 276 (plausible#3) ran in PARALLEL,
|
||||
BOTH status=success (drone API). Host after: 0 python harness processes, 0 immi/plau
|
||||
services/volumes/secrets/.envs — zero leakage. (d) satisfied by 275 (full green immich e2e).
|
||||
Leftover unheld lockfiles present by design (tidy-swept at next janitor).
|
||||
- (c) double-!testme on immich#2: two comments at 05:03:58Z → two custom builds, same run
|
||||
domain immi-ad3e33 → exactly one must block on the app lock with the visible log line.
|
||||
|
||||
## 2026-06-10 — CONC-A1: (c) failure root-caused + fixed (run-keyed state files)
|
||||
|
||||
- (c) round 1 = builds 279+281, both RED. Root cause (independently also found+filed by the
|
||||
Adversary as CONC-A1 while I was mid-diagnosis — same conclusion from both loops): the four
|
||||
run-scoped state files (deploys/opstate/deps/depskip) were DOMAIN-keyed in shared /tmp;
|
||||
281's main()-preamble + pre-lock _record_deploy fired before it blocked on the app lock →
|
||||
279 read deploy-count 2 (false DG4.1 RED); 279's end-of-run os.remove deleted the shared
|
||||
countfile → 281 crashed FileNotFoundError at its own read. Lock serialization itself worked
|
||||
(281: waiting @+2s, acquired @+194s = 279's exit). Masked pre-restructure by the
|
||||
end-to-end recipe flock.
|
||||
- Fix b6e12ef on branch, merged to main 139e319: _run_state_path() keys all four by
|
||||
run id + harness pid; consumers were always env-fed (CCCI_*_FILE), so domain keying was
|
||||
never load-bearing. Both cleanup sites already remove all four on normal exit.
|
||||
- New tests/concurrency/test_run_state.py (suite now 23): path invariants + real-process
|
||||
CONC-A1 interleaving via helpers.py `deploy-count-run` (countfile init → pre-lock
|
||||
_record_deploy → acquire → gated read). Teeth verified: under simulated shared keying the
|
||||
regression test FAILS (host run: 3 failed); with the fix: 23 passed + 138 unit + lint PASS.
|
||||
- Next: push build green → re-run (b)+(d), then (c), then (a) per the VETO's conditions.
|
||||
|
||||
## 2026-06-10 — M2 re-verification on CONC-A1-fixed main (139e319)
|
||||
|
||||
- Push builds 283/284/285 (branch fix, merge, inbox) all green.
|
||||
- (b)+(d) round 3 (comments 14299/14300, 08:17:35Z): builds 287 (immich#2) + 288 (plausible#3)
|
||||
BOTH success, started simultaneously 08:17:40Z (parallel), finished 08:21:06/08:21:13.
|
||||
Both logs: deploy-count = 1 (expect 1), level=4. Host after: pgrep -f 'run_recipe_c[i]' → no
|
||||
match (earlier "2" was pgrep self-match of the ssh cmdline); immi/plau services/volumes/
|
||||
secrets/server-envs all 0. Zero leakage. (d) satisfied by 287 (full green immich e2e on the
|
||||
final harness code).
|
||||
- (c) round 2 triggered 08:22:13Z: comments 14303+14304 on immich#2 (same domain immi-ad3e33).
|
||||
|
||||
## 2026-06-10 — M2(c) PASS round 2 (builds 290+291) + (a) re-run triggered
|
||||
|
||||
- (c) round 2: builds 290 (08:22:30→08:46:05) + 291 (08:22:33→08:49:23) BOTH success.
|
||||
291 log: "== app lock: another run of immi-ad3e33... in flight — waiting ==" at +1s,
|
||||
"acquired" at +1411s = exactly 290's exit. Both: deploy-count = 1 (expect 1), level=4.
|
||||
Slowness was an immich-ML healthcheck flake (Adversary cross-confirmed live via lslocks:
|
||||
one holder pid 739163, one waiter pid 739341 on the same lock inode — serialization observed
|
||||
in the kernel lock table); ML converged inside the 1500s window, both runs green anyway —
|
||||
no clean re-run needed.
|
||||
- After both: no harness procs (pgrep run_recipe_c[i] empty), 0 immi/plau services/volumes/
|
||||
secrets/server-envs. Unheld lockfile remains by design (tidy-swept at next janitor probe).
|
||||
- (a) re-run on fixed harness: !testme immich#2 comment 14307 @08:50:02Z; will cancel mid-run
|
||||
via drone API once the deploy is in flight, then check pid/lock/leakage + janitor reap.
|
||||
|
||||
## 2026-06-10 — M2(a) re-run PASS (build 295) + M2 claim
|
||||
|
||||
- (a) on fixed harness: build 295 (comment 14307 @08:50:02Z) canceled @08:51:05Z (HTTP 200)
|
||||
while mid-deploy (lock held by pid 763099, 4 immich services converging). Harness pid GONE
|
||||
@08:51:15Z — the SIGTERM funnel ran the run's own teardown inside 10s; build status=killed;
|
||||
lock released (lslocks empty); services/volumes/secrets/envs all 0. Zero leakage, no janitor
|
||||
required.
|
||||
- Adversary lifted the CONC-A1 VETO @09:05Z with its own M2(c) PASS (290/291 cold-verified,
|
||||
kernel-lock-table serialization observation). Remaining for DONE: formal M2 claim (this
|
||||
commit) + Adversary cold re-check of (a)/push-builds.
|
||||
- M2 claimed in STATUS-conc.md with consolidated (a)-(d) evidence + cold re-check recipe.
|
||||
|
||||
## 2026-06-10 — M2 PASS → ## DONE
|
||||
|
||||
- Adversary M2 PASS @08:55Z (review 9987fba): all 7 claim items cold-confirmed, both M2-found
|
||||
fixes verified, guardrails honored, no open veto. Parent-sha typo in my claim noted by the
|
||||
Adversary (139e319^1 = 2173894, not 4ad55ed) — corrected in STATUS.
|
||||
- ## DONE written to STATUS-conc.md. Phase conc complete: one mechanism (per-app-domain flock),
|
||||
per-run ABRA_DIR isolation, flock-probe janitor, lifetime guards + 60-min deadline, single
|
||||
concurrency knob, spec rewritten, 23-test real-kernel suite. Two live-found fixes along the
|
||||
way: wrapper exit-code under set -e, CONC-A1 run-keyed state files.
|
||||
106
JOURNAL-rcust.md
Normal file
106
JOURNAL-rcust.md
Normal file
@ -0,0 +1,106 @@
|
||||
# JOURNAL — sub-phase rcust (Builder)
|
||||
|
||||
## 2026-06-10 bootstrap
|
||||
|
||||
Read phase plan (recipe-custom-restructure-full-plan.md), plan.md §6.1/§7/§9, and the reference
|
||||
spec docs/recipe-customization.md @ 76a4b6b in full. Created phase state files. Work branch will
|
||||
be `restructure/recipe-custom` off main @ 76a4b6b. Starting P1: reading the six current loaders
|
||||
(run_recipe_ci.py::_load_meta, conftest.py::_recipe_meta, lifecycle.py::_recipe_extra_env,
|
||||
lifecycle.py::_recipe_meta_flag, deps.py::declared_deps, canonical.py::is_canonical_enrolled)
|
||||
before writing harness/meta.py.
|
||||
|
||||
## 2026-06-10 P1 — single loader + registry (branch 472a68b)
|
||||
|
||||
Wrote runner/harness/meta.py: KEYS registry (14 keys + CHAOS_BASE_DEPLOY/OIDC_AT_INSTALL/
|
||||
SKIP_GENERIC kept registered as deprecated=True so P1 lands green before P2 deletes them),
|
||||
RecipeMeta generated from KEYS via dataclasses.make_dataclass (frozen; field set cannot drift from
|
||||
the registry), load() = the only exec() of recipe_meta.py, MetaError on unknown ALL-CAPS/type
|
||||
mismatch/callable-on-data-key, difflib suggestion in the unknown-key message. BACKUP_CAPABLE keeps
|
||||
its tri-state via default None (None = auto-detect — preserves the old `"BACKUP_CAPABLE" in meta`
|
||||
semantics in generic.backup_capable).
|
||||
|
||||
Migrations: orchestrator loads once + passes meta down (deploy_app/perform_upgrade/_perform_op/
|
||||
run_lifecycle_tier all take the object); conftest meta fixture returns full RecipeMeta (R3 closed);
|
||||
lifecycle._recipe_extra_env/_recipe_meta_flag and deps.declared_deps deleted; canonical.is_enrolled
|
||||
+ enrolled_recipes go through meta.load (tests monkeypatch meta.TESTS_DIR now instead of
|
||||
canonical.__file__); screenshot._load_screenshot_hook reads the attribute (R2 fixed — unit test
|
||||
proves SCREENSHOT survives the real orchestrator load path). deploy_app keeps an optional
|
||||
meta=None fallback (loads via the single loader) for fixture/manual callers — exec still happens
|
||||
in exactly one function.
|
||||
|
||||
Effective-value safety check before committing: dumped non_default() for all 21 recipe dirs through
|
||||
the new loader — every recipe's customized key set matches its recipe_meta.py source (e.g. mumble:
|
||||
DEPLOY_TIMEOUT/EXTRA_ENV/HEALTH_OK/READY_PROBE/UPGRADE_EXTRA_ENV). One intentional delta class:
|
||||
deps.deploy_deps' fallback timeouts for a MISSING dep meta change from literal 900/600 to loading
|
||||
the dep's real meta (orchestrator path always supplied metas, so CI behavior is identical).
|
||||
|
||||
Verified on cc-ci (rsynced working tree before committing):
|
||||
cc-ci-run -m pytest tests/unit -q -> 175 passed
|
||||
nix develop .#lint --command scripts/lint.sh -> lint: PASS
|
||||
Three pre-existing f212 unit tests passed dicts to wait_ready_probes — updated mechanically to
|
||||
construct RecipeMeta via dataclasses.replace (assertions untouched).
|
||||
|
||||
Next: P2a compose.ccci.yml first-class + auto-chaos.
|
||||
|
||||
## 2026-06-10 P2 — legacy keys & paths deleted (branch 8cd72fd)
|
||||
|
||||
P2a: lifecycle.provide_ccci_overlay copies tests/<recipe>/compose.ccci.yml into the per-run
|
||||
checkout (after install_steps hook, before prepull/deploy); pinned base deploys auto-chaos on
|
||||
overlay presence (has_ccci_overlay replaces the meta.CHAOS_BASE_DEPLOY elif). ghost/discourse
|
||||
install_steps.sh were copy-only -> deleted whole; their metas keep COMPOSE_FILE in EXTRA_ENV
|
||||
(unchanged wiring, the harness now owns the copy).
|
||||
|
||||
P2b: oidc_at_install condition removed — `if declared:` provisions before the single deploy,
|
||||
legacy post-deploy block + _run_setup_custom_tests_hook deleted. lasuite-docs install_steps.sh is
|
||||
the meet/drive hook with docs' exact env names (diffed against the deleted setup_custom_tests.sh:
|
||||
same keys incl. OIDC_OP_DISCOVERY_ENDPOINT + scopes 'openid email profile'; secret-insert bump
|
||||
identical; only the abra-redeploy step is gone — the single deploy reads the env instead).
|
||||
lasuite-drive's MinIO bucket one-shot -> ops.py pre_install (runs at install-tier start, post-
|
||||
deploy; bucket lives in the minio volume so it survives upgrade/restore; same scale --detach +
|
||||
30x3s poll as the shell version). run_quick: deps still provision (realm/creds), hook call gone —
|
||||
no quick-enrolled recipe declares DEPS today; noted inline.
|
||||
|
||||
P2c: SKIP_GENERIC out of the registry; _skip_generic(op) env-only; skip_generic_env_overrides()
|
||||
prints a `!!` warning when active under DRONE (P5 will embed in the manifest).
|
||||
|
||||
P2d: conftest deps fixture = dict of _DepEntry (dict subclass w/ attribute sugar) — the 6 lasuite
|
||||
files only ever used deps_creds, renamed param to deps, zero assertion changes. NOTE for Adversary:
|
||||
some assert MESSAGE strings ('setup_custom_tests should have populated this.' -> 'dep
|
||||
provisioning...') and docstrings updated — message text only, no assert logic/expected values.
|
||||
|
||||
Verified on cc-ci (rsync of working tree): cc-ci-run -m pytest tests/unit -q -> 175 passed;
|
||||
nix develop .#lint --command scripts/lint.sh -> PASS. Doc table regenerated to the 14-key registry
|
||||
(doc-sync unit test pins it).
|
||||
|
||||
Next: P3 — HookCtx + ctx-hook signatures everywhere.
|
||||
|
||||
## 2026-06-10 P3 — uniform ctx hook convention (branch fd02d9f)
|
||||
|
||||
HookCtx frozen dataclass + hook_ctx() constructor in harness/meta.py; ctx.deps read straight from
|
||||
$CCCI_DEPS_FILE (json, both shapes) — meta.py stays import-cycle-free (deps.py imports lifecycle
|
||||
which imports meta). Registry keys carry hook_params; meta.load() enforces the expected positional
|
||||
names per hook key (READY_PROBE/BACKUP_VERIFY/EXTRA_ENV/UPGRADE_EXTRA_ENV=(ctx,),
|
||||
SCREENSHOT=(page, ctx)); _run_pre_hook applies meta.check_hook_signature(fn, ("ctx",)) to ops.py
|
||||
hooks before calling. Conversion of 17 ops.py + 8 recipe_meta hooks was scripted (def-line regex +
|
||||
bare `domain` -> `ctx.domain` inside the pre_*/hook function bodies only) and diff-reviewed; the
|
||||
only manual fixes: keycloak pre_restore passed `meta` -> `ctx.meta`, and two comment lines in
|
||||
lasuite-drive/-meet metas that the regex over-replaced were restored. wait_ready_probes gained
|
||||
op= (install/upgrade call sites pass it) so probes can know the phase.
|
||||
|
||||
Verified on cc-ci: cc-ci-run -m pytest tests/unit -q -> 180 passed; lint PASS.
|
||||
|
||||
Next: P4 — discovery placement rule + op_state/deps fixtures + migrate hand-parsers.
|
||||
|
||||
## 2026-06-10 P4 — custom-test ergonomics (branch 29a28e2)
|
||||
|
||||
Pre-change sweeps confirmed the plan's zero-users claims: no top-level non-lifecycle test_*.py in
|
||||
any recipe dir; no recipe test file reads os.environ / CCCI_OP_STATE_FILE directly (the only
|
||||
op-state consumers are the generic assertions via harness.generic.op_state — harness-side, fine).
|
||||
So P4 = discovery glob removal + new op_state fixture + pinning tests; no test migrations needed.
|
||||
test_discovery.py's HC2 gate test moved its repo-local custom fixture under functional/ (the rule);
|
||||
test_discovery_phase2.py now asserts top-level custom is NOT discovered. op_state fixture skips
|
||||
(clear reason) when env unset / file missing / unparseable; tested via request.getfixturevalue.
|
||||
|
||||
Verified on cc-ci: cc-ci-run -m pytest tests/unit -q -> 184 passed; lint PASS.
|
||||
|
||||
Next: P5 — customization manifest (print block + results.json key).
|
||||
442
REVIEW-conc.md
Normal file
442
REVIEW-conc.md
Normal file
@ -0,0 +1,442 @@
|
||||
# REVIEW-conc.md — Adversary ledger, concurrency-restructure phase
|
||||
|
||||
Append-only. Verdicts: `<gate>: PASS @<ts>` + evidence, or `FAIL` + [adversary] finding in
|
||||
BACKLOG-conc.md. SSOT for what is verified: /srv/cc-ci/cc-ci-plan/concurrency-restructure-full-plan.md.
|
||||
|
||||
## 2026-06-10T04:00Z — Adversary online; baseline pre-read (no gate pending)
|
||||
|
||||
Pulled main @5b65c6c. No STATUS-conc.md, no `restructure/concurrency` branch — nothing claimed yet.
|
||||
Pre-read the CURRENT system (docs/concurrency.md @5b65c6c + lifecycle.py/run_recipe_ci.py) to
|
||||
anchor my later diff review in the as-is code, not the Builder's narrative.
|
||||
|
||||
Current-system facts I will hold the restructure against:
|
||||
- Registry symbols slated for deletion (will grep for dangling refs at M1):
|
||||
`register_run_app` (lifecycle.py:69, call site :283), `unregister_run_app` (:78, call sites :723, :766),
|
||||
`_run_owner_state` (:83), `ACTIVE_RUN_DIR` (:43), `CCCI_JANITOR_MAX_AGE` (janitor :738),
|
||||
`acquire_recipe_lock` (:46, call site run_recipe_ci.py:843), `RECIPE_LOCK_DIR` (:42).
|
||||
- Must survive untouched: `RUN_APP_RE` (lifecycle.py:26) allowlist semantics (warm/canonical apps
|
||||
never probed), `services_converged()` paused-is-settled logic, docker-service sweep discovery,
|
||||
`teardown_app(verify=False)` idempotence.
|
||||
- M1 verification plan (cold, my clone): checkout branch; `pytest tests/unit -q`,
|
||||
`pytest tests/concurrency -q`, `scripts/lint.sh`; full diff review hunting: probe-vs-acquire
|
||||
ordering races, signal-handler reentrancy (SIGTERM during teardown / SIGALRM during SIGTERM),
|
||||
teardown-during-teardown, lock-fd lifetime (object dropped → GC closes fd → lock silently
|
||||
released), symlinked servers/ write conflicts, janitor unlink-vs-reacquire race (unlink while a
|
||||
waiter blocks on the old inode → two "held" locks on different inodes for one domain),
|
||||
PDEATHSIG-after-fork ordering (prctl before ppid check), alarm(0) vs teardown duration,
|
||||
setsid wrapper trap semantics under drone cancel, test-suite blind spots vs the 19 planned cases.
|
||||
- Tests/concurrency must NOT be wired into the default `pytest tests/unit` gate (plan decision).
|
||||
- M2 (post-merge, live): cancel-mid-run leak check, parallel immich#2+plausible#3, double-!testme
|
||||
same PR blocks visibly, one full green run. NEVER merge/push recipe mirror repos.
|
||||
|
||||
No verdict yet — waiting for Builder bootstrap/claim.
|
||||
|
||||
## 2026-06-10T04:05Z — cold-verify environment established (prep, no gate)
|
||||
|
||||
Builder seeded STATUS/BACKLOG/JOURNAL-conc; STATUS says P1 in flight, no gate claimed. Mapped the
|
||||
test-execution environment I'll use for the M1 cold run so a time-sensitive gate isn't spent
|
||||
debugging tooling:
|
||||
- Local VM devshell (`nix develop`) has only lintTools (no pytest). So pytest does NOT run here.
|
||||
- pytest 8.3.3 + playwright live in the host `pyEnv` (nix/modules/harness.nix) exposed as
|
||||
`cc-ci-run` on cc-ci. `cc-ci-run -m pytest <path> -q` works as the real harness interpreter
|
||||
(verified: `cc-ci-run -c "import pytest" -> 8.3.3`).
|
||||
- `.drone.yml` lint stage runs `nix develop .#lint --command bash scripts/lint.sh`.
|
||||
- COLD M1 PLAN: fresh `git clone`/checkout of `restructure/concurrency` into a throwaway dir ON
|
||||
cc-ci → `cc-ci-run -m pytest tests/unit -q` + `cc-ci-run -m pytest tests/concurrency -q` +
|
||||
`nix develop .#lint --command bash scripts/lint.sh`, all from that clean checkout (not the
|
||||
Builder's working tree). Then adversarial diff review per my baseline hit-list.
|
||||
- Baseline `.drone.yml` on main is still the pre-restructure version (concurrency.limit=2,
|
||||
acquire_recipe_lock / /run/cc-ci-active registry referenced) — confirms P1/P4 edits are
|
||||
branch-only so far. Good.
|
||||
|
||||
## 2026-06-10T04:23Z — early pre-review of P1+P2 (branch @b302f3a, NO gate claimed — NOT a verdict)
|
||||
|
||||
Builder has pushed P1 (b492f99) + P2 (b302f3a) to restructure/concurrency; P3/P4/P5/tests still
|
||||
pending, so M1 is not claimable and this is NOT a PASS — it's pre-review to front-load the M1 diff
|
||||
audit and avoid re-doing it under gate time pressure. Read code/diff + git only; did NOT read
|
||||
JOURNAL (anti-anchoring intact). I actively tried to break the following and each concern was
|
||||
REFUTED:
|
||||
|
||||
1. **Green-on-red via the .drone.yml EXIT trap** (my lead hypothesis). The wrapper is
|
||||
`setsid cc-ci-run … & PID=$!; trap 'kill -TERM -- -$PID' TERM EXIT; wait $PID`. I worried the
|
||||
EXIT trap's final `kill` status would override the harness exit code and mask a failing run.
|
||||
EMPIRICALLY TESTED (4 bash repros incl. failing harness with a lingering group member that
|
||||
makes kill succeed=0): bash PRESERVES the pre-trap exit status when the EXIT trap doesn't call
|
||||
`exit`. Exit code propagates correctly in all cases (RED stays RED, GREEN stays GREEN). Refuted.
|
||||
2. **P2 unlink/reacquire inode race** (janitor unlinks a reaped orphan's lockfile while a new run
|
||||
blocks on the old inode). Handled: both acquire_app_lock and _probe_and_reap recheck
|
||||
`fstat(fd).st_ino == stat(path).st_ino` after acquiring and retry/bail on mismatch — a lock on
|
||||
an unlinked (anonymous) inode is never treated as authoritative, and the path's lockfile is
|
||||
never unlinked out from under a newer run. Refuted.
|
||||
3. **Half-reaped/new-app coexistence.** Reap runs WHILE HOLDING the probe lock; a new same-domain
|
||||
run blocks in acquire_app_lock until reap completes. The pre-deploy window (lock held, app not
|
||||
yet created) is covered: the stale-lockfile sweep sees the held lock (BlockingIOError) and
|
||||
leaves it. Refuted.
|
||||
4. **Signal mid-normal-teardown aborting cleanup.** begin_teardown() is the FIRST line of BOTH
|
||||
finally blocks (run_recipe_ci.py:663 run_quick, :1134 main); the _funnel_handler swallows
|
||||
(logs+returns) any SIGTERM/SIGALRM once tearing_down is set, so a second signal can't abort the
|
||||
cleanup the first asked for. install_lifetime_guards() is the FIRST statement of main() (:829),
|
||||
before any abra/lock call, with prctl→ppid==1 recheck in the correct order. Refuted.
|
||||
|
||||
Open items to confirm AT M1 (cold, full suite) — NOT defects, just unverified-until-then:
|
||||
- `datetime` import removed from lifecycle.py along with _stack_age_seconds — grep for any
|
||||
remaining datetime use (ruff would catch an undefined name; confirm import truly orphaned).
|
||||
- `_stack_name` / age-fallback deadcode after the janitor rewrite — confirm no dangling refs.
|
||||
- Registry-symbol deletion is only PARTIAL on this commit: acquire_recipe_lock still present
|
||||
(P3 deletes it); register/unregister/_run_owner_state/ACTIVE_RUN_DIR/CCCI_JANITOR_MAX_AGE are
|
||||
gone — full dangling-ref grep belongs at M1 once P3 lands.
|
||||
- setsid-fork edge: if `setsid` ever forks (only when it's a pgrp leader; not the case for a
|
||||
backgrounded job in a non-job-control drone shell), $PID would be the intermediate and the
|
||||
harness would reparent to ppid==1 and self-abort. Live-verify the trap+cancel path at M2(a).
|
||||
- begin_teardown is process-global module state (lifetime._state) — fine for one harness process;
|
||||
the tests/concurrency suite must not import-share it across in-process cases (verify at M1).
|
||||
|
||||
## 2026-06-10T04:32Z — pre-review P3+P4 (branch @91d3cc7, NO gate claimed — NOT a verdict)
|
||||
|
||||
Builder pushed P3 (17ebdf3 per-run ABRA_DIR) + P4 (91d3cc7 config cleanup). tests/concurrency +
|
||||
P5 docs still pending, so M1 still not claimable. Continued the front-loaded diff audit (code/git
|
||||
only; JOURNAL still unread). Findings — all CLEAN:
|
||||
|
||||
- **Dangling-ref grep across runner/bridge/dashboard/nix = ZERO hits** for all 9 deleted symbols:
|
||||
acquire_recipe_lock, register_run_app, unregister_run_app, _run_owner_state, ACTIVE_RUN_DIR,
|
||||
CCCI_JANITOR_MAX_AGE, RECIPE_LOCK_DIR, _stack_age_seconds, _registry_path. The orphaned
|
||||
`datetime` import is also gone from lifecycle.py. Clean deletion.
|
||||
- **Path centralization**: all `~/.abra/recipes/<recipe>` literals replaced by `abra.recipe_dir()`
|
||||
(resolves `$ABRA_DIR else ~/.abra`) across abra.py (recipe_checkout, has_lightweight_version_tags,
|
||||
recipe_head_commit, recipe_versions), generic._recipe_dir, lifecycle.prepull_images,
|
||||
snapshot_recipe_tests, fetch_recipe. prepull's env_path stays canonical `~/.abra/servers/...`
|
||||
which is correct (servers/ is the shared symlink target).
|
||||
- **Ordering verified** (main(), the only structural risk): install_lifetime_guards() is the FIRST
|
||||
stmt (873); between it and setup_run_abra_dir() (891) there are ONLY env reads + a print — no
|
||||
abra call; ABRA_DIR is exported at 891 BEFORE fetch_recipe (892) and before the first path-helper
|
||||
recipe_head_commit (895). The `--quick` dispatch (run_quick, ~908) is AFTER 891, so the quick lane
|
||||
inherits the per-run ABRA_DIR too. No tree is touched before ABRA_DIR is set.
|
||||
- **Manual-run isolation**: rid=="manual" → "manual-<pid>" so two hand-runs don't share a tree.
|
||||
|
||||
Open items to confirm AT M1 (cold) — not defects:
|
||||
- setup_run_abra_dir symlink idempotency: `if not os.path.islink(link): os.symlink(...)` — if a
|
||||
NON-symlink file pre-exists at servers/catalogue (reused run dir from a crashed partial), symlink
|
||||
raises FileExistsError. Low risk (fresh run-id per Drone build) but worth a glance.
|
||||
- CCCI_SKIP_FETCH=1 now `rm -rf dest` + copytree(canonical, dest, symlinks=True) — confirm the
|
||||
--quick rollback-proof staging tests still pass (they set CCCI_SKIP_FETCH).
|
||||
- tests/{ghost,discourse}/install_steps.sh RECIPE_DIR=${ABRA_DIR:-$HOME/.abra} mechanical path fix
|
||||
— confirm it changed NO assertion/gate (guardrail: never weaken recipe-test gates). Diff-check.
|
||||
|
||||
Net: the entire P1–P4 diff has been pre-audited and is clean against my break-it hit-list. M1 cold
|
||||
run, once claimed (after tests/concurrency + P5 land), reduces to: fresh checkout on cc-ci →
|
||||
`cc-ci-run -m pytest tests/unit -q` + `cc-ci-run -m pytest tests/concurrency -q` + lint, plus a
|
||||
focused review of only the tests/concurrency suite (vs the 19 planned cases) and the P5 doc delta.
|
||||
|
||||
## M1: PASS @2026-06-10T04:38Z — implementation verified (branch restructure/concurrency @d3fe9e2)
|
||||
|
||||
Verdict formed from the plan (SSOT), the code/git, the STATUS claim's verify recipe, and my own
|
||||
COLD acceptance run — WITHOUT reading JOURNAL first (anti-anchoring honored; noting here that I had
|
||||
NOT consulted JOURNAL-conc at verdict time).
|
||||
|
||||
COLD ENVIRONMENT: fresh `git clone --branch restructure/concurrency` into /tmp/adv-m1 on cc-ci
|
||||
(NOT the Builder's tree); `git rev-parse HEAD == d3fe9e26bb0fbaedb37383539ba3973bc1c80aff` (matches
|
||||
claim), `git status` clean. Ran via the host `cc-ci-run` pyEnv (pytest 8.3.3 + playwright) and the
|
||||
pinned `.#lint` devshell.
|
||||
|
||||
ACCEPTANCE RESULTS (expected → observed):
|
||||
- `cc-ci-run -m pytest tests/unit -q` → 138 passed in 4.72s ✓ (claim: 138 passed)
|
||||
- `cc-ci-run -m pytest tests/concurrency -q` → 20 passed in 9.91s ✓ (claim: 20 passed)
|
||||
- `nix develop .#lint --command bash scripts/lint.sh` → `lint: PASS` ✓
|
||||
- `pytest tests/unit --collect-only` concurrency items → 0 ✓ (suite NOT in default gate)
|
||||
- dangling-ref grep (register_run_app, unregister_run_app, _run_owner_state, ACTIVE_RUN_DIR,
|
||||
CCCI_JANITOR_MAX_AGE, acquire_recipe_lock, RECIPE_LOCK_DIR, _stack_age_seconds) over
|
||||
*.py/*.nix/*.yml/*.sh → ZERO hits outside docs/ ✓
|
||||
|
||||
GATE-INTEGRITY (guardrails honored):
|
||||
- `RUN_APP_RE` regex unchanged (lifecycle.py:26, identical pattern); warm/canonical apps still
|
||||
never become probe candidates (test_11 asserts no lockfiles even created for warm names).
|
||||
- `services_converged()` / paused-is-settled / `backup_app()` waits: NOT in the code diff — all
|
||||
RUN_APP_RE/services_converged/paused diff hits are docs/concurrency.md prose (P5 rewrite).
|
||||
- `teardown_app` ordering untouched; only its trailing unregister call removed (registry gone).
|
||||
- Only `tests/<recipe>/` change is the mechanical `RECIPE_DIR=${ABRA_DIR:-$HOME/.abra}/...` line
|
||||
in ghost+discourse install_steps.sh — NO assertion/gate touched (diff-confirmed). Guardrail
|
||||
"never weaken recipe-test gates / touch tests/<recipe>/ content" honored.
|
||||
- P4: `concurrency.limit` block removed from .drone.yml; drone-runner.nix comment makes
|
||||
DRONE_RUNNER_CAPACITY the single knob.
|
||||
|
||||
ADVERSARIAL DIFF REVIEW (P1–P4 pre-audited in the two notes above; refuted: green-on-red exit-code
|
||||
masking [empirically tested], unlink/reacquire inode race [fstat==stat identity recheck],
|
||||
half-reaped coexistence [reap-under-probe-lock], signal-mid-teardown reentrancy [begin_teardown
|
||||
first line of both finally blocks], guard/ABRA_DIR/fetch ordering [no abra call pre-export]).
|
||||
|
||||
TEST-SUITE AUDIT vs the 19 plan cases: real kernel flocks, NEVER mocked (only teardown_app +
|
||||
abra-discovery stubbed, both disclosed). Coverage complete: cases 1–4 test_locks, 5–12
|
||||
test_janitor, 13–16 test_lifetime, 17–19 test_abra_dir, +test_18b (manual-pid isolation) = 20.
|
||||
Assertions are substantive, not tautological: exact funnel exit codes 142/143 (test_15/16),
|
||||
reap-vs-new-run timestamp ordering + fresh-inode `lock_state=="held"` (test_7), two-janitor
|
||||
arbitration via separate open()s (test_8 — valid: flock binds the open file description, so
|
||||
threads-with-distinct-fds model processes), long-held mtime-backdate flag-not-steal (test_10),
|
||||
PEP 446 fd non-inheritance with a surviving child (test_3), divergent per-run trees + canonical
|
||||
untouched (test_18).
|
||||
|
||||
INDEPENDENT PROBE (my own driver, NOT the Builder's helpers.py): drove the real
|
||||
`lifecycle.acquire_app_lock` from a standalone script with a sandbox CCCI_APP_LOCK_DIR on cc-ci →
|
||||
state `held` after acquire; a second acquirer BLOCKED while the first held (no ack2 after 1.5s);
|
||||
after `SIGKILL` of the holder the second acquired within 10s (kernel auto-release). Core invariant
|
||||
confirmed against the real code, not just the Builder's tests.
|
||||
|
||||
NON-BLOCKING NOTES (carry to M2 live-verify; none gate M1):
|
||||
- setsid-fork edge in the .drone.yml trap wrapper: if `setsid` ever forks (only when it's a pgrp
|
||||
leader — not the case for a backgrounded job in a non-job-control drone shell), $PID would be the
|
||||
intermediate and the harness could reparent (ppid==1) and self-abort. MUST be live-verified by
|
||||
the actual drone-cancel path at M2(a) — the plan already flags this ("verify drone exec runner
|
||||
signal delivery; the trap must fire on drone cancel"). Not unit-testable here.
|
||||
- End-of-janitor stale-lockfile tidy sweep (appless leftover lockfile unlink) is not directly
|
||||
covered by a named test (not one of the 19); low risk (tidiness only). Noted, not a defect.
|
||||
- test_14 (ppid race) depends on the helper reparenting to pid 1; under a subreaper it marks
|
||||
NEVER_REPARENTED and FAILS VISIBLY (never false-passes). Passed in this env.
|
||||
|
||||
CONCLUSION: M1 — implementation verified — PASS. M2 (merge to main + live verification a–d) is
|
||||
unblocked. Reminder for both loops: recipe-mirror PRs are !testme targets only — never merge/push
|
||||
them. (After this verdict I may consult JOURNAL-conc to contextualize, per §6.1.)
|
||||
|
||||
## 2026-06-10T04:49Z — M2 merge integrity pre-check (M2 NOT yet claimed — not a verdict)
|
||||
|
||||
Builder merged the branch to main (merge commit `bb5eb3d`, 2 parents 83a6c6e∘d3fe9e2, no force)
|
||||
after my M1 PASS, and is mid-M2 live verification (journal: M2(a) cancel-mid-run evidence, (b)
|
||||
parallel runs triggered). No `claim(conc): M2` commit yet; STATUS-conc still shows the stale M1
|
||||
line (Builder's file — will update at the M2 claim). Independent merge check:
|
||||
- `git diff bb5eb3d d3fe9e2 -- runner/ .drone.yml docs/concurrency.md tests/ nix/` = EMPTY → the
|
||||
merge preserved EXACTLY the code I cold-verified at M1. No conflict-resolution drift introduced.
|
||||
- `git merge-base --is-ancestor d3fe9e2 bb5eb3d` = true.
|
||||
So deployed main == M1-verified tree. At the M2 claim I therefore re-verify only LIVE behavior +
|
||||
the push build, not the code again:
|
||||
push build green; (a) cancel mid-run → no leaked python/lock, next janitor reaps the app, zero
|
||||
leakage; (b) two parallel !testme (immich#2 + plausible#3) → both green, zero leakage; (c)
|
||||
double-!testme same PR → 2nd blocks on the app lock (visible in its drone log) then runs; (d) one
|
||||
full green end-to-end run. Evidence to come from Drone build logs + cc-ci state (abra app ls /
|
||||
lslocks / docker), cold from my own access path.
|
||||
|
||||
## 2026-06-10T05:00Z — wrapper exit-code fix verified + CORRECTION to my P1 pre-review (inbox consumed)
|
||||
|
||||
Consumed ADVERSARY-INBOX.md (deleted) — Builder reported an M2 live-verify finding + fix. Folded in:
|
||||
|
||||
**The defect (real, Builder-found, build 269 plausible#3):** the drone exec step shell is `set -e`.
|
||||
On a NORMAL (green) harness exit the P1 EXIT trap still fired and its `kill -TERM -- -$PID` of the
|
||||
already-exited process group returned ESRCH (exit 1), which under `set -e` poisoned the step's exit
|
||||
status to 1 — a fully GREEN run (all tiers pass, level=4) reported RED.
|
||||
|
||||
**CORRECTION — my P1 pre-review was wrong on this point.** In my 04:23Z pre-review I claimed to have
|
||||
"empirically tested" green-on-red exit-code masking and REFUTED it. That test was run with plain
|
||||
`bash -c` WITHOUT `set -e` — the wrong shell mode. The real drone step runs `set -e`, where the bug
|
||||
manifests. I re-ran the matrix correctly now (bash -e), reproducing the bug (old wrapper + green +
|
||||
set -e → exit 1) and confirming I had the shell mode wrong. Lesson: model the EXACT runtime
|
||||
(set -e) for shell-trap behavior. The Builder caught this live; I did not. Owning it.
|
||||
NB the failure direction was false-RED (green reported red) — fail-safe-ish, not a green-on-red
|
||||
(no failing run was ever reported green); still a real defect.
|
||||
|
||||
**The fix (e1c4198 on branch, merged to main b7a009c) — independently verified by me, cold under
|
||||
`set -e` (the correct mode this time):**
|
||||
```
|
||||
setsid cc-ci-run runner/run_recipe_ci.py & PID=$!
|
||||
trap 'kill -TERM -- "-$PID" 2>/dev/null || true' TERM EXIT
|
||||
rc=0; wait "$PID" || rc=$?
|
||||
trap - TERM EXIT
|
||||
exit "$rc"
|
||||
```
|
||||
My 4-path matrix (all under `bash -e`, exact-shape repros):
|
||||
- A green harness → step exit 0 ✓ (poisoning gone: `|| true` on the trap kill + `trap - EXIT` before exit)
|
||||
- B **red harness (exit 7) → step exit 7 ✓ — NOT masked to green.** Critical false-GREEN check
|
||||
PASSES: `wait || rc=$?` captures the real rc and `exit "$rc"` propagates it. The
|
||||
"failing PR must report RED" gate is preserved by the fix.
|
||||
- C old wrapper + green + set -e → exit 1 ✓ (bug reproduced — root-cause confirmed)
|
||||
- D cancel (TERM to wrapper mid-wait) → wrapper exits 143 AND the child received TERM
|
||||
(CHILD_GOT_TERM logged) ✓ — cancel-forwarding semantics unchanged; the `trap - TERM EXIT` runs
|
||||
only AFTER `wait` returns (post-forward), so it can't disarm the forward during a real cancel.
|
||||
|
||||
Verdict on the fix: CORRECT and SAFE — resolves the false-RED poisoning without introducing
|
||||
false-GREEN, and preserves cancel forwarding. Folds cleanly into the pending M2 review.
|
||||
|
||||
**M1 status unaffected:** M1 PASS was for the code/suites/lint/diff of d3fe9e2; this wrapper
|
||||
exit-code-under-set-e is a LIVE behavior M1's checks could not exercise (the trap only runs in the
|
||||
real drone exec shell). main now = d3fe9e2 + this .drone.yml wrapper fix; the fix is verified above.
|
||||
Open for the formal M2 verdict: re-confirm lint green on the new .drone.yml (yamllint), the push
|
||||
build green, and live (a) cancel-no-leak / (b) parallel both-green / (c) double-!testme blocks /
|
||||
(d) one full green run — cold, once the Builder posts the M2 claim with evidence.
|
||||
|
||||
## M2(c): FAIL @2026-06-10T08:10Z — double-!testme same domain corrupts shared deploy-count → both runs RED + VETO
|
||||
|
||||
Proactive cold break-it probe of the live M2 evidence (M2 not yet formally `claim(conc)`'d — the
|
||||
Builder's JOURNAL shows (c) "triggered" but NOT evidenced as PASS; I went straight to the Drone API
|
||||
to verify the in-flight (c) runs independently, not to the JOURNAL narrative). I found a REAL defect
|
||||
that breaks M2(c). Filed as BACKLOG-conc CONC-A1.
|
||||
|
||||
EVIDENCE (Drone API, recipe-maintainers/cc-ci, cold via /run/secrets/bridge_drone_token — my own
|
||||
access path, not the Builder's word):
|
||||
- (c) = builds **279 + 281**, both `event=custom PR=2 RECIPE=immich REF=a92b28d…` → SAME domain
|
||||
`immi-ad3e33.ci.commoninternet.net`. Both `status=failure` (step `ci` exit_code=1).
|
||||
- 281 (the blocked run): log `== app lock: ... in flight — waiting ==` @2s → `== acquired ==` @194s,
|
||||
which is exactly when 279's process exited (279 finished 05:07:35Z). **Lock serialisation + the
|
||||
visible block line WORK** — that half of (c) is fine.
|
||||
- 279 RED: `!! deploy-count 2 != 1 (DG4.1 violation)`.
|
||||
- 281 RED: `FileNotFoundError: /tmp/ccci-deploys-immi-ad3e33….ci.commoninternet.net` at
|
||||
run_recipe_ci.py:1213.
|
||||
- Control build 275 (isolated immich, same fixed wrapper) → `deploy-count = 1`, GREEN. Confirms the
|
||||
failure is concurrency-specific, NOT a pre-existing immich/wrapper regression.
|
||||
|
||||
ROOT CAUSE (code, confirmed):
|
||||
- DG4.1 counter file is DOMAIN-keyed in shared /tmp, not per-run: `run_recipe_ci.py:930
|
||||
/tmp/ccci-deploys-<domain>`. P3 isolated ABRA_DIR per run but this per-run state file was missed
|
||||
(predates the restructure, ef44d46; the old recipe-flock serialised same-recipe runs end-to-end,
|
||||
masking it).
|
||||
- `deploy_app()` calls `_record_deploy()` (lifecycle.py:250) BEFORE `acquire_app_lock()` (:254,
|
||||
introduced by P2 b302f3a) → the increment races OUTSIDE the lock. 281's single pre-lock
|
||||
`_record_deploy` (@2s) bumps the shared counter 279 is using (→2, false violation), and 279's
|
||||
end-of-run `os.remove(countfile)` (:1215) deletes the file under 281 → FileNotFoundError.
|
||||
- Interleaving is fully reconstructed and self-consistent with the build timestamps (see CONC-A1).
|
||||
|
||||
This is squarely in M2(c) scope: the plan's DoD (c) requires the second run to "block … then RUN"
|
||||
(implicitly green), and the phase's whole premise is "two concurrent !testme don't collide on
|
||||
domain/volume/secrets." This is a domain-keyed-state collision — the restructure's narrower domain
|
||||
lock no longer covers the deploy-count file. M1 (code/suites/lint/diff of d3fe9e2) is unaffected —
|
||||
this is a live concurrency behavior M1's checks could not exercise; the tests/concurrency suite has
|
||||
the matching blind spot (case 4 serialises acquire but never asserts deploy-count isolation across
|
||||
two same-domain runs).
|
||||
|
||||
## VETO — M2 may NOT be marked DONE until CONC-A1 is fixed and I log a fresh (c) PASS
|
||||
Forbidding `## DONE` in STATUS-conc until: (1) deploy-counter keyed per-run; (2) a tests/concurrency
|
||||
case asserts same-domain deploy-count isolation; (3) live (c) re-run shows BOTH builds GREEN with
|
||||
the visible block line and zero leakage; (4) (a),(b),(d) re-confirmed unaffected. Only I clear this.
|
||||
(After this verdict I may consult JOURNAL-conc to contextualise — noting I had NOT read the (c)
|
||||
journal reasoning before forming this FAIL; I verified from the Drone API + code directly.)
|
||||
|
||||
## 2026-06-10T08:20Z — CONC-A1 fix CODE-verified (veto conditions 1+2 met; 3+4 still pending — NOT cleared)
|
||||
|
||||
Builder fixed CONC-A1 (b6e12ef, merged main 139e319) and is re-running M2 live (a)–(d). I
|
||||
cold-verified the FIX CODE from my own clone + a fresh checkout on cc-ci (not the Builder's word):
|
||||
|
||||
- **Condition (1) per-run keying — MET.** `run_recipe_ci._run_state_path(name)` keys all four
|
||||
run-scoped state files (`deploys`, `opstate`, `deps`, `depskip`) by `run_id()` + `os.getpid()`,
|
||||
never domain. Grep: ZERO residual `ccci-<state>-{domain}` literals in prod code (only the
|
||||
app-LOCK path stays domain-keyed, which is correct). All consumers env-read `CCCI_*_FILE`
|
||||
(lifecycle:148, deps:72/155, generic:134) — no path re-derivation. Uniqueness holds even in the
|
||||
manual fallback (`run_id()`→domain) because the `+pid` suffix separates two processes.
|
||||
- **Condition (2) same-domain isolation test — MET, and proven non-tautological.**
|
||||
tests/concurrency/test_run_state.py adds test_20/20b/20c. test_20c drives REAL processes + the
|
||||
REAL lock + real `_run_state_path`/`_record_deploy`, reproducing the 279/281 interleaving: run A
|
||||
reads `COUNT 1` (NOT polluted to 2 by B's pre-lock increment) and B's file survives A's remove
|
||||
(no FileNotFoundError). **Mutation check (my own):** reverting `_run_state_path` to domain-keying
|
||||
in a throwaway cc-ci clone → all 3 test_run_state cases FAIL (incl. test_20c). So the test
|
||||
genuinely guards the fix.
|
||||
- **Suites cold (fresh clone @4f6c955 on cc-ci):** unit 138 passed, concurrency 23 passed (was 20),
|
||||
concurrency still NOT collected by the default `pytest tests/unit` run (0). lint not re-run here
|
||||
(no .drone.yml/nix change in the fix; will confirm at the M2 claim).
|
||||
|
||||
**VETO NOT cleared.** Conditions (3) live (c) re-run BOTH builds GREEN + visible block line + zero
|
||||
leakage, and (4) (a)/(b)/(d) re-confirmed on the fixed harness, still require the Builder's live
|
||||
evidence (in flight). The code fix strongly predicts a (c) pass but M2 is a LIVE gate — I will
|
||||
re-verify the (c) double-!testme cold from the Drone API once the Builder posts the M2 claim, and
|
||||
only then clear the veto.
|
||||
|
||||
## 2026-06-10T08:43Z — live (c) round-2 (builds 290+291): serialization CONFIRMED via lslocks; delay is an immich-ML flake, NOT the restructure (not a verdict)
|
||||
|
||||
(b)+(d) re-passed on the fixed harness (builds 287 immich#2 + 288 plausible#3, parallel, both
|
||||
success — I'll re-confirm at the M2 claim). (c) round 2 = builds 290+291 (both custom PR=2 immich,
|
||||
same domain immi-ad3e33), started 08:22:30Z. I inspected the LIVE host state cold (my own ssh):
|
||||
|
||||
- **CORE INVARIANT DIRECTLY OBSERVED in the kernel lock table** — strongest possible proof of the
|
||||
double-!testme serialization:
|
||||
`lslocks`: pid 739163 (build 290) holds `WRITE` on cc-ci-app-immi-ad3e33….lock; pid 739341
|
||||
(build 291) is blocked `WRITE*` on the SAME lock. Exactly one holder, one waiter, one inode.
|
||||
- 290 (holder) is sleeping in `services_converged()` poll (hrtimer_nanosleep, no abra child) because
|
||||
`immich-machine-learning` is stuck 0/1: its container repeatedly fails the healthcheck
|
||||
(`non-zero exit (143): dockerexec: unhealthy container`, swarm restarting every 1–6 min). Current
|
||||
attempt (08:43) has gunicorn up, health `starting` — slow/flaky ML readiness, not a deploy break.
|
||||
- NOT caused by the restructure / teardown: 290's immich volumes (model-cache/postgres/uploads) +
|
||||
.env are all from 290's OWN fresh deploy (08:23), not inherited from the earlier same-domain run
|
||||
287. ML image present (1.36GB, no pull), host healthy (5.2Gi mem free, 65G disk). So this is an
|
||||
immich-ML healthcheck flake, orthogonal to concurrency.
|
||||
|
||||
Bearing on M2(c): the SERIALIZATION mechanism under test is verified working live. The "both GREEN"
|
||||
half of condition (3) is not yet demonstrated only because 290 is flake-blocked on immich-ML; if 290
|
||||
REDs on deploy-timeout, (c) needs a clean re-run (flake, not a code fault). VETO unchanged — I still
|
||||
require one clean (c) where both same-domain builds go GREEN with the block line + zero leakage.
|
||||
Continuing to watch 290/291 to terminal.
|
||||
|
||||
## M2(c): PASS @2026-06-10T09:05Z — double-!testme same domain, CONC-A1 fixed; VETO LIFTED
|
||||
|
||||
(c) round-2 builds 290+291 (both `custom PR=2 immich`, same domain immi-ad3e33, on CONC-A1-fixed
|
||||
main) both reached terminal **status=success**. Cold-verified from the Drone API + live host (my own
|
||||
access path), not the Builder's word:
|
||||
|
||||
- **Both GREEN:** 290 success, 291 success (Drone API).
|
||||
- **Visible block line (the (c) requirement):** 291 log —
|
||||
`== app lock: another run of immi-ad3e33….ci.commoninternet.net is in flight — waiting ==`
|
||||
then `== app lock: acquired … ==`. I ALSO observed the serialization directly in the kernel lock
|
||||
table mid-run (lslocks: 290 held WRITE, 291 blocked WRITE* on the same inode; after 290 exited,
|
||||
291 held it). Strongest possible proof of the double-!testme serialization invariant.
|
||||
- **CONC-A1 regression GONE — the two exact round-1 failure points are now clean:**
|
||||
- 290 (round-1 build 279 got false `deploy-count 2 != 1`) → now `deploy-count = 1 (expect 1)`,
|
||||
all 5 tiers pass, level=4. Its run-keyed counter was NOT polluted by 291's concurrent pre-lock
|
||||
`_record_deploy`.
|
||||
- 291 (round-1 build 281 crashed `FileNotFoundError` at run_recipe_ci.py:1213) → now
|
||||
`deploy-count = 1 (expect 1)`, all tiers pass, level=4, no traceback. Its own run-keyed countfile
|
||||
survived 290's end-of-run remove.
|
||||
- **Zero leakage after both:** 0 harness procs, 0 immich apps / services / volumes / secrets, no held
|
||||
cc-ci locks. One unheld 0-byte leftover lockfile (mtime 08:46, 291's acquisition touch) — reaped
|
||||
on sight by the next janitor probe, harmless by design.
|
||||
- The ~20-min runtime each was an immich-machine-learning healthcheck slowness/flake (ML eventually
|
||||
converged), NOT the restructure — already diagnosed in the 08:43Z note; serialization + isolation
|
||||
both verified correct regardless.
|
||||
|
||||
**VETO LIFTED.** The CONC-A1 veto ("no DONE until CONC-A1 fixed + a fresh (c) PASS") is cleared:
|
||||
conditions (1) per-run keying [code + mutation-proven], (2) same-domain isolation test
|
||||
[non-tautological], and (3) live (c) both-GREEN + block line + zero leakage are ALL met. CONC-A1
|
||||
closed in BACKLOG-conc.
|
||||
|
||||
**Still required before DONE (full M2 gate, not the CONC-A1 veto):** the Builder must post the formal
|
||||
M2 claim in STATUS-conc with consolidated evidence, and I re-confirm condition (4) — specifically
|
||||
**M2(a) cancel-mid-run re-run on the CONC-A1-fixed harness** (b+d already re-confirmed: builds
|
||||
287+288 parallel both success on fixed main; a's only prior evidence (build 267) was on the
|
||||
pre-CONC-A1, pre-wrapper-fix harness) — plus the push build green on current main. (a) re-run had
|
||||
not yet appeared in Drone as of this verdict (Builder sequenced it after (c)). I will verify it cold
|
||||
when it lands.
|
||||
|
||||
## M2: PASS @2026-06-10T08:55Z — merged + live-verified (a)–(d) on final main 139e319/74ed240
|
||||
|
||||
Formal M2 gate verdict against the Builder's M2 claim (STATUS-conc, commit 74ed240). Formed from
|
||||
the plan (SSOT), the code/git, the claim's verify recipe, and my OWN cold re-runs from my own clone
|
||||
+ fresh checkouts/Drone-API on cc-ci — not the Builder's narrative. All seven claim items confirmed:
|
||||
|
||||
1. **Merge integrity** — `git diff 139e319 b6e12ef -- runner/ tests/ docs/ .drone.yml nix/` = 0 lines;
|
||||
`b6e12ef ⊆ 139e319`; merge parents `2173894 ∘ b6e12ef`. So deployed main code == the CONC-A1 tree
|
||||
I code-verified + mutation-proofed. No force-push (history linear). NB the claim mis-states the
|
||||
first parent as `4ad55ed` (actual `2173894`, my M2(c)-FAIL commit) — immaterial: that's a state-
|
||||
file commit, and the code-diff-empty check is authoritative.
|
||||
2. **Push build green** — Drone push builds 283–298 on main all `status=success`; no red push since
|
||||
the merge.
|
||||
3. **Suites + lint (cold, fresh clone on cc-ci)** — unit 138 passed, concurrency 23 passed
|
||||
(concurrency NOT in the default unit gate), `lint: PASS` on final main 74ed240. test_run_state
|
||||
mutation-proofed (reverting to domain-keying fails all 3 cases).
|
||||
4. **(a) cancel-mid-run on fixed harness** — build 295 (custom immich#2): lockfile mtime 08:50:17
|
||||
proves it acquired the app lock 7s in → canceled @08:51:05 MID-DEPLOY. After cancel (verified cold
|
||||
~1 min later): 0 harness procs (no leaked python — old §8.1 gap stays closed), no held locks (lock
|
||||
released), no immich app/.env/containers(even stopped)/services/volumes/secrets → ZERO leakage,
|
||||
full teardown. Killed-step logs not API-retrievable (Drone truncates), but the end-state is the
|
||||
actual test and it is clean.
|
||||
5. **(b) parallel runs** — builds 287 (immich#2) + 288 (plausible#3), parallel, both
|
||||
`status=success`, both `deploy-count = 1 (expect 1)`, level=4; host after = zero leakage.
|
||||
6. **(c) double-!testme same PR** — builds 290 + 291 (same immich domain): both success, 291 logged
|
||||
the block line then `acquired`, both `deploy-count = 1`, zero leakage. Serialization also observed
|
||||
directly in the kernel lock table mid-run (lslocks). Covered in detail by my M2(c) PASS @09:05Z.
|
||||
7. **(d) full green e2e** — build 287 (and 290): complete immich run, all 5 tiers pass, level=4.
|
||||
|
||||
Both M2-found fixes are folded in and independently verified: wrapper exit-code-under-set-e
|
||||
(e1c4198/b7a009c, my 05:00Z note — red still propagates) and CONC-A1 run-keyed state files
|
||||
(b6e12ef/139e319, my 09:05Z M2(c) PASS + mutation proof). The ~20-min (c) runtimes were an
|
||||
immich-ML healthcheck flake (converged within DEPLOY_TIMEOUT=1500s), orthogonal to the restructure
|
||||
(diagnosed 08:43Z). Unheld 0-byte leftover lockfiles are by-design (next-janitor tidy-sweep).
|
||||
|
||||
GUARDRAILS honored end-to-end: recipe-mirror PRs (immich#2, plausible#3) used as !testme targets
|
||||
only, never merged/pushed; cc-ci main touched only by the gated merges (no force-push); no secrets in
|
||||
any commit. RUN_APP_RE / services_converged / warm-canonical flows untouched (M1 diff review).
|
||||
|
||||
CONCLUSION: **M2 — merged + live-verified — PASS.** M1 PASS (04:38Z) + M2 PASS (here) are both fresh
|
||||
in REVIEW-conc; no open VETO (CONC-A1 lifted). Per the phase DoD the Builder may now write `## DONE`
|
||||
to STATUS-conc. (Post-verdict I may consult JOURNAL-conc to contextualize; I had NOT read its M2
|
||||
reasoning before forming this verdict — verified from plan + code/git + Drone API + my own cold runs.)
|
||||
101
REVIEW-rcust.md
Normal file
101
REVIEW-rcust.md
Normal file
@ -0,0 +1,101 @@
|
||||
# REVIEW-rcust.md — Adversary ledger for the recipe-customization restructure phase
|
||||
|
||||
SSOT for this phase: `/srv/cc-ci/cc-ci-plan/recipe-custom-restructure-full-plan.md`.
|
||||
Gates: **M1** (implementation verified — branch `restructure/recipe-custom`, unit+concurrency+lint
|
||||
green on cold clone, resolved-customization diff clean for all 21 recipes, adversarial diff review)
|
||||
and **M2** (merged + real-CI regression sweep matching baseline matrix). DONE requires fresh PASS
|
||||
for both with no open VETO.
|
||||
|
||||
I own this file and the `## Adversary findings` section of BACKLOG-rcust.md only.
|
||||
|
||||
---
|
||||
|
||||
## Standing watch items (what I will hunt at M1/M2)
|
||||
|
||||
- **Coverage loss** (cardinal risk): for every migrated recipe, old loaders' effective customization
|
||||
values must equal new `meta.load()` values. Throwaway diff script over all 21 recipe dirs; any
|
||||
delta = finding.
|
||||
- **Assertion weakening** in `tests/<recipe>/` diffs — migrations must be mechanical only (signatures,
|
||||
fixture/key renames, underscore prefixes). Any changed assert/expected value = VETO.
|
||||
- **Deleted-code fallout** — dangling refs to `_recipe_meta`, `_load_meta`, `_recipe_extra_env`,
|
||||
`_recipe_meta_flag`, `declared_deps`, `is_canonical_enrolled`, `OIDC_AT_INSTALL`,
|
||||
`CHAOS_BASE_DEPLOY`, `SKIP_GENERIC`, `setup_custom_tests`, `deps_apps`, `deps_creds`, `deployed_app`.
|
||||
- **Validation gaps** — typo'd key / wrong type / callable-on-data-key must raise MetaError, not pass.
|
||||
- **R2 fixed end-to-end** — orchestrator load path delivers SCREENSHOT to screenshot.py.
|
||||
- **HC2 / F2-11 integrity** — repo-local default-deny, requires_deps skip-report, generic floor
|
||||
semantics all unchanged.
|
||||
|
||||
---
|
||||
|
||||
## Verdicts
|
||||
|
||||
_(no GATE verdict yet — M1 is not claimed. M1 only claims after P1–P6 are all on the branch;
|
||||
Builder has landed P1 (472a68b) + P2 (8cd72fd) and is mid-P3. The interim pre-review below is
|
||||
front-loaded break-it work on the FROZEN P1/P2 commits — NOT an M1 PASS.)_
|
||||
|
||||
### Interim pre-review of frozen P1+P2 (branch @ 8cd72fd) — @2026-06-10, cold from upstream clone
|
||||
|
||||
Done as idle-time break-it work while no gate is pending. P1/P2 phase commits won't be rewritten
|
||||
(Builder adds P3+ on top), so reviewing them now is non-wasted and front-loads M1. Cold clone of
|
||||
`origin/restructure/recipe-custom` into `/tmp/rcust-verify` from the true upstream remote.
|
||||
|
||||
**No defects found so far.** Results:
|
||||
|
||||
1. **Deleted-code fallout — CLEAN.** Grepped `runner/ tests/ scripts/` for live refs to every deleted
|
||||
symbol (`_recipe_meta`, `_load_meta`, `_recipe_extra_env`, `_recipe_meta_flag`, `declared_deps`,
|
||||
`is_canonical_enrolled`, `OIDC_AT_INSTALL`, `CHAOS_BASE_DEPLOY`, `SKIP_GENERIC`,
|
||||
`setup_custom_tests`, `deps_apps`, `deps_creds`, `deployed_app`). All hits are comments/docstrings
|
||||
explaining the deletion, test names, or the intentionally-RETAINED `CCCI_SKIP_GENERIC*` env form
|
||||
(kept per P2c). Zero live call-sites. `setup_custom_tests.sh` files gone.
|
||||
2. **All-recipes-load-clean (typo gate) — PASS, independently.** Ran `meta.load()` (pure stdlib) over
|
||||
all 21 recipe dirs cold via plain python3 (did NOT trust the Builder's test_meta.py). All 21 load;
|
||||
non-default key sets sane. Every ALL-CAPS key used in any recipe_meta.py is in the 14-key registry.
|
||||
3. **Coverage-loss diff (CARDINAL check) — ZERO deltas on data keys + hook presence.** Throwaway
|
||||
harness (`/tmp/diff_meta.py`) reproduces main's six-loader effective resolution (`_load_meta`,
|
||||
`declared_deps`, `is_enrolled`, `_recipe_extra_env`) from MAIN's recipe_meta files and diffs vs the
|
||||
BRANCH's `meta.load()` for all 21 recipes. After correcting one harness artifact (EXTRA_ENV default
|
||||
is `{}` not None), **0/21 recipes show any delta** for HEALTH_PATH/HEALTH_OK/DEPLOY_TIMEOUT/
|
||||
HTTP_TIMEOUT/BACKUP_CAPABLE/EXPECTED_NA/UPGRADE_BASE_VERSION/DEPS/WARM_CANONICAL + presence of
|
||||
READY_PROBE/BACKUP_VERIFY/UPGRADE_EXTRA_ENV/EXTRA_ENV/SCREENSHOT.
|
||||
4. **Validation gaps — CLOSED.** Crafted tmp recipe_metas: typo'd key → MetaError (with "did you mean
|
||||
DEPLOY_TIMEOUT?"); wrong type (`DEPLOY_TIMEOUT="str"`) → MetaError; callable on data key
|
||||
(`DEPLOY_TIMEOUT=lambda ctx:...`) → MetaError; `_PRIVATE`/lowercase-helper → loads clean (exemption
|
||||
works). All four behave per the locked decision.
|
||||
5. **meta.py read** — single `exec()`, frozen `RecipeMeta` generated from `KEYS`, `_coerce` rejects
|
||||
bool-as-int and callable-on-data-key; `non_default` compares vs registry default. No issues.
|
||||
|
||||
**Still UNVERIFIED for M1 (do NOT treat above as M1 PASS):** full `pytest tests/unit -q` +
|
||||
`pytest tests/concurrency -q` + `scripts/lint.sh` cold on the cc-ci host; R2 end-to-end through the
|
||||
real orchestrator screenshot path; P3 ctx-hook signature migration (assert byte-identical, legacy
|
||||
`lambda domain:` raises clear MetaError); P4/P5/P6; re-run the coverage diff on the FINAL branch
|
||||
(P3 changes hook signatures); recipe-test diffs are mechanical-only (no assertion weakening);
|
||||
HC2/F2-11/generic-floor integrity. These wait for the `claim(rcust): M1`.
|
||||
|
||||
### Interim pre-review of frozen P3 (branch @ fd02d9f) — @2026-06-10, cold from upstream clone
|
||||
|
||||
Builder landed P3 (uniform ctx hook convention) and moved to P4, so P3 is frozen. Pre-reviewed it.
|
||||
**No defects found.**
|
||||
|
||||
1. **Mechanical-migration discipline — HELD (no VETO trigger).** `git diff 8cd72fd..fd02d9f` over
|
||||
`tests/*/` shows ZERO changed assert/expected literals. Every hook change is purely
|
||||
`def HOOK(domain[, meta])` → `def HOOK(ctx)` + `domain` → `ctx.domain` in the body. Spot-checked
|
||||
cryptpad/mumble/ghost/lasuite-drive recipe_meta.py + lasuite-drive ops.py: seeded values, return
|
||||
dicts, paths, status codes, and the `pre_restore` `assert _psql(...) in (...)` are byte-identical
|
||||
apart from the `ctx.` deref.
|
||||
2. **HookCtx — present + complete.** `meta.HookCtx` frozen dataclass has all 5 documented fields
|
||||
(`.domain`, `.base_url`, `.meta`, `.deps`, `.op`); `meta.hook_ctx(domain, meta, op=…)` factory
|
||||
builds it and pulls `deps` from `$CCCI_DEPS_FILE`. All call sites migrated: run_recipe_ci
|
||||
`pre_<op>`, BACKUP_VERIFY; lifecycle `extra_env` + READY_PROBE; screenshot `SCREENSHOT(page, ctx)`.
|
||||
(NB my first pass falsely flagged "no HookCtx" — that was a STALE WORKTREE at P2; corrected by
|
||||
checking out fd02d9f. Logged here for honesty.)
|
||||
3. **Legacy-signature guard (P3.4) — PRESENT + works, live-probed.** `meta.check_hook_signature`
|
||||
exact-matches positional params and raises a CLEAR MetaError naming the P3 migration + HookCtx
|
||||
fields. Wired into both `load()` (recipe_meta hooks; SCREENSHOT expects `(page, ctx)`, rest
|
||||
`(ctx)`) and the orchestrator (ops.py `pre_<op>`). Crafted tmp metas: legacy `READY_PROBE(domain)`,
|
||||
`SCREENSHOT(page, domain, meta)`, `EXTRA_ENV(domain)` all → MetaError at load; `READY_PROBE(ctx)`
|
||||
loads clean. No silent mid-run TypeError path.
|
||||
4. **Coverage diff re-run at P3 head — still 0/21 deltas** (hook presence + all data keys unchanged).
|
||||
|
||||
Net: P1+P2+P3 all clean under cold adversarial probing. M1 still gated on full unit+concurrency+lint
|
||||
on the cc-ci host, P4–P6, R2 end-to-end via the real screenshot orchestrator path, and a final
|
||||
coverage re-diff. No findings filed; no VETO.
|
||||
62
STATUS-conc.md
Normal file
62
STATUS-conc.md
Normal file
@ -0,0 +1,62 @@
|
||||
# STATUS — sub-phase conc (concurrency restructure)
|
||||
|
||||
Plan: /srv/cc-ci/cc-ci-plan/concurrency-restructure-full-plan.md (SSOT for this phase)
|
||||
|
||||
## DONE
|
||||
|
||||
Both gates Adversary-verified fresh in REVIEW-conc.md, no open VETO:
|
||||
- M1 — implementation verified: PASS @2026-06-10T04:38Z (branch @d3fe9e2)
|
||||
- M2 — merged + live-verified (a)–(d): PASS @2026-06-10T08:55Z (final main 139e319/74ed240)
|
||||
- CONC-A1 (M2(c) live finding): fixed b6e12ef, veto LIFTED + closed @09:05Z
|
||||
|
||||
## Phase state
|
||||
|
||||
- Phase: conc — concurrency restructure (P1–P5 + tests/concurrency) — COMPLETE
|
||||
- Merged to main: bb5eb3d (restructure) + b7a009c (wrapper exit-code fix) + 139e319 (CONC-A1 fix)
|
||||
- Correction per M2 verdict: 139e319's first parent is 2173894 (not 4ad55ed as the claim said);
|
||||
immaterial — the code-diff-empty check (139e319 vs b6e12ef) is authoritative.
|
||||
|
||||
## Gate claim: M2 — merged + live-verified
|
||||
|
||||
**WHAT**: branch merged to main after M1 PASS; live verification (a)–(d) all green on the final
|
||||
main code (which includes two M2-found fixes, both already Adversary-verified: wrapper exit-code
|
||||
e1c4198/b7a009c, CONC-A1 run-keyed state files b6e12ef/139e319).
|
||||
|
||||
**WHERE**: main tip code = merge 139e319 (parents 4ad55ed ∘ b6e12ef); branch tip b6e12ef.
|
||||
All evidence builds ran post-139e319. Drone repo recipe-maintainers/cc-ci; host cc-ci.
|
||||
|
||||
**HOW + EXPECTED (cold re-check from your own access path):**
|
||||
|
||||
1. Merge integrity: `git diff 139e319 b6e12ef -- runner/ tests/ docs/ .drone.yml nix/` → EMPTY;
|
||||
no force-push anywhere (reflog linear).
|
||||
2. Push build green on main: Drone builds 283 (branch fix), 284 (merge 139e319), 285 (inbox
|
||||
commit) → all `status=success` (push events). No main push since has a red build.
|
||||
3. Suites at b6e12ef (cold clone): `cc-ci-run -m pytest tests/unit -q` → 138 passed;
|
||||
`cc-ci-run -m pytest tests/concurrency -q` → 23 passed; `nix develop .#lint --command bash
|
||||
scripts/lint.sh` → lint: PASS. (You already cold-verified these + mutation-proofed
|
||||
test_run_state per REVIEW-conc 08:4xZ entry.)
|
||||
4. **(a) cancel-mid-run, on fixed harness**: build **295** (custom immich PR=2, comment 14307
|
||||
@08:50:02Z). Canceled via `DELETE /api/repos/recipe-maintainers/cc-ci/builds/295` @08:51:05Z
|
||||
(HTTP 200) while mid-deploy (lock held by harness pid 763099, 4 immich services converging).
|
||||
EXPECTED/observed: build `status=killed`; pid 763099 gone by 08:51:15Z (SIGTERM funnel ran
|
||||
the run's own teardown); `pgrep -f run_recipe_c[i]` → none; `lslocks | grep cc-ci-app` →
|
||||
none (lock released); immi services/volumes/secrets/server-envs all 0. Zero leakage, no
|
||||
janitor needed (better than plan minimum).
|
||||
5. **(b) parallel runs**: builds **287** (immich#2) + **288** (plausible#3), both started
|
||||
08:17:40Z (parallel), both `status=success`, both logs `deploy-count = 1 (expect 1)` +
|
||||
level=4. Host after: zero harness procs / services / volumes / secrets / envs.
|
||||
6. **(c) double-!testme same PR**: builds **290** + **291** (both immich#2, domain immi-ad3e33).
|
||||
291 log line 1: `== app lock: another run of immi-ad3e33... is in flight — waiting ==`,
|
||||
`acquired` @+1411s = exactly 290's exit (08:46:05Z). BOTH `status=success`, both
|
||||
`deploy-count = 1`, level=4. Zero leakage after. (Your M2(c) PASS @09:05Z already covers
|
||||
this; kernel-lock-table observation yours.)
|
||||
7. **(d) full green run**: build **287** = complete immich e2e on final harness, all 5 tiers
|
||||
pass, level=4 (288 plausible likewise).
|
||||
|
||||
**Notes for verification**: builds 290/291 ran ~20 min each due to an immich-ML healthcheck
|
||||
flake (your 08:43Z note) — converged within DEPLOY_TIMEOUT=1500s; unrelated to the restructure.
|
||||
Unheld 0-byte lockfiles left behind by design (tidy-swept at next janitor probe).
|
||||
|
||||
## Blockers
|
||||
|
||||
(none)
|
||||
38
STATUS-rcust.md
Normal file
38
STATUS-rcust.md
Normal file
@ -0,0 +1,38 @@
|
||||
# STATUS — sub-phase rcust (recipe-customization restructure)
|
||||
|
||||
Plan: /srv/cc-ci/cc-ci-plan/recipe-custom-restructure-full-plan.md (SSOT for this phase).
|
||||
Reference spec: docs/recipe-customization.md @ 76a4b6b.
|
||||
Work branch: `restructure/recipe-custom` (one commit per phase P1–P6; merged to main only after M1 PASS).
|
||||
|
||||
## Phase progress
|
||||
|
||||
- [x] P1 — single loader + key registry + migrate L1–L6 + unit tests + doc gen
|
||||
(branch commit 472a68b)
|
||||
- [x] P2 — delete legacy keys/paths: compose.ccci.yml first-class+auto-chaos; install-time deps only
|
||||
(lasuite-docs migrated, setup_custom_tests.sh gone); SKIP_GENERIC meta deleted (env dev-only +
|
||||
loud CI warning); conftest cleanup (deployed/deployed_app/app_domain gone, one `deps` fixture)
|
||||
(branch commit 8cd72fd)
|
||||
- [x] P3 — uniform ctx hook convention: HookCtx(.domain/.base_url/.meta/.deps/.op); all hooks
|
||||
take ctx; legacy signatures raise MetaError at load naming the migration (branch fd02d9f)
|
||||
- [x] P4 — custom-test ergonomics: placement rule (custom under functional/+playwright/ only),
|
||||
op_state fixture, deps fixture tests (branch 29a28e2)
|
||||
- [ ] P5 — customization manifest
|
||||
- [ ] P6 — docs
|
||||
|
||||
## P1–P4 verification facts (for the eventual M1 cold-verify)
|
||||
|
||||
- WHERE: branch `restructure/recipe-custom`, P1=472a68b, P2=8cd72fd, P3=fd02d9f, P4=29a28e2.
|
||||
- HOW: `cc-ci-run -m pytest tests/unit -q` and `nix develop .#lint --command scripts/lint.sh`
|
||||
from a clean checkout of the branch.
|
||||
- EXPECTED: 184 passed; `lint: PASS`.
|
||||
- New single loader: `runner/harness/meta.py::load()`; all-recipes typo gate + R2 proof in
|
||||
`tests/unit/test_meta.py`; docs §4 table generated by `scripts/gen-meta-docs.py` (sync pinned
|
||||
by unit test).
|
||||
|
||||
## Gate
|
||||
|
||||
(none claimed yet — M1 claims only after P1–P6 complete on the branch)
|
||||
|
||||
## Current
|
||||
|
||||
P1–P4 done on the branch; starting P5 (customization manifest).
|
||||
@ -64,6 +64,8 @@ def parse_trigger(body):
|
||||
if s == f"{TRIGGER} --quick":
|
||||
return True, True
|
||||
return False, False
|
||||
|
||||
|
||||
ALLOWLIST = {u.strip() for u in os.environ.get("AUTH_ALLOWLIST", "").split(",") if u.strip()}
|
||||
|
||||
|
||||
@ -167,8 +169,12 @@ def post_commit_status(owner, repo, sha, state, target_url, description=""):
|
||||
f"{GITEA_API}/repos/{owner}/{repo}/statuses/{sha}",
|
||||
GITEA_TOKEN,
|
||||
method="POST",
|
||||
data={"state": state, "target_url": target_url,
|
||||
"description": description, "context": "cc-ci/testme"},
|
||||
data={
|
||||
"state": state,
|
||||
"target_url": target_url,
|
||||
"description": description,
|
||||
"context": "cc-ci/testme",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@ -217,7 +223,9 @@ def result_comment_body(recipe, sha, num, run_url, status):
|
||||
if artifact_available(badge_url):
|
||||
body += f"\n\n[]({run_url})"
|
||||
return f"{body}\n\n{links}"
|
||||
return f"{header} → {run_url}\n\n_(summary card unavailable — see the run for details.)_ {links}"
|
||||
return (
|
||||
f"{header} → {run_url}\n\n_(summary card unavailable — see the run for details.)_ {links}"
|
||||
)
|
||||
|
||||
|
||||
def watch_and_reflect(owner, name, number, num, recipe, sha, comment_id, run_url):
|
||||
@ -287,15 +295,11 @@ def process_testme(full_name, owner, name, number, user, comment_id, source, qui
|
||||
run_url = f"{DRONE_URL}/{CI_REPO}/{num}"
|
||||
post_commit_status(owner, name, head["sha"], "pending", run_url, "cc-ci run in progress")
|
||||
mode = " **(--quick: lower-confidence fast lane; does not gate merge)**" if quick else ""
|
||||
# R2/U3: one comment per PR, updated in place. Reuse the existing marked comment if present
|
||||
# (re-`!testme` refreshes it back to the ⏳ placeholder), else post a new one.
|
||||
# One NEW comment PER `!testme` (operator preference 2026-06-02): post a fresh ⏳ placeholder each
|
||||
# run so every re-`!testme` is visible in the PR timeline; watch_and_reflect then edits THIS
|
||||
# comment to its result. (Previously a single marked comment was reused/edited in place.)
|
||||
start_body = start_comment_body(name, head["sha"], run_url, mode)
|
||||
existing = find_existing_comment(full_name, number)
|
||||
if existing:
|
||||
edit_comment(owner, name, existing, start_body)
|
||||
cid = existing
|
||||
else:
|
||||
cid = post_comment(owner, name, number, start_body)
|
||||
cid = post_comment(owner, name, number, start_body)
|
||||
log(
|
||||
f"[{source}] triggered build {num} for {name}@{head['sha'][:8]} "
|
||||
f"(PR #{number}, comment {comment_id}) by {user}"
|
||||
|
||||
@ -66,8 +66,13 @@ _COLORS = {
|
||||
# Level → colour ramp, kept in sync with runner/harness/card.py LEVEL_COLOR (the dashboard is a
|
||||
# standalone stdlib service that doesn't import the runner harness, so the small map is duplicated).
|
||||
_LEVEL_COLOR = {
|
||||
0: "#e5534b", 1: "#e0823d", 2: "#e0823d", 3: "#d9b343",
|
||||
4: "#a0b93f", 5: "#57ab5a", 6: "#3fb950",
|
||||
0: "#e5534b",
|
||||
1: "#e0823d",
|
||||
2: "#e0823d",
|
||||
3: "#d9b343",
|
||||
4: "#a0b93f",
|
||||
5: "#57ab5a",
|
||||
6: "#3fb950",
|
||||
}
|
||||
|
||||
|
||||
@ -269,7 +274,11 @@ def _card(r):
|
||||
f'<a class="shot" href="{run_url}" title="open run">'
|
||||
f'<span class="ph">no screenshot</span>{_level_pill(r["level"])}</a>'
|
||||
)
|
||||
cap = f'<div class="cap">{html.escape(r["level_cap_reason"])}</div>' if r["level_cap_reason"] else ""
|
||||
cap = (
|
||||
f'<div class="cap">{html.escape(r["level_cap_reason"])}</div>'
|
||||
if r["level_cap_reason"]
|
||||
else ""
|
||||
)
|
||||
return (
|
||||
f'<div class="card">{shot}<div class="body">'
|
||||
f'<div class="name">{html.escape(r["recipe"])}</div>'
|
||||
@ -307,7 +316,11 @@ def render_history(recipe, rows):
|
||||
trs = []
|
||||
for r in rows:
|
||||
color = _COLORS.get(r["status"], "#8b949e")
|
||||
lvl = "—" if r["level"] is None else f'<b style="color:{level_color(r["level"])}">L{int(r["level"])}</b>'
|
||||
lvl = (
|
||||
"—"
|
||||
if r["level"] is None
|
||||
else f'<b style="color:{level_color(r["level"])}">L{int(r["level"])}</b>'
|
||||
)
|
||||
shot = f'<a href="/runs/{r["number"]}/summary.png">card</a>' if r["has_screenshot"] else "—"
|
||||
trs.append(
|
||||
f'<tr><td><a href="{html.escape(r["url"])}">#{r["number"]}</a></td>'
|
||||
@ -317,7 +330,7 @@ def render_history(recipe, rows):
|
||||
)
|
||||
body = "\n".join(trs) or '<tr><td colspan="6">no runs for this recipe yet</td></tr>'
|
||||
inner = (
|
||||
f'<h1>{_FLOWER} {html.escape(recipe)} — run history</h1>'
|
||||
f"<h1>{_FLOWER} {html.escape(recipe)} — run history</h1>"
|
||||
'<p class="sub"><a href="/">← all recipes</a> · every <code>!testme</code> run, newest first.</p>'
|
||||
"<table><thead><tr><th>Run</th><th>Status</th><th>Level</th><th>Version</th>"
|
||||
"<th>When</th><th>Card</th></tr></thead><tbody>"
|
||||
|
||||
236
docs/concurrency.md
Normal file
236
docs/concurrency.md
Normal file
@ -0,0 +1,236 @@
|
||||
# Concurrency: how parallel recipe CI runs stay safe
|
||||
|
||||
Spec of the concurrent-run system after the 2026-06-10 restructure (branch
|
||||
`restructure/concurrency`; plan: cc-ci-plan `concurrency-restructure-full-plan.md`). The previous
|
||||
registry + per-recipe-flock model is documented in this file's git history (`5b65c6c`).
|
||||
|
||||
## 1. Goal and design summary
|
||||
|
||||
Two recipe CI builds may run **at the same time** on the single cc-ci host. Safety is enforced by
|
||||
the **harness**, not by serialising everything, and rests on ONE locking mechanism plus ONE
|
||||
structural isolation:
|
||||
|
||||
| Rule | Mechanism |
|
||||
|---|---|
|
||||
| Different recipes run in parallel | nothing blocks them (isolation, §3) |
|
||||
| Same-RECIPE runs run in parallel too | per-run `ABRA_DIR` recipe trees (§4) — no shared tree, no lock |
|
||||
| Same-DOMAIN runs (double-`!testme` of one PR) serialise | per-app-domain `flock` (§5) |
|
||||
| A starting run never reaps a live concurrent run's app | janitor probes the app lock; held = live (§6) |
|
||||
| A crashed/canceled/rebooted run's leftovers get reaped | lock auto-released by the kernel → probe acquires → reap (§6) |
|
||||
|
||||
The invariant chain that makes "held lock = live owner" sound:
|
||||
|
||||
```
|
||||
lock lifetime ⊆ harness process lifetime ⊆ drone step lifetime ⊆ 60-min hard deadline
|
||||
```
|
||||
|
||||
- **lock ⊆ process**: locks are kernel flocks on fds the process holds (and PEP 446 makes those
|
||||
fds non-inheritable, so abra/docker/pytest children never carry them). The kernel releases them
|
||||
on process death, however it dies. There is no unlock code path and no stale-lock failure mode.
|
||||
- **process ⊆ step**: `PR_SET_PDEATHSIG(SIGTERM)` + the `.drone.yml` setsid/trap wrap (§2) — a
|
||||
dead or canceled build cannot leak a running harness.
|
||||
- **step ⊆ 60 min**: `signal.alarm(3600)` self-deadline (§2).
|
||||
|
||||
Never steal a held lock; manage the holder's lifetime. There is **no daemon and no shared state
|
||||
service** — everything is kernel/file primitives under `/run/lock` and per-run directories.
|
||||
|
||||
## 2. Mechanism 0: run-lifetime hardening (`runner/harness/lifetime.py`)
|
||||
|
||||
`run_recipe_ci.main()` calls `lifetime.install_lifetime_guards()` before ANY abra call or lock
|
||||
acquisition:
|
||||
|
||||
1. **`PR_SET_PDEATHSIG(SIGTERM)`** (ctypes prctl, return code checked): if the parent — the drone
|
||||
step shell — dies, the kernel TERMs the harness. A post-prctl `ppid == 1` re-check closes the
|
||||
start race: a harness whose parent died *before* the prctl armed would never get the signal,
|
||||
so it refuses to run orphaned.
|
||||
2. **SIGTERM handler**: logs, then raises `SystemExit(143)` so the run's `finally:` teardown
|
||||
funnel executes and the process exits non-zero. Re-entrant signals during teardown are logged
|
||||
and IGNORED (`lifetime.begin_teardown()`, also set at the top of the run's `finally:` blocks)
|
||||
so a second signal can't abort the cleanup the first one asked for.
|
||||
3. **`signal.alarm(3600)` hard deadline**: SIGALRM funnels into the same teardown path with a
|
||||
distinct log line (`== run exceeded 60-minute hard deadline — tearing down ==`), exit 142.
|
||||
Recipes keep their own smaller per-tier timeouts; this bounds the whole run. Teardown time
|
||||
after the deadline is deliberately not alarm-bounded — the janitor is the backstop if a
|
||||
teardown wedges and the process is killed harder.
|
||||
|
||||
The `.drone.yml` recipe-ci step runs the harness as `setsid cc-ci-run … &` with a
|
||||
`trap 'kill -TERM -- "-$PID"' TERM EXIT; wait "$PID"` — a drone **cancel** (TERM to the step
|
||||
shell) is forwarded to the harness's whole process group instead of leaking it (the exec runner
|
||||
only kills the step shell). PDEATHSIG backstops the no-trap paths.
|
||||
|
||||
## 3. Isolation model: what is shared, what is per-run
|
||||
|
||||
Per-run (no conflict possible):
|
||||
|
||||
- **App + stack + volumes + secrets.** Run app domain = `naming.app_domain()` →
|
||||
`<recipe[:4]>-<sha1(recipe|pr|ref)[:6]>.ci.commoninternet.net`, unique per (recipe, pr, ref);
|
||||
everything abra creates is namespaced by it. Run apps are recognised by
|
||||
`RUN_APP_RE = ^[a-z0-9]{1,4}-[0-9a-f]{6}\.ci\.commoninternet\.net$`; warm/canonical apps
|
||||
(e.g. `warm-keycloak...`) deliberately do NOT match → the janitor never probes them.
|
||||
- **Recipe working trees** — `$ABRA_DIR/recipes/<recipe>`, per run (§4). NEW in the restructure.
|
||||
- **Drone build workspace** (`/var/lib/drone-runner/drone-<id>/`) and **run artifacts**
|
||||
(`/var/lib/cc-ci-runs/<run-id>/`).
|
||||
- **Run-scoped state files** (`/tmp/ccci-{deploys,opstate,deps,depskip}-<run-id>-<pid>…`) —
|
||||
keyed by run id + harness pid via `run_recipe_ci._run_state_path()`, NEVER by app domain.
|
||||
A second run of the same domain executes its `main()` preamble before blocking at the app
|
||||
lock (§5), so domain-keyed files would be reset/removed underneath the live first run
|
||||
(live finding, M2(c) double-`!testme`: false DG4.1 deploy-count in run 1, countfile
|
||||
`FileNotFoundError` in run 2). Tier/hook children get the exact paths via the
|
||||
`CCCI_*_FILE` env vars; removed on normal run exit.
|
||||
|
||||
Shared (by design, conflict-free):
|
||||
|
||||
- **`/root/.abra/servers`** — app `.env` files, one per domain. The per-run `ABRA_DIR` symlinks
|
||||
`servers/` here, so .env files land in the canonical path: janitor discovery (`abra app ls`)
|
||||
and out-of-run tooling see every app. Per-domain filenames + the app-domain lock prevent write
|
||||
conflicts.
|
||||
- **`/root/.abra/catalogue`** — read-mostly, symlinked into each per-run dir.
|
||||
- **`HOME=/root`** (forced in `.drone.yml`) — safe: nothing recipe-mutable lives under `~/.abra`
|
||||
for a run anymore except through the two symlinks above.
|
||||
|
||||
## 4. Mechanism 1: per-run `ABRA_DIR` (replaces the per-recipe flock)
|
||||
|
||||
`run_recipe_ci.setup_run_abra_dir()` — called first thing in `main()`, before any abra call —
|
||||
builds `<runs_dir>/<run-id>/abra/` (run-id = Drone build number; `manual-<pid>` for hand runs):
|
||||
|
||||
```
|
||||
abra/
|
||||
servers/ -> /root/.abra/servers (symlink; canonical shared .env path)
|
||||
catalogue/ -> /root/.abra/catalogue (symlink; read-mostly)
|
||||
recipes/ fresh, empty (THE isolation that matters)
|
||||
```
|
||||
|
||||
and exports it as `$ABRA_DIR` — honored by the abra CLI itself and by every harness path helper
|
||||
(`abra.abra_dir()` / `abra.recipe_dir()`; `generic._recipe_dir`, `prepull_images`,
|
||||
`snapshot_recipe_tests`, `warm_reconcile._recipe_dir` all route through the same rule:
|
||||
`$ABRA_DIR` if set, else `~/.abra`).
|
||||
|
||||
- `fetch_recipe()` is now a plain clone into `$ABRA_DIR/recipes/<recipe>` (PR-head clone+checkout
|
||||
or `abra recipe fetch`); the upgrade tier's mid-run `git checkout`s happen in the run's own
|
||||
tree. Two same-recipe runs can no longer corrupt each other — structurally, with no lock. The
|
||||
old observed failure (immich builds 229/230 deploying a tree missing its config) is impossible.
|
||||
- `CCCI_SKIP_FETCH=1` (test/Adversary staging) copies the canonically-staged
|
||||
`~/.abra/recipes/<recipe>` clone into the per-run tree.
|
||||
- Out-of-run flows (warm_reconcile's systemd timer, manual abra) set no `ABRA_DIR` and keep using
|
||||
the canonical `/root/.abra` unchanged. In-run flows that touch canonical state on purpose
|
||||
(warm/canonical .env files) go through `servers/` and are unaffected.
|
||||
- The per-run dir rides along the existing `/var/lib/cc-ci-runs/<run-id>/` retention. abra
|
||||
auto-clones any recipe it needs to resolve (e.g. during `app ls`) into the per-run `recipes/` —
|
||||
a few seconds of git per run, gone with the run dir.
|
||||
|
||||
## 5. Mechanism 2: per-app-domain flock (`lifecycle.acquire_app_lock`)
|
||||
|
||||
- Lock file: `/run/lock/cc-ci-app-<domain>.lock` (dir overridable via `CCCI_APP_LOCK_DIR` for the
|
||||
test suite), exclusive `fcntl.flock`, taken in `deploy_app()` **before the app is created** — a
|
||||
concurrent janitor can never see a run app without its held lock.
|
||||
- Blocks (with a log line: `== app lock: another run of <domain> is in flight — waiting ==`) when
|
||||
another run of the SAME domain is in flight — the double-`!testme` serialisation point; the
|
||||
waiting run is visibly parked at that line in its drone log, by design.
|
||||
- The returned file object is ALSO retained in module-level `_held_app_locks` — if a caller
|
||||
dropped it, GC would close the fd and silently release the lock.
|
||||
- mtime is touched at acquisition: lock age feeds the janitor's long-held flag (§6).
|
||||
- **Unlink/recreate race guard**: the janitor unlinks reaped lockfiles, so after EVERY
|
||||
acquisition the locked fd is verified to still be the inode the path names
|
||||
(`fstat().st_ino == stat().st_ino`); a waiter that won a just-unlinked inode closes it and
|
||||
retries on the live path. (A lock on an unlinked inode protects nothing: a later opener gets a
|
||||
fresh inode and would acquire "the same" lock.)
|
||||
- Release is implicit: process exit (any kind). `teardown_app()` does NOT release or unlink —
|
||||
a clean run's leftover lockfile is unheld and is unlinked on sight by the next janitor sweep.
|
||||
|
||||
## 6. The flock-probe janitor (`lifecycle.janitor`)
|
||||
|
||||
Runs at every run start (cold + quick paths) and in the warm/upgrade sweeps. Candidate discovery
|
||||
is unchanged from the old model: `abra app ls` + a docker-service sweep (catches stacks whose
|
||||
`.env` is already gone), both matched against `RUN_APP_RE` — warm/canonical apps never match and
|
||||
are never probed.
|
||||
|
||||
Decision table (per candidate domain, `_probe_and_reap`):
|
||||
|
||||
| Probe (`LOCK_EX\|LOCK_NB`) | Meaning | Action |
|
||||
|---|---|---|
|
||||
| acquires (+ inode identity OK) | nobody holds it → owner died (kernel-guaranteed) | **reap**: `teardown_app(verify=False)` WHILE HOLDING the probe lock, then unlink the lockfile, then release |
|
||||
| acquires, inode stale | another janitor reaped + unlinked while we raced | skip (reap already done; unlinking now would hit a newer run's file) |
|
||||
| `BlockingIOError` (held) | live concurrent run | leave it; if lockfile mtime > 120 min (2× the hard deadline): `!! lock for <domain> held >120min — possible leaked run; inspect with lslocks` — flag, **never steal** |
|
||||
| `open()` fails (`OSError`) | garbled/unopenable lockfile | skip + log, never crash |
|
||||
|
||||
- Reaping under the probe lock closes the janitor-vs-new-run race: a new run of that domain
|
||||
blocks in `acquire_app_lock` until the reap finishes — no window where a fresh app coexists
|
||||
with a half-reaped one.
|
||||
- Two racing janitors arbitrate on the flock: one reaps, the other sees "held" and leaves; reaps
|
||||
are idempotent (`teardown_app(verify=False)` tolerates half-gone stacks).
|
||||
- After the candidates, a tidy sweep unlinks stale **unheld** `cc-ci-app-*.lock` files with no
|
||||
app behind them (under their own probe lock + identity check), keeping `/run/lock` clean.
|
||||
- **Post-reboot**: `/run/lock` is tmpfs → lockfiles gone → every surviving app probes as an
|
||||
orphan → reaped immediately. (Improvement over the old 2-hour age fallback; there IS no age
|
||||
logic anymore.)
|
||||
|
||||
## 7. Failure-mode guarantees
|
||||
|
||||
| Event | Outcome |
|
||||
|---|---|
|
||||
| Run crashes / SIGKILL mid-run | flock auto-released by kernel → next janitor probe reaps app + lockfile |
|
||||
| Drone build canceled via API | step trap TERMs the harness process group → SIGTERM funnel runs the run's own teardown (exit 143); if anything still leaks, PDEATHSIG + janitor reap (the old "cancel leaks the harness" gap is CLOSED) |
|
||||
| Run exceeds 60 min | SIGALRM → distinct log line → own teardown → exit 142 |
|
||||
| Host reboot | locks and lockfiles vanish (tmpfs, correct: no owners survived) → all surviving run apps reaped at the next run start, immediately |
|
||||
| Two same-recipe `!testme`s (different PRs) | run in parallel — separate domains, separate per-run recipe trees |
|
||||
| Double-`!testme` (same PR → same domain) | second blocks on the app lock before creating anything, visibly in its drone log, runs after the first finishes |
|
||||
| Janitor vs. app being created | impossible to mis-reap: the lock is held before `app new`, and a held lock is never touched |
|
||||
| Janitor unlink vs. blocked waiter | inode identity re-check on every acquisition → waiter retries on the live path |
|
||||
| Lock held implausibly long (>120 min) | flagged loudly for a human (`lslocks`), never stolen |
|
||||
|
||||
## 8. Where convergence fits (adjacent; unchanged by the restructure)
|
||||
|
||||
Two swarm-convergence behaviors in `services_converged()` look like concurrency bugs but aren't —
|
||||
any future work must keep them fixed:
|
||||
|
||||
- **N/N replicas ≠ converged** during a stop-first rolling update — `UpdateStatus.State` is also
|
||||
inspected (build 238: backupbot exec'd into a container killed seconds later).
|
||||
- **`paused` persists forever** (swarm's default `update-failure-action`) — only `updating` and
|
||||
`rollback_started` block convergence; `paused`/`rollback_paused` are settled (build 241).
|
||||
- `backup_app()` additionally waits (bounded 300s) for convergence before `backup create`.
|
||||
|
||||
## 9. Configuration knobs
|
||||
|
||||
| Knob | Where | Current | Meaning |
|
||||
|---|---|---|---|
|
||||
| `DRONE_RUNNER_CAPACITY` (aka `MAX_TESTS`) | `nix/modules/drone-runner.nix` (`maxTests`) | `2` | **THE single concurrency knob.** Max builds the exec runner executes at once; Drone queues the rest. (The `.drone.yml` `concurrency.limit` duplicate was removed.) Change requires `nixos-rebuild switch`. |
|
||||
| `CCCI_APP_LOCK_DIR` | env, read at call time | unset → `/run/lock` | App-domain lockfile dir override — used by `tests/concurrency` to sandbox locks. Never set in production. |
|
||||
| hard deadline | `lifetime.HARD_DEADLINE_SECONDS` | 3600 s | the whole-run alarm; long-held flag threshold is 2× this (`LONG_HELD_LOCK_SECONDS`) |
|
||||
|
||||
## 10. Testing: `tests/concurrency/`
|
||||
|
||||
Real-kernel suite (19 planned cases + companions): helper subprocesses hold REAL flocks and
|
||||
install the REAL prctl/signal/alarm guards — flock itself is never mocked; the janitor runs with
|
||||
injected candidates + stubbed teardown but probes real locks. **Not part of the default
|
||||
`pytest tests/unit` gate** (it spawns processes and sleeps); run it explicitly:
|
||||
|
||||
```
|
||||
cc-ci-run -m pytest tests/concurrency -q
|
||||
```
|
||||
|
||||
Covers: kernel auto-release on SIGKILL; LOCK_NB probe semantics; PEP 446 fd non-inheritance;
|
||||
same-domain serialisation; orphan reap + unlink; live-run protection; reap-under-probe-lock
|
||||
blocking; two-janitor arbitration; reboot-immediate reap; long-held flag; RUN_APP_RE allowlist;
|
||||
degrade-on-garbage; PDEATHSIG; ppid start race; deadline + SIGTERM funnels; per-run ABRA_DIR
|
||||
construction/export; concurrent same-recipe fetch isolation; symlinked-servers .env canonicality;
|
||||
run-keyed (never domain-keyed) run-scoped state files (M2(c) regression, `test_run_state.py`).
|
||||
|
||||
## 11. File / symbol index
|
||||
|
||||
| What | Where |
|
||||
|---|---|
|
||||
| lifetime guards (PDEATHSIG, signal funnels, deadline) | `runner/harness/lifetime.py`; installed in `run_recipe_ci.main()` |
|
||||
| setsid/trap cancel forwarding | `.drone.yml` (`recipe-ci` step) |
|
||||
| `acquire_app_lock`, `_held_app_locks`, `_app_lock_path` | `runner/harness/lifecycle.py` |
|
||||
| `acquire_app_lock` call site | `lifecycle.deploy_app()` (before app creation) |
|
||||
| janitor + probe (`janitor`, `_probe_and_reap`, `LONG_HELD_LOCK_SECONDS`) | `runner/harness/lifecycle.py` |
|
||||
| per-run ABRA_DIR (`setup_run_abra_dir`, `fetch_recipe`) | `runner/run_recipe_ci.py` |
|
||||
| path resolution (`abra_dir`, `recipe_dir`) | `runner/harness/abra.py` (used by `generic`, `lifecycle.prepull_images`, `warm_reconcile`) |
|
||||
| run-app naming | `runner/harness/naming.py` (`app_domain`), `RUN_APP_RE` in `lifecycle.py` |
|
||||
| capacity knob | `nix/modules/drone-runner.nix` (`maxTests`) |
|
||||
| convergence (adjacent) | `lifecycle.services_converged()`, `lifecycle.backup_app()` |
|
||||
| the test suite | `tests/concurrency/` (`helpers.py` subprocess entrypoints, `concutil.py` probes) |
|
||||
|
||||
Deleted in the restructure (grep should find NOTHING): `register_run_app`, `unregister_run_app`,
|
||||
`_run_owner_state`, `ACTIVE_RUN_DIR`, `CCCI_JANITOR_MAX_AGE`, `_stack_age_seconds`,
|
||||
`acquire_recipe_lock`, `RECIPE_LOCK_DIR`.
|
||||
353
docs/recipe-customization.md
Normal file
353
docs/recipe-customization.md
Normal file
@ -0,0 +1,353 @@
|
||||
# Recipe customization — review spec
|
||||
|
||||
Status: REVIEW SPEC — describes the customization surface as it exists today (main), written so
|
||||
the structure can be reviewed and potentially restructured. §8 lists known limitations and
|
||||
restructuring candidates; everything before it is purely descriptive.
|
||||
|
||||
Companion docs: `docs/testing.md` (test architecture / tier semantics), `docs/enroll-recipe.md`
|
||||
(step-by-step enrollment). This doc is the **complete reference** for the two questions those docs
|
||||
answer only partially:
|
||||
|
||||
1. How are custom tests written for a particular recipe?
|
||||
2. What are ALL the per-recipe CI settings, where do they live, and who reads them?
|
||||
|
||||
---
|
||||
|
||||
## 1. The three customization surfaces
|
||||
|
||||
A recipe customizes its CI through **three distinct mechanisms** (worth noticing for the
|
||||
restructure review — they are three different config languages):
|
||||
|
||||
| Surface | Form | Examples |
|
||||
|---|---|---|
|
||||
| **Declarative settings** | Python assignments in `tests/<recipe>/recipe_meta.py` | `DEPLOY_TIMEOUT = 1500`, `UPGRADE_BASE_VERSION = "2.3.1+..."` |
|
||||
| **Code hooks** | Callables in `recipe_meta.py`, `ops.py` functions, shell hooks | `def READY_PROBE(domain): ...`, `pre_upgrade()`, `install_steps.sh` |
|
||||
| **File presence** | A file existing at a discovered path changes behavior | `test_upgrade.py` overlay, `functional/test_*.py`, `compose.ccci.yml` |
|
||||
|
||||
There is additionally a fourth, operator-facing surface: **environment variables**
|
||||
(`CCCI_SKIP_GENERIC*`) that override declarative settings at run time (§4.4).
|
||||
|
||||
## 2. Zero-config baseline
|
||||
|
||||
A recipe with **no `tests/<recipe>/` directory at all** still gets the full generic floor:
|
||||
|
||||
- deploy base version → INSTALL (generic `assert_serving`: HTTP on `/`, expect 200/301/302)
|
||||
- chaos-upgrade to PR head → UPGRADE (generic `assert_upgraded`: version label matches head, converged, serving)
|
||||
- BACKUP (generic `assert_backup_artifact`) — iff the recipe's compose files carry
|
||||
`backupbot.backup` labels (auto-detected), else N/A
|
||||
- RESTORE (generic `assert_restore_healthy`)
|
||||
- CUSTOM tier: empty (no custom tests discovered)
|
||||
- teardown
|
||||
|
||||
Defaults: `HEALTH_PATH="/"`, `HEALTH_OK=(200,301,302)`, `DEPLOY_TIMEOUT=600`, `HTTP_TIMEOUT=300`.
|
||||
Everything in this doc is opt-in deviation from that floor. The cardinal invariant
|
||||
(docs/testing.md §1): the generic floor is **always on** and never depends on custom code;
|
||||
custom is **additive** by default.
|
||||
|
||||
## 3. The per-recipe tree — every file that can exist
|
||||
|
||||
Two locations, with precedence and a security gate between them:
|
||||
|
||||
- **cc-ci-owned**: `tests/<recipe>/` in this repo (trusted, maintainer-reviewed)
|
||||
- **repo-local**: the recipe repo's own `tests/` dir (PR-author-controlled → **default-deny**,
|
||||
consulted only when the recipe is listed in `tests/repo-local-approved.txt` — gate HC2,
|
||||
centralized in `runner/harness/discovery.py`)
|
||||
|
||||
```
|
||||
tests/<recipe>/ # cc-ci side (repo-local mirrors the same shape)
|
||||
├── recipe_meta.py # ALL declarative settings + meta callables (§4)
|
||||
├── test_<op>.py # lifecycle overlay assertions, op ∈ install|upgrade|backup|restore (§5.1)
|
||||
├── ops.py # pre_<op>(domain, meta) seed hooks (§5.2)
|
||||
├── test_*.py # custom-tier tests (top-level, cross-cutting)(§5.3)
|
||||
├── functional/test_*.py # custom tier: parity ports + recipe-specific (§5.3)
|
||||
├── playwright/test_*.py # custom tier: UI flows (§5.3)
|
||||
├── install_steps.sh # pre-deploy shell hook (§5.4)
|
||||
├── setup_custom_tests.sh # deps/OIDC credential wiring hook (§5.5)
|
||||
├── compose.ccci.yml # CI-only compose overlay (via install_steps) (§5.6)
|
||||
└── PARITY.md # enrollment contract doc (human-read only)
|
||||
```
|
||||
|
||||
Precedence (machine-docs/DECISIONS.md, implemented in `discovery.py`):
|
||||
|
||||
- lifecycle overlay `test_<op>.py`: repo-local **wins** over cc-ci (same-name collision); the
|
||||
generic floor still runs additively alongside.
|
||||
- custom tier `test_*.py`: **ALL** run, from both locations (no collision concept).
|
||||
- `install_steps.sh`: repo-local > cc-ci, or none.
|
||||
- `ops.py` pre-op hook: cc-ci wins; repo-local consulted only if approved.
|
||||
- `recipe_meta.py`: cc-ci only — repo-local recipes cannot set CI settings (by design; the
|
||||
settings surface stays maintainer-controlled).
|
||||
|
||||
## 4. `recipe_meta.py` — complete settings reference
|
||||
|
||||
The single settings file. Plain Python, `exec()`d by the harness (trusted, in-repo). A key is "set"
|
||||
by a top-level assignment or `def`. Unknown names are ignored silently (a recipe may keep private
|
||||
constants here, e.g. mumble's `WELCOME_TEXT_MARKER` — but see §8 R6: typos in real key names are
|
||||
also silently ignored).
|
||||
|
||||
**Loader column legend** — this is the structural finding for the review (§8 R1). There is no
|
||||
single loader; six independent code paths each `exec()` the file and pick out their own keys:
|
||||
|
||||
| # | Loader | Keys it sees |
|
||||
|---|---|---|
|
||||
| L1 | `runner/run_recipe_ci.py:_load_meta` (orchestrator) | 4 base + explicit 8-key allowlist |
|
||||
| L2 | `tests/conftest.py:_recipe_meta` (pytest `meta` fixture) | 4 base keys ONLY |
|
||||
| L3 | `runner/harness/lifecycle.py:_recipe_extra_env` | `EXTRA_ENV` only |
|
||||
| L4 | `runner/harness/lifecycle.py:_recipe_meta_flag` | boolean flags by name (`CHAOS_BASE_DEPLOY`) |
|
||||
| L5 | `runner/harness/deps.py:declared_deps` | `DEPS` only |
|
||||
| L6 | `runner/harness/canonical.py:is_canonical_enrolled` | `WARM_CANONICAL` only |
|
||||
|
||||
### 4.1 HTTP / health / timing (base 4 — seen by L1 AND L2)
|
||||
|
||||
| Key | Type / default | Meaning | Used by |
|
||||
|---|---|---|---|
|
||||
| `HEALTH_PATH` | str, `"/"` | Path probed for serving/health checks | deploy wait (`lifecycle.py`), generic `assert_serving` |
|
||||
| `HEALTH_OK` | tuple, `(200, 301, 302)` | Acceptable HTTP status codes for health | same |
|
||||
| `DEPLOY_TIMEOUT` | int s, `600` | Max wait for swarm convergence per deploy | `lifecycle.py`, generic ops |
|
||||
| `HTTP_TIMEOUT` | int s, `300` | Max wait for HTTP health after converged | same |
|
||||
|
||||
Example: immich sets `DEPLOY_TIMEOUT = 1500`, `HTTP_TIMEOUT = 600` (ML containers are slow).
|
||||
|
||||
### 4.2 Upgrade tier (loader L1)
|
||||
|
||||
| Key | Type / default | Meaning |
|
||||
|---|---|---|
|
||||
| `UPGRADE_BASE_VERSION` | str (exact published tag), default `None` | **The "base pin"** — overrides the harness default base for the upgrade tier. Default base = `recipe_versions[-2]` (the previous published version); pin when that is not the PR's true predecessor (e.g. the PR is the first release on a new major, or the previous tag is known-broken). Must be an exact published tag — typos fail the base deploy. Consumed at `run_recipe_ci.py` (`prev = meta.get("UPGRADE_BASE_VERSION") or lifecycle.previous_version(recipe)`). Users: discourse, plausible. |
|
||||
| `UPGRADE_EXTRA_ENV` | dict **or** callable `(domain) -> dict`, default `None` | Extra `.env` keys applied **after** the PR-head checkout, **before** the chaos redeploy (F2-14c) — for env vars that exist only at head (a new required setting introduced by the PR). Consumed in `generic.py:256`. User: mumble. |
|
||||
|
||||
### 4.3 Every-deploy shaping (loaders L3/L4 — NOT in the L1 allowlist)
|
||||
|
||||
| Key | Type / default | Meaning |
|
||||
|---|---|---|
|
||||
| `EXTRA_ENV` | dict **or** callable `(domain) -> dict`, default `{}` | Extra `.env` keys applied at **every** deploy (base install AND upgrade old-app). Callable form derives values from the per-run domain (e.g. cryptpad's `SANDBOX_DOMAIN`). Loaded by `lifecycle.py:_recipe_extra_env` (its own `exec()`). Users: cryptpad, discourse, ghost, matrix-synapse, mattermost-lts, mumble, plausible. |
|
||||
| `CHAOS_BASE_DEPLOY` | bool, default `False` | Base deploy uses `--chaos` so it survives untracked files in the recipe checkout (required when `install_steps.sh` copies in a `compose.ccci.yml` overlay — §5.6; implicit coupling, see §8 R7). Loaded by `lifecycle.py:_recipe_meta_flag`. Users: discourse, ghost. |
|
||||
|
||||
### 4.4 Skips and intentional N/A (loader L1)
|
||||
|
||||
| Key | Type / default | Meaning |
|
||||
|---|---|---|
|
||||
| `SKIP_GENERIC` | list of op names or `"all"`/`"*"`, default `[]` | Suppress the generic floor for the listed ops (overlay becomes override instead of additive). Two env equivalents at run time: `CCCI_SKIP_GENERIC=1` (all ops), `CCCI_SKIP_GENERIC_<OP>=1` (one op). Currently set by **no enrolled recipe** (env form is the one used, ad hoc). |
|
||||
| `EXPECTED_NA` | dict `{rung: reason}`, default `None` | Declares an N/A rung **intentional** (e.g. `{"backup": "stateless, nothing to back up"}`). Undeclared N/A is reported as an *unintentional coverage gap*. Both cap the achievable level — declaring does not un-cap, it only changes the report wording (`results.py`). User: custom-html-tiny. |
|
||||
| `BACKUP_CAPABLE` | bool, default auto-detect | Overrides the backup-tier capability detection (scan of recipe compose files for `backupbot.backup` labels, `generic.py:34`). `False` forces N/A; `True` forces the tier on. Users: custom-html-bkp-bad/rst-bad (harness self-test recipes). |
|
||||
|
||||
### 4.5 Readiness & data-verification hooks (loader L1, callable values)
|
||||
|
||||
| Key | Type / default | Meaning |
|
||||
|---|---|---|
|
||||
| `READY_PROBE` | callable `(domain) -> [probe, ...]`, default `None` | Extra readiness probes run after install AND after upgrade, before that tier's assertions. Probe dicts: HTTP `{host, path, ok}` or TCP `{tcp_host, tcp_port, stable}` (`stable`: must stay connectable across 3 checks — for UDP-adjacent voice ports etc.). Consumed at `lifecycle.py:516`. Users: lasuite-drive, mumble (TCP voice port). |
|
||||
| `BACKUP_VERIFY` | callable `(domain) -> bool`, default `None` | Post-backup data-capture check, retried — guards the truncated-dump race (backup snapshot taken before the seeded marker row hit disk). Return `False` → retry the backup, then fail. Users: discourse, ghost. |
|
||||
|
||||
### 4.6 Dependencies / SSO (loaders L5 + L1)
|
||||
|
||||
| Key | Type / default | Meaning |
|
||||
|---|---|---|
|
||||
| `DEPS` | list of recipe names, default `[]` | Dep recipes deployed alongside (e.g. `["keycloak"]`). Dep domain is `<dep[:4]>-<6hex>`, hashed from (parent, pr, ref, dep) — collision-free per run. Creds land in `$CCCI_DEPS_FILE` (JSON); tests use the `deps_apps` fixture; teardown deps LAST. Deploy-count guard becomes `1 + len(DEPS)`. Loaded by `deps.py:declared_deps`. Users: lasuite-docs/-drive/-meet. |
|
||||
| `OIDC_AT_INSTALL` | bool, default `False` | Provision deps **before** the single base deploy so `install_steps.sh` can wire OIDC env into that one deploy (reads `$CCCI_DEPS_FILE`). Default (legacy) is post-deploy provisioning + a `setup_custom_tests.sh` redeploy. Consumed at `run_recipe_ci.py:514`. Users: lasuite-drive, lasuite-meet. |
|
||||
|
||||
### 4.7 Warm-canonical enrollment (loader L6)
|
||||
|
||||
| Key | Type / default | Meaning |
|
||||
|---|---|---|
|
||||
| `WARM_CANONICAL` | bool, default `False` | Enrolls the recipe in the warm/canonical app system (`docs/warm.md`): green COLD runs on LATEST advance the canonical snapshot; the nightly sweep iterates enrolled recipes. Loaded by `canonical.py:is_canonical_enrolled`. User: custom-html. |
|
||||
|
||||
### 4.8 Cosmetic (BROKEN — see §8 R2)
|
||||
|
||||
| Key | Type / default | Meaning |
|
||||
|---|---|---|
|
||||
| `SCREENSHOT` | callable `(page, domain, meta) -> None` | Drives Playwright to a safe post-login view for the results-card screenshot (default: landing page). **Currently unreachable from the CI path**: `screenshot.py:41` reads it from the meta dict the orchestrator passes (`run_recipe_ci.py:1056`), but the L1 allowlist never loads `SCREENSHOT`, so the hook is always `None`. No recipe sets it (consistent with it never having worked). |
|
||||
|
||||
## 5. Writing custom tests & hooks
|
||||
|
||||
### 5.1 Lifecycle overlay assertions — `test_<op>.py`
|
||||
|
||||
One pytest file per lifecycle op (`install` / `upgrade` / `backup` / `restore`). The
|
||||
**orchestrator performs the op exactly once**; the overlay only *asserts* on the resulting state
|
||||
(HC3 op/assertion split — overlays never deploy, never restore, never mutate). The generic floor
|
||||
test runs additively against the same state.
|
||||
|
||||
Conventions (see `tests/immich/test_backup.py` etc.):
|
||||
- use the `live_app` fixture (asserts `CCCI_APP_DOMAIN` is set, yields the domain)
|
||||
- use the `meta` fixture for HEALTH_*/timeouts (note: only the 4 base keys — §8 R3)
|
||||
- read op context from `$CCCI_OP_STATE_FILE` (JSON written by the orchestrator after the op:
|
||||
versions, artifact paths)
|
||||
- execute in-container checks via `harness.lifecycle.exec_in_app(domain, service, cmd)`
|
||||
|
||||
### 5.2 Pre-op seed hooks — `ops.py`
|
||||
|
||||
`def pre_<op>(domain, meta)` callables, imported and called by the orchestrator **before**
|
||||
performing the op. This is where data gets seeded so the post-op overlay can assert on it:
|
||||
|
||||
```python
|
||||
# tests/immich/ops.py (pattern)
|
||||
def pre_upgrade(domain, meta): _psql(domain, "INSERT ... 'upgrade-survives'")
|
||||
def pre_backup(domain, meta): _psql(domain, "INSERT ... 'original'")
|
||||
def pre_restore(domain, meta): _psql(domain, "DROP TABLE ci_marker") # damage, restore must undo
|
||||
```
|
||||
|
||||
Seed → op → assert is the whole pattern: `pre_backup` writes a marker, the orchestrator backs up,
|
||||
`pre_restore` destroys it, the orchestrator restores, `test_restore.py` asserts the marker is back.
|
||||
|
||||
### 5.3 Custom tier — `functional/`, `playwright/`, top-level `test_*.py`
|
||||
|
||||
All non-lifecycle `test_*.py` (discovery: `discovery.py:custom_tests`, recursive over the
|
||||
top-level dir + `functional/` + `playwright/`; files named `test_<op>.py` excluded). Run in the
|
||||
CUSTOM tier, after restore, against the post-upgrade (PR-head) app. ALL discovered files run —
|
||||
cc-ci's and (if HC2-approved) repo-local's, additively.
|
||||
|
||||
Enrollment contract (`docs/enroll-recipe.md`): ≥2 NEW functional tests beyond ports of existing
|
||||
upstream checks; ported tests carry `SOURCE:` comments. Playwright tests get the shared
|
||||
browser/harness helpers (`harness.browser`); SSO recipes get `harness.sso`
|
||||
(`setup_keycloak_realm` — idempotent, `oidc_password_grant` — provider-pluggable).
|
||||
|
||||
Tests gate on deps via `CCCI_DEPS_READY` (skip-with-reason when `0`; the skip is counted and
|
||||
fails the run if deps were declared but unprovisionable — `run_recipe_ci.py:816`).
|
||||
|
||||
### 5.4 Pre-deploy shell hook — `install_steps.sh`
|
||||
|
||||
Runs after `abra app new` + `EXTRA_ENV` application + secret generation, **before** the base
|
||||
deploy. For setup that must precede the first deploy: writing extra config files into the recipe
|
||||
checkout, copying in a `compose.ccci.yml` overlay (§5.6), editing `.env` beyond simple key=val.
|
||||
|
||||
Env contract: `CCCI_APP_DOMAIN`, `CCCI_RECIPE`, `CCCI_APP_ENV` (path to the app's `.env`), and —
|
||||
when `OIDC_AT_INSTALL` deps exist — `CCCI_DEPS_FILE`. Must locate the recipe checkout
|
||||
ABRA_DIR-aware: `RECIPE_DIR="${ABRA_DIR:-${HOME}/.abra}/recipes/${CCCI_RECIPE}"` (per-run
|
||||
`ABRA_DIR` since the concurrency restructure — a hardcoded `~/.abra` writes to the wrong tree).
|
||||
|
||||
Graceful-generic rule: a recipe needing a hook but not shipping one simply fails the generic
|
||||
install — a correct reported outcome, not a harness error.
|
||||
|
||||
### 5.5 Deps credential wiring — `setup_custom_tests.sh`
|
||||
|
||||
For legacy (post-deploy) deps provisioning: runs after deps are up, reads `$CCCI_DEPS_FILE`
|
||||
(jq-readable JSON of dep creds/URLs), wires OIDC config via `abra app config set` + secrets, and
|
||||
redeploys. With `OIDC_AT_INSTALL = True` this hook is unnecessary (wiring happens in
|
||||
`install_steps.sh` before the only deploy) — preferred for new enrollments (one deploy, no
|
||||
deploy-count exception).
|
||||
|
||||
### 5.6 CI-only compose overlay — `compose.ccci.yml`
|
||||
|
||||
Not auto-discovered: `install_steps.sh` copies it into the recipe checkout, and the recipe must
|
||||
set `CHAOS_BASE_DEPLOY = True` so the base deploy (`--chaos`) tolerates the untracked file.
|
||||
Policy: minimal, justified fallback only (ghost's is a 15m `start_period` grace — a literal,
|
||||
because abra validates `start_period` before env substitution). The overlay is cc-ci-owned even
|
||||
though it rides in the recipe checkout.
|
||||
|
||||
### 5.7 Environment contract summary (what custom code can read)
|
||||
|
||||
| Var | Set for | Meaning |
|
||||
|---|---|---|
|
||||
| `CCCI_APP_DOMAIN` | all tests + hooks | the app's per-run domain |
|
||||
| `CCCI_BASE_URL` | approved repo-local code | `https://<domain>` |
|
||||
| `CCCI_RECIPE`, `CCCI_APP_ENV` | `install_steps.sh` | recipe name, app `.env` path |
|
||||
| `CCCI_OP_STATE_FILE` | overlay tests | JSON op context (versions, artifacts) |
|
||||
| `CCCI_DEPS_FILE` | deps hooks + tests | JSON dep creds dict |
|
||||
| `CCCI_DEPS_READY` / `CCCI_DEPS_NOT_READY_REASON` | custom tier | gate SSO tests, skip-with-reason |
|
||||
|
||||
## 6. Run-model context (what the settings plug into)
|
||||
|
||||
One deploy chain per run (full detail: `docs/testing.md` §2):
|
||||
|
||||
```
|
||||
deploy BASE (UPGRADE_BASE_VERSION or recipe_versions[-2]; EXTRA_ENV; install_steps.sh;
|
||||
CHAOS_BASE_DEPLOY?; OIDC_AT_INSTALL deps first?)
|
||||
→ INSTALL tier (READY_PROBE; generic + overlay asserts)
|
||||
→ pre_upgrade → chaos-deploy PR HEAD (UPGRADE_EXTRA_ENV)
|
||||
→ UPGRADE tier (READY_PROBE; version-label == head_ref)
|
||||
→ pre_backup → backup (BACKUP_CAPABLE; BACKUP_VERIFY)
|
||||
→ BACKUP tier
|
||||
→ pre_restore → restore
|
||||
→ RESTORE tier
|
||||
→ CUSTOM tier (functional/ + playwright/; deps via CCCI_DEPS_*)
|
||||
→ teardown (deps LAST)
|
||||
```
|
||||
|
||||
Deploy-count guard (DG4.1): exactly `1 + len(DEPS)` deploys per run (chaos redeploys don't
|
||||
count); the per-run counter file is keyed by run since the concurrency restructure.
|
||||
|
||||
## 7. Local iteration
|
||||
|
||||
```
|
||||
RECIPE=<recipe> PR=<n> REF=<sha> SRC=recipe-maintainers/<recipe> \
|
||||
STAGES=install,upgrade,backup,restore,custom \
|
||||
cc-ci-run runner/run_recipe_ci.py
|
||||
```
|
||||
|
||||
(`docs/enroll-recipe.md` §5 for the full loop, including dep teardown caveats.)
|
||||
|
||||
## 8. Known limitations & restructuring candidates
|
||||
|
||||
The review section. Ordered by how much they'd shape a restructure.
|
||||
|
||||
**R1 — Six divergent meta loaders (the core drift hazard).** §4's L1–L6: every loader re-`exec()`s
|
||||
`recipe_meta.py` and cherry-picks its own keys. Adding a key means knowing *which* loader to touch
|
||||
(or that you must extend the L1 allowlist — `SCREENSHOT` proves people don't, R2). Two conventions
|
||||
coexist: L1's explicit allowlist vs L3–L6's ad-hoc `ns.get(...)` which silently bypasses it.
|
||||
*Candidate:* one `harness.meta.load(recipe) -> RecipeMeta` with a declarative key registry
|
||||
(name, type, default, validator, consumer) as the single source of truth; L1–L6 become lookups
|
||||
into the one loaded object; the registry also generates §4 of this doc (kills doc drift, R5).
|
||||
|
||||
**R2 — `SCREENSHOT` is a dead knob.** Fully implemented consumer (`screenshot.py`), documented
|
||||
hook contract, never reachable: the orchestrator's allowlist omits it, so the dict passed at
|
||||
`run_recipe_ci.py:1056` can never contain it. Direct evidence of R1. *Candidate:* fix trivially by
|
||||
adding to the allowlist — or delete the hook path if post-login screenshots aren't wanted; decide
|
||||
during the restructure.
|
||||
|
||||
**R3 — The pytest `meta` fixture sees 4 keys.** `tests/conftest.py:_recipe_meta` loads only
|
||||
HEALTH_*/timeouts. An overlay test wanting e.g. `EXPECTED_NA` or a recipe constant must re-exec
|
||||
the file itself. Probably intended minimalism, but it's a third key-set to keep in sync.
|
||||
*Folds into R1.*
|
||||
|
||||
**R4 — Settings split across three config languages** (§1): recipe_meta keys, file-presence
|
||||
(`install_steps.sh` existing changes deploy behavior), and run-time env (`CCCI_SKIP_GENERIC*`).
|
||||
A reviewer asking "what does this recipe customize?" must check all three. *Candidate:* keep the
|
||||
three surfaces (they serve different actors) but make the run header log a single resolved
|
||||
"customization manifest" per run: every non-default key + every discovered hook file + every
|
||||
CCCI_* override, in one block.
|
||||
|
||||
**R5 — Reference-doc drift already happened.** `docs/testing.md` documents 6 meta keys,
|
||||
`docs/enroll-recipe.md` shows others by example; neither is complete (18 keys exist). This doc is
|
||||
now complete but handwritten — it will drift too. *Candidate:* generate the key table from the R1
|
||||
registry (test asserts doc ⊆ registry).
|
||||
|
||||
**R6 — No schema validation / silent typos.** Unknown top-level names in `recipe_meta.py` are
|
||||
ignored, which is load-bearing (recipes keep private constants there: mumble's
|
||||
`WELCOME_TEXT_MARKER`, `MAX_USERS`). Consequence: misspelling `READY_PROBE` as `READINESS_PROBE`
|
||||
silently disables the probe — the run goes green with less coverage, the worst failure mode for a
|
||||
CI harness. *Candidate:* with the R1 registry, warn (not fail) on ALL-CAPS top-level names that
|
||||
are not registered and not referenced by the recipe's own tests; or namespace private constants
|
||||
(`_WELCOME_TEXT_MARKER`).
|
||||
|
||||
**R7 — `compose.ccci.yml` ⇄ `CHAOS_BASE_DEPLOY` implicit coupling.** The overlay only works if
|
||||
the recipe *also* sets the flag; forgetting it fails the base deploy with an abra
|
||||
untracked-files error far from the cause. *Candidate:* if `install_steps.sh` exists alongside a
|
||||
`compose.ccci.yml`, the harness could auto-enable chaos for the base deploy (or at least assert
|
||||
the flag and fail with a pointed message).
|
||||
|
||||
**R8 — `SKIP_GENERIC` (meta form) has zero users.** Only the env-var form is used, ad hoc. Either
|
||||
the meta key earns its place (first real user) or it's surface to delete in the restructure.
|
||||
|
||||
**R9 — `recipe_meta.py` is code, not config.** Five keys take callables (`EXTRA_ENV`,
|
||||
`UPGRADE_EXTRA_ENV`, `READY_PROBE`, `BACKUP_VERIFY`, `SCREENSHOT`), so the file must stay an
|
||||
`exec()`d Python module — it can't be validated as data, serialized into results, or diffed
|
||||
declaratively. This is a real expressiveness need (cryptpad derives `SANDBOX_DOMAIN` from the
|
||||
per-run domain), not an accident. *Candidate if restructuring:* split data keys (TOML-able,
|
||||
schema-validated) from a `hooks.py` (callables only) — but weigh against the cost of two files
|
||||
per recipe; the R1 registry gets most of the value without the split.
|
||||
|
||||
## 9. File / symbol index
|
||||
|
||||
| Concern | Where |
|
||||
|---|---|
|
||||
| Orchestrator meta loader (L1, allowlist) | `runner/run_recipe_ci.py:250` `_load_meta` |
|
||||
| Pytest meta fixture (L2) | `tests/conftest.py` `_recipe_meta` |
|
||||
| `EXTRA_ENV` loader (L3) | `runner/harness/lifecycle.py:114` `_recipe_extra_env` |
|
||||
| Boolean-flag loader (L4) | `runner/harness/lifecycle.py:132` `_recipe_meta_flag` |
|
||||
| `DEPS` loader (L5) | `runner/harness/deps.py:37` `declared_deps` |
|
||||
| `WARM_CANONICAL` loader (L6) | `runner/harness/canonical.py:36` `is_canonical_enrolled` |
|
||||
| Overlay/custom/hook discovery + HC2 gate | `runner/harness/discovery.py` |
|
||||
| HC2 allowlist | `tests/repo-local-approved.txt` |
|
||||
| Generic assertions + `BACKUP_CAPABLE` detect | `runner/harness/generic.py` |
|
||||
| `READY_PROBE` / `CHAOS_BASE_DEPLOY` consumption | `runner/harness/lifecycle.py:516` / `:283` |
|
||||
| `EXPECTED_NA` reporting | `runner/harness/results.py` |
|
||||
| Dead `SCREENSHOT` consumer | `runner/harness/screenshot.py:36`, called `run_recipe_ci.py:1056` |
|
||||
| Skip-generic logic (meta + env) | `runner/run_recipe_ci.py:285` |
|
||||
| Worked examples | `tests/ghost/` (overlay+chaos), `tests/mumble/` (TCP probe, UPGRADE_EXTRA_ENV), `tests/lasuite-drive/` (DEPS+OIDC_AT_INSTALL), `tests/immich/` (ops.py seed pattern) |
|
||||
54
flake.nix
54
flake.nix
@ -31,34 +31,36 @@
|
||||
];
|
||||
in
|
||||
{
|
||||
# Canonical live host target: the Hetzner cc-ci server.
|
||||
# Use `.#cc-ci` for the current production host.
|
||||
nixosConfigurations.cc-ci = nixpkgs.lib.nixosSystem {
|
||||
inherit system;
|
||||
modules = [
|
||||
sops-nix.nixosModules.sops
|
||||
./nix/hosts/cc-ci-hetzner/configuration.nix
|
||||
];
|
||||
};
|
||||
nixosConfigurations = {
|
||||
# Canonical live host target: the Hetzner cc-ci server.
|
||||
# Use `.#cc-ci` for the current production host.
|
||||
cc-ci = nixpkgs.lib.nixosSystem {
|
||||
inherit system;
|
||||
modules = [
|
||||
sops-nix.nixosModules.sops
|
||||
./nix/hosts/cc-ci-hetzner/configuration.nix
|
||||
];
|
||||
};
|
||||
|
||||
# Legacy Incus VM host definition retained only for historical comparison and fallback.
|
||||
# Do NOT use this target on the live Hetzner server.
|
||||
nixosConfigurations.cc-ci-incus = nixpkgs.lib.nixosSystem {
|
||||
inherit system;
|
||||
modules = [
|
||||
sops-nix.nixosModules.sops
|
||||
./nix/hosts/cc-ci/configuration.nix
|
||||
];
|
||||
};
|
||||
# Legacy Incus VM host definition retained only for historical comparison and fallback.
|
||||
# Do NOT use this target on the live Hetzner server.
|
||||
cc-ci-incus = nixpkgs.lib.nixosSystem {
|
||||
inherit system;
|
||||
modules = [
|
||||
sops-nix.nixosModules.sops
|
||||
./nix/hosts/cc-ci/configuration.nix
|
||||
];
|
||||
};
|
||||
|
||||
# Explicit alias for the live Hetzner host. Kept alongside `cc-ci` so the intended host target
|
||||
# remains obvious in recovery/migration workflows.
|
||||
nixosConfigurations.cc-ci-hetzner = nixpkgs.lib.nixosSystem {
|
||||
inherit system;
|
||||
modules = [
|
||||
sops-nix.nixosModules.sops
|
||||
./nix/hosts/cc-ci-hetzner/configuration.nix
|
||||
];
|
||||
# Explicit alias for the live Hetzner host. Kept alongside `cc-ci` so the intended host
|
||||
# target remains obvious in recovery/migration workflows.
|
||||
cc-ci-hetzner = nixpkgs.lib.nixosSystem {
|
||||
inherit system;
|
||||
modules = [
|
||||
sops-nix.nixosModules.sops
|
||||
./nix/hosts/cc-ci-hetzner/configuration.nix
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
devShells.${system} = {
|
||||
|
||||
131
machine-docs/BACKLOG-regression.md
Normal file
131
machine-docs/BACKLOG-regression.md
Normal file
@ -0,0 +1,131 @@
|
||||
# BACKLOG — server regression canaries phase
|
||||
|
||||
## Build backlog
|
||||
|
||||
- [x] Create `tests/regression/` suite (conftest + test_canaries + README)
|
||||
- [ ] Run `good-simple` canary (custom-html-tiny main) → confirm GREEN + test_serving passes
|
||||
- [ ] Run `bad-false-green` canary (custom-html v5-stale-docroot) → confirm RED + test_content_type fails
|
||||
- [ ] Run `good-significant` canary (lasuite-docs main) → confirm GREEN + test_serving_and_frontend passes
|
||||
- [ ] Open PR for operator review (DoD item 5: NOT merged)
|
||||
- [ ] Claim gate once all canary runs are GREEN/RED as expected + PR is open
|
||||
|
||||
## Adversary findings
|
||||
|
||||
### A-reg-1 [adversary] CLOSED @2026-06-02T01:46Z — relative import fixed, 3 tests collect
|
||||
**Filed:** 2026-06-02T01:37Z
|
||||
**Severity:** CRITICAL — suite can't run at all until fixed
|
||||
|
||||
Cold-run `cc-ci-run -m pytest tests/regression/ --collect-only` on cc-ci confirms:
|
||||
```
|
||||
ImportError: attempted relative import with no known parent package
|
||||
tests/regression/test_canaries.py:18: from .conftest import run_recipe_ci, ...
|
||||
```
|
||||
No tests collected. 0 canaries can run.
|
||||
|
||||
**Root cause:** `test_canaries.py` uses a relative import (`from .conftest import ...`) which
|
||||
requires the directory to be a Python package. Without `tests/regression/__init__.py` (and
|
||||
`tests/__init__.py`), pytest imports `test_canaries.py` as a top-level module, not a package
|
||||
member. Relative imports fail.
|
||||
|
||||
**Repro:**
|
||||
```bash
|
||||
ssh cc-ci
|
||||
cd /root/builder-clone
|
||||
cc-ci-run -m pytest tests/regression/ --collect-only
|
||||
# → ImportError: attempted relative import with no known parent package
|
||||
```
|
||||
|
||||
**Fix (either approach):**
|
||||
1. Add `tests/__init__.py` and `tests/regression/__init__.py` (makes it a real package)
|
||||
2. OR replace `from .conftest import ...` with absolute sys.path manipulation (like other test
|
||||
files do, e.g. `sys.path.insert(0, ...); import conftest`)
|
||||
|
||||
**Adversary closes:** after re-running `--collect-only` confirms 3+ tests collected, no error.
|
||||
|
||||
---
|
||||
|
||||
### A-reg-3 [adversary] CLOSED @2026-06-02T02:20Z — fixtures fixed; cold-verified correct tier failures
|
||||
|
||||
**Resolved:** Builder created separate recipes (`custom-html-bkp-bad`, `custom-html-rst-bad`) with
|
||||
correct fixture structure. Cold-verified from cc-ci artifact dirs (no harness re-run needed).
|
||||
|
||||
**Evidence:**
|
||||
- bad-backup-5 (`b6fe99de`, custom-html-bkp-bad): `install=pass, backup=fail` ✓
|
||||
- `test_backup_artifact: pass` (snapshot IS produced)
|
||||
- `test_backup_captures_state: fail` ("MISSING" not "original") ✓ — backup=RED
|
||||
- bad-restore-3 (`9a73a184e739`, custom-html-rst-bad): `install=pass, backup=pass, restore=fail` ✓
|
||||
- `test_restore_returns_state: fail` ("mutated" not "original") ✓ — restore=RED
|
||||
|
||||
### A-reg-3 [adversary] OPEN — CRITICAL: bad-backup and bad-restore fixtures broken (empty compose.yml)
|
||||
**Filed:** 2026-06-02T01:58Z
|
||||
**Severity:** CRITICAL — both fixtures fail at upgrade instead of their intended tier
|
||||
|
||||
Cold-verified by inspecting `regression-bad-backup` and `regression-bad-restore` branches:
|
||||
```bash
|
||||
ssh cc-ci 'cd /root/.abra/recipes/custom-html && git diff origin/main..origin/regression-bad-backup -- compose.yml'
|
||||
```
|
||||
Result: compose.yml is completely empty (entire file deleted, leaving only a blank line). Same
|
||||
for `regression-bad-restore`.
|
||||
|
||||
**Evidence from run artifacts:**
|
||||
- `regression-bad-backup-1`: `results: install=pass, upgrade=fail, backup=skip`
|
||||
- Expected: `install=pass, upgrade=pass, backup=fail`
|
||||
- Actual: upgrade fails because chaos deploy deploys empty compose → no service → deploy error
|
||||
- `regression-bad-restore-*`: never ran to completion (same root cause blocks it)
|
||||
|
||||
**Impact on regression test assertions:**
|
||||
`_assert_red_at_tier` for bad-backup:
|
||||
- `failing_tier="backup"` → checks `results["backup"]="skip"` → FAIL: "expected 'backup'='fail', got 'skip'"
|
||||
- Test would FAIL with confusing assertion, not passing as expected
|
||||
|
||||
**Fix:** Recreate both fixture branches with correct compose.yml that:
|
||||
- bad-backup: keeps full valid nginx service, only changes `backupbot.backup.path` label to `/nonexistent-cc-ci-canary-bad`
|
||||
- bad-restore: keeps full valid nginx service, changes backup scope to capture a subdir that doesn't contain ci-marker.txt (so restore doesn't recover the marker)
|
||||
|
||||
The compose.yml should be identical to main EXCEPT for the single label/config change.
|
||||
|
||||
**Repro:** `git diff origin/main..origin/regression-bad-backup -- compose.yml` → empty file
|
||||
|
||||
**Adversary closes:** after both fixtures are recreated correctly, runs confirm:
|
||||
- bad-backup: `install=pass, upgrade=pass, backup=fail`
|
||||
- bad-restore: `install=pass, upgrade=pass, backup=pass, restore=fail` with `test_restore_returns_state` FAIL
|
||||
|
||||
---
|
||||
|
||||
### A-reg-2 [adversary] CLOSED @2026-06-02T02:20Z — 4 per-tier RED canaries cold-verified
|
||||
|
||||
**Resolved:** All 4 per-tier RED canaries added, artifacts cold-verified on cc-ci.
|
||||
|
||||
| Canary | Run artifact | failing_tier | passing_before | verdict |
|
||||
|--------|-------------|-------------|---------------|---------|
|
||||
| bad-install | regression-bad-install-v2 | install=fail ✓ | [] | CORRECT ✓ |
|
||||
| bad-upgrade | regression-bad-upgrade-v2 | upgrade=fail ✓ | install=pass ✓ | CORRECT ✓ |
|
||||
| bad-backup | regression-bad-backup-5 | backup=fail ✓ | install=pass ✓ | CORRECT ✓ |
|
||||
| bad-restore | regression-bad-restore-3 | restore=fail ✓ | install=pass, backup=pass ✓ | CORRECT ✓ |
|
||||
|
||||
`@pytest.mark.canary_fast` marker added ✓. 7 tests collect ✓.
|
||||
|
||||
**Note:** bad-backup comment in test_canaries.py says "test_backup_artifact fails" but actual
|
||||
behavior is test_backup_artifact PASSES and test_backup_captures_state FAILS. Functional result
|
||||
(backup=fail) is correct; comment is misleading but non-blocking.
|
||||
|
||||
### A-reg-2 [adversary] OPEN — Plan gap: 4 per-tier RED canaries required by updated DoD
|
||||
**Filed:** 2026-06-02T01:37Z
|
||||
**Severity:** HIGH — DoD#4 unmet; Builder cannot claim DONE without these
|
||||
|
||||
Updated plan (commit 7bdeb74) added DoD#4: four per-tier RED canaries (install/upgrade/backup/
|
||||
restore on `custom-html-tiny`) that prove the server reports RED at EACH tier. Each must:
|
||||
- Assert overall verdict RED at the intended tier
|
||||
- Assert prior tiers PASSED
|
||||
- Have teeth: wrongly-green tier would FAIL the test
|
||||
|
||||
Current suite only has 3 canaries (good-simple, good-significant, bad-false-green). The 4
|
||||
per-tier RED canaries are MISSING. This is a mandatory DoD item.
|
||||
|
||||
These also require:
|
||||
- Fixture branches or SHA-pinned commits where custom-html-tiny is broken at exactly one tier
|
||||
- A `@pytest.mark.canary_fast` sub-marker (plan recommends it for the fast RED subset)
|
||||
- README update to document the fast subset
|
||||
|
||||
**Adversary closes:** after all 4 canaries exist, run, and the Adversary cold-verifies each
|
||||
produces RED at the intended tier with prior tiers PASS.
|
||||
@ -1283,3 +1283,15 @@ the commit), which is the correct SCM integration.
|
||||
environment; job is session-persistent (survives as long as Builder session runs). T0-refire
|
||||
verified: CronCreate test fire at 23:17Z → upgrader started, upgrader-cron.log created, status
|
||||
RUNNING. (2026-06-01)
|
||||
|
||||
## conc P3 (2026-06-10, Builder): install_steps.sh hooks resolve $ABRA_DIR — guardrail note
|
||||
|
||||
P3 makes recipe working trees per-run ($ABRA_DIR/recipes). tests/{ghost,discourse}/install_steps.sh
|
||||
hard-coded `${HOME}/.abra/recipes/...` to copy their compose.ccci.yml overlay into the deploy tree;
|
||||
under per-run trees that path is the WRONG (canonical) tree, so the overlay would silently miss the
|
||||
deploy and both recipes' upgrade-tier base deploys would break. Fixed with ONE mechanical line per
|
||||
hook: `RECIPE_DIR="${ABRA_DIR:-${HOME}/.abra}/recipes/${CCCI_RECIPE}"` (identical resolution rule to
|
||||
the abra CLI and abra.recipe_dir()). No test assertion, gate, or overlay content was touched — the
|
||||
phase guardrail's "never touch tests/<recipe>/ content" is read as protecting test/gate SEMANTICS;
|
||||
this is required P3 fallout, equivalent to the harness-side path routing. Flagged here for the
|
||||
Adversary's gate-integrity review.
|
||||
|
||||
76
machine-docs/JOURNAL-regression.md
Normal file
76
machine-docs/JOURNAL-regression.md
Normal file
@ -0,0 +1,76 @@
|
||||
# JOURNAL — server regression canaries phase (Builder)
|
||||
|
||||
**Phase:** server regression canaries
|
||||
**Started:** 2026-06-02
|
||||
|
||||
---
|
||||
|
||||
## Step 0 — phase kickoff and design (2026-06-02)
|
||||
|
||||
**Context:** Mirror phase (plan-mirror-enroll-all-recipes.md) completed DONE at 2026-06-02T01:16Z.
|
||||
Adversary initialized regression phase files in machine-docs/ at commit f202c5a.
|
||||
|
||||
**Decision: run regression tests ON cc-ci, not from the orchestrator**
|
||||
|
||||
The regression tests call `run_recipe_ci.py` which uses abra/docker/swarm — these only exist on
|
||||
cc-ci. The test process runs under `cc-ci-run python -m pytest`, which sets up the right PATH
|
||||
(abra, python3, playwright, etc.). The test then invokes `run_recipe_ci.py` as a subprocess using
|
||||
`sys.executable` (inherits the same python3 from cc-ci-run).
|
||||
|
||||
The README.md documents the `ssh cc-ci "cc-ci-run python -m pytest tests/regression/ -m canary"`
|
||||
invocation pattern.
|
||||
|
||||
**Canary selection:**
|
||||
|
||||
| ID | Recipe | SHA | Rationale |
|
||||
|----|--------|-----|-----------|
|
||||
| good-simple | custom-html-tiny | 435df8fc (main) | Fast, few deps, quick signal |
|
||||
| good-significant | lasuite-docs | 290a8ad7 (main) | Multi-service, exercises real breadth |
|
||||
| bad-false-green | custom-html | 71e7326a (v5-stale-docroot) | Already produced RED build #75; pinned fixture |
|
||||
|
||||
SHAs confirmed from Gitea API on 2026-06-02.
|
||||
|
||||
**Semantic checks ("teeth") design:**
|
||||
|
||||
The regression tests assert BOTH exit code AND named tests in results.json stages. This guards
|
||||
against two failure modes:
|
||||
1. Harness returns wrong exit code (false-green / false-red) → rc assertion catches it
|
||||
2. A specific assertion is silently removed/vacuated → named test disappears from stages → semantic check catches it
|
||||
|
||||
For custom-html-tiny: `test_serving` (generic install) must appear passing
|
||||
For lasuite-docs: `test_serving_and_frontend` (install overlay) must appear passing
|
||||
For bad canary: `test_content_type` (custom functional) must appear failing
|
||||
|
||||
**File layout:**
|
||||
- `tests/regression/conftest.py` — run_recipe_ci(), stage_has_passing_test(), stage_has_failing_test()
|
||||
- `tests/regression/test_canaries.py` — parametrized @pytest.mark.canary test
|
||||
- `tests/regression/README.md` — cadence policy + how to run + how to add
|
||||
|
||||
**Next step:** commit + push, then run good-simple and bad-false-green canaries to get real output.
|
||||
lasuite-docs is slow (10-20 min) so will run it last.
|
||||
|
||||
---
|
||||
|
||||
## Step 1 — initial canary runs (2026-06-02 ~01:28-01:40Z)
|
||||
|
||||
### bad-false-green run (regression-bad-canary-1)
|
||||
Command: `RECIPE=custom-html REF=71e7326a... SRC=recipe-maintainers/custom-html cc-ci-run runner/run_recipe_ci.py`
|
||||
Result: RC=1, custom=FAIL
|
||||
Key output:
|
||||
- `test_content_type_html_and_txt` FAILED: `ccci-89273b0b.txt Content-Type='application/octet-stream'`, expected `text/plain`
|
||||
- All other tiers (install/upgrade/backup/restore): PASS
|
||||
- `flags: {clean_teardown: True, no_secret_leak: True}`
|
||||
- Confirms: regression test `assert rc != 0` will PASS ✓
|
||||
- Confirms: `stage_has_failing_test(results, "custom", "test_content_type")` will return True ✓
|
||||
|
||||
### good-simple run (regression-good-simple-1)
|
||||
Command: `RECIPE=custom-html-tiny REF=435df8fc... SRC=recipe-maintainers/custom-html-tiny cc-ci-run runner/run_recipe_ci.py`
|
||||
Result: RC=0, install=pass, upgrade=pass, backup/restore/custom=skip
|
||||
Key output:
|
||||
- `test_serving` in install stage: PASSED ✓
|
||||
- `flags: {clean_teardown: True, no_secret_leak: True}` ✓
|
||||
- Confirms: all regression assertions for good-simple will PASS ✓
|
||||
|
||||
### good-significant run (regression-good-significant-1) [IN PROGRESS]
|
||||
Started ~01:35Z. Multi-service stack (lasuite-docs + keycloak dep). Image pull in progress.
|
||||
Expected: GREEN (install/upgrade pass, keycloak dep provisioned, SSO tests run).
|
||||
238
machine-docs/REVIEW-regression.md
Normal file
238
machine-docs/REVIEW-regression.md
Normal file
@ -0,0 +1,238 @@
|
||||
# REVIEW — server regression canaries phase (Adversary ledger)
|
||||
|
||||
**Phase:** server regression canaries (codified E2E self-tests)
|
||||
**SSOT:** `/srv/cc-ci/cc-ci-plan/plan-server-regression-canaries.md`
|
||||
**Adversary loop started:** 2026-06-02T01:15Z
|
||||
**Repo:** git.autonomic.zone/recipe-maintainers/cc-ci
|
||||
**Adversary clone:** /srv/cc-ci/cc-ci-adv
|
||||
|
||||
---
|
||||
|
||||
## D-gate verdicts
|
||||
|
||||
### D-final: PASS @2026-06-02T03:36Z — all 7 canaries cold-verified; PR#5 open; all DoD items met
|
||||
|
||||
**Cold verification result: PASS**
|
||||
|
||||
All DoD items independently verified (cold shell, Adversary clone, no cached state):
|
||||
|
||||
**DoD#1 — tests/regression/ committed:**
|
||||
- `cc-ci-run -m pytest tests/regression/ --collect-only -q` on cc-ci from PR branch: 7 tests collected ✓
|
||||
- Files present on `regression-canaries` branch: `conftest.py`, `test_canaries.py`, `README.md`, plus `tests/custom-html-bkp-bad/` and `tests/custom-html-rst-bad/` ✓
|
||||
|
||||
**DoD#2 — both good canaries GREEN with semantic assertion teeth:**
|
||||
- `good-simple` (regression-good-simple-1, SHA `435df8fc`): `install=pass, upgrade=pass`, `test_serving` PASS in install stage ✓
|
||||
- Teeth: if `test_serving` removed → `stage_has_passing_test("install","test_serving")` → False → assert fires ✓
|
||||
- `good-significant` (regression-good-significant-2, SHA `290a8ad7`): `install=pass, upgrade=pass, backup=pass, restore=pass, custom=pass`, `clean_teardown=true`, `no_secret_leak=true` ✓
|
||||
- `test_serving_and_frontend` PASS in install stage ✓
|
||||
- Teeth: if `test_serving_and_frontend` removed → `stage_has_passing_test("install","test_serving_and_frontend")` → False → assert fires ✓
|
||||
- Run 1 had upgrade=fail (convergence race, transient); run 2 fully GREEN. Known plan risk; no action needed unless persistent.
|
||||
|
||||
**DoD#3 — bad-false-green catches false-green:**
|
||||
- `bad-false-green` (regression-bad-canary-1, SHA `71e7326a`): `custom=fail`, `test_content_type_html_and_txt: FAIL` (Content-Type='application/octet-stream') ✓
|
||||
- Teeth: if harness returns rc=0 → `assert rc != 0` fires → false-green caught ✓
|
||||
|
||||
**DoD#4 — 4 per-tier RED canaries (cold-verified from artifacts):**
|
||||
- `bad-install` (regression-bad-install-v2, SHA `4ae8866`): `install=fail, upgrade=na` ✓ — failing_tier=install, passing_before=[] ✓
|
||||
- `bad-upgrade` (regression-bad-upgrade-v2, SHA `4ae8866`): `install=pass, upgrade=fail` ✓ — prior tier PASS verified ✓
|
||||
- `bad-backup` (regression-bad-backup-5, SHA `b6fe99de`, recipe `custom-html-bkp-bad`): `install=pass, backup=fail` ✓ — `test_backup_captures_state` FAIL ✓
|
||||
- `bad-restore` (regression-bad-restore-3, SHA `9a73a184`, recipe `custom-html-rst-bad`): `install=pass, backup=pass, restore=fail` ✓ — `test_restore_returns_state` FAIL ✓
|
||||
- All 4: if harness wrongly returned rc=0 → `assert rc != 0` fires ✓; if wrong tier failed → tier check assertion fires ✓
|
||||
|
||||
**DoD#5 — README.md:**
|
||||
- `tests/regression/README.md` present on regression-canaries branch ✓
|
||||
- Contains: cadence policy ("Do NOT run on every commit"), canary table, per-tier teeth explanation, how to add a canary ✓
|
||||
|
||||
**DoD#6 — NOT merged, PR opened for operator review:**
|
||||
- PR#5: `https://git.autonomic.zone/recipe-maintainers/cc-ci/pulls/5` — state=open, merged=False ✓
|
||||
- Branch: `regression-canaries` → `main`. 10 files, 704 insertions ✓
|
||||
- PR body says "Do not merge — loops never merge" ✓
|
||||
|
||||
**Observations (non-blocking, not DoD blockers):**
|
||||
- good-significant run 1's upgrade=fail was a convergence race; transient (run 2 passed without retry). No test weakening, no retry added — consistent with plan policy.
|
||||
- Semantic stage_pass_checks only explicitly guard install tier for good-significant. Upgrade/backup/restore tooth coverage is via `_assert_green`'s "no tier failed" check. Limitation noted; acceptable per plan DoD requirements.
|
||||
- A-reg-2 comment in test_canaries.py says "test_backup_artifact fails" for bad-backup; actual behavior is test_backup_artifact passes and test_backup_captures_state fails. Misleading comment, non-blocking.
|
||||
|
||||
**Verdict: D-final PASS.** All 7 canaries verified. All 6 DoD items met. Phase is complete pending operator review of PR#5. No vetoes.
|
||||
|
||||
---
|
||||
|
||||
### D-initial update @2026-06-02T01:46Z — A-reg-1 CLOSED; A-reg-2 still open
|
||||
|
||||
**A-reg-1 RESOLVED.** Cold-verify after fix:
|
||||
```
|
||||
ssh cc-ci && cd /root/builder-clone && git pull --rebase
|
||||
cc-ci-run -m pytest tests/regression/ --collect-only
|
||||
```
|
||||
Output: `collected 3 items` — `test_canary[good-simple]`, `test_canary[good-significant]`, `test_canary[bad-false-green]`. No errors.
|
||||
|
||||
**Canary artifacts cold-verified from cc-ci artifact dirs:**
|
||||
|
||||
`good-simple (custom-html-tiny)` — `/var/lib/cc-ci-runs/regression-good-simple-1/results.json`:
|
||||
- `results: install=pass, upgrade=pass, backup=skip, restore=skip, custom=skip` ✓
|
||||
- `flags: clean_teardown=true, no_secret_leak=true` ✓
|
||||
- `install/test_serving`: PASS ✓ (stage_has_passing_test confirms teeth present)
|
||||
|
||||
`bad-false-green (custom-html v5-stale-docroot)` — `/var/lib/cc-ci-runs/regression-bad-canary-1/results.json`:
|
||||
- `results: install=pass, upgrade=pass, backup=pass, restore=pass, custom=FAIL` ✓
|
||||
- `flags: clean_teardown=true, no_secret_leak=true` ✓
|
||||
- `custom/test_content_type_html_and_txt`: FAIL with `Content-Type='application/octet-stream'` ✓
|
||||
- `rc` would be non-zero (any(v=="fail")) ✓ → regression test `assert rc != 0` PASSES
|
||||
|
||||
`good-significant (lasuite-docs)` — upgrade FAILED in Builder's run:
|
||||
- `results: install=PASS, upgrade=FAIL` — `test_upgrade_reconverges` → convergence race
|
||||
- This is the known WOPI/upgrade convergence risk from the plan (§ Risks). Builder is re-running.
|
||||
- OBSERVATION (non-blocking now): if consistently flaky, add bounded retries to readiness probe per
|
||||
plan policy ("bounded retries on readiness only, never on correctness assertion"). Will watch.
|
||||
|
||||
**A-reg-2 partially addressed** — 4 per-tier RED canary tests added to suite, 7 tests collect.
|
||||
But bad-backup and bad-restore FIXTURES are broken (see A-reg-3). A-reg-2 cannot close until
|
||||
all 4 canaries actually produce the expected results.
|
||||
|
||||
---
|
||||
|
||||
### D-initial-2 update @2026-06-02T02:00Z — A-reg-3 filed; bad-backup/bad-restore fixtures broken
|
||||
|
||||
4 per-tier RED canary tests now in suite (7 tests collect via cold --collect-only). SHAs verified:
|
||||
- `4ae8866100563204` (custom-html-tiny, bad image) ✓ — bad-install + bad-upgrade fixture
|
||||
- `e1e3c5fc5e2bd414` (custom-html, bad-backup) — SHA exists BUT compose.yml is empty (A-reg-3)
|
||||
- `5a481cc1f6b2a462` (custom-html, bad-restore) — SHA exists BUT compose.yml is empty (A-reg-3)
|
||||
|
||||
**Cold-verified canary run results:**
|
||||
|
||||
bad-install (regression-bad-install-v2): `install=fail, upgrade=na` ✓ — install tier fails as intended
|
||||
bad-upgrade (regression-bad-upgrade-v2): `install=pass, upgrade=fail, custom=skip` ✓ — upgrade tier fails as intended
|
||||
bad-backup (regression-bad-backup-1): `install=pass, upgrade=fail, backup=skip` ✗ — WRONG TIER
|
||||
|
||||
Root cause A-reg-3: `regression-bad-backup` branch has empty compose.yml (whole file deleted, not
|
||||
just backup path changed). Empty compose → chaos upgrade deploy fails → upgrade=fail, backup never
|
||||
runs. Same issue for `regression-bad-restore` (same empty compose.yml diff).
|
||||
|
||||
**`_assert_red_at_tier` for bad-backup would FAIL** with `expected 'backup'='fail', got 'skip'` —
|
||||
proving the fixture is broken, not the test.
|
||||
|
||||
**What still needs fixing before final gate:**
|
||||
1. ~~A-reg-3~~ CLOSED — fixtures fixed and cold-verified ✓
|
||||
2. ~~A-reg-2~~ CLOSED — all 4 per-tier RED canaries present and verified ✓
|
||||
3. **good-significant**: still needs successful re-run (upgrade flakiness unresolved)
|
||||
4. **Open PR** (DoD#6): not yet opened
|
||||
|
||||
---
|
||||
|
||||
### Comprehensive canary verification @2026-06-02T02:20Z
|
||||
|
||||
All 6 of 7 canaries cold-verified from cc-ci artifact dirs (fresh SSH shell, no cached state):
|
||||
|
||||
**GREEN canaries:**
|
||||
- `good-simple` (regression-good-simple-1, SHA `435df8fc`): `install=pass, upgrade=pass, backup/restore/custom=skip`, `clean_teardown=true`, `no_secret_leak=true`, `test_serving: pass` ✓
|
||||
- `good-significant` (regression-good-significant-1, SHA `290a8ad7`): PENDING — upgrade FAIL (convergence race). Needs re-run to confirm transient.
|
||||
|
||||
**Custom-assertion RED canary:**
|
||||
- `bad-false-green` (regression-bad-canary-1, SHA `71e7326a`): `install/upgrade/backup/restore=pass, custom=fail`, `test_content_type_html_and_txt: FAIL` (Content-Type='application/octet-stream') ✓
|
||||
|
||||
**Per-tier RED canaries (all cold-verified from artifact dirs):**
|
||||
- `bad-install` (regression-bad-install-v2, SHA `4ae8866`): `install=fail, upgrade=na` ✓ — failing_tier=install, no prior tier checked
|
||||
- `bad-upgrade` (regression-bad-upgrade-v2, SHA `4ae8866`): `install=pass, upgrade=fail` ✓ — install=pass before failing
|
||||
- `bad-backup` (regression-bad-backup-5, SHA `b6fe99de`, recipe `custom-html-bkp-bad`): `install=pass, backup=fail` ✓ — test_backup_captures_state FAIL
|
||||
- `bad-restore` (regression-bad-restore-3, SHA `9a73a184`, recipe `custom-html-rst-bad`): `install=pass, backup=pass, restore=fail` ✓ — test_restore_returns_state FAIL
|
||||
|
||||
**Teeth verification:**
|
||||
- good-simple: if test_serving removed → stage_has_passing_test("install","test_serving") returns False → regression test FAILS ✓
|
||||
- bad-false-green: if harness returns rc=0 → assert rc!=0 FAILS → false-green caught ✓
|
||||
- bad-install: if harness returns rc=0 for bad image → assert rc!=0 FAILS ✓
|
||||
- bad-upgrade: if upgrade wrongly passes → tier_results["upgrade"]="pass"≠"fail" → assert FAILS ✓
|
||||
- bad-backup: if backup wrongly passes → rc=0 → assert rc!=0 FAILS ✓
|
||||
- bad-restore: if restore wrongly passes → tier_results["restore"]!="fail" → assert FAILS ✓; if backup wrongly fails → tier_results["backup"]!="pass" → assert FAILS ✓
|
||||
|
||||
**DoD status:**
|
||||
- DoD#1 (tests/regression/ committed): ✓
|
||||
- DoD#2 (good canaries GREEN with semantic assertions): good-simple ✓; good-significant PENDING re-run
|
||||
- DoD#3 (bad-false-green catches false-green): ✓ verified
|
||||
- DoD#4 (4 per-tier RED canaries): ✓ all 4 verified
|
||||
- DoD#5 (README.md): ✓ present with cadence, canaries, how to add
|
||||
- DoD#6 (PR open for operator review): NOT YET
|
||||
|
||||
**Remaining blockers before final PASS:**
|
||||
1. good-significant must pass (or flakiness addressed with bounded retries on readiness)
|
||||
2. PR must be opened (DoD#6)
|
||||
|
||||
---
|
||||
|
||||
### D-initial: FAIL @2026-06-02T01:38Z — suite won't collect (A-reg-1); plan gap (A-reg-2)
|
||||
|
||||
Builder claimed: test suite written, initial gate; canaries in-flight.
|
||||
|
||||
**Cold verification result: FAIL — two blocking issues.**
|
||||
|
||||
**A-reg-1 (CRITICAL): Relative import fails, 0 tests collected.**
|
||||
```
|
||||
ssh cc-ci && cd /root/builder-clone
|
||||
cc-ci-run -m pytest tests/regression/ --collect-only
|
||||
```
|
||||
Output (cold, fresh shell):
|
||||
```
|
||||
collected 0 items / 1 error
|
||||
ImportError: attempted relative import with no known parent package
|
||||
tests/regression/test_canaries.py:18: from .conftest import run_recipe_ci, ...
|
||||
!!!!!!!!!!!!!!!!! Interrupted: 1 error during collection !!!!!!!!!!!!!!!!!!!!!
|
||||
```
|
||||
Root cause: `tests/regression/__init__.py` and `tests/__init__.py` missing. Fix: add them or
|
||||
use absolute imports (as other test files in this repo do).
|
||||
|
||||
**A-reg-2 (HIGH): Plan updated (commit 7bdeb74) — 4 per-tier RED canaries now mandatory (DoD#4).**
|
||||
Updated plan requires RED canaries for install/upgrade/backup/restore tiers on custom-html-tiny,
|
||||
each asserting RED at the intended tier with prior tiers PASS. Current suite: 3 canaries only
|
||||
(2 good + 1 bad-custom-assertion). All four are MISSING. Cannot claim DONE without them.
|
||||
|
||||
**Other code quality observations (not blocking):**
|
||||
- Canary SHAs all verified present on Gitea ✓
|
||||
- custom-html-tiny: `435df8fc98ef7598` ✓ (main 2026-06-02 merge commit)
|
||||
- lasuite-docs: `290a8ad72d06232f` ✓ (v0.3.3+v5.1.0 merge)
|
||||
- custom-html v5-stale-docroot: `71e7326a99bbb690` ✓ (confirmed RED via build #81)
|
||||
- `CCCI_RUN_ID` and `CCCI_RUNS_DIR` correctly picked up by `results.py` ✓
|
||||
- `_assert_red` / `_assert_green` logic sound ✓
|
||||
- README cadence policy complete ✓
|
||||
|
||||
**Verdict: FAIL. Standing issues: A-reg-1 (critical), A-reg-2 (high). Builder must fix both
|
||||
before re-claiming this gate.**
|
||||
|
||||
---
|
||||
|
||||
## Adversary findings
|
||||
|
||||
*(See BACKLOG-regression.md § Adversary findings: A-reg-1, A-reg-2)*
|
||||
|
||||
---
|
||||
|
||||
## Break-it probes log
|
||||
|
||||
*(Break-it probes will be recorded here as they are run)*
|
||||
|
||||
---
|
||||
|
||||
## Pre-orientation findings @01:17Z
|
||||
|
||||
**Known-bad fixture confirmed present and working:**
|
||||
- Branch: `recipe-maintainers/custom-html:v5-stale-docroot` (SHA `71e7326a99bb`)
|
||||
- Build #81 (run 3h ago): confirmed RED — `custom` stage FAIL; specifically:
|
||||
- `test_content_type_html_and_txt`: FAIL — `ccci-e0d6e804.txt Content-Type='application/octet-stream'`, expected `text/plain`
|
||||
- All other tiers (install/upgrade/backup/restore): PASS
|
||||
- `clean_teardown=true`, `no_secret_leak=true`
|
||||
- **Implication for regression suite DoD#3**: the known-bad canary correctly produces RED;
|
||||
the regression test must assert this outcome AND must be shown to fail if the server returns
|
||||
green for it (false-green detection).
|
||||
|
||||
**Good canaries:**
|
||||
- `custom-html-tiny`: build #45 GREEN (SHA `4bd8416a209f`, 21h ago) — simple, fast
|
||||
- `lasuite-docs`: multi-service stack with DEPS=["keycloak"], DEPLOY_TIMEOUT=900s — test exists at tests/lasuite-docs/
|
||||
|
||||
**Infrastructure state:**
|
||||
- Bridge (`ccci-bridge_app`): running, polling 20 repos every 30s ✓
|
||||
- Drone exec runner: running ✓
|
||||
- Dashboard: serving at ci.commoninternet.net ✓
|
||||
- Builder hasn't started regression phase: no STATUS-regression.md yet
|
||||
|
||||
**Notes:**
|
||||
- Mirror phase (plan-mirror-enroll-all-recipes.md) completed DONE at 2026-06-02T01:16Z.
|
||||
- This phase starts fresh: no STATUS-regression.md or tests/regression/ yet.
|
||||
- Watching for Builder to create STATUS-regression.md and begin work.
|
||||
138
machine-docs/STATUS-regression.md
Normal file
138
machine-docs/STATUS-regression.md
Normal file
@ -0,0 +1,138 @@
|
||||
# STATUS — server regression canaries phase
|
||||
|
||||
**Phase:** server regression canaries (codified E2E self-tests)
|
||||
**SSOT:** `/srv/cc-ci/cc-ci-plan/plan-server-regression-canaries.md`
|
||||
**Builder loop started:** 2026-06-02
|
||||
**Repo:** git.autonomic.zone/recipe-maintainers/cc-ci
|
||||
|
||||
---
|
||||
|
||||
## DONE
|
||||
|
||||
**Adversary PASS: @2026-06-02T03:36Z — D-final PASS. All 7 canaries verified. All 6 DoD items met. No vetoes.**
|
||||
|
||||
All DoD items Adversary-verified:
|
||||
1. ✓ `tests/regression/` suite committed — 7 tests collected (DoD#1)
|
||||
2. ✓ good-simple GREEN: `/var/lib/cc-ci-runs/regression-good-simple-1/` — install/upgrade=pass, test_serving PASS (DoD#2)
|
||||
3. ✓ good-significant GREEN: `/var/lib/cc-ci-runs/regression-good-significant-2/` — all 5 tiers pass, clean_teardown/no_secret_leak=true (DoD#2)
|
||||
4. ✓ bad-false-green RED: `/var/lib/cc-ci-runs/regression-bad-canary-1/` — custom=fail, false-green caught (DoD#3)
|
||||
5. ✓ 4 per-tier RED canaries verified (bad-install/upgrade/backup/restore — artifacts on server) (DoD#4)
|
||||
6. ✓ README.md: cadence, canaries, how to add (DoD#5)
|
||||
7. ✓ PR#5 open for operator review: https://git.autonomic.zone/recipe-maintainers/cc-ci/pulls/5 (DoD#6)
|
||||
|
||||
**Phase complete. Loop stopped. PR#5 awaits operator review — do not merge.**
|
||||
|
||||
---
|
||||
|
||||
## What was built
|
||||
|
||||
```
|
||||
tests/regression/
|
||||
├── conftest.py — run_recipe_ci(), stage_has_{passing,failing}_test() helpers
|
||||
├── test_canaries.py — 7 parametrized canaries (3 @canary + 4 @canary_fast)
|
||||
└── README.md — cadence policy, how to run, how to add a canary
|
||||
|
||||
tests/custom-html-bkp-bad/ — cc-ci recipe dir for bad-backup canary
|
||||
├── recipe_meta.py — BACKUP_CAPABLE=True
|
||||
└── test_backup.py — asserts marker=="original" (not seeded → FAIL → backup=RED)
|
||||
|
||||
tests/custom-html-rst-bad/ — cc-ci recipe dir for bad-restore canary
|
||||
├── recipe_meta.py — BACKUP_CAPABLE=True
|
||||
├── ops.py — pre_restore writes "mutated" (no pre_backup)
|
||||
└── test_restore.py — asserts marker=="original" (not in snapshot → FAIL → restore=RED)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Canaries (7 total)
|
||||
|
||||
| ID | Recipe | SHA | Expected | Verified |
|
||||
|----|--------|-----|---------|---------|
|
||||
| good-simple | custom-html-tiny | 435df8fc (main) | GREEN | ✓ rc=0, install=pass, test_serving present |
|
||||
| good-significant | lasuite-docs | 290a8ad7 (main) | GREEN | ✓ rc=0, all tiers pass (run: regression-good-significant-2) |
|
||||
| bad-false-green | custom-html | 71e7326a (v5-stale-docroot) | RED | ✓ rc=1, custom=fail, test_content_type fails |
|
||||
| bad-install | custom-html-tiny | 4ae88661 (regression-bad-image) | RED (install) | ✓ rc=1, install=fail |
|
||||
| bad-upgrade | custom-html-tiny | 4ae88661 (regression-bad-image) | RED (upgrade) | ✓ rc=1, install=pass, upgrade=fail |
|
||||
| bad-backup | custom-html-bkp-bad | b6fe99de (main) | RED (backup) | ✓ rc=1, install=pass, backup=fail |
|
||||
| bad-restore | custom-html-rst-bad | 9a73a184 (main) | RED (restore) | ✓ rc=1, install=pass, backup=pass, restore=fail |
|
||||
|
||||
---
|
||||
|
||||
## How to verify (Adversary commands)
|
||||
|
||||
From cc-ci server (builder-clone at `/root/builder-clone`):
|
||||
|
||||
```bash
|
||||
# Pull latest
|
||||
cd /root/builder-clone && git pull --rebase
|
||||
|
||||
# Verify collection (expect 7 tests)
|
||||
cc-ci-run -m pytest tests/regression/ --collect-only
|
||||
|
||||
# Fast RED canaries (~2-3 min each):
|
||||
RECIPE=custom-html-tiny REF=4ae8866100563204d40435c5aba00374aa5a8ed3 SRC=recipe-maintainers/custom-html-tiny PR=0 STAGES=install CCCI_RUN_ID=adv-bad-install HOME=/root /run/current-system/sw/bin/cc-ci-run runner/run_recipe_ci.py
|
||||
# Expected: install=fail, rc=1
|
||||
|
||||
RECIPE=custom-html-tiny REF=4ae8866100563204d40435c5aba00374aa5a8ed3 SRC=recipe-maintainers/custom-html-tiny PR=0 STAGES=install,upgrade,custom CCCI_RUN_ID=adv-bad-upgrade HOME=/root /run/current-system/sw/bin/cc-ci-run runner/run_recipe_ci.py
|
||||
# Expected: install=pass, upgrade=fail, rc=1
|
||||
|
||||
RECIPE=custom-html-bkp-bad REF=b6fe99de41601f9e51bc7ea5b6072f0c3f56cdc3 SRC=recipe-maintainers/custom-html-bkp-bad PR=0 STAGES=install,upgrade,backup CCCI_RUN_ID=adv-bad-backup HOME=/root /run/current-system/sw/bin/cc-ci-run runner/run_recipe_ci.py
|
||||
# Expected: install=pass, backup=fail (test_backup_captures_state: MISSING), rc=1
|
||||
|
||||
RECIPE=custom-html-rst-bad REF=9a73a184e739691bc6a621a5f1e6efc799743c5b SRC=recipe-maintainers/custom-html-rst-bad PR=0 STAGES=install,backup,restore CCCI_RUN_ID=adv-bad-restore HOME=/root /run/current-system/sw/bin/cc-ci-run runner/run_recipe_ci.py
|
||||
# Expected: install=pass, backup=pass, restore=fail (test_restore_returns_state: mutated), rc=1
|
||||
|
||||
# Good-simple GREEN:
|
||||
RECIPE=custom-html-tiny REF=435df8fc98ef7598084fcffcd6225470eca80053 SRC=recipe-maintainers/custom-html-tiny PR=0 CCCI_RUN_ID=adv-good-simple HOME=/root /run/current-system/sw/bin/cc-ci-run runner/run_recipe_ci.py
|
||||
# Expected: install=pass, upgrade=pass, rc=0; stages.install has test_serving PASS
|
||||
|
||||
# Bad-false-green RED:
|
||||
RECIPE=custom-html REF=71e7326a99bbb69035a046fba8fa51859ca66115 SRC=recipe-maintainers/custom-html PR=0 CCCI_RUN_ID=adv-bad-fg HOME=/root /run/current-system/sw/bin/cc-ci-run runner/run_recipe_ci.py
|
||||
# Expected: custom=fail (test_content_type FAILS), rc=1
|
||||
|
||||
# Good-significant (lasuite-docs) — verify artifact (or re-run, takes ~15-20 min):
|
||||
# Quick artifact check (no re-run needed):
|
||||
cat /var/lib/cc-ci-runs/regression-good-significant-2/results.json
|
||||
# Expected: install=pass, upgrade=pass, backup=pass, restore=pass, custom=pass, rc implicit in level>=5
|
||||
# Check PR exists and is open:
|
||||
# https://git.autonomic.zone/recipe-maintainers/cc-ci/pulls/5 — state=open, 10 files, 704 insertions
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Artifacts already on server
|
||||
|
||||
| Run ID | Recipe | Result |
|
||||
|--------|--------|--------|
|
||||
| regression-good-simple-1 | custom-html-tiny | GREEN ✓ |
|
||||
| regression-good-significant-2 | lasuite-docs | GREEN ✓ (all tiers: install/upgrade/backup/restore/custom=pass) |
|
||||
| regression-bad-canary-1 | custom-html v5-stale-docroot | RED ✓ |
|
||||
| regression-bad-install-v2 | custom-html-tiny bad-image | RED (install=fail) ✓ |
|
||||
| regression-bad-upgrade-v2 | custom-html-tiny bad-image | RED (upgrade=fail) ✓ |
|
||||
| regression-bad-backup-5 | custom-html-bkp-bad | RED (backup=fail) ✓ |
|
||||
| regression-bad-restore-3 | custom-html-rst-bad | RED (restore=fail) ✓ |
|
||||
|
||||
---
|
||||
|
||||
## good-significant run 2 full results (cold-readable on server)
|
||||
|
||||
`cat /var/lib/cc-ci-runs/regression-good-significant-2/results.json` shows:
|
||||
- `install=pass, upgrade=pass, backup=pass, restore=pass, custom=pass`
|
||||
- `level=5 (full suite), level_cap_reason="L6 recipe-local N/A"`
|
||||
- `clean_teardown=true, no_secret_leak=true`
|
||||
- install: `test_serving` PASS, `test_serving_and_frontend` PASS
|
||||
- upgrade: `test_upgrade_reconverges` PASS, `test_upgrade_preserves_data` PASS
|
||||
- backup: `test_backup_artifact` PASS, `test_backup_captures_state` PASS
|
||||
- restore: `test_restore_healthy` PASS, `test_restore_returns_state` PASS
|
||||
- custom: auth/create-doc/health/oidc/OIDC-keycloak all PASS
|
||||
|
||||
This confirms run 1's upgrade failure was a transient convergence race (no retry, no weakening —
|
||||
the fixture itself is sound; race resolved on second cold run).
|
||||
|
||||
---
|
||||
|
||||
## PR
|
||||
|
||||
**PR#5: https://git.autonomic.zone/recipe-maintainers/cc-ci/pulls/5**
|
||||
Branch `regression-canaries` → `main`. 10 files, 704 insertions. Open for operator review.
|
||||
"Do not merge" — operator review only per DoD#6.
|
||||
@ -7,7 +7,7 @@
|
||||
# git clone --recursive https://git.autonomic.zone/recipe-maintainers/cc-ci.git /etc/cc-ci
|
||||
# install -m600 <age-private-key> /var/lib/sops-nix/key.txt
|
||||
# nixos-rebuild switch --flake /etc/cc-ci#cc-ci-hetzner
|
||||
{ pkgs, lib, ... }:
|
||||
{ pkgs, ... }:
|
||||
{
|
||||
imports = [
|
||||
./hardware.nix
|
||||
@ -22,6 +22,7 @@
|
||||
../../modules/drone-runner.nix
|
||||
../../modules/bridge.nix
|
||||
../../modules/dashboard.nix
|
||||
../../modules/reports.nix
|
||||
../../modules/backupbot.nix
|
||||
../../modules/harness.nix
|
||||
../../modules/warm-keycloak.nix
|
||||
|
||||
@ -11,13 +11,17 @@
|
||||
{
|
||||
imports = [ (modulesPath + "/profiles/qemu-guest.nix") ];
|
||||
|
||||
boot.loader = {
|
||||
efi.efiSysMountPoint = "/boot/efi";
|
||||
grub = {
|
||||
efiSupport = true;
|
||||
efiInstallAsRemovable = true;
|
||||
device = "nodev";
|
||||
boot = {
|
||||
loader = {
|
||||
efi.efiSysMountPoint = "/boot/efi";
|
||||
grub = {
|
||||
efiSupport = true;
|
||||
efiInstallAsRemovable = true;
|
||||
device = "nodev";
|
||||
};
|
||||
};
|
||||
initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "xen_blkfront" "vmw_pvscsi" ];
|
||||
initrd.kernelModules = [ "nvme" ];
|
||||
};
|
||||
|
||||
fileSystems."/boot/efi" = {
|
||||
@ -25,9 +29,6 @@
|
||||
fsType = "vfat";
|
||||
};
|
||||
|
||||
boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "xen_blkfront" "vmw_pvscsi" ];
|
||||
boot.initrd.kernelModules = [ "nvme" ];
|
||||
|
||||
fileSystems."/" = {
|
||||
device = "/dev/sda1";
|
||||
fsType = "ext4";
|
||||
|
||||
@ -8,14 +8,19 @@
|
||||
{ pkgs, config, lib, ... }:
|
||||
let
|
||||
# MAX_TESTS (plan §4.2/§4.3 resource safety): max CI builds the exec runner runs at once. Drone
|
||||
# queues the rest in its native pending-build queue (no custom queue). THE concurrency cap that
|
||||
# bounds how many test apps can be live at once — kept LOW (1) on this single 28GiB node since
|
||||
# recipes are heavy (immich/matrix large volumes). With capacity=1 there is never a concurrent
|
||||
# in-flight run, so the run-start janitor can safely reap *any* orphan (a SIGKILL'd build runs no
|
||||
# teardown) and the "at most MAX_TESTS apps live" bound holds exactly. Raise to 2 only if the node
|
||||
# is shown to handle two light recipes at once (then the janitor MUST stay age-based to avoid
|
||||
# reaping a concurrent run — see DECISIONS.md "Resource safety").
|
||||
maxTests = "1";
|
||||
# queues the rest in its native pending-build queue (no custom queue). THE SINGLE concurrency
|
||||
# knob — nothing else caps recipe-ci parallelism (the .drone.yml concurrency.limit was removed:
|
||||
# one knob, one place). Bounds how many test apps can be live at once.
|
||||
#
|
||||
# Raised to 2 (operator request 2026-06-09) so two recipes can be tested in parallel (e.g. immich
|
||||
# and plausible under active development at once). Verified safe on the current node (Hetzner cpx22,
|
||||
# ~7.6 GiB / 4 vCPU — NOTE: smaller than the original 28 GiB this was written for): a full immich CI
|
||||
# stack measured ~1 GiB (server+ML+pg+redis) with multiple GiB free, so two concurrent recipes fit.
|
||||
# Concurrent-run safety is the harness's job at ANY capacity (docs/concurrency.md): per-run
|
||||
# ABRA_DIR recipe trees, per-app-domain flocks, and a flock-probe janitor that reaps a crashed
|
||||
# build's orphan immediately (held lock = live run, never touched). Revert to "1" if OOM /
|
||||
# disk-I/O contention is observed under load.
|
||||
maxTests = "2";
|
||||
in
|
||||
{
|
||||
# Drone ships under the Polyform Small Business license (nixpkgs marks it unfree);
|
||||
|
||||
@ -29,7 +29,7 @@ in
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
# A full sweep across several recipes (each a cold deploy/test/teardown) is long; bound it.
|
||||
TimeoutStartSec = "21600"; # 6h ceiling
|
||||
TimeoutStartSec = "21600"; # 6h ceiling
|
||||
ExecStart = "${sweep}/bin/cc-ci-nightly-sweep";
|
||||
};
|
||||
};
|
||||
@ -39,7 +39,7 @@ in
|
||||
wantedBy = [ "timers.target" ];
|
||||
timerConfig = {
|
||||
OnCalendar = "*-*-* 03:00:00";
|
||||
Persistent = true; # catch up a missed nightly after downtime
|
||||
Persistent = true; # catch up a missed nightly after downtime
|
||||
RandomizedDelaySec = "600";
|
||||
};
|
||||
};
|
||||
|
||||
116
nix/modules/reports.nix
Normal file
116
nix/modules/reports.nix
Normal file
@ -0,0 +1,116 @@
|
||||
# Recipe Report static site (report.ci.commoninternet.net): a public nginx serving the weekly
|
||||
# "Recipe Report" HTML pages written to /var/lib/cc-ci-reports by the /recipe-report skill. No app,
|
||||
# no secrets — just static files behind traefik + the wildcard TLS (same pattern as dashboard.nix,
|
||||
# but a plain nginx:alpine since there's nothing to render server-side). Content is updated by writing
|
||||
# files into /var/lib/cc-ci-reports; nginx serves them live (no redeploy needed).
|
||||
#
|
||||
# It ALSO serves a same-origin realtime PR-status proxy at /pr/<recipe>/<n>: the report's STATUS
|
||||
# column fetches it client-side to show each PR's live state (open vs. ✓). Same-origin means no
|
||||
# dependency on the Gitea CORS allow-list; the recipe mirrors are public so no token is needed. The
|
||||
# proxy is pinned to recipe-maintainers + a safe recipe-name charset and is read-only (GET/HEAD).
|
||||
{ pkgs, ... }:
|
||||
let
|
||||
reportsDir = "/var/lib/cc-ci-reports";
|
||||
|
||||
# Custom nginx server: static report files + the /pr/<recipe>/<n> → Gitea-API proxy. Replaces the
|
||||
# stock /etc/nginx/conf.d/default.conf (which the image's nginx.conf includes inside http{}).
|
||||
nginxConf = pkgs.writeText "cc-ci-reports-default.conf" ''
|
||||
server {
|
||||
listen 80;
|
||||
server_name _;
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
|
||||
# Realtime PR-status proxy for the Recipe Report STATUS column.
|
||||
# GET /pr/<recipe>/<n> -> the PUBLIC Gitea PR JSON ({state, merged, ...}). Same-origin from
|
||||
# the browser's view, so no CORS dependency; unauthenticated, since the recipe mirrors are
|
||||
# public. The repo owner is hard-pinned to recipe-maintainers and the recipe name to a
|
||||
# slashless charset, so the proxied path can only ever address recipe-maintainers/<name>/pulls
|
||||
# (it cannot be coerced to another org or path). Only safe read methods are allowed.
|
||||
location ~ ^/pr/([a-z0-9._-]+)/([0-9]+)$ {
|
||||
limit_except GET HEAD { deny all; }
|
||||
resolver 127.0.0.11 ipv6=off valid=30s; # docker embedded DNS (forwards external names)
|
||||
proxy_ssl_server_name on;
|
||||
proxy_set_header Host git.autonomic.zone;
|
||||
proxy_set_header Accept "application/json";
|
||||
proxy_pass https://git.autonomic.zone/api/v1/repos/recipe-maintainers/$1/pulls/$2;
|
||||
proxy_intercept_errors off;
|
||||
proxy_connect_timeout 5s;
|
||||
proxy_read_timeout 10s;
|
||||
add_header Cache-Control "no-store" always; # always fetch live state, never cache in the browser
|
||||
}
|
||||
|
||||
location / {
|
||||
try_files $uri $uri/ =404;
|
||||
}
|
||||
}
|
||||
'';
|
||||
|
||||
stack = pkgs.writeText "cc-ci-reports-stack.yml" ''
|
||||
version: "3.8"
|
||||
services:
|
||||
app:
|
||||
image: nginx:alpine
|
||||
volumes:
|
||||
- type: bind
|
||||
source: ${reportsDir}
|
||||
target: /usr/share/nginx/html
|
||||
read_only: true
|
||||
- type: bind
|
||||
source: ${nginxConf}
|
||||
target: /etc/nginx/conf.d/default.conf
|
||||
read_only: true
|
||||
networks:
|
||||
- proxy
|
||||
deploy:
|
||||
replicas: 1
|
||||
restart_policy:
|
||||
condition: any
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.services.ccci-reports.loadbalancer.server.port=80"
|
||||
- "traefik.http.routers.ccci-reports.rule=Host(`report.ci.commoninternet.net`)"
|
||||
- "traefik.http.routers.ccci-reports.entrypoints=web-secure"
|
||||
- "traefik.http.routers.ccci-reports.tls=true"
|
||||
networks:
|
||||
proxy:
|
||||
external: true
|
||||
'';
|
||||
|
||||
reconcile = pkgs.writeShellApplication {
|
||||
name = "cc-ci-reconcile-reports";
|
||||
runtimeInputs = with pkgs; [ docker coreutils ];
|
||||
text = ''
|
||||
mkdir -p ${reportsDir}
|
||||
# Seed a placeholder index so the site serves something before the first report is generated.
|
||||
if [ ! -f ${reportsDir}/index.html ]; then
|
||||
cat > ${reportsDir}/index.html <<'HTML'
|
||||
<!doctype html><html lang="en"><head><meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
<title>The Recipe Report</title>
|
||||
<style>body{font:16px/1.5 system-ui,sans-serif;max-width:50rem;margin:3rem auto;padding:0 1rem;color:#222}</style>
|
||||
</head><body><h1>🌻 The Recipe Report</h1>
|
||||
<p>No reports yet — the first one is generated after the weekly recipe-upgrade run.</p>
|
||||
</body></html>
|
||||
HTML
|
||||
fi
|
||||
docker stack deploy --detach=true -c ${stack} ccci-reports
|
||||
'';
|
||||
};
|
||||
in
|
||||
{
|
||||
systemd.services.deploy-reports = {
|
||||
description = "Reconcile the cc-ci Recipe Report static site (report.ci.commoninternet.net)";
|
||||
# Ordering-only: chain after the dashboard (proxy→…→dashboard→reports) to avoid concurrent
|
||||
# docker-init races on a fresh host.
|
||||
after = [ "deploy-dashboard.service" "deploy-proxy.service" "swarm-init.service" "docker.service" "network-online.target" ];
|
||||
requires = [ "swarm-init.service" "docker.service" ];
|
||||
wants = [ "network-online.target" ];
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
RemainAfterExit = true;
|
||||
ExecStart = "${reconcile}/bin/cc-ci-reconcile-reports";
|
||||
};
|
||||
};
|
||||
}
|
||||
@ -10,6 +10,7 @@ Bakes in the known abra gotchas (re-verify per installed abra version, currently
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
ABRA = "abra"
|
||||
@ -19,6 +20,20 @@ class AbraError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def abra_dir() -> str:
|
||||
"""abra's state dir, resolved the same way the abra CLI resolves it: $ABRA_DIR if set, else
|
||||
~/.abra. Inside a CI run, run_recipe_ci exports a PER-RUN $ABRA_DIR (fresh recipes/, shared
|
||||
servers/+catalogue/ symlinks) before any abra call, so every helper here and every abra
|
||||
subprocess agree on the same tree; outside a run (warm_reconcile's systemd timer, manual use)
|
||||
both fall back to the canonical /root/.abra."""
|
||||
return os.environ.get("ABRA_DIR") or os.path.expanduser("~/.abra")
|
||||
|
||||
|
||||
def recipe_dir(recipe: str) -> str:
|
||||
"""The current ABRA_DIR's working tree for a recipe (per-run inside a CI run)."""
|
||||
return os.path.join(abra_dir(), "recipes", recipe)
|
||||
|
||||
|
||||
def _run_pty(
|
||||
args: list[str], timeout: int = 900, check: bool = True
|
||||
) -> subprocess.CompletedProcess:
|
||||
@ -77,9 +92,7 @@ def recipe_checkout(recipe: str, version: str) -> None:
|
||||
a chaos (`-C`) deploy ignores ENV VERSION and uses the current checkout — together that silently
|
||||
deployed LATEST for a 'previous-version' base, making the upgrade a no-op (Adversary F1d-2). With
|
||||
this checkout + a non-chaos deploy, a pinned deploy genuinely deploys that version."""
|
||||
import os
|
||||
|
||||
path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||||
path = recipe_dir(recipe)
|
||||
# -f (force): the version-pinning checkout must yield the EXACT ref tree. Without it, a cc-ci
|
||||
# install_steps-provided overlay (e.g. discourse's compose.ccci.yml, copied into the pinned base)
|
||||
# is an UNTRACKED file that collides with the same path TRACKED in a later ref, and
|
||||
@ -100,9 +113,7 @@ def has_lightweight_version_tags(recipe: str) -> bool:
|
||||
'reference not found'.) The caller (deploy_app) uses this to fall back to a chaos base deploy
|
||||
(which skips lint and deploys the explicitly-checked-out pinned version — see lifecycle.deploy_app).
|
||||
Read-only: just `git tag` + `cat-file -t`; no fetch/mutation, so it can't trigger abra's revert."""
|
||||
import os
|
||||
|
||||
path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||||
path = recipe_dir(recipe)
|
||||
tags = subprocess.run(
|
||||
["git", "-C", path, "tag", "-l"], capture_output=True, text=True
|
||||
).stdout.split()
|
||||
@ -168,7 +179,9 @@ def secret_generate(domain: str, timeout: int = 300) -> None:
|
||||
)
|
||||
|
||||
|
||||
def deploy(domain: str, chaos: bool = True, timeout: int = 900, no_converge_checks: bool = False) -> None:
|
||||
def deploy(
|
||||
domain: str, chaos: bool = True, timeout: int = 900, no_converge_checks: bool = False
|
||||
) -> None:
|
||||
args = ["app", "deploy", domain, "-o", "-n"]
|
||||
if chaos:
|
||||
args.append("-C")
|
||||
@ -203,7 +216,10 @@ def backup_create(domain: str, timeout: int = 900) -> str:
|
||||
# remote and fails "authentication required: Unauthorized". Returns the captured output, whose
|
||||
# restic JSON summary line carries the produced "snapshot_id" (the backup artifact, DG3) — note
|
||||
# `abra app backup snapshots` needs a TTY and is awkward to script, so we read the create output.
|
||||
out = _run_pty(["app", "backup", "create", domain, "-n", "-C", "-o"], timeout=timeout).stdout or ""
|
||||
out = (
|
||||
_run_pty(["app", "backup", "create", domain, "-n", "-C", "-o"], timeout=timeout).stdout
|
||||
or ""
|
||||
)
|
||||
# Echo the backup output (incl. backupbot's pre-hook run / any "Failed to run command" or
|
||||
# "Container ... not running" ERROR) into the run log. Backup is otherwise opaque: a pre-hook that
|
||||
# fails to register/run leaves the DB dump out of the snapshot, surfacing only as a downstream
|
||||
@ -226,9 +242,7 @@ def recipe_head_commit(recipe: str) -> str | None:
|
||||
"""The current HEAD commit of the recipe checkout — captured right after fetch (the PR head, or
|
||||
the catalogue current) so the upgrade tier can re-checkout it for the chaos redeploy after the
|
||||
prev-tag base deploy reset the working tree (HC1)."""
|
||||
import os
|
||||
|
||||
path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||||
path = recipe_dir(recipe)
|
||||
proc = subprocess.run(["git", "-C", path, "rev-parse", "HEAD"], capture_output=True, text=True)
|
||||
out = proc.stdout.strip()
|
||||
return out or None
|
||||
@ -236,10 +250,7 @@ def recipe_head_commit(recipe: str) -> str | None:
|
||||
|
||||
def recipe_versions(recipe: str) -> list[str]:
|
||||
"""Published versions of a recipe, oldest→newest (from the recipe git tags)."""
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
path = os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||||
path = recipe_dir(recipe)
|
||||
proc = subprocess.run(
|
||||
["git", "-C", path, "tag", "--sort=creatordate"], capture_output=True, text=True
|
||||
)
|
||||
|
||||
@ -13,8 +13,15 @@ from __future__ import annotations
|
||||
import time
|
||||
|
||||
|
||||
def goto_with_retry(page, url, *, deadline_seconds: int = 120, accept_statuses=(200, 304),
|
||||
goto_timeout_ms: int = 30_000, wait_until: str = "domcontentloaded"):
|
||||
def goto_with_retry(
|
||||
page,
|
||||
url,
|
||||
*,
|
||||
deadline_seconds: int = 120,
|
||||
accept_statuses=(200, 304),
|
||||
goto_timeout_ms: int = 30_000,
|
||||
wait_until: str = "domcontentloaded",
|
||||
):
|
||||
"""Poll `page.goto(url)` until status is in `accept_statuses` OR the deadline expires.
|
||||
|
||||
Returns the final Playwright response. Raises AssertionError if the deadline expires without
|
||||
|
||||
@ -55,7 +55,9 @@ def enrolled_recipes() -> list[str]:
|
||||
out = []
|
||||
try:
|
||||
for name in sorted(os.listdir(tests_dir)):
|
||||
if os.path.isfile(os.path.join(tests_dir, name, "recipe_meta.py")) and is_enrolled(name):
|
||||
if os.path.isfile(os.path.join(tests_dir, name, "recipe_meta.py")) and is_enrolled(
|
||||
name
|
||||
):
|
||||
out.append(name)
|
||||
except OSError:
|
||||
pass
|
||||
@ -122,11 +124,15 @@ def deploy_canonical(recipe: str, timeout: int = 900) -> None:
|
||||
abra.recipe_checkout(recipe, version)
|
||||
r = subprocess.run(
|
||||
["abra", "app", "deploy", domain, version, "-o", "-n", "-f"],
|
||||
capture_output=True, text=True, timeout=timeout,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
if r.returncode != 0:
|
||||
raise RuntimeError(f"deploy canonical {domain} {version} failed: "
|
||||
f"{(r.stderr + ' ' + r.stdout).strip()[:300]}")
|
||||
raise RuntimeError(
|
||||
f"deploy canonical {domain} {version} failed: "
|
||||
f"{(r.stderr + ' ' + r.stdout).strip()[:300]}"
|
||||
)
|
||||
_set_status(recipe, "warm")
|
||||
|
||||
|
||||
|
||||
@ -79,10 +79,44 @@ def render_badge_svg(label: str, message: str, color: str) -> str:
|
||||
)
|
||||
|
||||
|
||||
def level_badge_svg(level: int, cap_reason: str = "") -> str:
|
||||
"""Per-recipe/-run LEVEL badge: 'cc-ci | level N'. Colour by level (R6)."""
|
||||
msg = f"level {int(level)}"
|
||||
return render_badge_svg("cc-ci", msg, level_color(level))
|
||||
# Third-segment colours for the level badge: amber = an UNINTENTIONAL skip (a rung skipped but not
|
||||
# in the recipe's intentional list — likely missing coverage) capped the climb; muted = an
|
||||
# INTENTIONAL skip (declared in recipe_meta.EXPECTED_NA — nothing to fix). Font-safe text labels
|
||||
# (no emoji) so the SVG renders anywhere.
|
||||
GAP_COLOR = "#d29922"
|
||||
EXPECT_COLOR = "#6e7681"
|
||||
|
||||
|
||||
def level_badge_svg(level: int, cap_reason: str = "", cap_skip: str = "") -> str:
|
||||
"""Per-recipe/-run LEVEL badge: 'cc-ci | level N' coloured by level (R6), with a THIRD segment
|
||||
that differentiates *why* the climb stopped when a SKIP capped it (`cap_skip`):
|
||||
- "unintentional" (a rung skipped but not in the recipe's intentional list): amber 'gap?'.
|
||||
- "intentional" (a skip declared in recipe_meta.EXPECTED_NA): muted 'expected'.
|
||||
- "" (clean cap / full climb / a real failure): no third segment (the level + card carry it).
|
||||
The badge never inflates — it only annotates the cap the level already reflects."""
|
||||
label, msg = "cc-ci", f"level {int(level)}"
|
||||
lw, mw = _text_width(label), _text_width(msg)
|
||||
third: tuple[str, str] | None = None
|
||||
if cap_skip == "unintentional":
|
||||
third = ("gap?", GAP_COLOR)
|
||||
elif cap_skip == "intentional":
|
||||
third = ("expected", EXPECT_COLOR)
|
||||
if third is None:
|
||||
return render_badge_svg(label, msg, level_color(level))
|
||||
txt, tcolor = third
|
||||
tw = _text_width(txt)
|
||||
w = lw + mw + tw
|
||||
return (
|
||||
f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="20" role="img" '
|
||||
f'aria-label="{html.escape(label)}: {html.escape(msg)} ({html.escape(txt)})">'
|
||||
f'<rect width="{lw}" height="20" fill="#555"/>'
|
||||
f'<rect x="{lw}" width="{mw}" height="20" fill="{level_color(level)}"/>'
|
||||
f'<rect x="{lw + mw}" width="{tw}" height="20" fill="{tcolor}"/>'
|
||||
f'<g fill="#fff" font-family="Verdana,Geneva,sans-serif" font-size="11">'
|
||||
f'<text x="6" y="14">{html.escape(label)}</text>'
|
||||
f'<text x="{lw + 6}" y="14">{html.escape(msg)}</text>'
|
||||
f'<text x="{lw + mw + 6}" y="14">{html.escape(txt)}</text></g></svg>'
|
||||
)
|
||||
|
||||
|
||||
def _stage_rows(stages: list[dict]) -> str:
|
||||
@ -107,6 +141,45 @@ def _stage_rows(stages: list[dict]) -> str:
|
||||
return "\n".join(rows) or '<tr><td colspan="3">no stages</td></tr>'
|
||||
|
||||
|
||||
# Friendly rung labels for the skip rows (the four essential rungs).
|
||||
RUNG_LABEL = {
|
||||
"install": "install",
|
||||
"upgrade": "upgrade",
|
||||
"backup_restore": "backup/restore",
|
||||
"functional": "functional",
|
||||
}
|
||||
SKIP_GREEN = (
|
||||
"#57ab5a" # muted green — an intentional skip reads like a pass (but labelled, never inflating)
|
||||
)
|
||||
|
||||
|
||||
def _skip_rows(skips: dict) -> str:
|
||||
"""Render SKIPPED rungs as stage-like rows. An intentional (declared) skip looks like a pass row
|
||||
but its status says 'INTENTIONAL SKIP' (muted green) with the declared reason on the line below;
|
||||
an unintentional skip is amber 'UNINTENTIONAL SKIP' with a prompt to add a test or declare it."""
|
||||
rows = []
|
||||
for rung, reason in (skips.get("intentional") or {}).items():
|
||||
rows.append(
|
||||
f'<tr class="stage"><td colspan="2"><span class="mark" style="color:{SKIP_GREEN}">⊘</span>'
|
||||
f"<b>{html.escape(RUNG_LABEL.get(rung, rung))}</b></td>"
|
||||
f'<td class="st" style="color:{SKIP_GREEN}">intentional skip</td></tr>'
|
||||
)
|
||||
rows.append(
|
||||
f'<tr class="skipreason"><td></td><td colspan="2">{html.escape(reason)}</td></tr>'
|
||||
)
|
||||
for rung in skips.get("unintentional") or []:
|
||||
rows.append(
|
||||
f'<tr class="stage"><td colspan="2"><span class="mark" style="color:{GAP_COLOR}">⊘</span>'
|
||||
f"<b>{html.escape(RUNG_LABEL.get(rung, rung))}</b></td>"
|
||||
f'<td class="st" style="color:{GAP_COLOR}">unintentional skip</td></tr>'
|
||||
)
|
||||
rows.append(
|
||||
'<tr class="skipreason"><td></td><td colspan="2">not declared in EXPECTED_NA — add the '
|
||||
"missing test/label, or declare the skip with a reason</td></tr>"
|
||||
)
|
||||
return "\n".join(rows)
|
||||
|
||||
|
||||
def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png") -> str:
|
||||
"""Build the summary-card HTML from a results.json dict. `screenshot_rel` is the relative path to
|
||||
the screenshot PNG (same dir as the card) — omitted from the card if None / absent.
|
||||
@ -116,7 +189,9 @@ def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png")
|
||||
recipe = html.escape(str(data.get("recipe", "?")))
|
||||
version = html.escape(str(data.get("version") or data.get("ref") or ""))
|
||||
level = int(data.get("level", 0))
|
||||
cap = html.escape(str(data.get("level_cap_reason") or ""))
|
||||
cap_reason = str(data.get("level_cap_reason") or "")
|
||||
cap = html.escape(cap_reason)
|
||||
sk = data.get("skips", {}) or {}
|
||||
color = level_color(level)
|
||||
flags = data.get("flags", {}) or {}
|
||||
flag_bits = []
|
||||
@ -132,7 +207,7 @@ def render_card_html(data: dict, screenshot_rel: str | None = "screenshot.png")
|
||||
if show_shot
|
||||
else '<div class="shot noshot">no screenshot</div>'
|
||||
)
|
||||
rows = _stage_rows(data.get("stages", []))
|
||||
rows = _stage_rows(data.get("stages", [])) + "\n" + _skip_rows(sk)
|
||||
return f"""<!doctype html><html><head><meta charset="utf-8"><style>
|
||||
*{{box-sizing:border-box}}
|
||||
body{{margin:0;font-family:system-ui,-apple-system,Segoe UI,sans-serif;background:#0d1117;color:#c9d1d9}}
|
||||
@ -157,6 +232,7 @@ tr.stage td{{padding-top:.5rem;border-bottom:1px solid #30363d}}
|
||||
.test .tmark{{width:1.4rem;text-align:center}}
|
||||
.test .tname{{color:#c9d1d9;font-family:ui-monospace,monospace;font-size:.8rem}}
|
||||
.test .tms{{text-align:right;color:#8b949e;font-size:.74rem;width:5rem}}
|
||||
tr.skipreason td{{color:#8b949e;font-size:.78rem;font-style:italic;padding-top:0;padding-bottom:.45rem;border-bottom:1px solid #21262d}}
|
||||
.shot{{width:360px;flex:none;border:1px solid #30363d;border-radius:8px;overflow:hidden;background:#0d1117}}
|
||||
.shot img{{width:100%;display:block}}
|
||||
.shot.noshot{{display:flex;align-items:center;justify-content:center;height:225px;color:#8b949e;font-size:.85rem}}
|
||||
@ -167,7 +243,7 @@ tr.stage td{{padding-top:.5rem;border-bottom:1px solid #30363d}}
|
||||
<div class="hd">{FLOWER_SVG}
|
||||
<div class="title"><h1>{recipe}</h1><span class="ver">{version}</span></div>
|
||||
<div class="lvl"><span class="num">{level}</span><span class="lbl">level</span></div></div>
|
||||
<div class="cap">{("<b>capped:</b> " + cap) if cap else "<b>full clean climb</b> — top level (6)"}</div>
|
||||
<div class="cap">{("<b>capped:</b> " + cap) if cap else "<b>full clean climb</b> — top level (4)"}</div>
|
||||
<div class="body"><div class="tbl"><table>{rows}</table></div>{shot_html}</div>
|
||||
<div class="flags">{"".join(flag_bits)}</div>
|
||||
</div></body></html>"""
|
||||
|
||||
@ -28,7 +28,7 @@ from __future__ import annotations
|
||||
import contextlib
|
||||
import json
|
||||
import os
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
from . import lifecycle, naming
|
||||
|
||||
@ -36,9 +36,7 @@ from . import lifecycle, naming
|
||||
def declared_deps(recipe: str) -> list[str]:
|
||||
"""Read `DEPS` from `tests/<recipe>/recipe_meta.py` — a list of recipe names this recipe needs
|
||||
deployed alongside it. Returns [] if none."""
|
||||
path = os.path.join(
|
||||
os.path.dirname(__file__), "..", "..", "tests", recipe, "recipe_meta.py"
|
||||
)
|
||||
path = os.path.join(os.path.dirname(__file__), "..", "..", "tests", recipe, "recipe_meta.py")
|
||||
if not os.path.exists(path):
|
||||
return []
|
||||
ns: dict = {}
|
||||
|
||||
@ -25,7 +25,7 @@ _BACKUPBOT_RE = re.compile(r"backupbot\.backup\b[^\n]*\btrue\b", re.IGNORECASE)
|
||||
|
||||
|
||||
def _recipe_dir(recipe: str) -> str:
|
||||
return os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||||
return abra.recipe_dir(recipe) # the per-run tree inside a CI run ($ABRA_DIR)
|
||||
|
||||
|
||||
def backup_capable(recipe: str, meta: dict | None = None) -> bool:
|
||||
@ -222,7 +222,11 @@ def assert_restore_healthy(domain: str, meta: dict) -> None:
|
||||
|
||||
|
||||
def perform_upgrade(
|
||||
domain: str, recipe: str, head_ref: str | None, deploy_timeout: int = 900, meta: dict | None = None
|
||||
domain: str,
|
||||
recipe: str,
|
||||
head_ref: str | None,
|
||||
deploy_timeout: int = 900,
|
||||
meta: dict | None = None,
|
||||
) -> dict[str, str | None]:
|
||||
"""Perform the UPGRADE op once, in place, to the PR-HEAD code under test (HC1): re-checkout the
|
||||
PR head (the prev-tag base deploy reset the recipe working tree), then `abra app deploy --chaos`
|
||||
@ -267,7 +271,9 @@ def perform_upgrade(
|
||||
deploy_timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout)),
|
||||
http_timeout=int(meta.get("HTTP_TIMEOUT", 300)),
|
||||
)
|
||||
lifecycle.wait_ready_probes(meta, domain, timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout)))
|
||||
lifecycle.wait_ready_probes(
|
||||
meta, domain, timeout=int(meta.get("DEPLOY_TIMEOUT", deploy_timeout))
|
||||
)
|
||||
after = lifecycle.deployed_identity(domain)
|
||||
# Evidence (HC1): the chaos-version label = the deployed recipe commit; it should match the
|
||||
# PR-head we checked out — proving the upgrade deployed the code under test, not a published tag.
|
||||
|
||||
@ -73,7 +73,7 @@ def http_post(
|
||||
`data` is JSON-encoded if content_type='application/json',
|
||||
form-encoded if 'application/x-www-form-urlencoded' (the OIDC token endpoint form),
|
||||
or sent raw bytes if data is already bytes."""
|
||||
if isinstance(data, (bytes, bytearray)):
|
||||
if isinstance(data, bytes | bytearray):
|
||||
body: bytes | None = bytes(data)
|
||||
elif content_type == "application/json" and data is not None:
|
||||
body = json.dumps(data).encode()
|
||||
@ -107,7 +107,7 @@ def http_request(
|
||||
) -> tuple[int, object | None]:
|
||||
"""Arbitrary-method HTTP (PUT/DELETE/PATCH) for parity tests that mutate. Same shape as
|
||||
http_post (returns (status, json_or_None))."""
|
||||
if isinstance(data, (bytes, bytearray)):
|
||||
if isinstance(data, bytes | bytearray):
|
||||
body: bytes | None = bytes(data)
|
||||
elif content_type == "application/json" and data is not None:
|
||||
body = json.dumps(data).encode()
|
||||
@ -142,7 +142,7 @@ def post_with_headers(
|
||||
"""Like http_post but ALSO returns the response headers as a dict — for APIs that hand back an
|
||||
auth token in a response header rather than the body (e.g. mattermost login → `Token` header).
|
||||
Returns (status, parsed_json_or_None, response_headers). status=0 + {} on transport failure."""
|
||||
if isinstance(data, (bytes, bytearray)):
|
||||
if isinstance(data, bytes | bytearray):
|
||||
body: bytes | None = bytes(data)
|
||||
elif content_type == "application/json" and data is not None:
|
||||
body = json.dumps(data).encode()
|
||||
@ -252,13 +252,16 @@ def retry_http_post(
|
||||
) -> tuple[int, object | None]:
|
||||
"""POST with retry until expect_fn(status, json) is truthy. Defaults to any 2xx."""
|
||||
if expect_fn is None:
|
||||
|
||||
def expect_fn(s, _j): # noqa: ARG001
|
||||
return 200 <= s < 300
|
||||
|
||||
result: list[tuple[int, object | None]] = [(0, None)]
|
||||
|
||||
def _check():
|
||||
s, j = http_post(url, data=data, headers=headers, content_type=content_type, timeout=timeout)
|
||||
s, j = http_post(
|
||||
url, data=data, headers=headers, content_type=content_type, timeout=timeout
|
||||
)
|
||||
result[0] = (s, j)
|
||||
return expect_fn(s, j)
|
||||
|
||||
|
||||
@ -5,37 +5,39 @@ YunoHost semantics: **a gap caps the level** — you only earn level L if every
|
||||
PASS. The first rung that is not a clean PASS (a real FAIL *or* genuinely N/A for this recipe) stops
|
||||
the climb; `cap_reason` records why. This is deliberately conservative: presentation must NEVER make
|
||||
a run look greener than its tests (plan §6 cardinal guardrail), so an N/A rung caps just like a fail
|
||||
(the L5 example in §4.1 — "recipes with no integration surface cap at L4 by definition" — is exactly
|
||||
this: N/A caps, with a recorded reason so the level is *fair*, not inflated).
|
||||
— with a recorded reason so the level is *fair*, not inflated.
|
||||
|
||||
The ladder (§4.1):
|
||||
The ladder is the FOUR essential rungs every recipe is held to:
|
||||
L0 — install failed / app never became healthy.
|
||||
L1 — Installs: deploys + passes health/readiness.
|
||||
L2 — Upgrades: previous published version → PR version, stays healthy, data intact.
|
||||
L3 — Backup/restore: seeded data survives backup → wipe → restore.
|
||||
L4 — Functional: recipe-specific functional tests pass.
|
||||
L5 — Integration: SSO/OIDC + cross-app integration tests pass.
|
||||
L6 — Recipe-local: the recipe repo's own tests/ (D4) pass and are merged.
|
||||
|
||||
Integration (SSO/OIDC + cross-app) and recipe-local (the recipe repo's own tests/) are **OPTIONAL**
|
||||
capabilities — they are NOT part of the level ladder and never cap it. They still run when present
|
||||
(and SSO is still enforced for the run VERDICT via the deps/SSO checks in run_recipe_ci.py), but a
|
||||
recipe without an SSO surface or without repo-local tests is simply not penalised on the level.
|
||||
|
||||
This module is PURE (no I/O) so it is cheaply unit-testable and the Adversary can re-run the unit
|
||||
test cold (`cc-ci-run -m pytest tests/unit/test_level.py -q`). The orchestrator
|
||||
(`run_recipe_ci.py`) is responsible for translating its raw per-tier results + deps/SSO signals into
|
||||
the rung-status dict this function consumes; that mapping is documented in DECISIONS.md (Phase 3).
|
||||
(`run_recipe_ci.py`) is responsible for translating its raw per-tier results into the rung-status
|
||||
dict this function consumes; that mapping is documented in DECISIONS.md (Phase 3).
|
||||
|
||||
Rung status vocabulary (each rung ∈ these three):
|
||||
"pass" — the rung was exercised and passed.
|
||||
"fail" — the rung was exercised and failed.
|
||||
"na" — the rung does not apply to this recipe (e.g. only one published version → no upgrade;
|
||||
not backup-capable; no SSO/integration surface; no recipe-local tests). N/A is NOT a
|
||||
failure, but it DOES cap the climb (with a distinct cap_reason) so the level never
|
||||
overstates what was actually verified.
|
||||
not backup-capable). N/A is NOT a failure, but it DOES cap the climb (with a distinct
|
||||
cap_reason) so the level never overstates what was actually verified.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# The climbable rungs in ascending order. install (L1) is the foundation; L0 means install itself
|
||||
# did not pass. Each later rung requires every earlier rung to be a clean PASS.
|
||||
RUNGS = ("install", "upgrade", "backup_restore", "functional", "integration", "recipe_local")
|
||||
# did not pass. Each later rung requires every earlier rung to be a clean PASS. These four are the
|
||||
# ESSENTIAL rungs — integration/recipe-local are optional and deliberately NOT in this tuple.
|
||||
RUNGS = ("install", "upgrade", "backup_restore", "functional")
|
||||
|
||||
# Human-readable label per rung level, for cap_reason + the summary card.
|
||||
RUNG_LABEL = {
|
||||
@ -43,22 +45,20 @@ RUNG_LABEL = {
|
||||
2: "upgrade (prev published → PR)",
|
||||
3: "backup/restore (data integrity)",
|
||||
4: "functional (recipe-specific tests)",
|
||||
5: "integration (SSO/OIDC + cross-app)",
|
||||
6: "recipe-local (recipe repo tests/)",
|
||||
}
|
||||
|
||||
VALID = {"pass", "fail", "na"}
|
||||
|
||||
|
||||
def compute_level(rungs: dict[str, str]) -> tuple[int, str]:
|
||||
"""Map a rung-status dict → (level 0..6, cap_reason).
|
||||
"""Map a rung-status dict → (level 0..4, cap_reason).
|
||||
|
||||
`rungs` must contain a status in {"pass","fail","na"} for every name in RUNGS. The level is the
|
||||
highest L such that rungs[1..L] are all "pass"; the first non-"pass" rung caps the climb. L0 is
|
||||
returned when the install rung itself is not "pass" (install failed / never healthy).
|
||||
|
||||
cap_reason explains where the climb stopped:
|
||||
- "" (empty) when the recipe earned the top rung (L6, full clean climb).
|
||||
- "" (empty) when the recipe earned the top rung (L4, full clean climb).
|
||||
- "L<k> <label> FAILED" when a rung was exercised and failed.
|
||||
- "L<k> <label> N/A" when a rung does not apply to this recipe.
|
||||
Returns the reason for the FIRST rung that stopped the climb (the binding constraint).
|
||||
|
||||
@ -7,7 +7,8 @@ next run. Callers wrap deploy()/teardown() in try/finally (or a pytest finalizer
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import datetime
|
||||
import fcntl
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@ -17,7 +18,7 @@ import subprocess
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
from . import abra
|
||||
from . import abra, lifetime
|
||||
|
||||
GATEWAY_IP = "143.244.213.108" # *.ci.commoninternet.net -> gateway (TLS passthrough to cc-ci)
|
||||
# A run app domain is "<recipe[:4]>-<6hex>.ci.commoninternet.net" (see DECISIONS.md). Used by the
|
||||
@ -29,6 +30,68 @@ class TeardownError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
# --- Concurrent-run safety (capacity=2) -------------------------------------------------------
|
||||
# ONE mechanism, process-lifetime-scoped so SIGKILL can't leak a stale claim: every run holds an
|
||||
# exclusive kernel flock on its app DOMAIN (/run/lock/cc-ci-app-<domain>.lock) for the whole run.
|
||||
# A held lock implies a live owner — the kernel releases a flock when the holding process dies,
|
||||
# however it dies. The janitor probes the lock (LOCK_NB) to tell a live concurrent run (held →
|
||||
# leave it) from a crashed run's orphan (acquirable → reap it); it never inspects pids and never
|
||||
# steals a held lock. Recipe-tree corruption between same-recipe runs is gone structurally (each
|
||||
# run deploys from its own per-run ABRA_DIR — there is no shared recipe tree and no recipe lock),
|
||||
# and same-domain runs (double-!testme of one PR) serialise on this app lock.
|
||||
# See docs/concurrency.md.
|
||||
|
||||
# Acquired app-lock file objects are retained here for the REMAINING PROCESS LIFETIME: if the
|
||||
# caller drops the returned file object, GC would close the fd and silently release the lock —
|
||||
# this list is the lock's owner of record. Never cleared; release is process exit.
|
||||
_held_app_locks: list = []
|
||||
|
||||
|
||||
def _app_lock_dir() -> str:
|
||||
"""The app-domain lockfile dir. /run/lock (tmpfs: a reboot clears locks AND lockfiles, so
|
||||
post-reboot apps probe as orphans and are reaped immediately). Env-overridable so the
|
||||
tests/concurrency suite (and its helper subprocesses) can use a sandbox dir."""
|
||||
return os.environ.get("CCCI_APP_LOCK_DIR", "/run/lock")
|
||||
|
||||
|
||||
def _app_lock_path(domain: str) -> str:
|
||||
return os.path.join(_app_lock_dir(), f"cc-ci-app-{domain}.lock")
|
||||
|
||||
|
||||
def acquire_app_lock(domain: str):
|
||||
"""Take the per-app-domain exclusive lock; blocks (with a log line) if another run of the
|
||||
same domain is in flight (double-!testme serialisation). Returns the open lock file, which is
|
||||
ALSO retained in _held_app_locks so the flock lives exactly as long as the process.
|
||||
|
||||
Unlink/recreate race guard: the janitor unlinks a reaped orphan's lockfile while holding its
|
||||
flock, so a waiter blocked on the OLD inode can win a lock no later opener can observe (a new
|
||||
open() at the path creates a FRESH inode). After every acquisition, verify the locked fd is
|
||||
still the file at the path (st_ino match); if not, drop it and retry on the live path."""
|
||||
path = _app_lock_path(domain)
|
||||
waited = False
|
||||
while True:
|
||||
# PEP 446: the fd is non-inheritable, so subprocess children never carry the lock.
|
||||
f = open(path, "a") # noqa: SIM115 — deliberately held for the rest of the process
|
||||
try:
|
||||
fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except BlockingIOError:
|
||||
if not waited:
|
||||
print(f"== app lock: another run of {domain} is in flight — waiting ==", flush=True)
|
||||
waited = True
|
||||
fcntl.flock(f, fcntl.LOCK_EX)
|
||||
try:
|
||||
if os.fstat(f.fileno()).st_ino == os.stat(path).st_ino:
|
||||
break # we hold the lock on the inode the path names — done
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
f.close() # locked a stale (unlinked) inode — retry on the live path
|
||||
os.utime(f.fileno()) # mtime = acquisition time = lock age (janitor's long-held flag)
|
||||
_held_app_locks.append(f)
|
||||
if waited:
|
||||
print(f"== app lock: acquired {path} ==", flush=True)
|
||||
return f
|
||||
|
||||
|
||||
def _docker_names(kind: str, stack: str) -> list[str]:
|
||||
"""docker <kind> ls names filtered to a stack (kind: service|volume|secret)."""
|
||||
proc = subprocess.run(
|
||||
@ -48,31 +111,6 @@ def _residual(domain: str) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def _stack_age_seconds(stack: str) -> float | None:
|
||||
"""Age of the stack's oldest service, or None if not present."""
|
||||
svcs = _docker_names("service", stack)
|
||||
if not svcs:
|
||||
return None
|
||||
oldest = None
|
||||
for s in svcs:
|
||||
p = subprocess.run(
|
||||
["docker", "service", "inspect", s, "--format", "{{.CreatedAt}}"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
ts = p.stdout.strip()
|
||||
try:
|
||||
# docker emits e.g. 2026-05-27 00:12:33.123 +0000 UTC -> take the leading 19 chars
|
||||
dt = datetime.datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace(
|
||||
tzinfo=datetime.UTC
|
||||
)
|
||||
except ValueError:
|
||||
continue
|
||||
age = (datetime.datetime.now(datetime.UTC) - dt).total_seconds()
|
||||
oldest = age if oldest is None else max(oldest, age)
|
||||
return oldest
|
||||
|
||||
|
||||
def _recipe_extra_env(recipe: str, domain: str) -> dict[str, str]:
|
||||
"""Per-recipe extra .env keys, applied at every deploy (install + upgrade's old_app) so a recipe
|
||||
with multi-domain / config needs is enrolled with NO shared-harness change (D5/M6.5). A recipe
|
||||
@ -149,9 +187,9 @@ def prepull_images(recipe: str, domain: str) -> None:
|
||||
app-INIT time (slow-init apps like collabora/immich still need their recipe healthcheck/READY_PROBE).
|
||||
Best-effort on resolution failure (skip + let the deploy pull as usual); HARD-fails on a real
|
||||
pull error (don't mask it)."""
|
||||
import os
|
||||
|
||||
recipe_dir = os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||||
recipe_dir = abra.recipe_dir(recipe) # per-run tree inside a CI run
|
||||
# The app .env lives in the CANONICAL servers path (the per-run ABRA_DIR's servers/ is a
|
||||
# symlink to it, so abra and this path agree on the same file).
|
||||
env_path = os.path.expanduser(f"~/.abra/servers/default/{domain}.env")
|
||||
if not os.path.isdir(recipe_dir) or not os.path.isfile(env_path):
|
||||
print(f" prepull: recipe dir or .env missing for {recipe} — skipping", flush=True)
|
||||
@ -161,7 +199,8 @@ def prepull_images(recipe: str, domain: str) -> None:
|
||||
# --env-file supplies $VERSION-style interpolation so pinned tags resolve correctly.
|
||||
cf = subprocess.run(
|
||||
["bash", "-c", f'set -a; . "{env_path}"; printf "%s" "${{COMPOSE_FILE:-compose.yml}}"'],
|
||||
capture_output=True, text=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
).stdout.strip()
|
||||
files = [f for f in cf.split(":") if f] or ["compose.yml"]
|
||||
args = ["docker", "compose", "--env-file", env_path]
|
||||
@ -209,6 +248,10 @@ def deploy_app(
|
||||
past the 900s default. abra's INTERNAL TIMEOUT (recipe's TIMEOUT env, default 300s) is set via
|
||||
EXTRA_ENV; this is the Python subprocess wrapper's timeout so abra doesn't get SIGKILLed mid-deploy."""
|
||||
_record_deploy()
|
||||
# Lock BEFORE the app exists: a concurrent run's janitor must never see this app without a
|
||||
# held app lock (it would probe it as an orphan and reap an in-flight deploy). Also the
|
||||
# double-!testme serialisation point: a second run of the same domain blocks here.
|
||||
acquire_app_lock(domain)
|
||||
abra.app_config_remove(domain) # clear any stale .env from a prior crashed run
|
||||
abra.app_new(recipe, domain, version=version, secrets=secrets)
|
||||
# A pinned version must actually deploy that version: check the recipe out to the tag so the
|
||||
@ -268,18 +311,22 @@ def _stack_name(domain: str) -> str:
|
||||
|
||||
|
||||
def services_converged(domain: str) -> bool:
|
||||
"""True when every service in the stack reports replicas N/N (N>0)."""
|
||||
"""True when every service in the stack reports replicas N/N (N>0) AND no service is
|
||||
mid-rolling-update (swarm UpdateStatus settled)."""
|
||||
stack = _stack_name(domain)
|
||||
proc = subprocess.run(
|
||||
["docker", "stack", "services", stack, "--format", "{{.Replicas}}"],
|
||||
["docker", "stack", "services", stack, "--format", "{{.Name}} {{.Replicas}}"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
rows = [r for r in proc.stdout.split("\n") if r.strip()]
|
||||
if not rows:
|
||||
return False
|
||||
names = []
|
||||
for r in rows:
|
||||
cur, _, want = r.partition("/")
|
||||
name, _, replicas = r.partition(" ")
|
||||
names.append(name)
|
||||
cur, _, want = replicas.partition("/")
|
||||
# A service at its DESIRED replica count is converged — including a `replicas: 0`
|
||||
# on-demand one-shot (e.g. lasuite-drive's `minio-createbuckets`, which is scaled up
|
||||
# manually only when buckets need (re)creating), which reports "0/0". The earlier
|
||||
@ -288,6 +335,34 @@ def services_converged(domain: str) -> bool:
|
||||
# still spinning up shows e.g. "0/1" (cur != want) and is correctly not-yet-converged.
|
||||
if not want or cur != want:
|
||||
return False
|
||||
# N/N alone is NOT convergence during a stop-first rolling update: a chaos redeploy that changes
|
||||
# a non-app service image (e.g. immich's db pin) registers the update immediately, but swarm may
|
||||
# not have cycled that service's task yet — the OLD task still shows 1/1, then dies seconds later
|
||||
# (immich CI 238: backupbot exec'd the db pre-hook into the just-killed container → 409). Require
|
||||
# every service's UpdateStatus to be settled too, so the wait spans the whole rolling update.
|
||||
proc = subprocess.run(
|
||||
[
|
||||
"docker",
|
||||
"service",
|
||||
"inspect",
|
||||
*names,
|
||||
"--format",
|
||||
"{{if .UpdateStatus}}{{.UpdateStatus.State}}{{end}}",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
return False # a service vanished mid-check — not settled
|
||||
for state in proc.stdout.split("\n"):
|
||||
# Only ACTIVE states block convergence. 'paused'/'rollback_paused' are terminal-without-
|
||||
# intervention: swarm's default update-failure-action pauses the update on one task flicker
|
||||
# and the flag then persists FOREVER (immich CI 241: app service 'paused' from a restart
|
||||
# during restore, service back at 1/1 and healthy — the wait hung to its deadline). With
|
||||
# N/N already required above, a paused update is settled for our purposes; the HTTP-health
|
||||
# and tier assertions still gate whether the app actually works.
|
||||
if state.strip() in ("updating", "rollback_started"):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@ -415,7 +490,9 @@ def recipe_checkout_ref(recipe: str, ref: str) -> None:
|
||||
abra.recipe_checkout(recipe, ref)
|
||||
|
||||
|
||||
def chaos_redeploy(domain: str, deploy_timeout: int = 900, no_converge_checks: bool = False) -> None:
|
||||
def chaos_redeploy(
|
||||
domain: str, deploy_timeout: int = 900, no_converge_checks: bool = False
|
||||
) -> None:
|
||||
"""In-place `abra app deploy --chaos`: redeploy the running app at the CURRENT recipe checkout
|
||||
(HC1: the PR-head code under test). This is the upgrade op, not a fresh install — it does NOT go
|
||||
through deploy_app, so the deploy-count guard (DG4.1) is not incremented.
|
||||
@ -498,6 +575,16 @@ def wait_ready_probes(meta: dict, domain: str, timeout: int = 600) -> None:
|
||||
|
||||
def backup_app(domain: str) -> str:
|
||||
"""Create a backup; return the abra/restic output (carries the produced snapshot_id)."""
|
||||
# Never back up a stack that is still converging/rolling-updating: backupbot resolves each
|
||||
# service's hook container ONCE up front, so a task that cycles between that lookup and the
|
||||
# pre-hook exec crashes the whole backup with a 409 (immich CI 238). Bounded wait — on timeout
|
||||
# we still attempt the backup and let the tier's assertion deliver the verdict.
|
||||
deadline = time.time() + 300
|
||||
while time.time() < deadline and not services_converged(domain):
|
||||
print(
|
||||
f" backup: {domain} stack not settled yet — waiting before backup create", flush=True
|
||||
)
|
||||
time.sleep(5)
|
||||
return abra.backup_create(domain)
|
||||
|
||||
|
||||
@ -603,17 +690,84 @@ def teardown_app(domain: str, verify: bool = True) -> None:
|
||||
residual = _residual(domain)
|
||||
if any(residual.values()):
|
||||
raise TeardownError(f"teardown left residual for {domain}: {residual}")
|
||||
# No unregistration step: the app lock releases implicitly at process exit. The clean run's
|
||||
# leftover lockfile (unheld) is unlinked on sight by the next janitor's stale-lockfile sweep.
|
||||
|
||||
|
||||
def janitor(max_age_seconds: int | None = None) -> None:
|
||||
"""Reap orphaned run apps from crashed/rebooted runs. Matches the real naming scheme and only
|
||||
reaps apps older than max_age_seconds (so concurrent in-flight runs are never killed). Reaps via
|
||||
docker primitives so it works even when the .env is gone (A2/A3). Default 2h, env-overridable
|
||||
via CCCI_JANITOR_MAX_AGE (e.g. 0 to reap all matching orphans immediately)."""
|
||||
import os
|
||||
# A lock held longer than 2x the 60-min hard deadline can only be a leaked run (the deadline
|
||||
# bounds every healthy run). Flag it for a human — NEVER steal a held lock.
|
||||
LONG_HELD_LOCK_SECONDS = 2 * lifetime.HARD_DEADLINE_SECONDS
|
||||
|
||||
if max_age_seconds is None:
|
||||
max_age_seconds = int(os.environ.get("CCCI_JANITOR_MAX_AGE", "7200"))
|
||||
|
||||
def _probe_and_reap(domain: str) -> None:
|
||||
"""Probe one run app's lock; reap iff nobody holds it (kernel-guaranteed orphan).
|
||||
|
||||
Reaping happens WHILE HOLDING the probe lock, closing the janitor-vs-new-run race: a new run
|
||||
of the same domain blocks in acquire_app_lock until the reap finishes, so a fresh app never
|
||||
coexists with a half-reaped one. The lockfile is unlinked before release (still holding the
|
||||
lock); a waiter that blocked on the unlinked inode re-checks identity and retries. Two racing
|
||||
janitors arbitrate on the same flock: one reaps, the other sees 'held' and leaves —
|
||||
teardown_app(verify=False) is idempotent either way."""
|
||||
path = _app_lock_path(domain)
|
||||
try:
|
||||
# PEP 446: non-inheritable fd, same as acquire_app_lock.
|
||||
f = open(path, "a") # noqa: SIM115 — closed in the finally below, lock released with it
|
||||
except OSError as e:
|
||||
print(f"!! janitor: cannot open lockfile {path} ({e}) — skipping {domain}", flush=True)
|
||||
return
|
||||
try:
|
||||
try:
|
||||
fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except BlockingIOError:
|
||||
# Held -> live run. Never steal; flag if it has been held implausibly long.
|
||||
try:
|
||||
held_for = time.time() - os.stat(path).st_mtime
|
||||
except OSError:
|
||||
held_for = 0
|
||||
if held_for > LONG_HELD_LOCK_SECONDS:
|
||||
print(
|
||||
f"!! lock for {domain} held >{LONG_HELD_LOCK_SECONDS // 60}min — possible "
|
||||
"leaked run; inspect with lslocks",
|
||||
flush=True,
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f" janitor: {domain} lock held — live concurrent run, leaving it", flush=True
|
||||
)
|
||||
return
|
||||
# Acquired — but only the inode the PATH names counts (another janitor may have reaped
|
||||
# and unlinked this inode while we raced; a lock on an unlinked inode protects nothing
|
||||
# and unlinking the path now would delete a NEWER run's lockfile).
|
||||
try:
|
||||
if os.fstat(f.fileno()).st_ino != os.stat(path).st_ino:
|
||||
return
|
||||
except FileNotFoundError:
|
||||
return
|
||||
# Orphan: no live owner (the kernel released the lock when the owner died). Reap while
|
||||
# holding the probe lock, then unlink the lockfile before releasing.
|
||||
print(f" janitor: {domain} lock acquirable — orphan, reaping", flush=True)
|
||||
with contextlib.suppress(Exception):
|
||||
teardown_app(domain, verify=False)
|
||||
with contextlib.suppress(OSError):
|
||||
os.unlink(path)
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
|
||||
def janitor() -> None:
|
||||
"""Reap orphaned run apps from crashed/rebooted runs; the kernel flock is the only liveness
|
||||
oracle. For every candidate run app, probe its app-domain lock (LOCK_NB):
|
||||
|
||||
acquirable -> nobody holds it -> orphan -> reap under the probe lock + unlink lockfile
|
||||
held -> live concurrent run -> leave it (warn if held >2x the hard deadline)
|
||||
|
||||
Candidate discovery is unchanged: `abra app ls` + a docker-service sweep (catches stacks
|
||||
whose .env is already gone), both matched against RUN_APP_RE — warm/canonical apps never
|
||||
match and are never probed. Post-reboot, /run/lock (tmpfs) is empty, so every surviving app
|
||||
probes as an orphan and is reaped immediately (no age threshold). Stale lockfiles with no
|
||||
app behind them are unlinked on sight. Degrades safely: an unreadable lockfile/dir is
|
||||
skipped with a log line, never a crash. Reaps via docker primitives so it works even when
|
||||
the .env is gone (A2/A3)."""
|
||||
seen = set()
|
||||
for app in abra.app_ls():
|
||||
name = app.get("appName") or app.get("domain") or ""
|
||||
@ -627,9 +781,22 @@ def janitor(max_age_seconds: int | None = None) -> None:
|
||||
seen.add(f"{m.group(1)}.ci.commoninternet.net")
|
||||
|
||||
for name in seen:
|
||||
stack = _stack_name(name)
|
||||
age = _stack_age_seconds(stack)
|
||||
if age is not None and age < max_age_seconds:
|
||||
continue # likely a concurrent in-flight run; leave it
|
||||
with contextlib.suppress(Exception):
|
||||
teardown_app(name, verify=False)
|
||||
_probe_and_reap(name)
|
||||
|
||||
# Tidy /run/lock: a clean run's leftover lockfile is unheld and appless — unlink it (under
|
||||
# its own probe lock, with the same identity check as above).
|
||||
with contextlib.suppress(OSError):
|
||||
for path in glob.glob(os.path.join(_app_lock_dir(), "cc-ci-app-*.lock")):
|
||||
domain = os.path.basename(path)[len("cc-ci-app-") : -len(".lock")]
|
||||
if domain in seen:
|
||||
continue # handled (or deliberately left) above
|
||||
with contextlib.suppress(OSError):
|
||||
f = open(path, "a") # noqa: SIM115 — closed below, lock released with it
|
||||
try:
|
||||
fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
if os.fstat(f.fileno()).st_ino == os.stat(path).st_ino:
|
||||
os.unlink(path)
|
||||
except (BlockingIOError, FileNotFoundError):
|
||||
pass # held (live run pre-deploy) or already gone — leave it
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
95
runner/harness/lifetime.py
Normal file
95
runner/harness/lifetime.py
Normal file
@ -0,0 +1,95 @@
|
||||
"""Run-lifetime hardening (concurrency restructure P1).
|
||||
|
||||
The concurrency model's invariant chain is:
|
||||
|
||||
lock lifetime ⊆ harness process lifetime ⊆ drone step lifetime ⊆ 60-min hard deadline
|
||||
|
||||
Locks are kernel flocks released on process exit, so the only thing that needs managing is the
|
||||
PROCESS lifetime. Three guards, installed at run startup (before any abra call) by
|
||||
`install_lifetime_guards()`:
|
||||
|
||||
1. `PR_SET_PDEATHSIG(SIGTERM)`: if the parent (the drone step shell) dies — cancel, runner
|
||||
crash, host shutdown of the step — the kernel delivers SIGTERM to the harness, so a dead
|
||||
build can never leak a running harness that holds locks. Paired with a ppid==1 re-check
|
||||
AFTER the prctl: a parent that died BEFORE the prctl took effect would never trigger the
|
||||
death signal, so a harness that finds itself already reparented refuses to run.
|
||||
2. SIGTERM handler: raise SystemExit so the run's `finally:` teardown funnel executes and the
|
||||
process exits non-zero. Re-entrant deliveries during teardown are logged and IGNORED so a
|
||||
second signal can't abort the cleanup the first one asked for (`begin_teardown()` guards
|
||||
this; the run's own `finally:` blocks also call it so a signal landing mid-normal-teardown
|
||||
can't abort that either).
|
||||
3. `signal.alarm(3600)`: self-imposed hard deadline. SIGALRM funnels into the same teardown
|
||||
path with a distinct log line. Teardown time after the deadline is not alarm-bounded —
|
||||
interrupting a teardown buys nothing; the janitor (flock probe) is the backstop if a
|
||||
teardown wedges and the process is killed harder.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import ctypes
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
|
||||
HARD_DEADLINE_SECONDS = 60 * 60
|
||||
|
||||
_PR_SET_PDEATHSIG = 1 # linux/prctl.h
|
||||
|
||||
_state = {"tearing_down": False}
|
||||
|
||||
|
||||
def begin_teardown() -> None:
|
||||
"""Mark the teardown funnel as running. From here on SIGTERM/SIGALRM must NOT raise — it
|
||||
would abort the very cleanup it asks for — so the handlers log and return instead. Called by
|
||||
the handlers themselves before raising, and at the top of the run's `finally:` blocks."""
|
||||
_state["tearing_down"] = True
|
||||
|
||||
|
||||
def _funnel_handler(log_line: str, exit_code: int):
|
||||
"""A signal handler that routes into the teardown funnel exactly once: log, then raise
|
||||
SystemExit (propagates through the run's try/finally → teardown executes → non-zero exit).
|
||||
While teardown is already running, further signals are logged and swallowed."""
|
||||
|
||||
def handler(signum: int, frame) -> None: # noqa: ARG001
|
||||
print(log_line, flush=True)
|
||||
if _state["tearing_down"]:
|
||||
print(
|
||||
f"== signal {signum} during teardown — ignored (teardown continues, "
|
||||
"exit stays non-zero) ==",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
begin_teardown()
|
||||
raise SystemExit(exit_code)
|
||||
|
||||
return handler
|
||||
|
||||
|
||||
def install_lifetime_guards(deadline_seconds: int = HARD_DEADLINE_SECONDS) -> None:
|
||||
"""Install all three lifetime guards (see module docstring). Must run at harness startup,
|
||||
before any abra call and before any lock is taken."""
|
||||
libc = ctypes.CDLL("libc.so.6", use_errno=True)
|
||||
if libc.prctl(_PR_SET_PDEATHSIG, signal.SIGTERM, 0, 0, 0) != 0:
|
||||
err = ctypes.get_errno()
|
||||
raise OSError(err, f"prctl(PR_SET_PDEATHSIG, SIGTERM) failed: {os.strerror(err)}")
|
||||
# The prctl is armed now — but only fires for a parent death AFTER this point. If the parent
|
||||
# already died, we are reparented (ppid 1) and would never get the signal: refuse to run, an
|
||||
# orphaned harness would hold locks/apps with nothing managing its lifetime.
|
||||
if os.getppid() == 1:
|
||||
sys.exit("parent died before prctl(PR_SET_PDEATHSIG) — refusing to run orphaned")
|
||||
signal.signal(
|
||||
signal.SIGTERM,
|
||||
_funnel_handler(
|
||||
"== SIGTERM received (drone cancel / parent death) — tearing down ==",
|
||||
128 + signal.SIGTERM,
|
||||
),
|
||||
)
|
||||
minutes = deadline_seconds // 60
|
||||
signal.signal(
|
||||
signal.SIGALRM,
|
||||
_funnel_handler(
|
||||
f"== run exceeded {minutes}-minute hard deadline — tearing down ==",
|
||||
128 + signal.SIGALRM,
|
||||
),
|
||||
)
|
||||
signal.alarm(deadline_seconds)
|
||||
@ -2,7 +2,14 @@
|
||||
|
||||
Turns a run's per-tier pytest outcomes into a single `results.json` artifact carrying, per the plan:
|
||||
{ recipe, version, pr, ref, run_id, finished, stages:[{name,status,tests:[{name,status,ms}]}],
|
||||
level, level_cap_reason, rungs, flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
|
||||
level, level_cap_reason, level_cap_rung, rungs,
|
||||
skips:{intentional:{rung:reason}, unintentional:[rung]},
|
||||
flags:{clean_teardown,no_secret_leak}, screenshot, summary_card }
|
||||
|
||||
`skips` splits the N/A (skipped) rungs by a simple rule: a skip is INTENTIONAL iff the recipe lists
|
||||
it (with a reason) in `recipe_meta.EXPECTED_NA = {rung: reason}`; any rung skipped but not listed is
|
||||
UNINTENTIONAL (a coverage gap to fill or declare). Skips still cap the level either way — the harness
|
||||
never claims a rung it did not verify; this only labels *why* a skip happened.
|
||||
|
||||
The per-test breakdown comes from JUnit XML emitted by each tier's pytest invocation (`--junitxml`),
|
||||
parsed here with the stdlib (no new dep). The integer **level** is computed by harness.level from a
|
||||
@ -127,41 +134,24 @@ def collect_stages(records: list[dict]) -> list[dict]:
|
||||
return stages
|
||||
|
||||
|
||||
def _has_repo_local(records: list[dict]) -> bool:
|
||||
return any(r.get("source") == "repo-local" for r in records)
|
||||
|
||||
|
||||
def _repo_local_passed(records: list[dict]) -> bool:
|
||||
repo = [r for r in records if r.get("source") == "repo-local"]
|
||||
return bool(repo) and all(r.get("rc", 1) == 0 for r in repo)
|
||||
|
||||
|
||||
def derive_rungs(
|
||||
results: dict[str, str],
|
||||
*,
|
||||
backup_capable: bool,
|
||||
declared: list[str] | None,
|
||||
deps_ready: bool,
|
||||
sso_unverified: bool,
|
||||
has_custom: bool,
|
||||
has_repo_local: bool,
|
||||
repo_local_passed: bool,
|
||||
) -> dict[str, str]:
|
||||
"""Translate the orchestrator's tier results + deps/SSO signals into the rung-status dict
|
||||
harness.level consumes. Documented in DECISIONS.md (Phase 3). Conservative by design — never
|
||||
reports a rung 'pass' it can't substantiate (cardinal guardrail: presentation never inflates).
|
||||
"""Translate the orchestrator's tier results into the rung-status dict harness.level consumes —
|
||||
the FOUR essential rungs only. Conservative by design — never reports a rung 'pass' it can't
|
||||
substantiate (cardinal guardrail: presentation never inflates).
|
||||
|
||||
L1 install : install tier pass.
|
||||
L2 upgrade : upgrade tier (skip → N/A: only one published version).
|
||||
L3 backup/res : backup AND restore tiers pass (N/A if not backup-capable).
|
||||
L4 functional : the recipe-specific functional (non-deps) tests pass — the custom tier, minus
|
||||
its SSO/integration tests. N/A if the recipe has no custom tests at all.
|
||||
L5 integration: SSO/OIDC + cross-app. Applies ONLY if the recipe declares deps (else N/A — the
|
||||
"no integration surface caps at L4" rule, §4.1). pass iff deps wired
|
||||
(deps_ready) and not sso_unverified and the custom tier didn't fail.
|
||||
L6 recipe-loc : the recipe repo's own tests/ (repo-local source) ran and passed (N/A if none).
|
||||
L4 functional : recipe-specific functional tests pass — the custom tier. N/A if none ran.
|
||||
|
||||
Integration (SSO/OIDC) and recipe-local are OPTIONAL and intentionally NOT rungs here — they
|
||||
never cap the level (SSO is still enforced for the run VERDICT in run_recipe_ci.py).
|
||||
"""
|
||||
declared = declared or []
|
||||
rungs: dict[str, str] = {}
|
||||
rungs["install"] = level_mod.tier_to_rung(results.get("install"))
|
||||
rungs["upgrade"] = level_mod.tier_to_rung(results.get("upgrade"))
|
||||
@ -170,36 +160,34 @@ def derive_rungs(
|
||||
)
|
||||
|
||||
custom = results.get("custom")
|
||||
# Functional rung (L4): the non-deps custom tests.
|
||||
if not has_custom or custom == "skip" or custom is None:
|
||||
rungs["functional"] = "na"
|
||||
elif custom == "fail":
|
||||
# A custom test failed. With declared deps we cannot cheaply tell functional-vs-SSO apart, so
|
||||
# conservatively fail the functional rung (caps at L3) — never inflate.
|
||||
rungs["functional"] = "fail"
|
||||
else: # custom == "pass"
|
||||
rungs["functional"] = "pass"
|
||||
|
||||
# Integration rung (L5): only recipes with an SSO/integration surface (declared deps) can climb.
|
||||
if not declared:
|
||||
rungs["integration"] = "na"
|
||||
elif sso_unverified or not deps_ready or custom == "fail":
|
||||
# SSO not wired/verified, or a custom test failed → integration not verified.
|
||||
rungs["integration"] = "fail"
|
||||
elif custom == "pass":
|
||||
rungs["integration"] = "pass"
|
||||
else:
|
||||
# declared deps but no custom tests ran — can't claim integration verified
|
||||
rungs["integration"] = "na"
|
||||
|
||||
# Recipe-local rung (L6).
|
||||
if not has_repo_local:
|
||||
rungs["recipe_local"] = "na"
|
||||
else:
|
||||
rungs["recipe_local"] = "pass" if repo_local_passed else "fail"
|
||||
return rungs
|
||||
|
||||
|
||||
def skips(rungs: dict[str, str], expected_na: dict | None) -> dict:
|
||||
"""Split the SKIPPED (N/A) rungs into intentional vs unintentional (operator model).
|
||||
|
||||
A recipe lists the rungs it intentionally skips, each with a reason, in
|
||||
`recipe_meta.EXPECTED_NA = {rung: reason}`. The rule is dead simple: a skipped rung is
|
||||
**intentional** iff it is in that list; any rung that is skipped and NOT in the list is
|
||||
**unintentional** (a coverage gap someone should either fill or declare). N/A still caps the
|
||||
level either way — the harness never claims a rung it did not verify — this only labels *why* a
|
||||
skip happened. Returns:
|
||||
{ "intentional": {rung: reason, ...}, # skipped AND declared in EXPECTED_NA
|
||||
"unintentional": [rung, ...] } # skipped but NOT declared
|
||||
"""
|
||||
expected = {str(k): str(v) for k, v in (expected_na or {}).items()}
|
||||
na = [r for r, st in rungs.items() if st == "na"]
|
||||
intentional = {r: expected[r] for r in na if r in expected}
|
||||
unintentional = sorted(r for r in na if r not in expected)
|
||||
return {"intentional": intentional, "unintentional": unintentional}
|
||||
|
||||
|
||||
def build_results(
|
||||
*,
|
||||
recipe: str,
|
||||
@ -209,30 +197,24 @@ def build_results(
|
||||
records: list[dict],
|
||||
results: dict[str, str],
|
||||
backup_capable: bool,
|
||||
declared: list[str] | None,
|
||||
deps_ready: bool,
|
||||
sso_unverified: bool,
|
||||
clean_teardown: bool,
|
||||
no_secret_leak: bool,
|
||||
finished_ts: float | None,
|
||||
screenshot: str | None = None,
|
||||
summary_card: str | None = None,
|
||||
expected_na: dict | None = None,
|
||||
) -> dict:
|
||||
"""Assemble the full results.json dict (no I/O). `finished_ts` is passed in (the orchestrator
|
||||
stamps it) so this stays pure and deterministic for unit tests."""
|
||||
stamps it) so this stays pure and deterministic for unit tests. `expected_na` is the recipe's
|
||||
declared intentional-skip map (recipe_meta.EXPECTED_NA) used to distinguish a deliberate skip from
|
||||
accidentally-missing coverage."""
|
||||
stages = collect_stages(records)
|
||||
has_custom = any(r["tier"] == "custom" for r in records)
|
||||
rungs = derive_rungs(
|
||||
results,
|
||||
backup_capable=backup_capable,
|
||||
declared=declared,
|
||||
deps_ready=deps_ready,
|
||||
sso_unverified=sso_unverified,
|
||||
has_custom=has_custom,
|
||||
has_repo_local=_has_repo_local(records),
|
||||
repo_local_passed=_repo_local_passed(records),
|
||||
)
|
||||
rungs = derive_rungs(results, backup_capable=backup_capable, has_custom=has_custom)
|
||||
lvl, cap_reason = level_mod.compute_level(rungs)
|
||||
# The rung that capped the climb (lowest non-pass), or None on a full climb — lets a consumer
|
||||
# (card/badge) tell whether the cap was an intentional skip, an unintentional one, or a failure.
|
||||
capped = level_mod.RUNGS[lvl] if cap_reason else None
|
||||
return {
|
||||
"schema": 1,
|
||||
"run_id": run_id(),
|
||||
@ -243,7 +225,9 @@ def build_results(
|
||||
"finished": finished_ts,
|
||||
"level": lvl,
|
||||
"level_cap_reason": cap_reason,
|
||||
"level_cap_rung": capped,
|
||||
"rungs": rungs,
|
||||
"skips": skips(rungs, expected_na),
|
||||
"stages": stages,
|
||||
"results": results,
|
||||
"flags": {
|
||||
|
||||
@ -113,7 +113,9 @@ def _assert_undeployed(domain: str) -> None:
|
||||
)
|
||||
|
||||
|
||||
def snapshot(recipe: str, domain: str, commit: str | None = None, version: str | None = None) -> dict:
|
||||
def snapshot(
|
||||
recipe: str, domain: str, commit: str | None = None, version: str | None = None
|
||||
) -> dict:
|
||||
"""Take a last-known-good snapshot of every data volume of <domain>'s stack. The app MUST be
|
||||
undeployed. Atomically replaces the prior last-good. Returns the written meta dict."""
|
||||
_assert_undeployed(domain)
|
||||
@ -169,7 +171,9 @@ def restore(recipe: str, domain: str) -> dict:
|
||||
for vol in meta.get("volumes", []):
|
||||
tar_path = os.path.join(volumes_dir(recipe), f"{vol}.tar")
|
||||
if vol not in current:
|
||||
raise SnapshotError(f"snapshot volume {vol} absent from current stack {sorted(current)}")
|
||||
raise SnapshotError(
|
||||
f"snapshot volume {vol} absent from current stack {sorted(current)}"
|
||||
)
|
||||
mp = _volume_mountpoint(vol)
|
||||
# Clear the volume contents (incl. dotfiles) without removing the mountpoint itself.
|
||||
r = _run(["sh", "-c", f'rm -rf -- "{mp}"/* "{mp}"/.[!.]* "{mp}"/..?* 2>/dev/null; true'])
|
||||
|
||||
@ -60,14 +60,17 @@ def sweep() -> int:
|
||||
for r in recipes:
|
||||
print(f"\n===== nightly: full-cold {r} (latest) =====", flush=True)
|
||||
env = dict(os.environ, RECIPE=r)
|
||||
env.pop("REF", None) # latest, not a PR head
|
||||
env.pop("REF", None) # latest, not a PR head
|
||||
env.pop("CCCI_QUICK", None)
|
||||
env.pop("MODE", None)
|
||||
rc = subprocess.run(
|
||||
[sys.executable, os.path.join(_here(), "run_recipe_ci.py")], env=env
|
||||
).returncode
|
||||
results[r] = rc
|
||||
print(f"nightly: {r} rc={rc} ({'green→canonical refreshed' if rc == 0 else 'red'})", flush=True)
|
||||
print(
|
||||
f"nightly: {r} rc={rc} ({'green→canonical refreshed' if rc == 0 else 'red'})",
|
||||
flush=True,
|
||||
)
|
||||
# WC8 disk hygiene: drop warm data for de-enrolled canonicals; log the disk budget.
|
||||
pruned = canonical.prune_stale()
|
||||
if pruned:
|
||||
|
||||
@ -44,17 +44,26 @@ sys.path.insert(0, os.path.join(ROOT, "runner"))
|
||||
from harness import ( # noqa: E402
|
||||
abra,
|
||||
canonical,
|
||||
card as card_mod,
|
||||
deps as deps_mod,
|
||||
discovery,
|
||||
generic,
|
||||
lifecycle,
|
||||
lifetime,
|
||||
naming,
|
||||
results as results_mod,
|
||||
screenshot as screenshot_mod,
|
||||
warm,
|
||||
warmsnap,
|
||||
)
|
||||
from harness import ( # noqa: E402
|
||||
card as card_mod,
|
||||
)
|
||||
from harness import ( # noqa: E402
|
||||
deps as deps_mod,
|
||||
)
|
||||
from harness import ( # noqa: E402
|
||||
results as results_mod,
|
||||
)
|
||||
from harness import ( # noqa: E402
|
||||
screenshot as screenshot_mod,
|
||||
)
|
||||
|
||||
ALL_STAGES = ("install", "upgrade", "backup", "restore", "custom")
|
||||
|
||||
@ -129,18 +138,73 @@ def _gitea_token() -> str | None:
|
||||
return tok or None
|
||||
|
||||
|
||||
def _run_state_path(name: str) -> str:
|
||||
"""Run-scoped state file in the tempdir, keyed by run id + harness pid — NEVER by app domain.
|
||||
A second run of the SAME domain overlaps this process (its main() preamble executes before it
|
||||
blocks at the app lock inside deploy_app), so domain-keyed files get reset/removed under the
|
||||
live run: M2(c) double-!testme produced a false DG4.1 deploy-count=2 in run 1 and a countfile
|
||||
FileNotFoundError crash in run 2. Children never re-derive these paths — they receive them
|
||||
via the CCCI_*_FILE env vars, so the key only has to be unique per harness process."""
|
||||
rid = results_mod.run_id()
|
||||
return os.path.join(tempfile.gettempdir(), f"ccci-{name}-{rid}-{os.getpid()}")
|
||||
|
||||
|
||||
def setup_run_abra_dir() -> str:
|
||||
"""P3: build + export this run's PER-RUN ABRA_DIR — structural isolation of recipe trees.
|
||||
|
||||
`<runs_dir>/<run-id>/abra/` with:
|
||||
servers/ -> symlink to the canonical ~/.abra/servers. App .env files land in the shared
|
||||
canonical path, so janitor discovery (`abra app ls`) and env-based teardown
|
||||
work unchanged from any process; per-domain filenames + the app-domain lock
|
||||
prevent write conflicts.
|
||||
catalogue/ -> symlink to the canonical ~/.abra/catalogue (read-mostly).
|
||||
recipes/ fresh + empty — THE isolation that matters: each run clones and git-checkouts
|
||||
its own recipe trees, so concurrent runs (same recipe included) can never
|
||||
corrupt each other's deploy tree. Replaces the per-recipe flock.
|
||||
Exported as $ABRA_DIR — honored by the abra CLI and by every harness path helper
|
||||
(abra.abra_dir()) — BEFORE any abra call. Rides along the existing run-dir retention."""
|
||||
canonical = os.path.expanduser("~/.abra")
|
||||
rid = results_mod.run_id()
|
||||
if rid == "manual":
|
||||
rid = f"manual-{os.getpid()}" # two concurrent hand-runs must not share a tree
|
||||
run_abra_dir = os.path.join(results_mod.runs_dir(), rid, "abra")
|
||||
os.makedirs(os.path.join(run_abra_dir, "recipes"), exist_ok=True)
|
||||
for shared in ("servers", "catalogue"):
|
||||
link = os.path.join(run_abra_dir, shared)
|
||||
if not os.path.islink(link):
|
||||
os.symlink(os.path.join(canonical, shared), link)
|
||||
os.environ["ABRA_DIR"] = run_abra_dir
|
||||
print(
|
||||
f"== per-run ABRA_DIR: {run_abra_dir} (servers/catalogue -> canonical; fresh recipes/) ==",
|
||||
flush=True,
|
||||
)
|
||||
return run_abra_dir
|
||||
|
||||
|
||||
def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None:
|
||||
"""Make the recipe available at the code under test. If SRC+REF point at the mirror PR,
|
||||
"""Make the recipe available at the code under test in THIS RUN's recipe tree
|
||||
($ABRA_DIR/recipes/<recipe>): a plain clone — no locking needed, no rm-rf of any shared
|
||||
state (the rm below only clears this run's own leftovers, e.g. a janitor-triggered
|
||||
`abra app ls` auto-clone or a Drone build-number reuse). If SRC+REF point at the mirror PR,
|
||||
clone it at that ref; otherwise fetch the catalogue copy. Private mirror repos need the bot
|
||||
token — passed via a per-command http.extraHeader (not persisted in .git/config, not printed)."""
|
||||
recipes_dir = os.path.expanduser("~/.abra/recipes")
|
||||
os.makedirs(recipes_dir, exist_ok=True)
|
||||
dest = os.path.join(recipes_dir, recipe)
|
||||
# CCCI_SKIP_FETCH=1: use the local recipe clone as-is (lets a test/Adversary stage a fake/broken
|
||||
# ref — e.g. a simulated broken PR head for the --quick rollback proof — without it being clobbered
|
||||
# by a re-fetch). Never set in production CI.
|
||||
dest = abra.recipe_dir(recipe)
|
||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||
# CCCI_SKIP_FETCH=1: use the locally STAGED recipe clone as-is (lets a test/Adversary stage a
|
||||
# fake/broken ref — e.g. a simulated broken PR head for the --quick rollback proof — without it
|
||||
# being clobbered by a re-fetch). Staging happens in the canonical ~/.abra/recipes/<recipe>;
|
||||
# copy it into the per-run tree so the rest of the run reads the staged state. Never set in
|
||||
# production CI.
|
||||
if os.environ.get("CCCI_SKIP_FETCH") == "1":
|
||||
print(f"[fetch] CCCI_SKIP_FETCH=1 — using local {recipe} recipe clone as-is", flush=True)
|
||||
canonical = os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||||
subprocess.run(["rm", "-rf", dest], check=False)
|
||||
if os.path.isdir(canonical):
|
||||
shutil.copytree(canonical, dest, symlinks=True)
|
||||
print(
|
||||
f"[fetch] CCCI_SKIP_FETCH=1 — using staged {recipe} clone as-is "
|
||||
f"(copied {canonical} -> per-run tree)",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
if src and ref:
|
||||
url = f"https://git.autonomic.zone/{src}.git"
|
||||
@ -169,7 +233,7 @@ def fetch_recipe(recipe: str, ref: str | None, src: str | None) -> None:
|
||||
def snapshot_recipe_tests(recipe: str) -> str | None:
|
||||
"""Copy the recipe-shipped tests/ to a stable temp dir, immune to abra re-checking-out the
|
||||
recipe to a version tag during the run. Returns the snapshot path, or None if no tests/."""
|
||||
src = os.path.expanduser(f"~/.abra/recipes/{recipe}/tests")
|
||||
src = os.path.join(abra.recipe_dir(recipe), "tests")
|
||||
if not os.path.isdir(src):
|
||||
return None
|
||||
has_overlay = glob.glob(os.path.join(src, "test_*.py")) or os.path.isfile(
|
||||
@ -200,6 +264,7 @@ def _load_meta(recipe: str) -> dict:
|
||||
for k in list(meta) + [
|
||||
"BACKUP_CAPABLE",
|
||||
"SKIP_GENERIC",
|
||||
"EXPECTED_NA",
|
||||
"OIDC_AT_INSTALL",
|
||||
"READY_PROBE",
|
||||
"UPGRADE_BASE_VERSION",
|
||||
@ -565,15 +630,15 @@ def run_quick(
|
||||
flush=True,
|
||||
)
|
||||
|
||||
statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json")
|
||||
statefile = _run_state_path("opstate") + ".json"
|
||||
with open(statefile, "w") as f:
|
||||
json.dump({}, f)
|
||||
os.environ["CCCI_OP_STATE_FILE"] = statefile
|
||||
depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json")
|
||||
depsfile = _run_state_path("deps") + ".json"
|
||||
with open(depsfile, "w") as f:
|
||||
json.dump({}, f)
|
||||
os.environ["CCCI_DEPS_FILE"] = depsfile
|
||||
skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt")
|
||||
skipfile = _run_state_path("depskip") + ".txt"
|
||||
with contextlib.suppress(OSError):
|
||||
os.remove(skipfile)
|
||||
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
|
||||
@ -649,6 +714,8 @@ def run_quick(
|
||||
results["upgrade"] = "fail"
|
||||
results["custom"] = "skip"
|
||||
finally:
|
||||
# Teardown funnel running: further SIGTERM/SIGALRM are logged + ignored (lifetime.py).
|
||||
lifetime.begin_teardown()
|
||||
# F2-11 skip count (read before deciding pass/fail)
|
||||
requires_deps_skipped = 0
|
||||
try:
|
||||
@ -812,6 +879,9 @@ def promote_canonical(recipe: str, head_ref: str | None) -> None:
|
||||
|
||||
|
||||
def main() -> int:
|
||||
# P1 lock-lifetime hardening: PDEATHSIG + SIGTERM/SIGALRM teardown funnel + 60-min hard
|
||||
# deadline, armed before ANY abra call or lock acquisition (see harness/lifetime.py).
|
||||
lifetime.install_lifetime_guards()
|
||||
recipe = os.environ.get("RECIPE")
|
||||
if not recipe:
|
||||
print("RECIPE env is required", file=sys.stderr)
|
||||
@ -826,6 +896,10 @@ def main() -> int:
|
||||
print(
|
||||
f"== cc-ci run: recipe={recipe} ref={ref} pr={os.environ.get('PR', '0')} stages={sorted(stages)}"
|
||||
)
|
||||
# Concurrent-run safety is structural: this run's recipe trees live in its own ABRA_DIR
|
||||
# (exported here, before ANY abra call), so no recipe-tree lock exists; same-DOMAIN runs
|
||||
# serialise on the app-domain flock taken in deploy_app (see docs/concurrency.md).
|
||||
setup_run_abra_dir()
|
||||
fetch_recipe(recipe, ref, src)
|
||||
# The PR-head commit the upgrade tier re-checks out for the chaos redeploy to the code under test
|
||||
# (HC1). Prefer the explicit PR head sha ($REF) — robust + exact; fall back to the recipe checkout
|
||||
@ -864,7 +938,7 @@ def main() -> int:
|
||||
hook = discovery.install_steps(recipe, repo_local)
|
||||
|
||||
# Deploy-count guard (DG4.1): exactly one deploy_app() per run.
|
||||
countfile = os.path.join(tempfile.gettempdir(), f"ccci-deploys-{domain}")
|
||||
countfile = _run_state_path("deploys")
|
||||
with open(countfile, "w") as f:
|
||||
f.write("0")
|
||||
os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile
|
||||
@ -880,7 +954,7 @@ def main() -> int:
|
||||
|
||||
# Run-scoped op state (HC3): the orchestrator records op results (pre-upgrade identity, backup
|
||||
# snapshot_id) here for the assertion tiers (generic + overlay) to read via generic.op_state().
|
||||
statefile = os.path.join(tempfile.gettempdir(), f"ccci-opstate-{domain}.json")
|
||||
statefile = _run_state_path("opstate") + ".json"
|
||||
with open(statefile, "w") as f:
|
||||
json.dump({}, f)
|
||||
os.environ["CCCI_OP_STATE_FILE"] = statefile
|
||||
@ -891,12 +965,12 @@ def main() -> int:
|
||||
# cannot break the generic-tier signal. The `setup_custom_tests` step deploys each dep + runs
|
||||
# `tests/<recipe>/setup_custom_tests.sh` to wire OIDC env via in-place redeploy.
|
||||
# `$CCCI_DEPS_FILE` is written with the full creds dict the hook script needs (jq-readable).
|
||||
depsfile = os.path.join(tempfile.gettempdir(), f"ccci-deps-{domain}.json")
|
||||
depsfile = _run_state_path("deps") + ".json"
|
||||
with open(depsfile, "w") as f:
|
||||
json.dump({}, f)
|
||||
os.environ["CCCI_DEPS_FILE"] = depsfile
|
||||
# F2-11: conftest appends the count of requires_deps tests it skips (deps-not-ready) here.
|
||||
skipfile = os.path.join(tempfile.gettempdir(), f"ccci-depskip-{domain}.txt")
|
||||
skipfile = _run_state_path("depskip") + ".txt"
|
||||
with contextlib.suppress(OSError):
|
||||
os.remove(skipfile)
|
||||
os.environ["CCCI_DEPS_SKIP_REPORT"] = skipfile
|
||||
@ -1108,6 +1182,9 @@ def main() -> int:
|
||||
if op in stages:
|
||||
results[op] = "skip"
|
||||
finally:
|
||||
# From here the teardown funnel runs: a SIGTERM/SIGALRM landing now is logged + ignored
|
||||
# (lifetime.py) so a second signal can't abort the cleanup the first one asked for.
|
||||
lifetime.begin_teardown()
|
||||
# Teardown the recipe under test FIRST, then deps in reverse declaration order.
|
||||
# Parent verify=False (Phase 1d): keep as-is so a parent residual doesn't mask a tier
|
||||
# failure. Dep teardown uses verify=True via teardown_deps (F2-5 fix); failures are
|
||||
@ -1224,7 +1301,6 @@ def main() -> int:
|
||||
# a failure here NEVER changes `overall` (R7 — cosmetics never block the pipeline). ----
|
||||
data: dict | None = None
|
||||
try:
|
||||
sso_unverified = sso_dep_unverified(declared, deps_ready, requires_deps_skipped)
|
||||
clean_teardown = (deploy_count == expected_deploy_count) and not dep_teardown_error
|
||||
data = results_mod.build_results(
|
||||
recipe=recipe,
|
||||
@ -1234,13 +1310,11 @@ def main() -> int:
|
||||
records=records,
|
||||
results=results,
|
||||
backup_capable=backup_cap,
|
||||
declared=declared,
|
||||
deps_ready=deps_ready,
|
||||
sso_unverified=sso_unverified,
|
||||
clean_teardown=clean_teardown,
|
||||
no_secret_leak=True, # narrowed below by an actual scan of the serialised artifact
|
||||
screenshot=screenshot_rel, # Phase 3 U1 (R4): relative PNG name iff capture succeeded
|
||||
finished_ts=time.time(),
|
||||
expected_na=meta.get("EXPECTED_NA"), # declared intentional-skip map (recipe_meta)
|
||||
)
|
||||
# Real (if narrow) leak check: no known infra-secret value may appear in the artifact (R7).
|
||||
blob = json.dumps(data)
|
||||
@ -1257,6 +1331,15 @@ def main() -> int:
|
||||
f"{' — ' + data['level_cap_reason'] if data['level_cap_reason'] else ''})",
|
||||
flush=True,
|
||||
)
|
||||
# Surface UNINTENTIONAL skips in the CI log (non-blocking, R7): a rung that was skipped (N/A)
|
||||
# but is not in the recipe's intentional list — either add the missing coverage or declare it.
|
||||
for rung in data.get("skips", {}).get("unintentional", []):
|
||||
print(
|
||||
f"⚠ coverage: rung '{rung}' was skipped (N/A) but is not declared intentional — add "
|
||||
f"the missing test/label, or list it in tests/{recipe}/recipe_meta.py "
|
||||
f"EXPECTED_NA = {{'{rung}': '<why>'}}.",
|
||||
flush=True,
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 — results assembly is cosmetic; never fail a run on it (R7)
|
||||
print(
|
||||
f"!! results.json assembly failed (non-fatal, verdict unaffected): {_scrub(str(e))}",
|
||||
@ -1275,8 +1358,21 @@ def main() -> int:
|
||||
with open(html_path, "w", encoding="utf-8") as f:
|
||||
f.write(card_mod.render_card_html(data, screenshot_rel=data.get("screenshot")))
|
||||
png = card_mod.render_card_png(html_path, os.path.join(run_artifact_dir, "summary.png"))
|
||||
capped = data.get("level_cap_rung")
|
||||
sk = data.get("skips", {})
|
||||
cap_skip = (
|
||||
"intentional"
|
||||
if capped in (sk.get("intentional") or {})
|
||||
else "unintentional"
|
||||
if capped in (sk.get("unintentional") or [])
|
||||
else ""
|
||||
)
|
||||
with open(os.path.join(run_artifact_dir, "badge.svg"), "w", encoding="utf-8") as f:
|
||||
f.write(card_mod.level_badge_svg(data["level"], data.get("level_cap_reason", "")))
|
||||
f.write(
|
||||
card_mod.level_badge_svg(
|
||||
data["level"], data.get("level_cap_reason", ""), cap_skip
|
||||
)
|
||||
)
|
||||
print(
|
||||
f"summary card {'rendered ' + png if png else '(PNG render unavailable)'} + "
|
||||
f"badge.svg written into {run_artifact_dir}",
|
||||
|
||||
@ -43,11 +43,16 @@ def _traefik_setup(recipe: str, domain: str, version: str) -> None:
|
||||
ssl_cert/ssl_key swarm secrets; NO ACME). Uses the proven abra.env_set (newline-safe, unlike the
|
||||
bash set_env that bit keycloak)."""
|
||||
cert_dir = "/var/lib/ci-certs/live"
|
||||
if not (os.path.isfile(f"{cert_dir}/fullchain.pem") and os.path.isfile(f"{cert_dir}/privkey.pem")):
|
||||
if not (
|
||||
os.path.isfile(f"{cert_dir}/fullchain.pem") and os.path.isfile(f"{cert_dir}/privkey.pem")
|
||||
):
|
||||
raise RuntimeError(f"FATAL: wildcard cert missing at {cert_dir} (sops decrypt broken?)")
|
||||
if not os.path.isfile(env_file(domain)):
|
||||
_run(["abra", "app", "new", recipe, "-s", "default", "-D", domain, version, "-o", "-n"],
|
||||
timeout=120, check=True)
|
||||
_run(
|
||||
["abra", "app", "new", recipe, "-s", "default", "-D", domain, version, "-o", "-n"],
|
||||
timeout=120,
|
||||
check=True,
|
||||
)
|
||||
abra.env_set(domain, "DOMAIN", domain)
|
||||
abra.env_set(domain, "LETS_ENCRYPT_ENV", "")
|
||||
abra.env_set(domain, "WILDCARDS_ENABLED", "1")
|
||||
@ -61,11 +66,39 @@ def _traefik_setup(recipe: str, domain: str, version: str) -> None:
|
||||
return any(s.endswith(f"_{name}_v1") for s in have)
|
||||
|
||||
if not _has("ssl_cert"):
|
||||
_run(["abra", "app", "secret", "insert", domain, "ssl_cert", "v1",
|
||||
f"{cert_dir}/fullchain.pem", "-f", "-n"], timeout=120, check=True)
|
||||
_run(
|
||||
[
|
||||
"abra",
|
||||
"app",
|
||||
"secret",
|
||||
"insert",
|
||||
domain,
|
||||
"ssl_cert",
|
||||
"v1",
|
||||
f"{cert_dir}/fullchain.pem",
|
||||
"-f",
|
||||
"-n",
|
||||
],
|
||||
timeout=120,
|
||||
check=True,
|
||||
)
|
||||
if not _has("ssl_key"):
|
||||
_run(["abra", "app", "secret", "insert", domain, "ssl_key", "v1",
|
||||
f"{cert_dir}/privkey.pem", "-f", "-n"], timeout=120, check=True)
|
||||
_run(
|
||||
[
|
||||
"abra",
|
||||
"app",
|
||||
"secret",
|
||||
"insert",
|
||||
domain,
|
||||
"ssl_key",
|
||||
"v1",
|
||||
f"{cert_dir}/privkey.pem",
|
||||
"-f",
|
||||
"-n",
|
||||
],
|
||||
timeout=120,
|
||||
check=True,
|
||||
)
|
||||
|
||||
|
||||
SPECS: dict[str, dict] = {
|
||||
@ -166,7 +199,13 @@ def _run(cmd, timeout=120, check=False):
|
||||
|
||||
|
||||
def _recipe_dir(recipe: str) -> str:
|
||||
return os.path.expanduser(f"~/.abra/recipes/{recipe}")
|
||||
# Resolve like the abra CLI does: $ABRA_DIR (the per-run tree when imported by a CI run,
|
||||
# e.g. promote_canonical) else the canonical ~/.abra (this module's own systemd-timer runs,
|
||||
# which set no ABRA_DIR). Keeps fetch_recipe (an `abra` subprocess) and the git readers
|
||||
# below pointed at the SAME tree in both contexts.
|
||||
return os.path.join(
|
||||
os.environ.get("ABRA_DIR") or os.path.expanduser("~/.abra"), "recipes", recipe
|
||||
)
|
||||
|
||||
|
||||
def recipe_tags(recipe: str) -> list[str]:
|
||||
@ -218,8 +257,17 @@ def health_code(spec: dict) -> int:
|
||||
domain = spec.get("health_domain", spec["domain"])
|
||||
r = _run(
|
||||
[
|
||||
"curl", "-sk", "-o", "/dev/null", "-w", "%{http_code}", "--max-time", "10",
|
||||
"--resolve", f"{domain}:443:127.0.0.1", f"https://{domain}{spec['health_path']}",
|
||||
"curl",
|
||||
"-sk",
|
||||
"-o",
|
||||
"/dev/null",
|
||||
"-w",
|
||||
"%{http_code}",
|
||||
"--max-time",
|
||||
"10",
|
||||
"--resolve",
|
||||
f"{domain}:443:127.0.0.1",
|
||||
f"https://{domain}{spec['health_path']}",
|
||||
],
|
||||
timeout=20,
|
||||
)
|
||||
@ -230,7 +278,6 @@ def health_code(spec: dict) -> int:
|
||||
|
||||
|
||||
def wait_healthy(spec: dict, timeout: int | None = None) -> bool:
|
||||
domain = spec["domain"]
|
||||
deadline = time.time() + (timeout or spec["health_timeout"])
|
||||
while time.time() < deadline:
|
||||
if health_code(spec) in tuple(spec["health_ok"]):
|
||||
@ -325,15 +372,18 @@ def ensure_server() -> None:
|
||||
|
||||
def ensure_app_config(recipe: str, domain: str, version: str) -> None:
|
||||
if not os.path.isfile(env_file(domain)):
|
||||
_run(["abra", "app", "new", recipe, "-s", "default", "-D", domain, version, "-o", "-n"],
|
||||
timeout=120, check=True)
|
||||
_run(
|
||||
["abra", "app", "new", recipe, "-s", "default", "-D", domain, version, "-o", "-n"],
|
||||
timeout=120,
|
||||
check=True,
|
||||
)
|
||||
abra.env_set(domain, "DOMAIN", domain)
|
||||
abra.env_set(domain, "LETS_ENCRYPT_ENV", "")
|
||||
|
||||
|
||||
def ensure_secrets(domain: str) -> None:
|
||||
stack = lifecycle._stack_name(domain) # noqa: SLF001
|
||||
have = {n for n in lifecycle._docker_names("secret", stack)} # noqa: SLF001
|
||||
have = set(lifecycle._docker_names("secret", stack)) # noqa: SLF001
|
||||
if not any(n.endswith("_admin_password_v1") for n in have):
|
||||
abra.secret_generate(domain)
|
||||
|
||||
@ -393,8 +443,9 @@ def reconcile(app: str) -> str:
|
||||
write_alert(app, "held-major", current=current, latest=latest, release_notes=notes[:4000])
|
||||
return f"held-major:{current}->{latest}"
|
||||
if notes_flag_manual_migration(notes):
|
||||
write_alert(app, "held-manual-migration", current=current, latest=latest,
|
||||
release_notes=notes[:4000])
|
||||
write_alert(
|
||||
app, "held-manual-migration", current=current, latest=latest, release_notes=notes[:4000]
|
||||
)
|
||||
return f"held-manual-migration:{current}->{latest}"
|
||||
|
||||
# WC1.1 health-gated upgrade with rollback.
|
||||
@ -428,8 +479,14 @@ def reconcile(app: str) -> str:
|
||||
warmsnap.restore(recipe, domain)
|
||||
deploy_version(recipe, domain, last_good, dt)
|
||||
recovered = wait_healthy(spec)
|
||||
write_alert(app, "rollback", last_good=last_good, attempted=latest, recovered=recovered,
|
||||
release_notes=notes[:2000])
|
||||
write_alert(
|
||||
app,
|
||||
"rollback",
|
||||
last_good=last_good,
|
||||
attempted=latest,
|
||||
recovered=recovered,
|
||||
release_notes=notes[:2000],
|
||||
)
|
||||
if not recovered:
|
||||
raise RuntimeError(f"{app} rollback to {last_good} did not become healthy")
|
||||
return f"rolled-back:{latest}->{last_good}"
|
||||
|
||||
@ -15,7 +15,8 @@ import shlex
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from harness import http as harness_http, lifecycle # noqa: E402
|
||||
from harness import http as harness_http # noqa: E402
|
||||
from harness import lifecycle
|
||||
|
||||
PDS_HOST_LOCAL = "http://localhost:3000"
|
||||
_PW = "ccci-P4-marker-pw-2026"
|
||||
|
||||
@ -27,6 +27,7 @@ CRUD). A wedged PDS subsystem fails AT its layer.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
import re
|
||||
import secrets
|
||||
@ -35,7 +36,8 @@ import sys
|
||||
import uuid
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", "runner"))
|
||||
from harness import http as harness_http, lifecycle # noqa: E402
|
||||
from harness import http as harness_http # noqa: E402
|
||||
from harness import lifecycle
|
||||
|
||||
PDS_HOST_LOCAL = "http://localhost:3000"
|
||||
|
||||
@ -58,14 +60,18 @@ def _goat_admin(domain: str, args: str) -> str:
|
||||
return _in_container(domain, cmd)
|
||||
|
||||
|
||||
def _xrpc_post(domain: str, nsid: str, data: dict, token: str | None = None) -> tuple[int, dict | None]:
|
||||
def _xrpc_post(
|
||||
domain: str, nsid: str, data: dict, token: str | None = None
|
||||
) -> tuple[int, dict | None]:
|
||||
headers = {}
|
||||
if token:
|
||||
headers["Authorization"] = f"Bearer {token}"
|
||||
return harness_http.http_post(f"https://{domain}/xrpc/{nsid}", data=data, headers=headers)
|
||||
|
||||
|
||||
def _xrpc_get(domain: str, nsid: str, query: str, token: str | None = None) -> tuple[int, dict | None]:
|
||||
def _xrpc_get(
|
||||
domain: str, nsid: str, query: str, token: str | None = None
|
||||
) -> tuple[int, dict | None]:
|
||||
headers = {}
|
||||
if token:
|
||||
headers["Authorization"] = f"Bearer {token}"
|
||||
@ -82,9 +88,9 @@ def test_account_lifecycle_and_post_roundtrip(live_app):
|
||||
|
||||
# Step 1: PDS describe via goat — recipe self-identifies as did:web:<domain>
|
||||
out = _in_container(domain, f"goat pds describe {PDS_HOST_LOCAL} 2>&1")
|
||||
assert f"did:web:{domain}" in out, (
|
||||
f"goat pds describe did not contain expected DID 'did:web:{domain}'. Output:\n{out[:500]!r}"
|
||||
)
|
||||
assert (
|
||||
f"did:web:{domain}" in out
|
||||
), f"goat pds describe did not contain expected DID 'did:web:{domain}'. Output:\n{out[:500]!r}"
|
||||
|
||||
# Step 2: Create account (UUID-suffixed handle = no run-to-run collision)
|
||||
out = _goat_admin(
|
||||
@ -127,9 +133,9 @@ def test_account_lifecycle_and_post_roundtrip(live_app):
|
||||
assert s == 200, f"createRecord HTTP {s}: {body!r}"
|
||||
record_uri = (body or {}).get("uri", "")
|
||||
# URI format: at://<did>/app.bsky.feed.post/<rkey>
|
||||
assert record_uri.startswith(f"at://{new_did}/app.bsky.feed.post/"), (
|
||||
f"unexpected record uri: {record_uri!r}"
|
||||
)
|
||||
assert record_uri.startswith(
|
||||
f"at://{new_did}/app.bsky.feed.post/"
|
||||
), f"unexpected record uri: {record_uri!r}"
|
||||
rkey = record_uri.rsplit("/", 1)[-1]
|
||||
assert rkey, f"no rkey in uri: {record_uri!r}"
|
||||
|
||||
@ -142,15 +148,13 @@ def test_account_lifecycle_and_post_roundtrip(live_app):
|
||||
)
|
||||
assert s == 200, f"getRecord HTTP {s}: {body!r}"
|
||||
record_value = (body or {}).get("value", {})
|
||||
assert record_value.get("text") == marker, (
|
||||
f"post text did not round-trip: created={marker!r}, fetched={record_value.get('text')!r}"
|
||||
)
|
||||
assert (
|
||||
record_value.get("text") == marker
|
||||
), f"post text did not round-trip: created={marker!r}, fetched={record_value.get('text')!r}"
|
||||
assert record_value.get("$type") == "app.bsky.feed.post"
|
||||
finally:
|
||||
# Step 6: Best-effort cleanup. (The per-run domain teardown will discard the volume
|
||||
# too, but we exercise the delete-account path because it's part of §4.3.)
|
||||
if cleanup_did:
|
||||
try:
|
||||
with contextlib.suppress(Exception):
|
||||
_goat_admin(domain, f"account delete {cleanup_did}")
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
@ -26,6 +26,6 @@ def test_describe_server_returns_atproto_envelope(live_app):
|
||||
# At least one of these atproto-spec fields must be present
|
||||
expected_any = ("availableUserDomains", "inviteCodeRequired", "links", "did")
|
||||
present = [k for k in expected_any if k in body]
|
||||
assert present, (
|
||||
f"describe-server missing all of {expected_any}; got keys: {sorted(body.keys())[:20]}"
|
||||
)
|
||||
assert (
|
||||
present
|
||||
), f"describe-server missing all of {expected_any}; got keys: {sorted(body.keys())[:20]}"
|
||||
|
||||
@ -17,6 +17,6 @@ def test_pds_health_returns_version(live_app):
|
||||
url = f"https://{live_app}/xrpc/_health"
|
||||
status, body = harness_http.retry_http_get(url, expect_status=200, max_wait=60, interval=3)
|
||||
assert status == 200, f"GET {url} HTTP {status} (expected 200)"
|
||||
assert isinstance(body, dict) and isinstance(body.get("version"), str) and body["version"], (
|
||||
f"GET {url} response is not the expected health envelope: {body!r}"
|
||||
)
|
||||
assert (
|
||||
isinstance(body, dict) and isinstance(body.get("version"), str) and body["version"]
|
||||
), f"GET {url} response is not the expected health envelope: {body!r}"
|
||||
|
||||
@ -30,6 +30,6 @@ def test_get_session_requires_auth(live_app):
|
||||
f"body: {body!r}"
|
||||
)
|
||||
# The XRPC error envelope is JSON with an `error` field per the atproto spec.
|
||||
assert isinstance(body, dict) and body.get("error"), (
|
||||
f"expected XRPC JSON error envelope; got: {body!r}"
|
||||
)
|
||||
assert isinstance(body, dict) and body.get(
|
||||
"error"
|
||||
), f"expected XRPC JSON error envelope; got: {body!r}"
|
||||
|
||||
@ -22,12 +22,12 @@ echo " bluesky-pds install_steps: generating secp256k1 PLC rotation key..."
|
||||
# same shape the PDS expects (32-byte hex). Equivalent for atproto PDS bootstrap.
|
||||
KEY_HEX=$(cc-ci-run -c 'import secrets; print(secrets.token_bytes(32).hex())')
|
||||
if [ -z "${KEY_HEX}" ] || [ "${#KEY_HEX}" != "64" ]; then
|
||||
echo " install_steps: failed to generate PLC rotation key (KEY_HEX length=${#KEY_HEX})" >&2
|
||||
exit 1
|
||||
echo " install_steps: failed to generate PLC rotation key (KEY_HEX length=${#KEY_HEX})" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Insert via abra under TTY-wrap (`abra app secret insert` requires a TTY on this version).
|
||||
# We DON'T log the key value — abra also doesn't print it.
|
||||
script -qec "abra app secret insert ${CCCI_APP_DOMAIN} pds_plc_rotation_key v1 ${KEY_HEX} --no-input" /dev/null \
|
||||
>/dev/null 2>&1
|
||||
>/dev/null 2>&1
|
||||
echo " bluesky-pds install_steps: PLC rotation key inserted (v1)."
|
||||
|
||||
@ -11,6 +11,6 @@ import _p4 # noqa: E402
|
||||
|
||||
|
||||
def test_restore_returns_state(live_app):
|
||||
assert _p4.account_exists(live_app), (
|
||||
"restore did not bring back the seeded marker account (PDS data did not survive restore)"
|
||||
)
|
||||
assert _p4.account_exists(
|
||||
live_app
|
||||
), "restore did not bring back the seeded marker account (PDS data did not survive restore)"
|
||||
|
||||
108
tests/concurrency/concutil.py
Normal file
108
tests/concurrency/concutil.py
Normal file
@ -0,0 +1,108 @@
|
||||
"""Shared utilities for the real-kernel concurrency suite (imported by the test modules; the
|
||||
fixtures in conftest.py wrap these). No flock mocking anywhere — probes use real LOCK_NB."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import fcntl
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from harness import lifecycle # noqa: E402
|
||||
|
||||
HELPERS = os.path.join(os.path.dirname(__file__), "helpers.py")
|
||||
DOMAIN = "test-abc123.ci.commoninternet.net" # matches RUN_APP_RE
|
||||
|
||||
|
||||
class HelperPool:
|
||||
"""Spawns helpers.py subprocesses and GUARANTEES their cleanup (incl. recorded grandchild
|
||||
pids from `hold-with-child`/`wrapper` markers) — no leaked children in the test VM."""
|
||||
|
||||
def __init__(self, out_dir: str):
|
||||
self.out_dir = out_dir
|
||||
self.procs: list[subprocess.Popen] = []
|
||||
self.extra_pids: list[int] = []
|
||||
self._n = 0
|
||||
|
||||
def spawn(self, *args: str, env_extra: dict | None = None) -> tuple[subprocess.Popen, str]:
|
||||
"""Start `helpers.py <args...>`; returns (proc, marker_file)."""
|
||||
self._n += 1
|
||||
out = os.path.join(self.out_dir, f"helper-{self._n}.out")
|
||||
env = dict(os.environ, CCCI_HELPER_OUT=out, **(env_extra or {}))
|
||||
p = subprocess.Popen( # noqa: S603
|
||||
[sys.executable, HELPERS, *args],
|
||||
env=env,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
self.procs.append(p)
|
||||
return p, out
|
||||
|
||||
def track_pid(self, pid: int) -> None:
|
||||
self.extra_pids.append(pid)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
for p in self.procs:
|
||||
if p.poll() is None:
|
||||
p.kill()
|
||||
with contextlib.suppress(subprocess.TimeoutExpired):
|
||||
p.wait(timeout=10)
|
||||
for pid in self.extra_pids:
|
||||
with contextlib.suppress(OSError):
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
|
||||
|
||||
def wait_marker(out: str, token: str, timeout: float = 15.0) -> str | None:
|
||||
"""Poll a helper's marker file for a line containing `token`; returns the line or None."""
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
with open(out) as f:
|
||||
for line in f:
|
||||
if token in line:
|
||||
return line.strip()
|
||||
except OSError:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
return None
|
||||
|
||||
|
||||
def lock_state(domain: str) -> str:
|
||||
"""'held' | 'free' | 'absent' for the domain's lockfile, probed with a REAL LOCK_NB."""
|
||||
path = lifecycle._app_lock_path(domain) # noqa: SLF001
|
||||
if not os.path.exists(path):
|
||||
return "absent"
|
||||
with open(path, "a") as f:
|
||||
try:
|
||||
fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
return "free"
|
||||
except BlockingIOError:
|
||||
return "held"
|
||||
|
||||
|
||||
def wait_lock_state(domain: str, want: str, timeout: float = 10.0) -> str:
|
||||
"""Poll until lock_state(domain) == want (kernel release on process death is fast, but give
|
||||
the scheduler room). Returns the final observed state."""
|
||||
deadline = time.time() + timeout
|
||||
state = lock_state(domain)
|
||||
while state != want and time.time() < deadline:
|
||||
time.sleep(0.1)
|
||||
state = lock_state(domain)
|
||||
return state
|
||||
|
||||
|
||||
def pid_alive(pid: int) -> bool:
|
||||
return os.path.exists(f"/proc/{pid}")
|
||||
|
||||
|
||||
def wait_pid_gone(pid: int, timeout: float = 15.0) -> bool:
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
if not pid_alive(pid):
|
||||
return True
|
||||
time.sleep(0.1)
|
||||
return False
|
||||
34
tests/concurrency/conftest.py
Normal file
34
tests/concurrency/conftest.py
Normal file
@ -0,0 +1,34 @@
|
||||
"""Fixtures for the real-kernel concurrency suite (concurrency-restructure plan, 19 cases).
|
||||
|
||||
NOT part of the default `pytest tests/unit` gate — run explicitly with `pytest tests/concurrency
|
||||
-q` (docs/concurrency.md). Locks live in a per-test tmp dir (CCCI_APP_LOCK_DIR); helper
|
||||
subprocesses hold REAL flocks / install the REAL prctl+signal guards and are always reaped in
|
||||
fixture finalizers (no leaked children in the test VM).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from concutil import HelperPool # noqa: E402
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lock_dir(tmp_path, monkeypatch):
|
||||
"""Sandbox lock dir, exported so BOTH this process's lifecycle calls and helper subprocesses
|
||||
(which inherit os.environ) resolve their lockfiles here — never /run/lock."""
|
||||
d = tmp_path / "locks"
|
||||
d.mkdir()
|
||||
monkeypatch.setenv("CCCI_APP_LOCK_DIR", str(d))
|
||||
return str(d)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pool(tmp_path):
|
||||
hp = HelperPool(str(tmp_path))
|
||||
yield hp
|
||||
hp.cleanup()
|
||||
149
tests/concurrency/helpers.py
Normal file
149
tests/concurrency/helpers.py
Normal file
@ -0,0 +1,149 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Subprocess helpers for tests/concurrency — REAL kernel locks and the REAL lifetime guards in
|
||||
separate processes (flock/prctl are never mocked; tests assert on actual kernel behavior).
|
||||
|
||||
Invoked as: python3 helpers.py <command> <args...>
|
||||
|
||||
Env contract (set by the spawning test):
|
||||
CCCI_APP_LOCK_DIR sandbox lock dir (never /run/lock in tests)
|
||||
CCCI_HELPER_OUT marker file this helper APPENDS progress lines to (ACQUIRED/READY/...)
|
||||
|
||||
Commands:
|
||||
hold <domain> acquire the app lock, mark `ACQUIRED <ts>`, sleep forever
|
||||
hold-with-child <domain> acquire the lock, spawn a plain sleeping subprocess child, mark
|
||||
`ACQUIRED <ts>` + `CHILD <pid>` (PEP 446: the child must NOT
|
||||
inherit the lock fd), sleep forever
|
||||
guarded <domain> <deadline> install the REAL lifetime guards (alarm=<deadline>s), acquire the
|
||||
lock, mark `READY`; when the teardown funnel runs (`finally:`),
|
||||
mark `TEARDOWN` before exiting
|
||||
wrapper <domain> spawn `guarded <domain> 3600` as MY child, mark `WRAPPED <pid>`,
|
||||
sleep — the test kills me to prove PDEATHSIG TERMs the child
|
||||
orphan-probe wait (bounded) until reparented (ppid==1), then install the
|
||||
guards; mark `REFUSED` if they exit (expected) or `GUARDS_OK`
|
||||
fetch-checkout <recipe> <ref> run run_recipe_ci.fetch_recipe (the test sets CCCI_SKIP_FETCH=1
|
||||
+ a per-"run" ABRA_DIR), git-checkout <ref>, mark
|
||||
`RESULT <head> <data.txt content>`
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "runner"))
|
||||
from harness import abra, lifecycle, lifetime # noqa: E402
|
||||
|
||||
OUT = os.environ.get("CCCI_HELPER_OUT")
|
||||
|
||||
|
||||
def mark(line: str) -> None:
|
||||
if OUT:
|
||||
with open(OUT, "a") as f:
|
||||
f.write(line + "\n")
|
||||
f.flush()
|
||||
print(line, flush=True)
|
||||
|
||||
|
||||
def cmd_hold(domain: str) -> None:
|
||||
lifecycle.acquire_app_lock(domain)
|
||||
mark(f"ACQUIRED {time.time()}")
|
||||
time.sleep(3600)
|
||||
|
||||
|
||||
def cmd_hold_with_child(domain: str) -> None:
|
||||
lifecycle.acquire_app_lock(domain)
|
||||
child = subprocess.Popen([sys.executable, "-c", "import time; time.sleep(3600)"])
|
||||
mark(f"ACQUIRED {time.time()}")
|
||||
mark(f"CHILD {child.pid}")
|
||||
time.sleep(3600)
|
||||
|
||||
|
||||
def cmd_guarded(domain: str, deadline: str) -> None:
|
||||
lifetime.install_lifetime_guards(deadline_seconds=int(deadline))
|
||||
lifecycle.acquire_app_lock(domain)
|
||||
mark("READY")
|
||||
try:
|
||||
time.sleep(3600)
|
||||
finally:
|
||||
mark("TEARDOWN")
|
||||
|
||||
|
||||
def cmd_wrapper(domain: str) -> None:
|
||||
p = subprocess.Popen( # noqa: S603
|
||||
[sys.executable, os.path.abspath(__file__), "guarded", domain, "3600"],
|
||||
env=os.environ.copy(),
|
||||
)
|
||||
mark(f"WRAPPED {p.pid}")
|
||||
time.sleep(3600)
|
||||
|
||||
|
||||
def cmd_orphan_probe() -> None:
|
||||
# Our spawner exits immediately after fork; wait (bounded) until we are reparented so the
|
||||
# prctl is installed with the parent ALREADY dead — the exact race the ppid check closes.
|
||||
for _ in range(200):
|
||||
if os.getppid() == 1:
|
||||
break
|
||||
time.sleep(0.05)
|
||||
else:
|
||||
mark("NEVER_REPARENTED") # e.g. a subreaper environment — test will fail visibly
|
||||
return
|
||||
try:
|
||||
lifetime.install_lifetime_guards()
|
||||
except SystemExit:
|
||||
mark("REFUSED")
|
||||
raise
|
||||
mark("GUARDS_OK")
|
||||
|
||||
|
||||
def cmd_fetch_checkout(recipe: str, ref: str) -> None:
|
||||
import run_recipe_ci
|
||||
|
||||
run_recipe_ci.fetch_recipe(recipe, None, None)
|
||||
abra.recipe_checkout(recipe, ref)
|
||||
head = abra.recipe_head_commit(recipe)
|
||||
with open(os.path.join(abra.recipe_dir(recipe), "data.txt")) as f:
|
||||
content = f.read().strip()
|
||||
mark(f"RESULT {head} {content}")
|
||||
|
||||
|
||||
def cmd_deploy_count_run(domain: str, gate: str) -> None:
|
||||
"""Mirror the REAL run flow for the DG4.1 counter (CONC-A1 regression): countfile init
|
||||
(main() preamble) → _record_deploy (deploy_app fires it BEFORE the app lock) → acquire
|
||||
the app lock → wait for `gate` (file path; '' = no wait) → read + remove own countfile.
|
||||
Two of these on the SAME domain must each see COUNT 1 and never lose their file."""
|
||||
import run_recipe_ci
|
||||
|
||||
countfile = run_recipe_ci._run_state_path("deploys")
|
||||
with open(countfile, "w") as f:
|
||||
f.write("0")
|
||||
os.environ["CCCI_DEPLOY_COUNT_FILE"] = countfile
|
||||
lifecycle._record_deploy() # pre-lock, exactly like lifecycle.deploy_app()
|
||||
mark("PRELOCK")
|
||||
lifecycle.acquire_app_lock(domain)
|
||||
mark("ACQUIRED")
|
||||
if gate:
|
||||
deadline = time.time() + 15
|
||||
while not os.path.exists(gate) and time.time() < deadline:
|
||||
time.sleep(0.05)
|
||||
try:
|
||||
with open(countfile) as f:
|
||||
n = int(f.read().strip() or "0")
|
||||
os.remove(countfile)
|
||||
mark(f"COUNT {n}")
|
||||
except FileNotFoundError:
|
||||
mark("COUNT_FILE_MISSING")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cmd, *args = sys.argv[1:]
|
||||
{
|
||||
"hold": cmd_hold,
|
||||
"hold-with-child": cmd_hold_with_child,
|
||||
"guarded": cmd_guarded,
|
||||
"wrapper": cmd_wrapper,
|
||||
"orphan-probe": cmd_orphan_probe,
|
||||
"fetch-checkout": cmd_fetch_checkout,
|
||||
"deploy-count-run": cmd_deploy_count_run,
|
||||
}[cmd](*args)
|
||||
175
tests/concurrency/test_abra_dir.py
Normal file
175
tests/concurrency/test_abra_dir.py
Normal file
@ -0,0 +1,175 @@
|
||||
"""Per-run ABRA_DIR isolation (concurrency-restructure plan, cases 17-19). Real directories,
|
||||
real symlinks, real git — abra itself is replaced by a recording stub where a CLI call is
|
||||
involved (case 17), because these cases test OUR dir/env plumbing, not abra."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import stat
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
import run_recipe_ci # noqa: E402
|
||||
from concutil import wait_marker # noqa: E402
|
||||
from harness import abra # noqa: E402
|
||||
|
||||
RECIPE = "fakerecipe"
|
||||
|
||||
|
||||
def _git(cwd, *args):
|
||||
subprocess.run(
|
||||
["git", "-c", "user.email=t@t", "-c", "user.name=t", *args],
|
||||
cwd=cwd,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
|
||||
def _make_fake_home(tmp_path):
|
||||
"""A fake $HOME with a canonical ~/.abra: servers/default + catalogue dirs, and a recipe git
|
||||
repo with two tags whose data.txt differs (v1 -> 'one', v2 -> 'two', HEAD at v2)."""
|
||||
home = tmp_path / "home"
|
||||
(home / ".abra" / "servers" / "default").mkdir(parents=True)
|
||||
(home / ".abra" / "catalogue").mkdir(parents=True)
|
||||
repo = home / ".abra" / "recipes" / RECIPE
|
||||
repo.mkdir(parents=True)
|
||||
_git(repo, "init", "-q")
|
||||
(repo / "data.txt").write_text("one\n")
|
||||
_git(repo, "add", "data.txt")
|
||||
_git(repo, "commit", "-qm", "v1")
|
||||
_git(repo, "tag", "v1")
|
||||
(repo / "data.txt").write_text("two\n")
|
||||
_git(repo, "add", "data.txt")
|
||||
_git(repo, "commit", "-qm", "v2")
|
||||
_git(repo, "tag", "v2")
|
||||
return home
|
||||
|
||||
|
||||
def test_17_per_run_dir_built_and_exported_before_abra(tmp_path, monkeypatch):
|
||||
"""Case 17: setup_run_abra_dir builds the per-run dir correctly (servers/catalogue symlinks
|
||||
resolve to the canonical tree, recipes/ empty + writable) and $ABRA_DIR is exported before
|
||||
the first abra call — proven by a stub `abra` on PATH that records the env it saw."""
|
||||
home = _make_fake_home(tmp_path)
|
||||
monkeypatch.setenv("HOME", str(home))
|
||||
monkeypatch.setenv("CCCI_RUNS_DIR", str(tmp_path / "runs"))
|
||||
monkeypatch.setenv("DRONE_BUILD_NUMBER", "777")
|
||||
monkeypatch.setenv("ABRA_DIR", "sentinel-to-be-overwritten") # so monkeypatch restores it
|
||||
|
||||
d = run_recipe_ci.setup_run_abra_dir()
|
||||
assert d == str(tmp_path / "runs" / "777" / "abra")
|
||||
assert os.environ["ABRA_DIR"] == d
|
||||
assert os.readlink(os.path.join(d, "servers")) == str(home / ".abra" / "servers")
|
||||
assert os.readlink(os.path.join(d, "catalogue")) == str(home / ".abra" / "catalogue")
|
||||
# symlinks RESOLVE (targets exist) and recipes/ is empty + writable
|
||||
assert os.path.isdir(os.path.join(d, "servers", "default"))
|
||||
assert os.path.isdir(os.path.join(d, "catalogue"))
|
||||
assert os.listdir(os.path.join(d, "recipes")) == []
|
||||
probe = os.path.join(d, "recipes", ".write-probe")
|
||||
open(probe, "w").close()
|
||||
os.remove(probe)
|
||||
# idempotent re-entry (Drone build-number retry): must not raise on existing symlinks
|
||||
assert run_recipe_ci.setup_run_abra_dir() == d
|
||||
|
||||
# stub abra records $ABRA_DIR at call time; fetch_recipe's catalogue branch invokes it
|
||||
stub_dir = tmp_path / "bin"
|
||||
stub_dir.mkdir()
|
||||
log = tmp_path / "abra-env.log"
|
||||
stub = stub_dir / "abra"
|
||||
stub.write_text(f'#!/bin/sh\necho "$ABRA_DIR" >> {log}\nexit 0\n')
|
||||
stub.chmod(stub.stat().st_mode | stat.S_IEXEC)
|
||||
monkeypatch.setenv("PATH", f"{stub_dir}{os.pathsep}{os.environ['PATH']}")
|
||||
monkeypatch.delenv("CCCI_SKIP_FETCH", raising=False)
|
||||
run_recipe_ci.fetch_recipe(RECIPE, None, None)
|
||||
assert log.read_text().strip() == d, "abra was called without the per-run ABRA_DIR exported"
|
||||
|
||||
|
||||
def test_18_concurrent_same_recipe_fetch_no_cross_talk(tmp_path, monkeypatch, pool):
|
||||
"""Case 18: two CONCURRENT fetch+checkout flows of the SAME recipe into different ABRA_DIRs
|
||||
produce two correct, divergent trees (v1 vs v2) — the old shared-tree corruption scenario,
|
||||
now structurally safe with no lock. The canonical staged clone is untouched."""
|
||||
home = _make_fake_home(tmp_path)
|
||||
canonical_repo = home / ".abra" / "recipes" / RECIPE
|
||||
head_before = subprocess.run(
|
||||
["git", "-C", canonical_repo, "rev-parse", "HEAD"], capture_output=True, text=True
|
||||
).stdout.strip()
|
||||
|
||||
runs = {}
|
||||
for name, ref in (("runA", "v1"), ("runB", "v2")):
|
||||
abra_dir = tmp_path / name / "abra"
|
||||
abra_dir.mkdir(parents=True)
|
||||
_, out = pool.spawn(
|
||||
"fetch-checkout",
|
||||
RECIPE,
|
||||
ref,
|
||||
env_extra={
|
||||
"HOME": str(home),
|
||||
"ABRA_DIR": str(abra_dir),
|
||||
"CCCI_SKIP_FETCH": "1",
|
||||
},
|
||||
)
|
||||
runs[name] = (out, ref, abra_dir)
|
||||
|
||||
expect = {"v1": "one", "v2": "two"}
|
||||
for name, (out, ref, abra_dir) in runs.items():
|
||||
line = wait_marker(out, "RESULT", timeout=30)
|
||||
assert line, f"{name} never produced a RESULT"
|
||||
_, head, content = line.split()
|
||||
assert content == expect[ref], f"{name}@{ref}: tree content {content!r}"
|
||||
tree = abra_dir / "recipes" / RECIPE
|
||||
assert (tree / "data.txt").read_text().strip() == expect[ref]
|
||||
assert (
|
||||
head
|
||||
== subprocess.run(
|
||||
["git", "-C", tree, "rev-parse", "HEAD"], capture_output=True, text=True
|
||||
).stdout.strip()
|
||||
)
|
||||
|
||||
# the two trees genuinely diverge AND the canonical staged clone is untouched
|
||||
a = (runs["runA"][2] / "recipes" / RECIPE / "data.txt").read_text()
|
||||
b = (runs["runB"][2] / "recipes" / RECIPE / "data.txt").read_text()
|
||||
assert a != b
|
||||
head_after = subprocess.run(
|
||||
["git", "-C", canonical_repo, "rev-parse", "HEAD"], capture_output=True, text=True
|
||||
).stdout.strip()
|
||||
assert head_after == head_before, "canonical clone must not be touched by per-run fetches"
|
||||
|
||||
|
||||
def test_19_env_written_through_servers_symlink_lands_canonical(tmp_path, monkeypatch):
|
||||
"""Case 19: an app .env written through the per-run servers/ symlink (what abra does under
|
||||
$ABRA_DIR) lands in the CANONICAL shared path — so janitor discovery and every
|
||||
expanduser('~/.abra/servers/...') reader keep working unchanged."""
|
||||
home = _make_fake_home(tmp_path)
|
||||
monkeypatch.setenv("HOME", str(home))
|
||||
monkeypatch.setenv("CCCI_RUNS_DIR", str(tmp_path / "runs"))
|
||||
monkeypatch.setenv("DRONE_BUILD_NUMBER", "778")
|
||||
monkeypatch.setenv("ABRA_DIR", "sentinel-to-be-overwritten")
|
||||
d = run_recipe_ci.setup_run_abra_dir()
|
||||
|
||||
domain = "test-abc123.ci.commoninternet.net"
|
||||
via_symlink = os.path.join(d, "servers", "default", f"{domain}.env")
|
||||
with open(via_symlink, "w") as f:
|
||||
f.write("TYPE=fakerecipe:1.0.0\nDOMAIN=placeholder\n")
|
||||
|
||||
canonical = home / ".abra" / "servers" / "default" / f"{domain}.env"
|
||||
assert canonical.is_file(), ".env written via the symlink must land in the canonical path"
|
||||
# the canonical-path readers/writers (abra.env_get/env_set use ~/.abra) see the same file
|
||||
assert abra.env_get(domain, "TYPE") == "fakerecipe:1.0.0"
|
||||
abra.env_set(domain, "DOMAIN", domain)
|
||||
with open(via_symlink) as f:
|
||||
assert f"DOMAIN={domain}" in f.read()
|
||||
|
||||
|
||||
def test_18b_run_id_manual_fallback_is_per_process(tmp_path, monkeypatch):
|
||||
"""Companion to case 18: two concurrent MANUAL runs (no DRONE_BUILD_NUMBER) must not share an
|
||||
abra dir either — the manual fallback is pid-suffixed."""
|
||||
home = _make_fake_home(tmp_path)
|
||||
monkeypatch.setenv("HOME", str(home))
|
||||
monkeypatch.setenv("CCCI_RUNS_DIR", str(tmp_path / "runs"))
|
||||
monkeypatch.delenv("DRONE_BUILD_NUMBER", raising=False)
|
||||
monkeypatch.delenv("CCCI_APP_DOMAIN", raising=False)
|
||||
monkeypatch.delenv("CCCI_RUN_ID", raising=False)
|
||||
monkeypatch.setenv("ABRA_DIR", "sentinel-to-be-overwritten")
|
||||
d = run_recipe_ci.setup_run_abra_dir()
|
||||
assert f"manual-{os.getpid()}" in d
|
||||
189
tests/concurrency/test_janitor.py
Normal file
189
tests/concurrency/test_janitor.py
Normal file
@ -0,0 +1,189 @@
|
||||
"""Janitor / flock-probe semantics (concurrency-restructure plan, cases 5-12).
|
||||
|
||||
The janitor runs IN-PROCESS with its discovery monkeypatched (candidates injected via a stubbed
|
||||
abra.app_ls + empty docker sweep) and teardown_app stubbed to record calls — but the LOCKS are
|
||||
real kernel flocks, held by real helper subprocesses where a live owner is needed."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from concutil import DOMAIN, lock_state, wait_marker # noqa: E402
|
||||
from harness import lifecycle # noqa: E402
|
||||
|
||||
|
||||
def _inject_candidates(monkeypatch, domains):
|
||||
"""Point janitor discovery at exactly `domains`: abra lists them, docker sweep is empty.
|
||||
teardown_app is stubbed to a recorder; returns the calls list."""
|
||||
calls = []
|
||||
monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": d} for d in domains])
|
||||
monkeypatch.setattr(lifecycle, "_docker_names", lambda kind, stack: [])
|
||||
monkeypatch.setattr(lifecycle, "teardown_app", lambda d, verify=True: calls.append(d))
|
||||
return calls
|
||||
|
||||
|
||||
def test_5_orphan_reaped_lockfile_unlinked(lock_dir, pool, monkeypatch):
|
||||
"""Case 5: an orphan (lockfile exists, no holder — its run was SIGKILL'd) is reaped exactly
|
||||
once and its lockfile unlinked."""
|
||||
p, out = pool.spawn("hold", DOMAIN)
|
||||
assert wait_marker(out, "ACQUIRED")
|
||||
p.kill()
|
||||
p.wait(timeout=10)
|
||||
calls = _inject_candidates(monkeypatch, [DOMAIN])
|
||||
lifecycle.janitor()
|
||||
assert calls == [DOMAIN], f"teardown calls: {calls} (expected exactly one)"
|
||||
assert lock_state(DOMAIN) == "absent", "reaped orphan's lockfile must be unlinked"
|
||||
|
||||
|
||||
def test_6_live_run_never_reaped(lock_dir, pool, monkeypatch, capsys):
|
||||
"""Case 6: a held lock (live helper) is never reaped and is logged as live."""
|
||||
p, out = pool.spawn("hold", DOMAIN)
|
||||
assert wait_marker(out, "ACQUIRED")
|
||||
calls = _inject_candidates(monkeypatch, [DOMAIN])
|
||||
lifecycle.janitor()
|
||||
assert calls == []
|
||||
assert "live concurrent run" in capsys.readouterr().out
|
||||
assert lock_state(DOMAIN) == "held"
|
||||
|
||||
|
||||
def test_7_new_run_blocks_until_reap_finishes(lock_dir, pool, monkeypatch):
|
||||
"""Case 7: the janitor reaps WHILE HOLDING the probe lock, so a new run of the same domain
|
||||
blocks in acquire_app_lock until the reap completes — no window where a fresh app coexists
|
||||
with a half-reaped one."""
|
||||
# Make an orphan.
|
||||
p, out = pool.spawn("hold", DOMAIN)
|
||||
assert wait_marker(out, "ACQUIRED")
|
||||
p.kill()
|
||||
p.wait(timeout=10)
|
||||
|
||||
state = {"teardown_end": None, "acquirer_out": None}
|
||||
|
||||
def slow_teardown(domain, verify=True):
|
||||
# While the janitor holds the probe lock mid-reap, a new run starts acquiring.
|
||||
_, aout = pool.spawn("hold", DOMAIN)
|
||||
state["acquirer_out"] = aout
|
||||
time.sleep(2.0)
|
||||
state["teardown_end"] = time.time()
|
||||
|
||||
monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": DOMAIN}])
|
||||
monkeypatch.setattr(lifecycle, "_docker_names", lambda kind, stack: [])
|
||||
monkeypatch.setattr(lifecycle, "teardown_app", slow_teardown)
|
||||
lifecycle.janitor()
|
||||
|
||||
line = wait_marker(state["acquirer_out"], "ACQUIRED", timeout=15)
|
||||
assert line, "new run never acquired after the reap"
|
||||
acquired_ts = float(line.split()[1])
|
||||
assert (
|
||||
acquired_ts >= state["teardown_end"]
|
||||
), f"new run acquired at {acquired_ts} BEFORE the reap finished at {state['teardown_end']}"
|
||||
# The new run must hold a lock the next probe can SEE (fresh inode at the path).
|
||||
assert lock_state(DOMAIN) == "held"
|
||||
|
||||
|
||||
def test_8_two_janitors_exactly_one_reaps(lock_dir, pool, monkeypatch):
|
||||
"""Case 8: two concurrent janitors arbitrate on the probe flock — exactly one reaps (the
|
||||
other sees 'held' and leaves). Teardown is slowed so the runs genuinely overlap."""
|
||||
p, out = pool.spawn("hold", DOMAIN)
|
||||
assert wait_marker(out, "ACQUIRED")
|
||||
p.kill()
|
||||
p.wait(timeout=10)
|
||||
|
||||
calls = []
|
||||
calls_lock = threading.Lock()
|
||||
|
||||
def slow_teardown(domain, verify=True):
|
||||
with calls_lock:
|
||||
calls.append(domain)
|
||||
time.sleep(2.0)
|
||||
|
||||
monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": DOMAIN}])
|
||||
monkeypatch.setattr(lifecycle, "_docker_names", lambda kind, stack: [])
|
||||
monkeypatch.setattr(lifecycle, "teardown_app", slow_teardown)
|
||||
|
||||
barrier = threading.Barrier(2)
|
||||
|
||||
def run_janitor():
|
||||
barrier.wait()
|
||||
lifecycle.janitor()
|
||||
|
||||
t1, t2 = threading.Thread(target=run_janitor), threading.Thread(target=run_janitor)
|
||||
t1.start(), t2.start()
|
||||
t1.join(timeout=30), t2.join(timeout=30)
|
||||
assert calls == [DOMAIN], f"expected exactly one reap, got {calls}"
|
||||
assert lock_state(DOMAIN) == "absent"
|
||||
|
||||
|
||||
def test_9_reboot_lockfile_absent_reaped_immediately(lock_dir, monkeypatch):
|
||||
"""Case 9: post-reboot simulation — the app exists but its lockfile is gone (/run/lock is
|
||||
tmpfs). The probe trivially acquires -> immediate reap, NO age threshold (improvement over
|
||||
the old 2h fallback)."""
|
||||
assert lock_state(DOMAIN) == "absent"
|
||||
calls = _inject_candidates(monkeypatch, [DOMAIN])
|
||||
t0 = time.time()
|
||||
lifecycle.janitor()
|
||||
assert calls == [DOMAIN]
|
||||
assert time.time() - t0 < 5, "reap must be immediate (no age wait)"
|
||||
|
||||
|
||||
def test_10_long_held_lock_flagged_never_stolen(lock_dir, pool, monkeypatch, capsys):
|
||||
"""Case 10: a lock held with mtime older than 120min is flagged as a possible leaked run —
|
||||
and NOT reaped (never steal a held lock)."""
|
||||
p, out = pool.spawn("hold", DOMAIN)
|
||||
assert wait_marker(out, "ACQUIRED")
|
||||
path = lifecycle._app_lock_path(DOMAIN) # noqa: SLF001
|
||||
backdate = time.time() - (130 * 60)
|
||||
os.utime(path, (backdate, backdate))
|
||||
calls = _inject_candidates(monkeypatch, [DOMAIN])
|
||||
lifecycle.janitor()
|
||||
assert calls == []
|
||||
out_text = capsys.readouterr().out
|
||||
assert "possible leaked run" in out_text and "lslocks" in out_text
|
||||
assert lock_state(DOMAIN) == "held"
|
||||
|
||||
|
||||
def test_11_warm_canonical_names_never_probed(lock_dir, monkeypatch):
|
||||
"""Case 11: RUN_APP_RE allowlist — warm/canonical-shaped names never become candidates, so
|
||||
they are never probed (no lockfile is even created for them) and never reaped."""
|
||||
warmish = [
|
||||
"warm-keycloak.ci.commoninternet.net",
|
||||
"keycloak.ci.commoninternet.net",
|
||||
"warm-hedgedoc.ci.commoninternet.net",
|
||||
"drone.ci.commoninternet.net",
|
||||
]
|
||||
calls = []
|
||||
monkeypatch.setattr(lifecycle.abra, "app_ls", lambda: [{"appName": d} for d in warmish])
|
||||
monkeypatch.setattr(
|
||||
lifecycle,
|
||||
"_docker_names",
|
||||
lambda kind, stack: ["warm-keycloak_ci_commoninternet_net_app"]
|
||||
if kind == "service"
|
||||
else [],
|
||||
)
|
||||
monkeypatch.setattr(lifecycle, "teardown_app", lambda d, verify=True: calls.append(d))
|
||||
lifecycle.janitor()
|
||||
assert calls == []
|
||||
lockdir = os.environ["CCCI_APP_LOCK_DIR"]
|
||||
assert [
|
||||
f for f in os.listdir(lockdir) if f.startswith("cc-ci-app-")
|
||||
] == [], "janitor must not create lockfiles for non-run-app names"
|
||||
|
||||
|
||||
def test_12_degrades_safely_on_bad_lockfile_and_missing_dir(lock_dir, monkeypatch, capsys):
|
||||
"""Case 12: a garbled/unopenable lockfile (here: a DIRECTORY at the lockfile path) is skipped
|
||||
with a log line; a missing lock dir doesn't crash the janitor either. Never a crash."""
|
||||
path = lifecycle._app_lock_path(DOMAIN) # noqa: SLF001
|
||||
os.makedirs(path) # open(path, "a") -> IsADirectoryError (an OSError)
|
||||
calls = _inject_candidates(monkeypatch, [DOMAIN])
|
||||
lifecycle.janitor() # must not raise
|
||||
assert calls == []
|
||||
assert "skipping" in capsys.readouterr().out
|
||||
|
||||
os.rmdir(path)
|
||||
monkeypatch.setenv("CCCI_APP_LOCK_DIR", os.path.join(os.environ["CCCI_APP_LOCK_DIR"], "gone"))
|
||||
lifecycle.janitor() # missing dir: probe open fails -> skip; tidy glob -> empty. No crash.
|
||||
assert calls == []
|
||||
82
tests/concurrency/test_lifetime.py
Normal file
82
tests/concurrency/test_lifetime.py
Normal file
@ -0,0 +1,82 @@
|
||||
"""Lifetime hardening (concurrency-restructure plan, cases 13-16): the REAL prctl/signal/alarm
|
||||
guards installed by helper subprocesses; tests assert teardown ran, exit was non-zero, and the
|
||||
lock was released."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from concutil import ( # noqa: E402
|
||||
DOMAIN,
|
||||
wait_lock_state,
|
||||
wait_marker,
|
||||
wait_pid_gone,
|
||||
)
|
||||
|
||||
|
||||
def test_13_pdeathsig_parent_kill_terms_harness(lock_dir, pool):
|
||||
"""Case 13: wrapper-parent spawns a guarded harness-child; the parent is SIGKILL'd (the
|
||||
harness gets no courtesy signal) -> the kernel's PDEATHSIG TERMs the child, its teardown
|
||||
funnel runs, it exits, and the lock is released."""
|
||||
p, out = pool.spawn("wrapper", DOMAIN)
|
||||
line = wait_marker(out, "WRAPPED")
|
||||
assert line, "wrapper never spawned its child"
|
||||
child_pid = int(line.split()[1])
|
||||
pool.track_pid(child_pid)
|
||||
assert wait_marker(out, "READY"), "guarded child never got ready"
|
||||
|
||||
p.kill() # parent dies WITHOUT signalling the child — only PDEATHSIG can save us
|
||||
p.wait(timeout=10)
|
||||
assert wait_pid_gone(child_pid), "guarded child must exit on parent death (PDEATHSIG)"
|
||||
assert wait_marker(out, "TEARDOWN", timeout=5), "teardown funnel did not run"
|
||||
assert wait_lock_state(DOMAIN, "free") == "free"
|
||||
|
||||
|
||||
def test_14_already_orphaned_helper_refuses_to_run(lock_dir, pool):
|
||||
"""Case 14 (ppid race): a helper whose parent died BEFORE the prctl was armed (it starts
|
||||
already reparented to pid 1) must refuse to run — PDEATHSIG would never fire for it."""
|
||||
# Spawn an intermediate parent that forks orphan-probe and exits immediately.
|
||||
import subprocess
|
||||
|
||||
out = os.path.join(pool.out_dir, "orphan.out")
|
||||
intermediate = (
|
||||
"import subprocess, sys, os; "
|
||||
"subprocess.Popen([sys.executable, os.environ['CCCI_HELPERS'], 'orphan-probe']); "
|
||||
)
|
||||
env = dict(
|
||||
os.environ,
|
||||
CCCI_HELPER_OUT=out,
|
||||
CCCI_HELPERS=os.path.join(os.path.dirname(__file__), "helpers.py"),
|
||||
)
|
||||
subprocess.run([sys.executable, "-c", intermediate], env=env, timeout=15, check=True)
|
||||
line = wait_marker(out, "REFUSED", timeout=20)
|
||||
assert line, "orphaned helper did not refuse to run (or never reparented to pid 1)"
|
||||
|
||||
|
||||
def test_15_deadline_alarm_fires_teardown_and_releases(lock_dir, pool):
|
||||
"""Case 15: the self-deadline (alarm). A guarded helper with a 2s deadline tears down via
|
||||
the funnel (finally: ran), exits NON-zero, and its lock is released."""
|
||||
p, out = pool.spawn("guarded", DOMAIN, "2")
|
||||
assert wait_marker(out, "READY")
|
||||
rc = p.wait(timeout=20)
|
||||
assert rc != 0, f"deadline exit must be non-zero (got {rc})"
|
||||
assert rc == 128 + signal.SIGALRM, f"expected 142 (128+SIGALRM), got {rc}"
|
||||
assert wait_marker(out, "TEARDOWN", timeout=5), "teardown funnel did not run on deadline"
|
||||
assert wait_lock_state(DOMAIN, "free") == "free"
|
||||
|
||||
|
||||
def test_16_sigterm_runs_teardown_funnel_and_releases(lock_dir, pool):
|
||||
"""Case 16: SIGTERM (drone cancel path) -> the finally: teardown funnel runs, exit is
|
||||
non-zero, lock released."""
|
||||
p, out = pool.spawn("guarded", DOMAIN, "3600")
|
||||
assert wait_marker(out, "READY")
|
||||
p.send_signal(signal.SIGTERM)
|
||||
rc = p.wait(timeout=20)
|
||||
assert rc != 0, f"SIGTERM exit must be non-zero (got {rc})"
|
||||
assert rc == 128 + signal.SIGTERM, f"expected 143 (128+SIGTERM), got {rc}"
|
||||
assert wait_marker(out, "TEARDOWN", timeout=5), "teardown funnel did not run on SIGTERM"
|
||||
assert wait_lock_state(DOMAIN, "free") == "free"
|
||||
85
tests/concurrency/test_locks.py
Normal file
85
tests/concurrency/test_locks.py
Normal file
@ -0,0 +1,85 @@
|
||||
"""Lock fundamentals (concurrency-restructure plan, cases 1-4). Real kernel flocks held by real
|
||||
subprocesses — nothing mocked."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import fcntl
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from concutil import ( # noqa: E402
|
||||
DOMAIN,
|
||||
lock_state,
|
||||
wait_lock_state,
|
||||
wait_marker,
|
||||
)
|
||||
from harness import lifecycle # noqa: E402
|
||||
|
||||
|
||||
def test_1_sigkill_releases_lock(lock_dir, pool):
|
||||
"""Case 1: acquire -> holder SIGKILL'd -> lock immediately acquirable (kernel auto-release).
|
||||
The exact property the old pidfile registry approximated with /proc checks."""
|
||||
p, out = pool.spawn("hold", DOMAIN)
|
||||
assert wait_marker(out, "ACQUIRED"), "holder never acquired"
|
||||
assert lock_state(DOMAIN) == "held"
|
||||
p.kill()
|
||||
p.wait(timeout=10)
|
||||
assert wait_lock_state(DOMAIN, "free") == "free"
|
||||
|
||||
|
||||
def test_2_nb_probe_held_vs_unheld(lock_dir, pool):
|
||||
"""Case 2: LOCK_NB probe raises BlockingIOError against a held lock; succeeds when unheld."""
|
||||
p, out = pool.spawn("hold", DOMAIN)
|
||||
assert wait_marker(out, "ACQUIRED")
|
||||
path = lifecycle._app_lock_path(DOMAIN) # noqa: SLF001
|
||||
with open(path, "a") as f:
|
||||
try:
|
||||
fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
raise AssertionError("LOCK_NB succeeded against a held lock")
|
||||
except BlockingIOError:
|
||||
pass
|
||||
p.kill()
|
||||
p.wait(timeout=10)
|
||||
assert wait_lock_state(DOMAIN, "free") == "free"
|
||||
with open(path, "a") as f:
|
||||
fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) # must not raise now
|
||||
|
||||
|
||||
def test_3_lock_fd_not_inherited_by_children(lock_dir, pool):
|
||||
"""Case 3 (PEP 446): the holder spawns a subprocess child, the holder dies, the child lives —
|
||||
and the lock is STILL released (the child never inherited the lock fd). This is what makes
|
||||
'held lock == live HARNESS owner' sound even though runs spawn abra/docker/pytest children."""
|
||||
p, out = pool.spawn("hold-with-child", DOMAIN)
|
||||
assert wait_marker(out, "ACQUIRED")
|
||||
child_line = wait_marker(out, "CHILD")
|
||||
assert child_line, "holder never reported its child pid"
|
||||
child_pid = int(child_line.split()[1])
|
||||
pool.track_pid(child_pid)
|
||||
p.kill()
|
||||
p.wait(timeout=10)
|
||||
assert os.path.exists(f"/proc/{child_pid}"), "child should outlive the holder"
|
||||
assert (
|
||||
wait_lock_state(DOMAIN, "free") == "free"
|
||||
), "lock must release on holder death even with a live child (PEP 446 non-inheritable fd)"
|
||||
|
||||
|
||||
def test_4_second_acquire_blocks_until_first_exits(lock_dir, pool):
|
||||
"""Case 4: a second same-domain acquire blocks until the first holder exits — the
|
||||
double-!testme serialisation property."""
|
||||
p1, out1 = pool.spawn("hold", DOMAIN)
|
||||
assert wait_marker(out1, "ACQUIRED")
|
||||
p2, out2 = pool.spawn("hold", DOMAIN)
|
||||
# p2 must NOT acquire while p1 holds.
|
||||
time.sleep(1.5)
|
||||
assert wait_marker(out2, "ACQUIRED", timeout=0.1) is None, "second acquire did not block"
|
||||
t_kill = time.time()
|
||||
p1.kill()
|
||||
p1.wait(timeout=10)
|
||||
line = wait_marker(out2, "ACQUIRED", timeout=15)
|
||||
assert line, "second acquire never completed after first holder exited"
|
||||
acquired_ts = float(line.split()[1])
|
||||
assert acquired_ts >= t_kill - 0.05, "second holder acquired before the first exited"
|
||||
assert lock_state(DOMAIN) == "held"
|
||||
79
tests/concurrency/test_run_state.py
Normal file
79
tests/concurrency/test_run_state.py
Normal file
@ -0,0 +1,79 @@
|
||||
"""Run-scoped state files — M2(c) live-verify regression (not one of the 19 plan cases).
|
||||
|
||||
The four CCCI state files (deploys countfile, opstate, deps, depskip) must be keyed by
|
||||
run id + harness pid, NEVER by app domain: a second run of the SAME domain executes its
|
||||
main() preamble (state-file init, deploy_app's _record_deploy) BEFORE it blocks at the
|
||||
app lock, so domain-keyed files in the shared tempdir get reset/removed underneath the
|
||||
live first run. Observed live (builds 279/281): false DG4.1 deploy-count=2 in run 1,
|
||||
countfile FileNotFoundError crash in run 2. Children never re-derive these paths — they
|
||||
receive them via the CCCI_*_FILE env vars, so per-process uniqueness is sufficient.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
import run_recipe_ci # noqa: E402
|
||||
from concutil import wait_marker # noqa: E402
|
||||
|
||||
DOMAIN = "fake-abc123.ci.commoninternet.net"
|
||||
|
||||
|
||||
def test_20_state_paths_keyed_by_run_and_pid_never_by_domain(monkeypatch):
|
||||
domain = "immi-ad3e33.ci.commoninternet.net"
|
||||
monkeypatch.setenv("CCCI_APP_DOMAIN", domain)
|
||||
|
||||
monkeypatch.setenv("DRONE_BUILD_NUMBER", "279")
|
||||
p279 = run_recipe_ci._run_state_path("deploys")
|
||||
monkeypatch.setenv("DRONE_BUILD_NUMBER", "281")
|
||||
p281 = run_recipe_ci._run_state_path("deploys")
|
||||
|
||||
# the double-!testme invariant: two runs (same domain) share NO state file
|
||||
assert p279 != p281
|
||||
# keyed by run id + pid, under the tempdir
|
||||
base = os.path.basename(p279)
|
||||
assert base == f"ccci-deploys-279-{os.getpid()}"
|
||||
assert os.path.dirname(p279) == tempfile.gettempdir()
|
||||
# the app domain must not appear in the path at all
|
||||
assert domain not in p279 and domain not in p281
|
||||
|
||||
|
||||
def test_20c_same_domain_runs_each_keep_their_own_count(tmp_path, lock_dir, pool):
|
||||
"""The live CONC-A1 interleaving, with REAL processes + the REAL lock and counter code:
|
||||
run A holds the app lock; run B (same domain) fires its pre-lock _record_deploy and
|
||||
blocks; A then reads its counter — must still be 1 (not polluted by B) — and removes
|
||||
its own file; B acquires and must find ITS file intact (no FileNotFoundError)."""
|
||||
gate = tmp_path / "gate"
|
||||
env_a = {"TMPDIR": str(tmp_path), "DRONE_BUILD_NUMBER": "9001"}
|
||||
env_b = {"TMPDIR": str(tmp_path), "DRONE_BUILD_NUMBER": "9002"}
|
||||
|
||||
pa, out_a = pool.spawn("deploy-count-run", DOMAIN, str(gate), env_extra=env_a)
|
||||
assert wait_marker(out_a, "ACQUIRED")
|
||||
pb, out_b = pool.spawn("deploy-count-run", DOMAIN, "", env_extra=env_b)
|
||||
# B's main()-preamble + pre-lock increment have fired; B is now blocked on the app lock
|
||||
assert wait_marker(out_b, "PRELOCK")
|
||||
assert wait_marker(out_b, "ACQUIRED", timeout=1.0) is None # still serialised behind A
|
||||
|
||||
gate.touch() # let A read its counter only AFTER B's pre-lock work landed
|
||||
line_a = wait_marker(out_a, "COUNT")
|
||||
assert line_a is not None and line_a.strip() == "COUNT 1", line_a # not 2: B didn't pollute A
|
||||
pa.wait(timeout=15)
|
||||
|
||||
line_b = wait_marker(out_b, "COUNT")
|
||||
assert (
|
||||
line_b is not None and line_b.strip() == "COUNT 1"
|
||||
), line_b # B's file survived A's remove
|
||||
pb.wait(timeout=15)
|
||||
|
||||
|
||||
def test_20b_manual_runs_distinct_via_pid(monkeypatch):
|
||||
# no DRONE_BUILD_NUMBER and no domain/run-id env → run_id() falls back to "manual";
|
||||
# the pid suffix still separates two concurrent hand-runs of the same domain.
|
||||
for var in ("DRONE_BUILD_NUMBER", "CCCI_APP_DOMAIN", "CCCI_RUN_ID"):
|
||||
monkeypatch.delenv(var, raising=False)
|
||||
p = run_recipe_ci._run_state_path("opstate")
|
||||
assert os.path.basename(p) == f"ccci-opstate-manual-{os.getpid()}"
|
||||
@ -13,7 +13,8 @@ import sys
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "runner"))
|
||||
from harness import deps as deps_mod, lifecycle, naming # noqa: E402
|
||||
from harness import deps as deps_mod # noqa: E402
|
||||
from harness import lifecycle, naming
|
||||
|
||||
|
||||
def _short(s: str, n: int = 8) -> str:
|
||||
|
||||
@ -26,6 +26,7 @@ Transient `net::ERR_NETWORK_CHANGED` is handled by the shared `goto_with_retry`
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
@ -39,7 +40,11 @@ def _open_pad(ctx, url):
|
||||
bar once CryptPad has created/loaded the fragment-keyed pad (`#/2/pad/edit/<key>/`)."""
|
||||
page = ctx.new_page()
|
||||
harness_browser.goto_with_retry(
|
||||
page, url, accept_statuses=(200,), goto_timeout_ms=60_000, wait_until="load",
|
||||
page,
|
||||
url,
|
||||
accept_statuses=(200,),
|
||||
goto_timeout_ms=60_000,
|
||||
wait_until="load",
|
||||
deadline_seconds=150,
|
||||
)
|
||||
pad_url = url
|
||||
@ -53,13 +58,15 @@ def _open_pad(ctx, url):
|
||||
pad_url = page.url
|
||||
break
|
||||
if i == 40:
|
||||
try:
|
||||
with contextlib.suppress(Exception): # best-effort unstick
|
||||
harness_browser.goto_with_retry(
|
||||
page, url, accept_statuses=(200,), goto_timeout_ms=60_000,
|
||||
wait_until="load", deadline_seconds=120,
|
||||
page,
|
||||
url,
|
||||
accept_statuses=(200,),
|
||||
goto_timeout_ms=60_000,
|
||||
wait_until="load",
|
||||
deadline_seconds=120,
|
||||
)
|
||||
except Exception: # noqa: BLE001 — best-effort unstick
|
||||
pass
|
||||
return page, pad_url
|
||||
|
||||
|
||||
@ -74,18 +81,22 @@ def _ckeditor_frame(page, deadline_polls=90, reload_at=22, reload_url=None):
|
||||
if "ckeditor-inner" in f.url:
|
||||
return f
|
||||
if i == reload_at and reload_url is not None:
|
||||
try:
|
||||
with contextlib.suppress(Exception): # reload is a best-effort unstick
|
||||
harness_browser.goto_with_retry(
|
||||
page, reload_url, accept_statuses=(200,), goto_timeout_ms=60_000,
|
||||
wait_until="load", deadline_seconds=120,
|
||||
page,
|
||||
reload_url,
|
||||
accept_statuses=(200,),
|
||||
goto_timeout_ms=60_000,
|
||||
wait_until="load",
|
||||
deadline_seconds=120,
|
||||
)
|
||||
except Exception: # noqa: BLE001 — reload is a best-effort unstick
|
||||
pass
|
||||
page.wait_for_timeout(2000)
|
||||
return None
|
||||
|
||||
|
||||
def _poll_any_frame_for_text(page, needle, deadline_polls=120, reload_at=(20, 45, 75, 100), reload_url=None):
|
||||
def _poll_any_frame_for_text(
|
||||
page, needle, deadline_polls=120, reload_at=(20, 45, 75, 100), reload_url=None
|
||||
):
|
||||
"""Robust read-back (F2-13): poll EVERY frame's body text for `needle`, returning True as soon as
|
||||
it appears. The fresh cold-cache read-back context's deeply-nested CKEditor frame is slow/flaky to
|
||||
*attach* by URL (the prior `_ckeditor_frame` wait timed out on the Adversary's cold run), but the
|
||||
@ -101,13 +112,15 @@ def _poll_any_frame_for_text(page, needle, deadline_polls=120, reload_at=(20, 45
|
||||
except Exception: # noqa: BLE001 — frame not ready / detached; keep polling
|
||||
pass
|
||||
if reload_url and i in reload_at:
|
||||
try:
|
||||
with contextlib.suppress(Exception): # best-effort unstick
|
||||
harness_browser.goto_with_retry(
|
||||
page, reload_url, accept_statuses=(200,), goto_timeout_ms=60_000,
|
||||
wait_until="load", deadline_seconds=120,
|
||||
page,
|
||||
reload_url,
|
||||
accept_statuses=(200,),
|
||||
goto_timeout_ms=60_000,
|
||||
wait_until="load",
|
||||
deadline_seconds=120,
|
||||
)
|
||||
except Exception: # noqa: BLE001 — best-effort unstick
|
||||
pass
|
||||
page.wait_for_timeout(2000)
|
||||
return False
|
||||
|
||||
@ -137,9 +150,9 @@ def test_cryptpad_pad_content_survives_fresh_session(live_app):
|
||||
# --- session 1: create the pad + write the marker ---
|
||||
ctx1 = browser.new_context(ignore_https_errors=True)
|
||||
page, pad_url = _open_pad(ctx1, f"https://{live_app}/pad/")
|
||||
assert "#/2/pad/edit/" in pad_url, (
|
||||
f"CryptPad did not create a fragment-keyed pad URL; got {pad_url!r}"
|
||||
)
|
||||
assert (
|
||||
"#/2/pad/edit/" in pad_url
|
||||
), f"CryptPad did not create a fragment-keyed pad URL; got {pad_url!r}"
|
||||
ck = _ckeditor_frame(page, reload_url=pad_url)
|
||||
assert ck is not None, "CKEditor content frame never attached (pad editor not ready)"
|
||||
_dismiss_store_modal(page)
|
||||
@ -148,9 +161,9 @@ def test_cryptpad_pad_content_survives_fresh_session(live_app):
|
||||
page.wait_for_timeout(1000)
|
||||
body.type(marker, delay=40)
|
||||
page.wait_for_timeout(12000) # let CryptPad encrypt + sync the update to the server
|
||||
assert marker in ck.locator("body").inner_text(), (
|
||||
"marker not present in the editor after typing — type did not land"
|
||||
)
|
||||
assert (
|
||||
marker in ck.locator("body").inner_text()
|
||||
), "marker not present in the editor after typing — type did not land"
|
||||
ctx1.close()
|
||||
|
||||
# --- session 2: FRESH context (no shared storage/localStorage) reads the pad back by URL.
|
||||
|
||||
@ -51,9 +51,9 @@ def test_cryptpad_spa_renders_with_no_console_errors(live_app):
|
||||
title = (page.title() or "").lower()
|
||||
body = page.content()
|
||||
blower = body.lower()
|
||||
assert "cryptpad" in title or "cryptpad" in blower, (
|
||||
f"CryptPad SPA does not carry brand. title={title!r}, body excerpt: {body[:200]!r}"
|
||||
)
|
||||
assert (
|
||||
"cryptpad" in title or "cryptpad" in blower
|
||||
), f"CryptPad SPA does not carry brand. title={title!r}, body excerpt: {body[:200]!r}"
|
||||
|
||||
# Canonical CryptPad asset references in the rendered DOM
|
||||
canonical = ("/customize/", "/components/", "main.js", "/api/broadcast")
|
||||
|
||||
@ -8,7 +8,8 @@ import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from harness import browser as harness_browser, generic, lifecycle # noqa: E402
|
||||
from harness import browser as harness_browser # noqa: E402
|
||||
from harness import generic, lifecycle
|
||||
|
||||
|
||||
def test_serving_and_content(live_app, meta):
|
||||
|
||||
19
tests/custom-html-bkp-bad/ops.py
Normal file
19
tests/custom-html-bkp-bad/ops.py
Normal file
@ -0,0 +1,19 @@
|
||||
"""custom-html-bkp-bad — lifecycle ops for bad-backup/bad-restore RED canaries.
|
||||
|
||||
Intentionally has NO pre_backup hook: the marker is never seeded before backup,
|
||||
so the backup snapshot has no ci-marker.txt. pre_restore writes "mutated" so that if
|
||||
restore DOES bring back the snapshot, the marker is gone/still-mutated → test fails.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from harness import lifecycle
|
||||
|
||||
MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
|
||||
|
||||
|
||||
def pre_restore(domain: str, meta: dict) -> None:
|
||||
"""Write 'mutated' to the marker before restore runs. If restore brings back the
|
||||
snapshot (which has no marker — never seeded by pre_backup), the marker ends up
|
||||
MISSING or 'mutated' after restore → test_restore_returns_state FAILS → restore=RED."""
|
||||
lifecycle.exec_in_app(domain, ["sh", "-c", f"echo mutated > {MARKER_PATH}"])
|
||||
5
tests/custom-html-bkp-bad/recipe_meta.py
Normal file
5
tests/custom-html-bkp-bad/recipe_meta.py
Normal file
@ -0,0 +1,5 @@
|
||||
# custom-html-bkp-bad — regression fixture for bad-backup canary.
|
||||
# This recipe is custom-html WITHOUT backupbot labels. Setting BACKUP_CAPABLE=True here forces the
|
||||
# harness to run the backup tier; the recipe itself has no backupbot service, so
|
||||
# `abra app backup create` produces no snapshot → test_backup_artifact fails → backup tier RED.
|
||||
BACKUP_CAPABLE = True
|
||||
30
tests/custom-html-bkp-bad/test_backup.py
Normal file
30
tests/custom-html-bkp-bad/test_backup.py
Normal file
@ -0,0 +1,30 @@
|
||||
"""custom-html-bkp-bad — BACKUP assertion (bad-backup RED canary).
|
||||
|
||||
This recipe has no ops.py::pre_backup, so ci-marker.txt is NEVER seeded before the backup.
|
||||
Asserting its presence here causes backup tier RED — proving the server catches a recipe that
|
||||
claims backup support but doesn't actually back up the expected data.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from harness import lifecycle # noqa: E402
|
||||
|
||||
MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
|
||||
|
||||
|
||||
def test_backup_captures_state(live_app):
|
||||
"""Assert the pre-backup marker is present and equals 'original'.
|
||||
|
||||
Since custom-html-bkp-bad has no ops.py::pre_backup to seed the marker, this file does NOT
|
||||
exist at backup time — exec_in_app returns empty or raises → assertion fails → backup tier RED.
|
||||
This models a recipe that declares backup capability but omits the data-seeding hook."""
|
||||
result = lifecycle.exec_in_app(
|
||||
live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]
|
||||
).strip()
|
||||
assert result == "original", (
|
||||
f"backup did not capture the expected marker at {MARKER_PATH}: got {result!r}. "
|
||||
"Expected 'original' (seeded by pre_backup). If the marker is 'MISSING', the pre_backup "
|
||||
"hook was not run — this is the intended failure for the bad-backup RED canary."
|
||||
)
|
||||
25
tests/custom-html-bkp-bad/test_restore.py
Normal file
25
tests/custom-html-bkp-bad/test_restore.py
Normal file
@ -0,0 +1,25 @@
|
||||
"""custom-html-bkp-bad — RESTORE assertion (bad-restore RED canary).
|
||||
|
||||
pre_restore seeds 'mutated' to ci-marker.txt. The backup snapshot has no ci-marker.txt
|
||||
(never seeded by pre_backup). After restore, the marker is either MISSING or 'mutated' —
|
||||
never 'original' — so this assertion FAILS → restore tier RED.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from harness import lifecycle # noqa: E402
|
||||
|
||||
MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
|
||||
|
||||
|
||||
def test_restore_returns_state(live_app):
|
||||
result = lifecycle.exec_in_app(
|
||||
live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]
|
||||
).strip()
|
||||
assert result == "original", (
|
||||
f"restore did not return the pre-mutation (backed-up) state: got {result!r}. "
|
||||
"Expected 'original'. The backup had no marker (not seeded by pre_backup), so "
|
||||
"restore cannot recover it — this is the intended failure for the bad-restore RED canary."
|
||||
)
|
||||
15
tests/custom-html-rst-bad/ops.py
Normal file
15
tests/custom-html-rst-bad/ops.py
Normal file
@ -0,0 +1,15 @@
|
||||
"""custom-html-rst-bad — lifecycle ops for bad-restore RED canary.
|
||||
|
||||
NO pre_backup hook: marker never seeded before backup → snapshot has no ci-marker.txt.
|
||||
pre_restore writes "mutated". After restore, marker stays "mutated" (not in snapshot) → FAIL.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from harness import lifecycle
|
||||
|
||||
MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
|
||||
|
||||
|
||||
def pre_restore(domain: str, meta: dict) -> None:
|
||||
lifecycle.exec_in_app(domain, ["sh", "-c", f"echo mutated > {MARKER_PATH}"])
|
||||
3
tests/custom-html-rst-bad/recipe_meta.py
Normal file
3
tests/custom-html-rst-bad/recipe_meta.py
Normal file
@ -0,0 +1,3 @@
|
||||
# custom-html-rst-bad — regression fixture for bad-restore canary.
|
||||
# BACKUP_CAPABLE=True forces the backup tier to run even though the recipe has no backupbot label.
|
||||
BACKUP_CAPABLE = True
|
||||
23
tests/custom-html-rst-bad/test_restore.py
Normal file
23
tests/custom-html-rst-bad/test_restore.py
Normal file
@ -0,0 +1,23 @@
|
||||
"""custom-html-rst-bad — RESTORE assertion (bad-restore RED canary).
|
||||
|
||||
No pre_backup → backup snapshot has no ci-marker.txt. pre_restore writes "mutated".
|
||||
After restore: marker is "mutated" (restore can't recover "original" — wasn't backed up) → FAIL.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from harness import lifecycle # noqa: E402
|
||||
|
||||
MARKER_PATH = "/usr/share/nginx/html/ci-marker.txt"
|
||||
|
||||
|
||||
def test_restore_returns_state(live_app):
|
||||
result = lifecycle.exec_in_app(
|
||||
live_app, ["sh", "-c", f"cat {MARKER_PATH} 2>/dev/null || echo MISSING"]
|
||||
).strip()
|
||||
assert result == "original", (
|
||||
f"restore did not return the pre-mutation (backed-up) state: got {result!r}. "
|
||||
"Expected 'original'. The backup had no marker, so restore cannot recover it."
|
||||
)
|
||||
87
tests/custom-html-tiny/functional/test_serves_content.py
Normal file
87
tests/custom-html-tiny/functional/test_serves_content.py
Normal file
@ -0,0 +1,87 @@
|
||||
"""custom-html-tiny — recipe-specific functional test (static-web-server).
|
||||
|
||||
Proves the deployed static-web-server is *actually serving files from its `content` volume* with real
|
||||
file-server semantics, not merely returning 200 from a Traefik fallback or a generic stub:
|
||||
|
||||
1. exact-byte round-trip — write a uniquely-named file with random content into the served volume,
|
||||
fetch it over HTTPS, and assert the bytes come back verbatim. Non-vacuous: the content is random
|
||||
per run, so only a server that reads this file off the volume can pass.
|
||||
2. real 404 — a random non-existent path returns 404, proving directory/file semantics (a
|
||||
200-everything stub or mis-routed host would not 404).
|
||||
|
||||
The recipe's image (joseluisq/static-web-server) is shell-less (scratch-based) and its content volume
|
||||
is seeded via the install_steps.sh host-mountpoint mechanism — so this test writes its probe file the
|
||||
same way (resolve the swarm volume's mountpoint with `docker volume inspect`, write directly) rather
|
||||
than `docker exec`-ing in a container that has no shell.
|
||||
|
||||
Runs in the custom tier against the shared post-install deployment (the `live_app` fixture is its
|
||||
per-run domain). Mirrors install_steps.sh: the app's content volume is named `<stack>_content`, where
|
||||
`stack` is the domain with dots replaced by underscores; HTTP_SUBDIR is empty, so the volume root is
|
||||
served at `/`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
import ssl
|
||||
import subprocess
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
import uuid
|
||||
|
||||
|
||||
def _served_dir(domain: str) -> str:
|
||||
"""Host mountpoint of the app's served `content` volume (same naming as install_steps.sh)."""
|
||||
vol = f"{domain.replace('.', '_')}_content"
|
||||
out = subprocess.run(
|
||||
["docker", "volume", "inspect", vol, "--format", "{{.Mountpoint}}"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
mountpoint = out.stdout.strip()
|
||||
assert mountpoint, f"could not resolve mountpoint for volume {vol!r}"
|
||||
return mountpoint
|
||||
|
||||
|
||||
def _get(url: str) -> tuple[int, bytes]:
|
||||
"""GET the URL; return (status, body). A 4xx/5xx is returned, not raised (we assert on the code).
|
||||
TLS verification is relaxed: the served wildcard cert is validated separately by the infra check;
|
||||
here we care only about the app's response."""
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=20, context=ctx) as resp:
|
||||
return resp.status, resp.read()
|
||||
except urllib.error.HTTPError as e:
|
||||
return e.code, e.read()
|
||||
|
||||
|
||||
def test_static_file_roundtrip_and_404(live_app):
|
||||
"""Write a random file into the served volume → fetch it → bytes match; and a missing path 404s."""
|
||||
served = _served_dir(live_app)
|
||||
token = uuid.uuid4().hex
|
||||
name = f"ccci-probe-{token}.txt"
|
||||
body = f"cc-ci-functional-{token}\n".encode()
|
||||
path = os.path.join(served, name)
|
||||
with open(path, "wb") as fh:
|
||||
fh.write(body)
|
||||
try:
|
||||
status, got = _get(f"https://{live_app}/{name}")
|
||||
assert status == 200, f"served probe file returned {status} (expected 200)"
|
||||
assert got == body, (
|
||||
f"content round-trip mismatch: served {got!r}, wrote {body!r} "
|
||||
"(static-web-server not serving the content volume?)"
|
||||
)
|
||||
|
||||
# A random non-existent path must 404 — proves real static-file semantics, distinguishing a
|
||||
# working server from a 200-everything stub or a mis-routed Traefik fallback.
|
||||
miss_status, _ = _get(f"https://{live_app}/ccci-missing-{uuid.uuid4().hex}.txt")
|
||||
assert (
|
||||
miss_status == 404
|
||||
), f"missing path returned {miss_status} (expected 404 — generic 200-returner / mis-route?)"
|
||||
finally:
|
||||
with contextlib.suppress(OSError):
|
||||
os.remove(path)
|
||||
@ -3,3 +3,14 @@
|
||||
# (DG5) is detected quickly instead of waiting the default 300s HTTP timeout.
|
||||
DEPLOY_TIMEOUT = 120
|
||||
HTTP_TIMEOUT = 90
|
||||
|
||||
# Rungs this recipe INTENTIONALLY skips, each with a reason. Any essential rung skipped (N/A) and NOT
|
||||
# listed here is reported as an *unintentional* skip (a coverage gap to fill or declare). A skip still
|
||||
# caps the level either way — the harness never claims a rung it did not verify; this only records
|
||||
# that the skip is deliberate. (The level ladder is the four essential rungs install/upgrade/
|
||||
# backup_restore/functional; integration + recipe-local are optional and not leveled.)
|
||||
# custom-html-tiny is a stateless static-web-server, so it has no backup surface:
|
||||
EXPECTED_NA = {
|
||||
"backup_restore": "stateless static file server: serves an ephemeral content volume seeded at "
|
||||
"deploy, with no persistent/user data to back up or restore (no backupbot.backup label)",
|
||||
}
|
||||
|
||||
@ -15,7 +15,8 @@ import sys
|
||||
import uuid
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", "runner"))
|
||||
from harness import http as harness_http, lifecycle # noqa: E402
|
||||
from harness import http as harness_http # noqa: E402
|
||||
from harness import lifecycle
|
||||
|
||||
|
||||
def test_content_roundtrip(live_app):
|
||||
|
||||
@ -53,9 +53,9 @@ def test_content_type_html_and_txt(live_app):
|
||||
ct_txt = h_txt.get("content-type", "")
|
||||
|
||||
# nginx default: "text/html" for .html and "text/plain" for .txt (may include "; charset=utf-8")
|
||||
assert ct_html.startswith("text/html"), (
|
||||
f"{html_name} Content-Type={ct_html!r}, expected text/html (nginx MIME config broken?)"
|
||||
)
|
||||
assert ct_txt.startswith("text/plain"), (
|
||||
f"{txt_name} Content-Type={ct_txt!r}, expected text/plain (nginx MIME config broken?)"
|
||||
)
|
||||
assert ct_html.startswith(
|
||||
"text/html"
|
||||
), f"{html_name} Content-Type={ct_html!r}, expected text/html (nginx MIME config broken?)"
|
||||
assert ct_txt.startswith(
|
||||
"text/plain"
|
||||
), f"{txt_name} Content-Type={ct_txt!r}, expected text/plain (nginx MIME config broken?)"
|
||||
|
||||
@ -9,7 +9,8 @@ import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from harness import browser as harness_browser, generic # noqa: E402
|
||||
from harness import browser as harness_browser # noqa: E402
|
||||
from harness import generic
|
||||
|
||||
|
||||
def test_serving_and_content(live_app, meta):
|
||||
|
||||
@ -53,7 +53,7 @@ def mint_admin(domain: str) -> tuple[str, str]:
|
||||
cmd = (
|
||||
"cd /opt/bitnami/discourse && "
|
||||
"RUBY=$(command -v ruby || echo /opt/bitnami/ruby/bin/ruby) && "
|
||||
f"RAILS_ENV=production \"$RUBY\" bin/rails runner \"{_BOOTSTRAP_RB}\""
|
||||
f'RAILS_ENV=production "$RUBY" bin/rails runner "{_BOOTSTRAP_RB}"'
|
||||
)
|
||||
out = lifecycle.exec_in_app(domain, ["bash", "-c", cmd], service="app", timeout=240)
|
||||
key = user = None
|
||||
@ -63,9 +63,9 @@ def mint_admin(domain: str) -> tuple[str, str]:
|
||||
key = line.split("=", 1)[1].strip()
|
||||
elif line.startswith("CCCI_API_USER="):
|
||||
user = line.split("=", 1)[1].strip()
|
||||
assert key and user, (
|
||||
f"could not bootstrap discourse admin/API key; rails output tail:\n{out[-1000:]}"
|
||||
)
|
||||
assert (
|
||||
key and user
|
||||
), f"could not bootstrap discourse admin/API key; rails output tail:\n{out[-1000:]}"
|
||||
return key, user
|
||||
|
||||
|
||||
|
||||
@ -48,21 +48,23 @@ def test_create_topic_roundtrip(live_app):
|
||||
headers=hdrs,
|
||||
timeout=60,
|
||||
)
|
||||
assert status in (200, 201) and isinstance(body, dict), (
|
||||
f"create topic failed: HTTP {status}, body={body!r}"
|
||||
)
|
||||
assert status in (200, 201) and isinstance(
|
||||
body, dict
|
||||
), f"create topic failed: HTTP {status}, body={body!r}"
|
||||
topic_id = body.get("topic_id")
|
||||
assert topic_id, f"create topic returned no topic_id: {body!r}"
|
||||
|
||||
# 4) Read the topic back and assert title + first-post body round-trip.
|
||||
status, got = harness_http.http_get(f"{base}/t/{topic_id}.json", headers=hdrs, timeout=30)
|
||||
assert status == 200 and isinstance(got, dict), f"read topic failed: HTTP {status}, body={got!r}"
|
||||
assert got.get("title") == title, (
|
||||
f"topic title did not round-trip: sent {title!r}, got {got.get('title')!r}"
|
||||
)
|
||||
assert status == 200 and isinstance(
|
||||
got, dict
|
||||
), f"read topic failed: HTTP {status}, body={got!r}"
|
||||
assert (
|
||||
got.get("title") == title
|
||||
), f"topic title did not round-trip: sent {title!r}, got {got.get('title')!r}"
|
||||
posts = (got.get("post_stream") or {}).get("posts") or []
|
||||
assert posts, f"topic has no posts on read-back: {got!r}"
|
||||
first_cooked = posts[0].get("cooked", "")
|
||||
assert marker in first_cooked, (
|
||||
f"topic body did not round-trip: marker {marker!r} not in first post {first_cooked!r}"
|
||||
)
|
||||
assert (
|
||||
marker in first_cooked
|
||||
), f"topic body did not round-trip: marker {marker!r} not in first post {first_cooked!r}"
|
||||
|
||||
@ -20,12 +20,12 @@ def test_site_json_has_discourse_config(live_app):
|
||||
status, body = harness_http.retry_http_get(
|
||||
f"https://{live_app}/site.json", expect_status=200, max_wait=120, interval=5
|
||||
)
|
||||
assert status == 200 and isinstance(body, dict), (
|
||||
f"GET /site.json failed: HTTP {status}, body type={type(body).__name__}"
|
||||
)
|
||||
assert status == 200 and isinstance(
|
||||
body, dict
|
||||
), f"GET /site.json failed: HTTP {status}, body type={type(body).__name__}"
|
||||
# /site.json carries Discourse-specific structure — `categories` (a list) and `groups` are always
|
||||
# present in a booted Discourse. A non-Discourse 200 (placeholder page) would not parse to this.
|
||||
assert "categories" in body, f"/site.json missing 'categories' key: keys={list(body)[:20]}"
|
||||
assert isinstance(body["categories"], list), (
|
||||
f"/site.json 'categories' not a list: {type(body['categories']).__name__}"
|
||||
)
|
||||
assert isinstance(
|
||||
body["categories"], list
|
||||
), f"/site.json 'categories' not a list: {type(body['categories']).__name__}"
|
||||
|
||||
@ -15,7 +15,9 @@ set -euo pipefail
|
||||
|
||||
: "${CCCI_RECIPE:?missing CCCI_RECIPE}"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
RECIPE_DIR="${HOME}/.abra/recipes/${CCCI_RECIPE}"
|
||||
# Resolve the recipe tree the way abra does: $ABRA_DIR (the per-run tree inside a CI run) else
|
||||
# the canonical ~/.abra — the overlay must land in the tree this run actually deploys from.
|
||||
RECIPE_DIR="${ABRA_DIR:-${HOME}/.abra}/recipes/${CCCI_RECIPE}"
|
||||
|
||||
if [ ! -d "$RECIPE_DIR" ]; then
|
||||
echo " discourse install_steps: recipe dir $RECIPE_DIR missing — cannot provide compose.ccci.yml" >&2
|
||||
|
||||
@ -15,8 +15,7 @@ from harness import lifecycle # noqa: E402
|
||||
|
||||
def _psql(domain, sql):
|
||||
cmd = (
|
||||
'PGPASSWORD=$(cat /run/secrets/db_password) '
|
||||
f'psql -U discourse -d discourse -tAc "{sql}"'
|
||||
"PGPASSWORD=$(cat /run/secrets/db_password) " f'psql -U discourse -d discourse -tAc "{sql}"'
|
||||
)
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], service="db").strip()
|
||||
|
||||
@ -42,6 +41,7 @@ def pre_backup(domain, meta):
|
||||
def pre_restore(domain, meta):
|
||||
# diverge from the backup so a successful restore is observable
|
||||
_psql(domain, "DROP TABLE IF EXISTS ci_marker;")
|
||||
assert _psql(domain, "SELECT to_regclass('public.ci_marker');") in ("", "NULL"), (
|
||||
"drop did not take"
|
||||
)
|
||||
assert _psql(domain, "SELECT to_regclass('public.ci_marker');") in (
|
||||
"",
|
||||
"NULL",
|
||||
), "drop did not take"
|
||||
|
||||
@ -6,7 +6,9 @@
|
||||
# app is actually serving (the canonical "is discourse up" signal — NOT "/", which may redirect to setup).
|
||||
HEALTH_PATH = "/srv/status"
|
||||
HEALTH_OK = (200,)
|
||||
DEPLOY_TIMEOUT = 3600 # slow Rails cold boot (15-25min) on the 7-GiB single node; bumped 2400→3600 for
|
||||
DEPLOY_TIMEOUT = (
|
||||
3600 # slow Rails cold boot (15-25min) on the 7-GiB single node; bumped 2400→3600 for
|
||||
)
|
||||
# headroom after full4's base deploy timed out at 2400s (RAM/CPU-constrained boot + image re-pull).
|
||||
HTTP_TIMEOUT = 1200
|
||||
|
||||
@ -59,7 +61,11 @@ def BACKUP_VERIFY(domain):
|
||||
try:
|
||||
out = lifecycle.exec_in_app(
|
||||
domain,
|
||||
["sh", "-c", "gzip -t /var/lib/postgresql/data/backup.sql && wc -c < /var/lib/postgresql/data/backup.sql"],
|
||||
[
|
||||
"sh",
|
||||
"-c",
|
||||
"gzip -t /var/lib/postgresql/data/backup.sql && wc -c < /var/lib/postgresql/data/backup.sql",
|
||||
],
|
||||
service="db",
|
||||
timeout=60,
|
||||
).strip()
|
||||
|
||||
@ -14,13 +14,12 @@ from harness import lifecycle # noqa: E402
|
||||
|
||||
def _psql(domain, sql):
|
||||
cmd = (
|
||||
'PGPASSWORD=$(cat /run/secrets/db_password) '
|
||||
f'psql -U discourse -d discourse -tAc "{sql}"'
|
||||
"PGPASSWORD=$(cat /run/secrets/db_password) " f'psql -U discourse -d discourse -tAc "{sql}"'
|
||||
)
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], service="db").strip()
|
||||
|
||||
|
||||
def test_backup_captures_state(live_app):
|
||||
assert _psql(live_app, "SELECT v FROM ci_marker;") == "original", (
|
||||
"the seeded discourse postgres state was not present at backup time"
|
||||
)
|
||||
assert (
|
||||
_psql(live_app, "SELECT v FROM ci_marker;") == "original"
|
||||
), "the seeded discourse postgres state was not present at backup time"
|
||||
|
||||
@ -14,13 +14,12 @@ from harness import lifecycle # noqa: E402
|
||||
|
||||
def _psql(domain, sql):
|
||||
cmd = (
|
||||
'PGPASSWORD=$(cat /run/secrets/db_password) '
|
||||
f'psql -U discourse -d discourse -tAc "{sql}"'
|
||||
"PGPASSWORD=$(cat /run/secrets/db_password) " f'psql -U discourse -d discourse -tAc "{sql}"'
|
||||
)
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], service="db").strip()
|
||||
|
||||
|
||||
def test_restore_returns_state(live_app):
|
||||
assert _psql(live_app, "SELECT v FROM ci_marker;") == "original", (
|
||||
"restore did not return the pre-mutation discourse postgres state (data-integrity failure)"
|
||||
)
|
||||
assert (
|
||||
_psql(live_app, "SELECT v FROM ci_marker;") == "original"
|
||||
), "restore did not return the pre-mutation discourse postgres state (data-integrity failure)"
|
||||
|
||||
@ -93,9 +93,10 @@ class GhostAdmin:
|
||||
status, body = self.req(
|
||||
"POST", "/session/", {"username": ADMIN_EMAIL, "password": ADMIN_PW}
|
||||
)
|
||||
assert status in (200, 201), (
|
||||
f"ghost admin session login failed: HTTP {status}, body={body!r}"
|
||||
)
|
||||
assert status in (
|
||||
200,
|
||||
201,
|
||||
), f"ghost admin session login failed: HTTP {status}, body={body!r}"
|
||||
|
||||
def create_post(self, title: str, html: str) -> dict:
|
||||
status, body = self.req(
|
||||
|
||||
@ -53,13 +53,15 @@ def test_ghost_admin_route_is_wired(live_app):
|
||||
return None
|
||||
|
||||
status_body = harness_http.assert_converges(
|
||||
_ready, f"GET {url} returns Ghost admin (200) or setup redirect (302)",
|
||||
max_wait=60, interval=3,
|
||||
_ready,
|
||||
f"GET {url} returns Ghost admin (200) or setup redirect (302)",
|
||||
max_wait=60,
|
||||
interval=3,
|
||||
)
|
||||
status, body = status_body
|
||||
assert status in (200, 302), f"unexpected status: {status}"
|
||||
if status == 200:
|
||||
# The admin SPA references /ghost-assets/ or contains "ghost" in title/body
|
||||
assert "ghost" in body.lower(), (
|
||||
f"GET {url} 200 but body has no Ghost markers: {body[:200]!r}"
|
||||
)
|
||||
assert (
|
||||
"ghost" in body.lower()
|
||||
), f"GET {url} 200 but body has no Ghost markers: {body[:200]!r}"
|
||||
|
||||
@ -35,10 +35,10 @@ def test_content_api_settings_endpoint(live_app):
|
||||
assert body is not None, f"GET {url} returned non-JSON body"
|
||||
# On success: {"settings": {...}}. On error: {"errors": [...]}. Either shape is valid.
|
||||
if status == 200:
|
||||
assert isinstance(body, dict) and "settings" in body, (
|
||||
f"200 response missing 'settings' envelope: {body!r}"
|
||||
)
|
||||
assert (
|
||||
isinstance(body, dict) and "settings" in body
|
||||
), f"200 response missing 'settings' envelope: {body!r}"
|
||||
else:
|
||||
assert isinstance(body, dict) and ("errors" in body or "message" in body or body), (
|
||||
f"error response not a proper Ghost error envelope: {body!r}"
|
||||
)
|
||||
assert isinstance(body, dict) and (
|
||||
"errors" in body or "message" in body or body
|
||||
), f"error response not a proper Ghost error envelope: {body!r}"
|
||||
|
||||
@ -43,17 +43,17 @@ def test_create_post_roundtrip(live_app):
|
||||
title = f"ccci-marker-{uniq}"
|
||||
marker = f"ccci-body-marker-{uniq}-roundtrip"
|
||||
created = admin.create_post(title, f"<p>{marker}</p>")
|
||||
assert created.get("title") == title, (
|
||||
f"created post title mismatch: sent {title!r}, got {created.get('title')!r}"
|
||||
)
|
||||
assert (
|
||||
created.get("title") == title
|
||||
), f"created post title mismatch: sent {title!r}, got {created.get('title')!r}"
|
||||
|
||||
# 4) Read it back by id and assert the post survived the round-trip (title always returned;
|
||||
# html returned because we requested ?formats=html).
|
||||
got = admin.get_post(created["id"])
|
||||
assert got.get("title") == title, (
|
||||
f"post title did not round-trip: sent {title!r}, got {got.get('title')!r}"
|
||||
)
|
||||
assert (
|
||||
got.get("title") == title
|
||||
), f"post title did not round-trip: sent {title!r}, got {got.get('title')!r}"
|
||||
html = got.get("html") or ""
|
||||
assert marker in html, (
|
||||
f"post body did not round-trip: marker {marker!r} not in read-back html {html!r}"
|
||||
)
|
||||
assert (
|
||||
marker in html
|
||||
), f"post body did not round-trip: marker {marker!r} not in read-back html {html!r}"
|
||||
|
||||
@ -15,7 +15,9 @@ set -euo pipefail
|
||||
|
||||
: "${CCCI_RECIPE:?missing CCCI_RECIPE}"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
RECIPE_DIR="${HOME}/.abra/recipes/${CCCI_RECIPE}"
|
||||
# Resolve the recipe tree the way abra does: $ABRA_DIR (the per-run tree inside a CI run) else
|
||||
# the canonical ~/.abra — the overlay must land in the tree this run actually deploys from.
|
||||
RECIPE_DIR="${ABRA_DIR:-${HOME}/.abra}/recipes/${CCCI_RECIPE}"
|
||||
|
||||
if [ ! -d "$RECIPE_DIR" ]; then
|
||||
echo " ghost install_steps: recipe dir $RECIPE_DIR missing — cannot provide compose.ccci.yml" >&2
|
||||
|
||||
@ -22,10 +22,7 @@ from harness import lifecycle # noqa: E402
|
||||
|
||||
|
||||
def _mysql(domain, sql):
|
||||
cmd = (
|
||||
'MYSQL_PWD="$(cat /run/secrets/db_password)" '
|
||||
f'mysql -u root -N -s ghost -e "{sql}"'
|
||||
)
|
||||
cmd = 'MYSQL_PWD="$(cat /run/secrets/db_password)" ' f'mysql -u root -N -s ghost -e "{sql}"'
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], service="db").strip()
|
||||
|
||||
|
||||
|
||||
@ -63,7 +63,11 @@ def BACKUP_VERIFY(domain):
|
||||
try:
|
||||
out = lifecycle.exec_in_app(
|
||||
domain,
|
||||
["sh", "-c", "gzip -t /var/lib/mysql/backup.sql.gz && wc -c < /var/lib/mysql/backup.sql.gz"],
|
||||
[
|
||||
"sh",
|
||||
"-c",
|
||||
"gzip -t /var/lib/mysql/backup.sql.gz && wc -c < /var/lib/mysql/backup.sql.gz",
|
||||
],
|
||||
service="db",
|
||||
timeout=60,
|
||||
).strip()
|
||||
|
||||
@ -15,14 +15,11 @@ from harness import lifecycle # noqa: E402
|
||||
|
||||
|
||||
def _mysql(domain, sql):
|
||||
cmd = (
|
||||
'MYSQL_PWD="$(cat /run/secrets/db_password)" '
|
||||
f'mysql -u root -N -s ghost -e "{sql}"'
|
||||
)
|
||||
cmd = 'MYSQL_PWD="$(cat /run/secrets/db_password)" ' f'mysql -u root -N -s ghost -e "{sql}"'
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], service="db").strip()
|
||||
|
||||
|
||||
def test_backup_captures_state(live_app):
|
||||
assert _mysql(live_app, "SELECT v FROM ci_marker;") == "original", (
|
||||
"the seeded ghost MySQL marker was not present at backup time"
|
||||
)
|
||||
assert (
|
||||
_mysql(live_app, "SELECT v FROM ci_marker;") == "original"
|
||||
), "the seeded ghost MySQL marker was not present at backup time"
|
||||
|
||||
@ -22,10 +22,7 @@ from harness import lifecycle # noqa: E402
|
||||
|
||||
|
||||
def _mysql(domain, sql):
|
||||
cmd = (
|
||||
'MYSQL_PWD="$(cat /run/secrets/db_password)" '
|
||||
f'mysql -u root -N -s ghost -e "{sql}"'
|
||||
)
|
||||
cmd = 'MYSQL_PWD="$(cat /run/secrets/db_password)" ' f'mysql -u root -N -s ghost -e "{sql}"'
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], service="db").strip()
|
||||
|
||||
|
||||
|
||||
@ -14,14 +14,11 @@ from harness import lifecycle # noqa: E402
|
||||
|
||||
|
||||
def _mysql(domain, sql):
|
||||
cmd = (
|
||||
'MYSQL_PWD="$(cat /run/secrets/db_password)" '
|
||||
f'mysql -u root -N -s ghost -e "{sql}"'
|
||||
)
|
||||
cmd = 'MYSQL_PWD="$(cat /run/secrets/db_password)" ' f'mysql -u root -N -s ghost -e "{sql}"'
|
||||
return lifecycle.exec_in_app(domain, ["sh", "-c", cmd], service="db").strip()
|
||||
|
||||
|
||||
def test_upgrade_preserves_state(live_app):
|
||||
assert _mysql(live_app, "SELECT v FROM ci_marker;") == "upgrade-survives", (
|
||||
"the seeded ghost MySQL marker did not survive the upgrade redeploy (data loss on upgrade)"
|
||||
)
|
||||
assert (
|
||||
_mysql(live_app, "SELECT v FROM ci_marker;") == "upgrade-survives"
|
||||
), "the seeded ghost MySQL marker did not survive the upgrade redeploy (data loss on upgrade)"
|
||||
|
||||
@ -14,7 +14,6 @@ import urllib.request
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", "runner"))
|
||||
from harness import http as harness_http # noqa: E402
|
||||
|
||||
|
||||
_CTX = ssl.create_default_context()
|
||||
_CTX.check_hostname = False
|
||||
_CTX.verify_mode = ssl.CERT_NONE
|
||||
|
||||
@ -15,7 +15,5 @@ from harness import http as harness_http # noqa: E402
|
||||
def test_hedgedoc_root_serves(live_app):
|
||||
"""GET / → 200 or 302 (login/new redirect)."""
|
||||
url = f"https://{live_app}/"
|
||||
status, _ = harness_http.retry_http_get(
|
||||
url, expect_status=(200, 302), max_wait=90, interval=5
|
||||
)
|
||||
status, _ = harness_http.retry_http_get(url, expect_status=(200, 302), max_wait=90, interval=5)
|
||||
assert status in (200, 302), f"GET {url} HTTP {status} (expected 200 or 302)"
|
||||
|
||||
@ -111,13 +111,13 @@ def test_immich_processes_uploaded_asset_metadata_and_statistics(live_app):
|
||||
if exif and exif.get("exifImageWidth"):
|
||||
break
|
||||
time.sleep(5)
|
||||
assert exif and exif.get("exifImageWidth") == 1 and exif.get("exifImageHeight") == 1, (
|
||||
f"immich metadata-extraction did not populate the 1x1 PNG dimensions in exifInfo: {exif!r}"
|
||||
)
|
||||
assert (
|
||||
exif and exif.get("exifImageWidth") == 1 and exif.get("exifImageHeight") == 1
|
||||
), f"immich metadata-extraction did not populate the 1x1 PNG dimensions in exifInfo: {exif!r}"
|
||||
|
||||
# the asset is catalogued into the owner's library statistics (list-back in aggregate)
|
||||
sst, stats = harness_http.http_request("GET", f"{base}/api/assets/statistics", headers=auth)
|
||||
assert sst == 200 and isinstance(stats, dict), f"statistics HTTP {sst}: {stats!r}"
|
||||
assert stats.get("images", 0) >= 1 and stats.get("total", 0) >= 1, (
|
||||
f"uploaded asset not reflected in library statistics: {stats!r}"
|
||||
)
|
||||
assert (
|
||||
stats.get("images", 0) >= 1 and stats.get("total", 0) >= 1
|
||||
), f"uploaded asset not reflected in library statistics: {stats!r}"
|
||||
|
||||
@ -121,6 +121,6 @@ def test_immich_upload_asset_readback_and_thumbnail(live_app):
|
||||
if thumb == 200:
|
||||
break
|
||||
time.sleep(5)
|
||||
assert thumb == 200, (
|
||||
f"immich did not generate a thumbnail/derivative for the uploaded asset (last HTTP {thumb})"
|
||||
)
|
||||
assert (
|
||||
thumb == 200
|
||||
), f"immich did not generate a thumbnail/derivative for the uploaded asset (last HTTP {thumb})"
|
||||
|
||||
@ -16,5 +16,11 @@ from harness import http as harness_http # noqa: E402
|
||||
|
||||
def test_immich_returns_200(live_app):
|
||||
url = f"https://{live_app}/"
|
||||
status, _ = harness_http.retry_http_get(url, expect_status=(200, 301, 302), max_wait=60, interval=3)
|
||||
assert status in (200, 301, 302), f"immich at {url} returned HTTP {status} (expected 200/301/302)"
|
||||
status, _ = harness_http.retry_http_get(
|
||||
url, expect_status=(200, 301, 302), max_wait=60, interval=3
|
||||
)
|
||||
assert status in (
|
||||
200,
|
||||
301,
|
||||
302,
|
||||
), f"immich at {url} returned HTTP {status} (expected 200/301/302)"
|
||||
|
||||
@ -35,4 +35,7 @@ def pre_backup(domain, meta):
|
||||
|
||||
def pre_restore(domain, meta):
|
||||
_psql(domain, "DROP TABLE ci_marker;")
|
||||
assert _psql(domain, "SELECT to_regclass('public.ci_marker');") in ("", "NULL"), "drop did not take"
|
||||
assert _psql(domain, "SELECT to_regclass('public.ci_marker');") in (
|
||||
"",
|
||||
"NULL",
|
||||
), "drop did not take"
|
||||
|
||||
@ -14,4 +14,6 @@ def _psql(domain, sql):
|
||||
|
||||
|
||||
def test_backup_captures_state(live_app):
|
||||
assert _psql(live_app, "SELECT v FROM ci_marker;") == "original", "seeded postgres state not present at backup time"
|
||||
assert (
|
||||
_psql(live_app, "SELECT v FROM ci_marker;") == "original"
|
||||
), "seeded postgres state not present at backup time"
|
||||
|
||||
@ -7,7 +7,8 @@ import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "runner"))
|
||||
from harness import browser as harness_browser, generic, lifecycle # noqa: E402
|
||||
from harness import browser as harness_browser # noqa: E402
|
||||
from harness import generic, lifecycle
|
||||
|
||||
|
||||
def test_serving_and_frontend(live_app, meta):
|
||||
@ -25,7 +26,11 @@ def test_serving_and_frontend(live_app, meta):
|
||||
resp = harness_browser.goto_with_retry(
|
||||
page, url, accept_statuses=(200, 301, 302), goto_timeout_ms=60_000
|
||||
)
|
||||
assert resp is not None and resp.status in (200, 301, 302), f"page status {resp and resp.status}"
|
||||
assert resp is not None and resp.status in (
|
||||
200,
|
||||
301,
|
||||
302,
|
||||
), f"page status {resp and resp.status}"
|
||||
assert "<html" in page.content().lower(), "no HTML served by the immich frontend"
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
@ -14,4 +14,6 @@ def _psql(domain, sql):
|
||||
|
||||
|
||||
def test_restore_returns_state(live_app):
|
||||
assert _psql(live_app, "SELECT v FROM ci_marker;") == "original", "restore did not return the pre-mutation postgres state"
|
||||
assert (
|
||||
_psql(live_app, "SELECT v FROM ci_marker;") == "original"
|
||||
), "restore did not return the pre-mutation postgres state"
|
||||
|
||||
@ -14,4 +14,6 @@ def _psql(domain, sql):
|
||||
|
||||
|
||||
def test_upgrade_preserves_data(live_app):
|
||||
assert _psql(live_app, "SELECT v FROM ci_marker;") == "upgrade-survives", "postgres data did not survive the upgrade"
|
||||
assert (
|
||||
_psql(live_app, "SELECT v FROM ci_marker;") == "upgrade-survives"
|
||||
), "postgres data did not survive the upgrade"
|
||||
|
||||
@ -120,9 +120,9 @@ def test_create_confidential_client_and_obtain_token(live_app):
|
||||
"clientId": client_id,
|
||||
"enabled": True,
|
||||
"secret": client_secret,
|
||||
"publicClient": False, # confidential client
|
||||
"serviceAccountsEnabled": True, # required for client_credentials grant
|
||||
"standardFlowEnabled": False, # not needed for service-account-only client
|
||||
"publicClient": False, # confidential client
|
||||
"serviceAccountsEnabled": True, # required for client_credentials grant
|
||||
"standardFlowEnabled": False, # not needed for service-account-only client
|
||||
"directAccessGrantsEnabled": False,
|
||||
"protocol": "openid-connect",
|
||||
}
|
||||
@ -144,25 +144,25 @@ def test_create_confidential_client_and_obtain_token(live_app):
|
||||
|
||||
# Use the client to obtain its own token (client_credentials grant)
|
||||
tok_status, tok_resp = _client_credentials_token(live_app, client_id, client_secret)
|
||||
assert tok_status == 200, (
|
||||
f"client_credentials token returned HTTP {tok_status}: {tok_resp!r}"
|
||||
)
|
||||
assert (
|
||||
tok_status == 200
|
||||
), f"client_credentials token returned HTTP {tok_status}: {tok_resp!r}"
|
||||
access_token = tok_resp.get("access_token") if isinstance(tok_resp, dict) else None
|
||||
assert isinstance(access_token, str) and access_token.count(".") == 2, (
|
||||
f"client_credentials access_token not a JWT: {access_token!r}"
|
||||
)
|
||||
assert (
|
||||
isinstance(access_token, str) and access_token.count(".") == 2
|
||||
), f"client_credentials access_token not a JWT: {access_token!r}"
|
||||
|
||||
# Decode the JWT payload; assert azp matches the new client
|
||||
payload = json.loads(_b64url_decode(access_token.split(".")[1]))
|
||||
assert payload.get("azp") == client_id, (
|
||||
f"client_credentials JWT azp={payload.get('azp')!r} != client_id={client_id!r}"
|
||||
)
|
||||
assert (
|
||||
payload.get("azp") == client_id
|
||||
), f"client_credentials JWT azp={payload.get('azp')!r} != client_id={client_id!r}"
|
||||
# Service-account token does NOT carry a session-scoped user (azp + clientId differ from
|
||||
# admin-cli token). The presence of azp + iss == per-run-domain proves the issuance flow.
|
||||
expected_iss = f"https://{live_app}/realms/master"
|
||||
assert payload.get("iss") == expected_iss, (
|
||||
f"JWT iss={payload.get('iss')!r} != {expected_iss!r}"
|
||||
)
|
||||
assert (
|
||||
payload.get("iss") == expected_iss
|
||||
), f"JWT iss={payload.get('iss')!r} != {expected_iss!r}"
|
||||
finally:
|
||||
# Idempotent cleanup
|
||||
if cleanup_id:
|
||||
|
||||
@ -43,22 +43,20 @@ def test_password_grant_issues_valid_jwt(live_app):
|
||||
token = kc_admin.admin_token(live_app, password)
|
||||
|
||||
# Shape: a JWT is exactly 3 base64url segments
|
||||
assert isinstance(token, str) and token.count(".") == 2, (
|
||||
f"access_token does not look like a JWT (no 3 segments): len={len(token) if token else 0}"
|
||||
)
|
||||
assert (
|
||||
isinstance(token, str) and token.count(".") == 2
|
||||
), f"access_token does not look like a JWT (no 3 segments): len={len(token) if token else 0}"
|
||||
|
||||
payload = _decode_jwt_payload(token)
|
||||
|
||||
# iss = the issuer URL, must be the per-run domain's /realms/master endpoint
|
||||
expected_iss = f"https://{live_app}/realms/master"
|
||||
assert payload.get("iss") == expected_iss, (
|
||||
f"JWT iss claim {payload.get('iss')!r} != {expected_iss!r}"
|
||||
)
|
||||
assert (
|
||||
payload.get("iss") == expected_iss
|
||||
), f"JWT iss claim {payload.get('iss')!r} != {expected_iss!r}"
|
||||
|
||||
# azp = authorized party (which client requested this token)
|
||||
assert payload.get("azp") == "admin-cli", (
|
||||
f"JWT azp claim {payload.get('azp')!r} != 'admin-cli'"
|
||||
)
|
||||
assert payload.get("azp") == "admin-cli", f"JWT azp claim {payload.get('azp')!r} != 'admin-cli'"
|
||||
|
||||
# typ = token type
|
||||
assert payload.get("typ") == "Bearer", f"JWT typ claim {payload.get('typ')!r} != 'Bearer'"
|
||||
@ -70,6 +68,6 @@ def test_password_grant_issues_valid_jwt(live_app):
|
||||
|
||||
# iat (issued at) is also a standard claim
|
||||
iat = payload.get("iat")
|
||||
assert isinstance(iat, int) and iat <= time.time() + 60, (
|
||||
f"JWT iat {iat!r} not a reasonable past timestamp"
|
||||
)
|
||||
assert (
|
||||
isinstance(iat, int) and iat <= time.time() + 60
|
||||
), f"JWT iat {iat!r} not a reasonable past timestamp"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user