diff --git a/bridge/bridge.py b/bridge/bridge.py index 4565b4e..33218dc 100644 --- a/bridge/bridge.py +++ b/bridge/bridge.py @@ -37,6 +37,7 @@ import time import urllib.error import urllib.parse import urllib.request +from datetime import datetime, timezone from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer GITEA_API = os.environ.get("GITEA_API", "https://git.autonomic.zone/api/v1") @@ -81,6 +82,7 @@ GITEA_TOKEN = _read(os.environ["GITEA_TOKEN_FILE"]) # Shared dedup across the poll + webhook paths: a comment id triggers at most one run. _PROCESSED: set = set() _PROCESSED_LOCK = threading.Lock() +_PROCESS_STARTED_AT = datetime.now(timezone.utc) def log(*a): @@ -277,6 +279,23 @@ def _claim(comment_id) -> bool: return True +def _is_preexisting_comment(comment) -> bool: + """Treat trigger comments older than this bridge process as already-seen. + + This closes the reopened-PR hole where a PR was CLOSED during bridge startup, so its old + `!testme` comments were never marked seen by the first poll pass; when that PR is later reopened, + the poller must not replay those historical comments as fresh triggers. + """ + created = (comment or {}).get("created_at") + if not created: + return False + try: + created_at = datetime.fromisoformat(created.replace("Z", "+00:00")) + except ValueError: + return False + return created_at <= _PROCESS_STARTED_AT + + def process_testme(full_name, owner, name, number, user, comment_id, source, quick=False): """Shared by both paths. Dedupes by comment id, checks authorization, resolves the PR head, triggers the build, comments the run link. Returns (run_url|None, reason).""" @@ -389,7 +408,7 @@ def poll_loop(): if not is_trigger: continue cid = c.get("id") - if first: + if first or _is_preexisting_comment(c): _claim(cid) # mark pre-existing comments seen; don't fire on startup continue user = (c.get("user") or {}).get("login", "") diff --git a/machine-docs/JOURNAL-cfold.md b/machine-docs/JOURNAL-cfold.md index 5348215..a2156b4 100644 --- a/machine-docs/JOURNAL-cfold.md +++ b/machine-docs/JOURNAL-cfold.md @@ -343,3 +343,52 @@ Takeaways: - There is also likely a separate trigger dedupe problem: one `!testme` comment spawned runs `568`, `569`, and `570`. I did not broaden into a D1 investigation in this loop step because cfold M2 is already hard-blocked by Ghost's repeated upgrade failures, but the evidence is now recorded. + +## 2026-06-13 — Root-caused Ghost triple-trigger replay; bridge fix authored with unit coverage + +Pulled the Adversary's latest cfold audit (`review(cfold)` `ddefc96`). It was not an M2 verdict or a +finding; it confirmed the sweep is still unclaimable while teardown remains clean (`live_pr_apps=0`). + +I then closed out the duplicate-run side observation from the Ghost PR #3 retrigger. + +Evidence: + +```bash +$ ssh cc-ci 'docker logs --since "2026-06-13T00:07:30" --until "2026-06-13T00:08:30" c54c433972ac 2>&1' +[poll] triggered build 568 for ghost@720faa0b (PR #3, comment 14029) by autonomic-bot +[poll] triggered build 569 for ghost@720faa0b (PR #3, comment 14032) by autonomic-bot +[poll] triggered build 570 for ghost@720faa0b (PR #3, comment 14497) by autonomic-bot + +$ ssh cc-ci 'docker service ps ccci-bridge_app --no-trunc' +# single running replica only; no restart near the incident + +$ ssh cc-ci 'docker ps --format "{{.ID}} {{.Names}} {{.Status}}" | grep ccci-bridge || true' +c54c433972ac ccci-bridge_app.1.u5msezm603izeyf7kizqxq97j Up 22 hours +``` + +Conclusion: this was NOT one comment id deduped incorrectly inside a single process. It was the poller +correctly treating THREE distinct comment ids as unseen after PR #3 was reopened: + +- `14029` and `14032` were historical `!testme` comments from when PR #3 had been open earlier. +- PR #3 was closed when the current bridge process started, so those comments were not covered by the + startup pass that marks pre-existing comments seen. +- When PR #3 was reopened, the poller saw those old comments for the first time and replayed them, then + also processed the fresh comment `14497`. + +Repo fix authored: + +- `bridge/bridge.py`: added `_PROCESS_STARTED_AT` and `_is_preexisting_comment()` so the poller now marks + any trigger comment older than the current bridge process as already-seen, even if the PR was closed at + startup and only becomes visible later via reopen. +- `tests/unit/test_bridge_trigger.py`: added focused tests for pre-start vs post-start comment handling. + +Verification: + +```bash +$ nix shell nixpkgs#python311Packages.pytest -c pytest tests/unit/test_bridge_trigger.py -q +.......... [100%] +10 passed in 0.04s +``` + +This fix addresses the replay hole exposed during cfold's Ghost retrigger. It does not change the cfold +bottom line: Ghost's upgrade tier remains the lone M2 blocker, while custom discovery continues to pass. diff --git a/machine-docs/STATUS-cfold.md b/machine-docs/STATUS-cfold.md index f328104..0086aea 100644 --- a/machine-docs/STATUS-cfold.md +++ b/machine-docs/STATUS-cfold.md @@ -60,8 +60,9 @@ Current work item: a cfold-neutral upgrade regression on the recipe/environment side - fresh follow-up probes now show the Ghost upgrade failure is not confined to PR #4 / PR #5: a reopened PR #3 at ref `720faa0b` also re-failed twice post-cfold (`568`, `569`) with the same shape -- one fresh `!testme` comment on Ghost PR #3 also spawned multiple runs (`568`, `569`, `570`), so there - is likely a duplicate-trigger side issue to investigate separately from cfold itself +- the Ghost duplicate-trigger side issue is now root-caused in the bridge source: reopened PRs can replay + old pre-bridge-start `!testme` comments that were never seen during startup because the PR was closed + at that time; repo fix landed locally and is being carried through deployment verification ### M2 baseline matrix (built from live PR heads + fresh post-cfold evidence) @@ -102,8 +103,12 @@ Current work item: - The fresh PR #3 rerun adds a second previously-green Ghost upgrade head that now fails the same way, so the blocker is broader than a single Ghost branch and still points away from cfold itself. - Side observation from the PR #3 retrigger: a single `!testme` comment at `2026-06-13T00:07:50Z` spawned - three new Ghost runs (`568`, `569`, `570`). `568` and `569` are already red with the same upgrade-only - failure; `570` was still in flight at the time of this status update. + three new Ghost runs (`568`, `569`, `570`). All three are now red with the same upgrade-only + failure. +- Root cause of the triple-trigger: bridge logs show those three runs were tied to three distinct comment + ids on the reopened PR (`14029`, `14032`, `14497`), not one comment processed three times. The poller + replayed two historical `!testme` comments that predated the current bridge process because PR #3 was + closed during bridge startup and only became visible to the poller after reopen. - Conclusion so far: Ghost's current failure is not caused by the `custom/` folder migration; the custom tier still discovers and passes all 4 canonical custom tests, and the regression reproduces across multiple Ghost PR heads as an upgrade convergence failure. @@ -111,6 +116,8 @@ Current work item: ### Fresh Adversary state - `REVIEW-cfold.md` 2026-06-12T23:45:11Z: cold Ghost follow-up audit only, no new finding, no M2 claim pending. +- `REVIEW-cfold.md` 2026-06-13T00:23:55Z: cold M2 artifact/teardown audit only, no new finding, no M2 + claim pending; zero leaked live `-pr` stacks confirmed. --- diff --git a/tests/unit/test_bridge_trigger.py b/tests/unit/test_bridge_trigger.py index 8b9ffbb..c0c65a6 100644 --- a/tests/unit/test_bridge_trigger.py +++ b/tests/unit/test_bridge_trigger.py @@ -8,6 +8,7 @@ from __future__ import annotations import os import sys +from datetime import timedelta # bridge.py reads HMAC/DRONE/GITEA secret FILES at import; point them at /dev/null (readable, empty) # so the import works in a unit context — parse_trigger doesn't use any of them. @@ -93,3 +94,13 @@ def test_find_existing_comment_matches_marker(monkeypatch): def test_find_existing_comment_none_when_absent(monkeypatch): monkeypatch.setattr(bridge, "list_comments", lambda fn, n: [{"id": 1, "body": "hello"}]) assert bridge.find_existing_comment("org/repo", 5) is None + + +def test_preexisting_comment_from_before_bridge_start_is_ignored(): + created = (bridge._PROCESS_STARTED_AT - timedelta(minutes=5)).isoformat().replace("+00:00", "Z") + assert bridge._is_preexisting_comment({"created_at": created}) is True + + +def test_comment_after_bridge_start_is_not_treated_as_preexisting(): + created = (bridge._PROCESS_STARTED_AT + timedelta(minutes=5)).isoformat().replace("+00:00", "Z") + assert bridge._is_preexisting_comment({"created_at": created}) is False