fix(bridge): ignore pre-start trigger comments

2026-06-13 00:27:22 +00:00
parent ddefc96eef
commit 23f1861b7a
4 changed files with 91 additions and 5 deletions
--- a/bridge/bridge.py
+++ b/bridge/bridge.py
@ -37,6 +37,7 @@ import time
 import urllib.error
 import urllib.parse
 import urllib.request
 from datetime import datetime, timezone
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 GITEA_API = os.environ.get("GITEA_API", "https://git.autonomic.zone/api/v1")
@ -81,6 +82,7 @@ GITEA_TOKEN = _read(os.environ["GITEA_TOKEN_FILE"])
 # Shared dedup across the poll + webhook paths: a comment id triggers at most one run.
 _PROCESSED: set = set()
 _PROCESSED_LOCK = threading.Lock()
 _PROCESS_STARTED_AT = datetime.now(timezone.utc)
 def log(*a):
@ -277,6 +279,23 @@ def _claim(comment_id) -> bool:
        return True
 def _is_preexisting_comment(comment) -> bool:
    """Treat trigger comments older than this bridge process as already-seen.
    This closes the reopened-PR hole where a PR was CLOSED during bridge startup, so its old
    `!testme` comments were never marked seen by the first poll pass; when that PR is later reopened,
    the poller must not replay those historical comments as fresh triggers.
    """
    created = (comment or {}).get("created_at")
    if not created:
        return False
    try:
        created_at = datetime.fromisoformat(created.replace("Z", "+00:00"))
    except ValueError:
        return False
    return created_at <= _PROCESS_STARTED_AT
 def process_testme(full_name, owner, name, number, user, comment_id, source, quick=False):
    """Shared by both paths. Dedupes by comment id, checks authorization, resolves the PR head,
    triggers the build, comments the run link. Returns (run_url|None, reason)."""
@ -389,7 +408,7 @@ def poll_loop():
                    if not is_trigger:
                        continue
                    cid = c.get("id")
-                    if first:
+                    if first or _is_preexisting_comment(c):
                        _claim(cid)  # mark pre-existing comments seen; don't fire on startup
                        continue
                    user = (c.get("user") or {}).get("login", "")
--- a/machine-docs/JOURNAL-cfold.md
+++ b/machine-docs/JOURNAL-cfold.md
@ -343,3 +343,52 @@ Takeaways:
 - There is also likely a separate trigger dedupe problem: one `!testme` comment spawned runs `568`, `569`,
  and `570`. I did not broaden into a D1 investigation in this loop step because cfold M2 is already
  hard-blocked by Ghost's repeated upgrade failures, but the evidence is now recorded.
 ## 2026-06-13 — Root-caused Ghost triple-trigger replay; bridge fix authored with unit coverage
 Pulled the Adversary's latest cfold audit (`review(cfold)` `ddefc96`). It was not an M2 verdict or a
 finding; it confirmed the sweep is still unclaimable while teardown remains clean (`live_pr_apps=0`).
 I then closed out the duplicate-run side observation from the Ghost PR #3 retrigger.
 Evidence:
 ```bash
 $ ssh cc-ci 'docker logs --since "2026-06-13T00:07:30" --until "2026-06-13T00:08:30" c54c433972ac 2>&1'
 [poll] triggered build 568 for ghost@720faa0b (PR #3, comment 14029) by autonomic-bot
 [poll] triggered build 569 for ghost@720faa0b (PR #3, comment 14032) by autonomic-bot
 [poll] triggered build 570 for ghost@720faa0b (PR #3, comment 14497) by autonomic-bot
 $ ssh cc-ci 'docker service ps ccci-bridge_app --no-trunc'
 # single running replica only; no restart near the incident
 $ ssh cc-ci 'docker ps --format "{{.ID}} {{.Names}} {{.Status}}" | grep ccci-bridge || true'
 c54c433972ac ccci-bridge_app.1.u5msezm603izeyf7kizqxq97j Up 22 hours
 ```
 Conclusion: this was NOT one comment id deduped incorrectly inside a single process. It was the poller
 correctly treating THREE distinct comment ids as unseen after PR #3 was reopened:
 - `14029` and `14032` were historical `!testme` comments from when PR #3 had been open earlier.
 - PR #3 was closed when the current bridge process started, so those comments were not covered by the
  startup pass that marks pre-existing comments seen.
 - When PR #3 was reopened, the poller saw those old comments for the first time and replayed them, then
  also processed the fresh comment `14497`.
 Repo fix authored:
 - `bridge/bridge.py`: added `_PROCESS_STARTED_AT` and `_is_preexisting_comment()` so the poller now marks
  any trigger comment older than the current bridge process as already-seen, even if the PR was closed at
  startup and only becomes visible later via reopen.
 - `tests/unit/test_bridge_trigger.py`: added focused tests for pre-start vs post-start comment handling.
 Verification:
 ```bash
 $ nix shell nixpkgs#python311Packages.pytest -c pytest tests/unit/test_bridge_trigger.py -q
 ..........                                                               [100%]
 10 passed in 0.04s
 ```
 This fix addresses the replay hole exposed during cfold's Ghost retrigger. It does not change the cfold
 bottom line: Ghost's upgrade tier remains the lone M2 blocker, while custom discovery continues to pass.
--- a/machine-docs/STATUS-cfold.md
+++ b/machine-docs/STATUS-cfold.md
@ -60,8 +60,9 @@ Current work item:
  a cfold-neutral upgrade regression on the recipe/environment side
 - fresh follow-up probes now show the Ghost upgrade failure is not confined to PR #4 / PR #5: a reopened
  PR #3 at ref `720faa0b` also re-failed twice post-cfold (`568`, `569`) with the same shape
- one fresh `!testme` comment on Ghost PR #3 also spawned multiple runs (`568`, `569`, `570`), so there
+- the Ghost duplicate-trigger side issue is now root-caused in the bridge source: reopened PRs can replay
-  is likely a duplicate-trigger side issue to investigate separately from cfold itself
+  old pre-bridge-start `!testme` comments that were never seen during startup because the PR was closed
  at that time; repo fix landed locally and is being carried through deployment verification
 ### M2 baseline matrix (built from live PR heads + fresh post-cfold evidence)
@ -102,8 +103,12 @@ Current work item:
 - The fresh PR #3 rerun adds a second previously-green Ghost upgrade head that now fails the same way,
  so the blocker is broader than a single Ghost branch and still points away from cfold itself.
 - Side observation from the PR #3 retrigger: a single `!testme` comment at `2026-06-13T00:07:50Z` spawned
-  three new Ghost runs (`568`, `569`, `570`). `568` and `569` are already red with the same upgrade-only
+  three new Ghost runs (`568`, `569`, `570`). All three are now red with the same upgrade-only
-  failure; `570` was still in flight at the time of this status update.
+  failure.
 - Root cause of the triple-trigger: bridge logs show those three runs were tied to three distinct comment
  ids on the reopened PR (`14029`, `14032`, `14497`), not one comment processed three times. The poller
  replayed two historical `!testme` comments that predated the current bridge process because PR #3 was
  closed during bridge startup and only became visible to the poller after reopen.
 - Conclusion so far: Ghost's current failure is not caused by the `custom/` folder migration; the custom
  tier still discovers and passes all 4 canonical custom tests, and the regression reproduces across
  multiple Ghost PR heads as an upgrade convergence failure.
@ -111,6 +116,8 @@ Current work item:
 ### Fresh Adversary state
 - `REVIEW-cfold.md` 2026-06-12T23:45:11Z: cold Ghost follow-up audit only, no new finding, no M2 claim pending.
 - `REVIEW-cfold.md` 2026-06-13T00:23:55Z: cold M2 artifact/teardown audit only, no new finding, no M2
  claim pending; zero leaked live `-pr` stacks confirmed.
 ---
--- a/tests/unit/test_bridge_trigger.py
+++ b/tests/unit/test_bridge_trigger.py
@ -8,6 +8,7 @@ from __future__ import annotations
 import os
 import sys
 from datetime import timedelta
 # bridge.py reads HMAC/DRONE/GITEA secret FILES at import; point them at /dev/null (readable, empty)
 # so the import works in a unit context — parse_trigger doesn't use any of them.
@ -93,3 +94,13 @@ def test_find_existing_comment_matches_marker(monkeypatch):
 def test_find_existing_comment_none_when_absent(monkeypatch):
    monkeypatch.setattr(bridge, "list_comments", lambda fn, n: [{"id": 1, "body": "hello"}])
    assert bridge.find_existing_comment("org/repo", 5) is None
 def test_preexisting_comment_from_before_bridge_start_is_ignored():
    created = (bridge._PROCESS_STARTED_AT - timedelta(minutes=5)).isoformat().replace("+00:00", "Z")
    assert bridge._is_preexisting_comment({"created_at": created}) is True
 def test_comment_after_bridge_start_is_not_treated_as_preexisting():
    created = (bridge._PROCESS_STARTED_AT + timedelta(minutes=5)).isoformat().replace("+00:00", "Z")
    assert bridge._is_preexisting_comment({"created_at": created}) is False