fix(bridge): ignore pre-start trigger comments
Some checks failed
continuous-integration/drone/push Build is failing

This commit is contained in:
autonomic-bot
2026-06-13 00:27:22 +00:00
parent ddefc96eef
commit 23f1861b7a
4 changed files with 91 additions and 5 deletions

View File

@ -37,6 +37,7 @@ import time
import urllib.error import urllib.error
import urllib.parse import urllib.parse
import urllib.request import urllib.request
from datetime import datetime, timezone
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
GITEA_API = os.environ.get("GITEA_API", "https://git.autonomic.zone/api/v1") GITEA_API = os.environ.get("GITEA_API", "https://git.autonomic.zone/api/v1")
@ -81,6 +82,7 @@ GITEA_TOKEN = _read(os.environ["GITEA_TOKEN_FILE"])
# Shared dedup across the poll + webhook paths: a comment id triggers at most one run. # Shared dedup across the poll + webhook paths: a comment id triggers at most one run.
_PROCESSED: set = set() _PROCESSED: set = set()
_PROCESSED_LOCK = threading.Lock() _PROCESSED_LOCK = threading.Lock()
_PROCESS_STARTED_AT = datetime.now(timezone.utc)
def log(*a): def log(*a):
@ -277,6 +279,23 @@ def _claim(comment_id) -> bool:
return True return True
def _is_preexisting_comment(comment) -> bool:
"""Treat trigger comments older than this bridge process as already-seen.
This closes the reopened-PR hole where a PR was CLOSED during bridge startup, so its old
`!testme` comments were never marked seen by the first poll pass; when that PR is later reopened,
the poller must not replay those historical comments as fresh triggers.
"""
created = (comment or {}).get("created_at")
if not created:
return False
try:
created_at = datetime.fromisoformat(created.replace("Z", "+00:00"))
except ValueError:
return False
return created_at <= _PROCESS_STARTED_AT
def process_testme(full_name, owner, name, number, user, comment_id, source, quick=False): def process_testme(full_name, owner, name, number, user, comment_id, source, quick=False):
"""Shared by both paths. Dedupes by comment id, checks authorization, resolves the PR head, """Shared by both paths. Dedupes by comment id, checks authorization, resolves the PR head,
triggers the build, comments the run link. Returns (run_url|None, reason).""" triggers the build, comments the run link. Returns (run_url|None, reason)."""
@ -389,7 +408,7 @@ def poll_loop():
if not is_trigger: if not is_trigger:
continue continue
cid = c.get("id") cid = c.get("id")
if first: if first or _is_preexisting_comment(c):
_claim(cid) # mark pre-existing comments seen; don't fire on startup _claim(cid) # mark pre-existing comments seen; don't fire on startup
continue continue
user = (c.get("user") or {}).get("login", "") user = (c.get("user") or {}).get("login", "")

View File

@ -343,3 +343,52 @@ Takeaways:
- There is also likely a separate trigger dedupe problem: one `!testme` comment spawned runs `568`, `569`, - There is also likely a separate trigger dedupe problem: one `!testme` comment spawned runs `568`, `569`,
and `570`. I did not broaden into a D1 investigation in this loop step because cfold M2 is already and `570`. I did not broaden into a D1 investigation in this loop step because cfold M2 is already
hard-blocked by Ghost's repeated upgrade failures, but the evidence is now recorded. hard-blocked by Ghost's repeated upgrade failures, but the evidence is now recorded.
## 2026-06-13 — Root-caused Ghost triple-trigger replay; bridge fix authored with unit coverage
Pulled the Adversary's latest cfold audit (`review(cfold)` `ddefc96`). It was not an M2 verdict or a
finding; it confirmed the sweep is still unclaimable while teardown remains clean (`live_pr_apps=0`).
I then closed out the duplicate-run side observation from the Ghost PR #3 retrigger.
Evidence:
```bash
$ ssh cc-ci 'docker logs --since "2026-06-13T00:07:30" --until "2026-06-13T00:08:30" c54c433972ac 2>&1'
[poll] triggered build 568 for ghost@720faa0b (PR #3, comment 14029) by autonomic-bot
[poll] triggered build 569 for ghost@720faa0b (PR #3, comment 14032) by autonomic-bot
[poll] triggered build 570 for ghost@720faa0b (PR #3, comment 14497) by autonomic-bot
$ ssh cc-ci 'docker service ps ccci-bridge_app --no-trunc'
# single running replica only; no restart near the incident
$ ssh cc-ci 'docker ps --format "{{.ID}} {{.Names}} {{.Status}}" | grep ccci-bridge || true'
c54c433972ac ccci-bridge_app.1.u5msezm603izeyf7kizqxq97j Up 22 hours
```
Conclusion: this was NOT one comment id deduped incorrectly inside a single process. It was the poller
correctly treating THREE distinct comment ids as unseen after PR #3 was reopened:
- `14029` and `14032` were historical `!testme` comments from when PR #3 had been open earlier.
- PR #3 was closed when the current bridge process started, so those comments were not covered by the
startup pass that marks pre-existing comments seen.
- When PR #3 was reopened, the poller saw those old comments for the first time and replayed them, then
also processed the fresh comment `14497`.
Repo fix authored:
- `bridge/bridge.py`: added `_PROCESS_STARTED_AT` and `_is_preexisting_comment()` so the poller now marks
any trigger comment older than the current bridge process as already-seen, even if the PR was closed at
startup and only becomes visible later via reopen.
- `tests/unit/test_bridge_trigger.py`: added focused tests for pre-start vs post-start comment handling.
Verification:
```bash
$ nix shell nixpkgs#python311Packages.pytest -c pytest tests/unit/test_bridge_trigger.py -q
.......... [100%]
10 passed in 0.04s
```
This fix addresses the replay hole exposed during cfold's Ghost retrigger. It does not change the cfold
bottom line: Ghost's upgrade tier remains the lone M2 blocker, while custom discovery continues to pass.

View File

@ -60,8 +60,9 @@ Current work item:
a cfold-neutral upgrade regression on the recipe/environment side a cfold-neutral upgrade regression on the recipe/environment side
- fresh follow-up probes now show the Ghost upgrade failure is not confined to PR #4 / PR #5: a reopened - fresh follow-up probes now show the Ghost upgrade failure is not confined to PR #4 / PR #5: a reopened
PR #3 at ref `720faa0b` also re-failed twice post-cfold (`568`, `569`) with the same shape PR #3 at ref `720faa0b` also re-failed twice post-cfold (`568`, `569`) with the same shape
- one fresh `!testme` comment on Ghost PR #3 also spawned multiple runs (`568`, `569`, `570`), so there - the Ghost duplicate-trigger side issue is now root-caused in the bridge source: reopened PRs can replay
is likely a duplicate-trigger side issue to investigate separately from cfold itself old pre-bridge-start `!testme` comments that were never seen during startup because the PR was closed
at that time; repo fix landed locally and is being carried through deployment verification
### M2 baseline matrix (built from live PR heads + fresh post-cfold evidence) ### M2 baseline matrix (built from live PR heads + fresh post-cfold evidence)
@ -102,8 +103,12 @@ Current work item:
- The fresh PR #3 rerun adds a second previously-green Ghost upgrade head that now fails the same way, - The fresh PR #3 rerun adds a second previously-green Ghost upgrade head that now fails the same way,
so the blocker is broader than a single Ghost branch and still points away from cfold itself. so the blocker is broader than a single Ghost branch and still points away from cfold itself.
- Side observation from the PR #3 retrigger: a single `!testme` comment at `2026-06-13T00:07:50Z` spawned - Side observation from the PR #3 retrigger: a single `!testme` comment at `2026-06-13T00:07:50Z` spawned
three new Ghost runs (`568`, `569`, `570`). `568` and `569` are already red with the same upgrade-only three new Ghost runs (`568`, `569`, `570`). All three are now red with the same upgrade-only
failure; `570` was still in flight at the time of this status update. failure.
- Root cause of the triple-trigger: bridge logs show those three runs were tied to three distinct comment
ids on the reopened PR (`14029`, `14032`, `14497`), not one comment processed three times. The poller
replayed two historical `!testme` comments that predated the current bridge process because PR #3 was
closed during bridge startup and only became visible to the poller after reopen.
- Conclusion so far: Ghost's current failure is not caused by the `custom/` folder migration; the custom - Conclusion so far: Ghost's current failure is not caused by the `custom/` folder migration; the custom
tier still discovers and passes all 4 canonical custom tests, and the regression reproduces across tier still discovers and passes all 4 canonical custom tests, and the regression reproduces across
multiple Ghost PR heads as an upgrade convergence failure. multiple Ghost PR heads as an upgrade convergence failure.
@ -111,6 +116,8 @@ Current work item:
### Fresh Adversary state ### Fresh Adversary state
- `REVIEW-cfold.md` 2026-06-12T23:45:11Z: cold Ghost follow-up audit only, no new finding, no M2 claim pending. - `REVIEW-cfold.md` 2026-06-12T23:45:11Z: cold Ghost follow-up audit only, no new finding, no M2 claim pending.
- `REVIEW-cfold.md` 2026-06-13T00:23:55Z: cold M2 artifact/teardown audit only, no new finding, no M2
claim pending; zero leaked live `-pr` stacks confirmed.
--- ---

View File

@ -8,6 +8,7 @@ from __future__ import annotations
import os import os
import sys import sys
from datetime import timedelta
# bridge.py reads HMAC/DRONE/GITEA secret FILES at import; point them at /dev/null (readable, empty) # bridge.py reads HMAC/DRONE/GITEA secret FILES at import; point them at /dev/null (readable, empty)
# so the import works in a unit context — parse_trigger doesn't use any of them. # so the import works in a unit context — parse_trigger doesn't use any of them.
@ -93,3 +94,13 @@ def test_find_existing_comment_matches_marker(monkeypatch):
def test_find_existing_comment_none_when_absent(monkeypatch): def test_find_existing_comment_none_when_absent(monkeypatch):
monkeypatch.setattr(bridge, "list_comments", lambda fn, n: [{"id": 1, "body": "hello"}]) monkeypatch.setattr(bridge, "list_comments", lambda fn, n: [{"id": 1, "body": "hello"}])
assert bridge.find_existing_comment("org/repo", 5) is None assert bridge.find_existing_comment("org/repo", 5) is None
def test_preexisting_comment_from_before_bridge_start_is_ignored():
created = (bridge._PROCESS_STARTED_AT - timedelta(minutes=5)).isoformat().replace("+00:00", "Z")
assert bridge._is_preexisting_comment({"created_at": created}) is True
def test_comment_after_bridge_start_is_not_treated_as_preexisting():
created = (bridge._PROCESS_STARTED_AT + timedelta(minutes=5)).isoformat().replace("+00:00", "Z")
assert bridge._is_preexisting_comment({"created_at": created}) is False