Retry builds on buildkite failure and support rebuilds

We had the CI service go down when buildkite was having an issue.
Instead of crashing, we should retry.

Also, support retesting using the "rebuild" button on buildkite.  This
should be rarely used, but it is helpful.

Change-Id: Ib8cdd2459988ef49b9da77144d324628c35fff58
diff --git a/tools/ci/buildkite_gerrit_trigger.go b/tools/ci/buildkite_gerrit_trigger.go
index eb31a92..7d789f0 100644
--- a/tools/ci/buildkite_gerrit_trigger.go
+++ b/tools/ci/buildkite_gerrit_trigger.go
@@ -14,6 +14,7 @@
 	"regexp"
 	"strings"
 	"sync"
+	"time"
 )
 
 type Commit struct {
@@ -87,6 +88,7 @@
 type EventInfo struct {
 	Author         *User      `json:"author"`
 	Uploader       *User      `json:"uploader"`
+	Reviewer       *User      `json:"reviewer"`
 	Submitter      User       `json:"submitter,omitempty"`
 	Approvals      []Approval `json:"approvals,omitempty"`
 	Comment        string     `json:"comment,omitempty"`
@@ -124,95 +126,108 @@
 	log.Printf("Got a matching change of %s %s %d,%d\n",
 		eventInfo.Change.ID, eventInfo.PatchSet.Revision, eventInfo.Change.Number, eventInfo.PatchSet.Number)
 
-	// Triggering a build creates a UUID, and we can see events back from the webhook before the command returns.  Lock across the command so nothing access Commits while the new UUID is being added.
-	s.mu.Lock()
+	for {
 
-	var user *User
-	if eventInfo.Author != nil {
-		user = eventInfo.Author
-	} else if eventInfo.Uploader != nil {
-		user = eventInfo.Uploader
-	} else {
-		log.Fatalf("Failed to find Author or Uploader")
-	}
+		// Triggering a build creates a UUID, and we can see events back from the webhook before the command returns.  Lock across the command so nothing access Commits while the new UUID is being added.
+		s.mu.Lock()
 
-	// Trigger the build.
-	if build, _, err := client.Builds.Create(
-		"spartan-robotics", "971-robot-code", &buildkite.CreateBuild{
-			Commit: eventInfo.PatchSet.Revision,
-			Branch: eventInfo.Change.ID,
-			Author: buildkite.Author{
-				Name:  user.Name,
-				Email: user.Email,
-			},
-			Env: map[string]string{
-				"GERRIT_CHANGE_NUMBER": fmt.Sprintf("%d", eventInfo.Change.Number),
-				"GERRIT_PATCH_NUMBER":  fmt.Sprintf("%d", eventInfo.PatchSet.Number),
-			},
-		}); err == nil {
-
-		if build.ID != nil {
-			log.Printf("Scheduled build %s\n", *build.ID)
-			s.Commits[*build.ID] = Commit{
-				Sha1:         eventInfo.PatchSet.Revision,
-				ChangeId:     eventInfo.Change.ID,
-				ChangeNumber: eventInfo.Change.Number,
-				Patchset:     eventInfo.PatchSet.Number,
-			}
-		}
-		s.mu.Unlock()
-
-		if data, err := json.MarshalIndent(build, "", "\t"); err != nil {
-			log.Fatalf("json encode failed: %s", err)
+		var user *User
+		if eventInfo.Author != nil {
+			user = eventInfo.Author
+		} else if eventInfo.Uploader != nil {
+			user = eventInfo.Uploader
 		} else {
-			log.Printf("%s\n", string(data))
+			log.Fatalf("Failed to find Author or Uploader")
 		}
 
-		// Now remove the verified from Gerrit and post the link.
-		cmd := exec.Command("ssh",
-			"-p",
-			"29418",
-			"-i",
-			s.Key,
-			s.User+"@software.frc971.org",
-			"gerrit",
-			"review",
-			"-m",
-			fmt.Sprintf("\"Build Started: %s\"", *build.WebURL),
-			"--verified",
-			"0",
-			fmt.Sprintf("%d,%d", eventInfo.Change.Number, eventInfo.PatchSet.Number))
+		// Trigger the build.
+		if build, _, err := client.Builds.Create(
+			"spartan-robotics", "971-robot-code", &buildkite.CreateBuild{
+				Commit: eventInfo.PatchSet.Revision,
+				Branch: eventInfo.Change.ID,
+				Author: buildkite.Author{
+					Name:  user.Name,
+					Email: user.Email,
+				},
+				Env: map[string]string{
+					"GERRIT_CHANGE_NUMBER": fmt.Sprintf("%d", eventInfo.Change.Number),
+					"GERRIT_PATCH_NUMBER":  fmt.Sprintf("%d", eventInfo.PatchSet.Number),
+				},
+			}); err == nil {
 
-		log.Printf("Running 'ssh -p 29418 -i %s %s@software.frc971.org gerrit review -m '\"Build Started: %s\"' --verified 0 %d,%d' and waiting for it to finish...",
-			s.Key, s.User,
-			*build.WebURL, eventInfo.Change.Number, eventInfo.PatchSet.Number)
-		if err := cmd.Run(); err != nil {
-			log.Printf("Command failed with error: %v", err)
+			if build.ID != nil {
+				log.Printf("Scheduled build %s\n", *build.ID)
+				s.Commits[*build.ID] = Commit{
+					Sha1:         eventInfo.PatchSet.Revision,
+					ChangeId:     eventInfo.Change.ID,
+					ChangeNumber: eventInfo.Change.Number,
+					Patchset:     eventInfo.PatchSet.Number,
+				}
+			}
+			s.mu.Unlock()
+
+			if data, err := json.MarshalIndent(build, "", "\t"); err != nil {
+				log.Fatalf("json encode failed: %s", err)
+			} else {
+				log.Printf("%s\n", string(data))
+			}
+
+			// Now remove the verified from Gerrit and post the link.
+			cmd := exec.Command("ssh",
+				"-p",
+				"29418",
+				"-i",
+				s.Key,
+				s.User+"@software.frc971.org",
+				"gerrit",
+				"review",
+				"-m",
+				fmt.Sprintf("\"Build Started: %s\"", *build.WebURL),
+				"--verified",
+				"0",
+				fmt.Sprintf("%d,%d", eventInfo.Change.Number, eventInfo.PatchSet.Number))
+
+			log.Printf("Running 'ssh -p 29418 -i %s %s@software.frc971.org gerrit review -m '\"Build Started: %s\"' --verified 0 %d,%d' and waiting for it to finish...",
+				s.Key, s.User,
+				*build.WebURL, eventInfo.Change.Number, eventInfo.PatchSet.Number)
+			if err := cmd.Run(); err != nil {
+				log.Printf("Command failed with error: %v", err)
+			}
+			return
+		} else {
+			s.mu.Unlock()
+			log.Printf("Failed to trigger build: %s", err)
+			log.Printf("Trying again in 30 seconds")
+			time.Sleep(30 * time.Second)
 		}
-	} else {
-		s.mu.Unlock()
-		log.Fatalf("Failed to trigger build: %s", err)
 	}
 
 }
 
+type BuildkiteChange struct {
+	ID     string `json:"id,omitempty"`
+	Number int    `json:"number,omitempty"`
+	URL    string `json:"url,omitempty"`
+}
+
 type Build struct {
-	ID           string `json:"id,omitempty"`
-	GraphqlId    string `json:"graphql_id,omitempty"`
-	URL          string `json:"url,omitempty"`
-	WebURL       string `json:"web_url,omitempty"`
-	Number       int    `json:"number,omitempty"`
-	State        string `json:"state,omitempty"`
-	Blocked      bool   `json:"blocked,omitempty"`
-	BlockedState string `json:"blocked_state,omitempty"`
-	Message      string `json:"message,omitempty"`
-	Commit       string `json:"commit"`
-	Branch       string `json:"branch"`
-	Source       string `json:"source,omitempty"`
-	CreatedAt    string `json:"created_at,omitempty"`
-	ScheduledAt  string `json:"scheduled_at,omitempty"`
-	StartedAt    string `json:"started_at,omitempty"`
-	FinishedAt   string `json:"finished_at,omitempty"`
+	ID           string           `json:"id,omitempty"`
+	GraphqlId    string           `json:"graphql_id,omitempty"`
+	URL          string           `json:"url,omitempty"`
+	WebURL       string           `json:"web_url,omitempty"`
+	Number       int              `json:"number,omitempty"`
+	State        string           `json:"state,omitempty"`
+	Blocked      bool             `json:"blocked,omitempty"`
+	BlockedState string           `json:"blocked_state,omitempty"`
+	Message      string           `json:"message,omitempty"`
+	Commit       string           `json:"commit"`
+	Branch       string           `json:"branch"`
+	Source       string           `json:"source,omitempty"`
+	CreatedAt    string           `json:"created_at,omitempty"`
+	ScheduledAt  string           `json:"scheduled_at,omitempty"`
+	StartedAt    string           `json:"started_at,omitempty"`
+	FinishedAt   string           `json:"finished_at,omitempty"`
+	RebuiltFrom  *BuildkiteChange `json:"rebuilt_from,omitempty"`
 }
 
 type BuildkiteWebhook struct {
@@ -252,13 +267,45 @@
 
 		// We've successfully received the webhook.  Spawn a goroutine in case the mutex is blocked so we don't block this thread.
 		f := func() {
-			if webhook.Event == "build.finished" {
+			if webhook.Event == "build.running" {
+				if webhook.Build.RebuiltFrom != nil {
+					s.mu.Lock()
+					if c, ok := s.Commits[webhook.Build.RebuiltFrom.ID]; ok {
+						log.Printf("Detected a rebuild of %s for build %s", webhook.Build.RebuiltFrom.ID, webhook.Build.ID)
+						s.Commits[webhook.Build.ID] = c
+
+						// And now remove the vote since the rebuild started.
+						cmd := exec.Command("ssh",
+							"-p",
+							"29418",
+							"-i",
+							s.Key,
+							s.User+"@software.frc971.org",
+							"gerrit",
+							"review",
+							"-m",
+							fmt.Sprintf("\"Build Started: %s\"", webhook.Build.WebURL),
+							"--verified",
+							"0",
+							fmt.Sprintf("%d,%d", c.ChangeNumber, c.Patchset))
+
+						log.Printf("Running 'ssh -p 29418 -i %s %s@software.frc971.org gerrit review -m '\"Build Started: %s\"' --verified 0 %d,%d' and waiting for it to finish...",
+							s.Key, s.User,
+							webhook.Build.WebURL, c.ChangeNumber, c.Patchset)
+						if err := cmd.Run(); err != nil {
+							log.Printf("Command failed with error: %v", err)
+						}
+					}
+					s.mu.Unlock()
+				}
+			} else if webhook.Event == "build.finished" {
 				var commit *Commit
 				{
 					s.mu.Lock()
 					if c, ok := s.Commits[webhook.Build.ID]; ok {
 						commit = &c
-						delete(s.Commits, webhook.Build.ID)
+						// While we *should* delete this now from the map, that will prevent rebuilds from being mapped correctly.
+						// Instead, leave it in the map indefinately.  For the number of builds we do, it should take quite a while to use enough ram to matter.  If that becomes an issue, we can either clean the list when a commit is submitted, or keep a fixed number of builds in the list and expire the oldest ones when it is time.
 					}
 					s.mu.Unlock()
 				}