Created
March 7, 2019 20:10
-
-
Save d-kuro/f9f2e4504a876f675f91e24983488dcb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// from: https://github.com/kubernetes/kubernetes/blob/87f9429087d4e31201412548517d36e83abebc8d/pkg/controller/cronjob/cronjob_controller.go#L190-L344 | |
// syncOne reconciles a CronJob with a list of any Jobs that it created. | |
// All known jobs created by "sj" should be included in "js". | |
// The current time is passed in to facilitate testing. | |
// It has no receiver, to facilitate testing. | |
func syncOne(sj *batchv1beta1.CronJob, js []batchv1.Job, now time.Time, jc jobControlInterface, sjc sjControlInterface, recorder record.EventRecorder) { | |
nameForLog := fmt.Sprintf("%s/%s", sj.Namespace, sj.Name) | |
childrenJobs := make(map[types.UID]bool) | |
for _, j := range js { | |
childrenJobs[j.ObjectMeta.UID] = true | |
found := inActiveList(*sj, j.ObjectMeta.UID) | |
if !found && !IsJobFinished(&j) { | |
recorder.Eventf(sj, v1.EventTypeWarning, "UnexpectedJob", "Saw a job that the controller did not create or forgot: %v", j.Name) | |
// We found an unfinished job that has us as the parent, but it is not in our Active list. | |
// This could happen if we crashed right after creating the Job and before updating the status, | |
// or if our jobs list is newer than our sj status after a relist, or if someone intentionally created | |
// a job that they wanted us to adopt. | |
// TODO: maybe handle the adoption case? Concurrency/suspend rules will not apply in that case, obviously, since we can't | |
// stop users from creating jobs if they have permission. It is assumed that if a | |
// user has permission to create a job within a namespace, then they have permission to make any scheduledJob | |
// in the same namespace "adopt" that job. ReplicaSets and their Pods work the same way. | |
// TBS: how to update sj.Status.LastScheduleTime if the adopted job is newer than any we knew about? | |
} else if found && IsJobFinished(&j) { | |
deleteFromActiveList(sj, j.ObjectMeta.UID) | |
// TODO: event to call out failure vs success. | |
recorder.Eventf(sj, v1.EventTypeNormal, "SawCompletedJob", "Saw completed job: %v", j.Name) | |
} | |
} | |
// Remove any job reference from the active list if the corresponding job does not exist any more. | |
// Otherwise, the cronjob may be stuck in active mode forever even though there is no matching | |
// job running. | |
for _, j := range sj.Status.Active { | |
if found := childrenJobs[j.UID]; !found { | |
recorder.Eventf(sj, v1.EventTypeNormal, "MissingJob", "Active job went missing: %v", j.Name) | |
deleteFromActiveList(sj, j.UID) | |
} | |
} | |
updatedSJ, err := sjc.UpdateStatus(sj) | |
if err != nil { | |
klog.Errorf("Unable to update status for %s (rv = %s): %v", nameForLog, sj.ResourceVersion, err) | |
return | |
} | |
*sj = *updatedSJ | |
if sj.DeletionTimestamp != nil { | |
// The CronJob is being deleted. | |
// Don't do anything other than updating status. | |
return | |
} | |
if sj.Spec.Suspend != nil && *sj.Spec.Suspend { | |
klog.V(4).Infof("Not starting job for %s because it is suspended", nameForLog) | |
return | |
} | |
times, err := getRecentUnmetScheduleTimes(*sj, now) | |
if err != nil { | |
recorder.Eventf(sj, v1.EventTypeWarning, "FailedNeedsStart", "Cannot determine if job needs to be started: %v", err) | |
klog.Errorf("Cannot determine if %s needs to be started: %v", nameForLog, err) | |
return | |
} | |
// TODO: handle multiple unmet start times, from oldest to newest, updating status as needed. | |
if len(times) == 0 { | |
klog.V(4).Infof("No unmet start times for %s", nameForLog) | |
return | |
} | |
if len(times) > 1 { | |
klog.V(4).Infof("Multiple unmet start times for %s so only starting last one", nameForLog) | |
} | |
scheduledTime := times[len(times)-1] | |
tooLate := false | |
if sj.Spec.StartingDeadlineSeconds != nil { | |
tooLate = scheduledTime.Add(time.Second * time.Duration(*sj.Spec.StartingDeadlineSeconds)).Before(now) | |
} | |
if tooLate { | |
klog.V(4).Infof("Missed starting window for %s", nameForLog) | |
recorder.Eventf(sj, v1.EventTypeWarning, "MissSchedule", "Missed scheduled time to start a job: %s", scheduledTime.Format(time.RFC1123Z)) | |
// TODO: Since we don't set LastScheduleTime when not scheduling, we are going to keep noticing | |
// the miss every cycle. In order to avoid sending multiple events, and to avoid processing | |
// the sj again and again, we could set a Status.LastMissedTime when we notice a miss. | |
// Then, when we call getRecentUnmetScheduleTimes, we can take max(creationTimestamp, | |
// Status.LastScheduleTime, Status.LastMissedTime), and then so we won't generate | |
// and event the next time we process it, and also so the user looking at the status | |
// can see easily that there was a missed execution. | |
return | |
} | |
if sj.Spec.ConcurrencyPolicy == batchv1beta1.ForbidConcurrent && len(sj.Status.Active) > 0 { | |
// Regardless which source of information we use for the set of active jobs, | |
// there is some risk that we won't see an active job when there is one. | |
// (because we haven't seen the status update to the SJ or the created pod). | |
// So it is theoretically possible to have concurrency with Forbid. | |
// As long the as the invocations are "far enough apart in time", this usually won't happen. | |
// | |
// TODO: for Forbid, we could use the same name for every execution, as a lock. | |
// With replace, we could use a name that is deterministic per execution time. | |
// But that would mean that you could not inspect prior successes or failures of Forbid jobs. | |
klog.V(4).Infof("Not starting job for %s because of prior execution still running and concurrency policy is Forbid", nameForLog) | |
return | |
} | |
if sj.Spec.ConcurrencyPolicy == batchv1beta1.ReplaceConcurrent { | |
for _, j := range sj.Status.Active { | |
klog.V(4).Infof("Deleting job %s of %s that was still running at next scheduled start time", j.Name, nameForLog) | |
job, err := jc.GetJob(j.Namespace, j.Name) | |
if err != nil { | |
recorder.Eventf(sj, v1.EventTypeWarning, "FailedGet", "Get job: %v", err) | |
return | |
} | |
if !deleteJob(sj, job, jc, recorder) { | |
return | |
} | |
} | |
} | |
jobReq, err := getJobFromTemplate(sj, scheduledTime) | |
if err != nil { | |
klog.Errorf("Unable to make Job from template in %s: %v", nameForLog, err) | |
return | |
} | |
jobResp, err := jc.CreateJob(sj.Namespace, jobReq) | |
if err != nil { | |
recorder.Eventf(sj, v1.EventTypeWarning, "FailedCreate", "Error creating job: %v", err) | |
return | |
} | |
klog.V(4).Infof("Created Job %s for %s", jobResp.Name, nameForLog) | |
recorder.Eventf(sj, v1.EventTypeNormal, "SuccessfulCreate", "Created job %v", jobResp.Name) | |
// ------------------------------------------------------------------ // | |
// If this process restarts at this point (after posting a job, but | |
// before updating the status), then we might try to start the job on | |
// the next time. Actually, if we re-list the SJs and Jobs on the next | |
// iteration of syncAll, we might not see our own status update, and | |
// then post one again. So, we need to use the job name as a lock to | |
// prevent us from making the job twice (name the job with hash of its | |
// scheduled time). | |
// Add the just-started job to the status list. | |
ref, err := getRef(jobResp) | |
if err != nil { | |
klog.V(2).Infof("Unable to make object reference for job for %s", nameForLog) | |
} else { | |
sj.Status.Active = append(sj.Status.Active, *ref) | |
} | |
sj.Status.LastScheduleTime = &metav1.Time{Time: scheduledTime} | |
if _, err := sjc.UpdateStatus(sj); err != nil { | |
klog.Infof("Unable to update status for %s (rv = %s): %v", nameForLog, sj.ResourceVersion, err) | |
} | |
return | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment