diff --git a/README.md b/README.md index 3d5ada4..1a46df9 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,121 @@ # Jobwatch -## (C) 2022, Michael Höß, MIT-Licensed +> (C) 2022, Michael Höß, hoess@gmx.net, MIT-Licensed -### What is jobwatch +## What is jobwatch If you just want to execute a simple shell-script via cron every couple of hour, often only a few lines of code are required. -But if you want to ensure the script really run, in the required interval, completed -without error, you have to add a lot of plumping code. +But if you want to ensure +- the script really run +- in the required interval +- completed without error/with the expected exitcode +- has the expected output +- only one instances executes in parallel + +and want all this to be monitored with your CheckMK, you have to add a lot of plumbing code. jobwatch helps to migitate this problem. It wraps the execution of your script, and does -all the rest for you, by providing output, which can be fed to CheckMK then -Agent-Plugin-Mechanism. +all the rest for you, by providing output, which can be fed to CheckMK by the +Agent-Plugin-mechanism. jobwatch -- checks for required exit-codes -- searches through the output of your script via regex to classify certain - keywords as WARN or CRIT -- let you define the required run-interval +- checks for required _exit-codes_, and maps them to _OK, WARN, CRIT, UNKN (0-3)_ +- _searches_ through the _console-output_ of your script via _regex_ to classify certain + keywords as OK, WARN or CRIT (0, 1, 2) +- sends previous regex matches to _logwatch_ +- prevents running your job in multiple instances (previous job took longer than expected) +- let you define the required run-intervals -Simply add a .job-file into /etc/jobwatch.d where you defined all of this, +Simply add a .job-file into /etc/jobwatch.d where you defines all of this, call you script via jobwatch -j job in the crontab and you are done. -TBD: -- document deployment -- document job-file (see included sample) -- feed logoutput to CheckMK +## Sample-Job `/etc/jobwatch.d/sample.job.yml`: +``` +cmd: /usr/bin/w +args: + - -i +exitcode_map: + - from: 23 # Map + to: 3 + - from: -1 # Map all nonzero-exit codes to 1=WARN + to: 1 + - from: 0 # Map exitcode 0 also to 1=WARN + to: 1 +log_matches: + - regex: .*192.168.*.* + state: 1 + - regex: ".*tty[0-9].+-.*" # 2=CRIT when sombody is logged on the console + state: 2 + alt_msg: "%v -> a user is logged in at console" + - regex: .+ # Include All other lines as OK + state: 0 +hide_output: False # Dont't hide output, but pass through +last_run_warn: # How often is this job to be expected to be executed + val: 8 + unit: "h" +last_run_crit: + val: 16 + ``` + +> Note: job-names may only consists of chars `a-z`,`0-9` and `- ` + +The config-config dir is When run as +- root: `/etc/jobwatch.d` +- user `$HOME/etc/jobwatch.d` + +## Deploying jobwatch +- Compile the go program, if required, the src-dir: `go get .;go build .` +- `cp jobwatch /usr/local/bin` +- `ln -s /usr/local/bin/jobwatch /usr/lib/check_agent/plugins` + +On the check-mk server just deploy the provede .mkp, no further +config there is currently needed + +## Invoking Jobwatch + +``` +jobwatch -h +Usage of jobwatch: + -d Debug + -i string + JobId Instance. Default '' + -j string + JobId. reads $jobDir/$job.job.yml Default '' + -jd string + JobDir. Default: $HOME/etc/jobwatch.d /etc/jobwatch.d' + -- After this parameter every further parameter is passed to the + called program + +Without any parameters CheckMK-Agent-Output is generated +``` + +Example1 from an crontab +``` +10 9 * * 0 root /usr/local/bin/jobwatch -j bu-dupl +``` + +Example2 from an crontab, here we use the same job for +different instances. (In this e.g. we have backup-sets) +``` +10 5 * * 0,3 root /usr/local/bin/jobwatch -j bu-borg -i b01-main -- /opt/borg/B01_main.borgjob +10 5 * * 1,4 root /usr/local/bin/jobwatch -j bu-borg -i b10-dvp -- /opt/borg/B10_dvp.borgjob +``` + +### Instances/Multiuser + +See "Invoking jobwatch above" + +> Note instance-names have the same restrictions as job-names + +When using instances, the final job name in the output is `[userid]/[job-name]_[instance-name]`, otherwise only `[userid]/[job-name]` + + + + +## TBD: - redirect script-output to a logfile via Config +- job-timeout - more metrics (last-success-ful run ) diff --git a/app/go.mod b/app/go.mod index 633de7c..c71474c 100644 --- a/app/go.mod +++ b/app/go.mod @@ -2,4 +2,7 @@ module jobwatch go 1.18 -require gopkg.in/yaml.v2 v2.4.0 +require ( + github.com/nightlyone/lockfile v1.0.0 + gopkg.in/yaml.v2 v2.4.0 +) diff --git a/app/go.sum b/app/go.sum index 7534661..9f40921 100644 --- a/app/go.sum +++ b/app/go.sum @@ -1,3 +1,5 @@ +github.com/nightlyone/lockfile v1.0.0 h1:RHep2cFKK4PonZJDdEl4GmkabuhbsRMgk/k3uAmxBiA= +github.com/nightlyone/lockfile v1.0.0/go.mod h1:rywoIealpdNse2r832aiD9jRk8ErCatROs6LzC841CI= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= diff --git a/app/main.go b/app/main.go index a8e358a..a95dbc3 100644 --- a/app/main.go +++ b/app/main.go @@ -21,15 +21,15 @@ import ( // Jobs root-idfile -func evalOutput(job types.Job, output []string) (types.CMKState, error) { +func evalOutput(job types.Job, output []string) (types.CMKState, []state.LogEntry, error) { var res types.CMKState = types.OK - var logLines []string + var logLines []state.LogEntry var rcs []*regexp.Regexp for _, m := range job.LogMatches { r, err := regexp.Compile(m.Regex) if err != nil { - return 0, err + return 0, logLines, err } else { rcs = append(rcs, r) } @@ -47,20 +47,12 @@ func evalOutput(job types.Job, output []string) (types.CMKState, error) { v = fmt.Sprintf(m.AltMsg, v) } - var p = "" - switch s := m.State; s { - case types.OK: - p = "O" - case types.WARN: - p = "W" - case types.CRIT: - p = "C" - case types.UNKNOWN: - p = "U" - } - p += ": " - logLines = append(logLines, time.Now().Format("2006-01-2 15:04:05")+" "+p+v) - + var le state.LogEntry + le.State = state.ToLogState(m.State) + le.Date = time.Now().Format("2006-01-2 15:04:05") + le.Text = v + logLines = append(logLines, le) + break } } } @@ -68,10 +60,10 @@ func evalOutput(job types.Job, output []string) (types.CMKState, error) { log.Printf("- Matched LogLine %v\n", oln) } - return res, nil + return res, logLines, nil } -func runJob(job types.Job) (state.State, error) { +func runJob(job types.Job) (state.State, []state.LogEntry, error) { st := state.StateFromJob(job) st.LastExitCode = -1 @@ -96,7 +88,7 @@ func runJob(job types.Job) (state.State, error) { stcode = exitError.ProcessState.ExitCode() } else { // cmd not found etc - return st, err + return st, []state.LogEntry{}, err } if stcode == 0 { // Exec failed, but exitcode, be sure to get error! stcode = int(types.UNKNOWN) @@ -119,7 +111,7 @@ func runJob(job types.Job) (state.State, error) { log.Printf("- State after ExitCodeMapping: %v", stcode) outText := string(stdcombinedBuf.Bytes()) - log_state, err := evalOutput(job, strings.Split(outText, "\n")) + log_state, logLines, err := evalOutput(job, strings.Split(outText, "\n")) if int(log_state) > stcode { stcode = int(log_state) } @@ -130,7 +122,7 @@ func runJob(job types.Job) (state.State, error) { st.LastState = stcode st.LastRun = time.Now().Format(time.RFC3339) - return st, err + return st, logLines, err } func setup() (*types.Job, error) { @@ -172,6 +164,7 @@ func setup() (*types.Job, error) { job.InstId = job_instance return job, nil } else { + log.Printf("! Error loading job : %v\n", err) return nil, err } } @@ -182,26 +175,50 @@ func main() { if job == nil && err == nil { fmt.Println("<<>>") state.ShowAll() + fmt.Println("<<>>") + state.ShowAllLogs() fmt.Println("<<<>>>") } else { fmt.Fprintln(os.Stderr, "Jobwatch 0.1 (C) 2022 hoess@gmx.net") + var exitCode = 0 + var logLines []state.LogEntry var res state.State - if err == nil { - res, err = runJob(*job) - } - res.ReEval() - res.SaveState() + defer func() { + os.Exit(exitCode) + }() + + lock, err := state.Lock(job.GetJobBaseFileName(), "JOB") + log.Println("Locked on job") + if err == nil { + defer func() { + log.Println("Unlocking job") + state.Unlock(lock) + }() + + if err != nil { + log.Println("Can't acquire lock, skipping") + } + + if err == nil { + res, logLines, err = runJob(*job) + res.ReEval() + res.SaveState() + } + + if err == nil { + err = state.AppendLog(*job, logLines) + } + } if err != nil { fmt.Printf("! Error running job %+v\n", err) - os.Exit(int(types.UNKNOWN)) + exitCode = int(types.UNKNOWN) } else { - os.Exit(int(res.LastState)) + exitCode = int(res.LastState) } - state.WriteLog(*job) } } diff --git a/app/sample.job.yml b/app/sample.job.yml index 3ed19ea..9482983 100644 --- a/app/sample.job.yml +++ b/app/sample.job.yml @@ -4,18 +4,20 @@ args: exitcode_map: - from: 23 to: 3 - - from: -1 + - from: -1 # Map all nonzero-exit codes to 1=WARN to: 1 - - from: 0 + - from: 0 # Map exitcode 0 also to 1=WARN to: 1 log_matches: - regex: .*192.168.*.* state: 1 - - regex: "-" + - regex: ".*tty[0-9].+-.*" # 2=CRIT when sombody is logged on the console state: 2 - alt_msg: "User logged in at console (%v)" -hide_output: True -last_run_warn: + alt_msg: "%v -> a user is logged in at console" + - regex: .+ # Include All other lines as OK + state: 0 +hide_output: False # Dont't hide output, but pass through +last_run_warn: # How often is this job to be expected to be executed val: 8 unit: "h" last_run_crit: diff --git a/app/state/lock.go b/app/state/lock.go new file mode 100644 index 0000000..d900e10 --- /dev/null +++ b/app/state/lock.go @@ -0,0 +1,45 @@ +package state + +import ( + "fmt" + "log" + "os" + "path/filepath" + "time" + + "github.com/nightlyone/lockfile" +) + +const LOCk_WAIT_MILLIS = 30000 + +func Lock(jobFullName string, lockType string) (*lockfile.Lockfile, error) { + + fn := fmt.Sprintf("%v_%v", jobFullName, lockType) + lock, err := lockfile.New(filepath.Join(os.TempDir(), fn)) + if err != nil { + return nil, fmt.Errorf("Cannot init lock. reason: %v", err) + } + + var retries = 300 + for retries > 0 { + err = lock.TryLock() + if err == nil { + log.Printf("Locked via lockfile %v", lock) + return &lock, nil + } + if _, ok := err.(interface{ Temporary() bool }); ok { + retries-- + time.Sleep(time.Millisecond * 100) + } else { + retries = -1 + } + } + return nil, fmt.Errorf("Timeout acquiring lock %v/%v", jobFullName, lockType) +} + +func Unlock(lock *lockfile.Lockfile) { + if lock != nil { + log.Printf("Unlock lockfile %v", *lock) + lock.Unlock() + } +} diff --git a/app/state/log.go b/app/state/log.go new file mode 100644 index 0000000..2d9c1e7 --- /dev/null +++ b/app/state/log.go @@ -0,0 +1,188 @@ +package state + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "jobwatch/types" + "log" + "os" + "path/filepath" + "strings" +) + +type LogEntryState string + +const MAX_LOG_ENTRIES = 2500 +const MAX_LOG_SEND = 250 + +const ( + LOG_OK LogEntryState = "O" + LOG_WARN LogEntryState = "W" + LOG_CRIT LogEntryState = "C" + LOG_UNKWN LogEntryState = "U" +) + +type LogEntry struct { + Id int `json:"id"` + Date string `json:"date"` + State LogEntryState `json:"state"` + Text string `json:"text"` +} + +type LogDb struct { + JobFullName string `json:"job_full_name"` + MaxSentByRemote map[string]int `json:"max_sent_by_remote"` + Entries []LogEntry `json:"entries"` +} + +func LoadLog(logPath string) (*LogDb, error) { + + if _, err := os.Stat(logPath); err != nil { + log.Println("Existing log not readable, creating new") + return &LogDb{}, nil + } + + data, err := os.ReadFile(logPath) + if err != nil { + return nil, err + } + + var logDb LogDb + err = json.Unmarshal(data, &logDb) + if err != nil { + return nil, fmt.Errorf("Error unmarshaling state: %v", err) + } + + log.Println("existing log loaded") + return &logDb, nil +} + +func (logDb LogDb) Save(logPath string) error { + + data, err := json.MarshalIndent(logDb, " ", " ") + if err != nil { + return fmt.Errorf("Error marshaling state: %v", err) + } + + if err := os.WriteFile(logPath, data, 0600); err != nil { + return fmt.Errorf("Error writing log file %v: %v", logPath, err) + } + + log.Println("Log written") + return nil +} + +func AppendLog(job types.Job, entries []LogEntry) error { + + lock, err := Lock(job.GetJobBaseFileName(), "LOG") + if err != nil { + return err + } + defer func() { + Unlock(lock) + }() + + logDir, err := getLibDir("states") + if err != nil { + return err + } + var logPath = filepath.FromSlash(fmt.Sprintf("%v/%v.log.json", logDir, job.GetJobBaseFileName())) + + logDb, err := LoadLog(logPath) + if err != nil { + return err + } + logDb.JobFullName = job.GetJobBaseFileName() + var maxId int = 1 + if len(logDb.Entries) > 0 { + maxId = logDb.Entries[len(logDb.Entries)-1].Id + } + + for _, e := range entries { + maxId++ + e.Id = maxId + logDb.Entries = append(logDb.Entries, e) + } + + for len(logDb.Entries) > MAX_LOG_ENTRIES { + logDb.Entries = logDb.Entries[:1] + } + + return logDb.Save(logPath) +} + +func ShowAllLogs() { + dirs := findAllStateDirs() + + var remote = os.Getenv("REMOTE") + if remote == "" { + remote = "-no-remote-" + } + + for _, d := range dirs { + fsi, err := ioutil.ReadDir(d.Path) + log.Printf("Scanning path %v", d.Path) + if err != nil { + continue + } + + for _, e := range fsi { + if strings.LastIndex(e.Name(), ".log.json") < 0 { + continue + } + p := filepath.FromSlash(d.Path + "/" + e.Name()) + + jfn := strings.Replace(e.Name(), ".log.json", "", -1) + lock, err := Lock(jfn, "LOG") + if err != nil { + log.Printf("Can't lock log %v: %v, skipping!", p, err) + continue + } + defer func() { + Unlock(lock) + }() + + log.Printf("Processing log file %v", p) + logDb, err := LoadLog(p) + if err != nil { + log.Printf("Can't read log %v: %v, skipping!", p, err) + continue + } + var maxSent = logDb.MaxSentByRemote[remote] + + fmt.Printf("[[[JobWatch %v/%v]]]\n", d.UserId, logDb.JobFullName) + var sentRem = MAX_LOG_SEND + for _, l := range logDb.Entries { + if l.Id <= maxSent { + continue + } + + fmt.Printf("%v %v: %v\n", l.State, l.Date, l.Text) + maxSent = l.Id + sentRem-- + if sentRem <= 0 { + break + } + } + if logDb.MaxSentByRemote == nil { + logDb.MaxSentByRemote = map[string]int{} + } + logDb.MaxSentByRemote[remote] = maxSent + logDb.Save(p) + } + } +} + +func ToLogState(s types.CMKState) LogEntryState { + var p = LOG_UNKWN + switch s { + case types.OK: + p = LOG_OK + case types.WARN: + p = LOG_WARN + case types.CRIT: + p = LOG_CRIT + } + return p +} diff --git a/app/state/state.go b/app/state/state.go index 53698ad..a603d3d 100644 --- a/app/state/state.go +++ b/app/state/state.go @@ -45,21 +45,29 @@ type summary struct { Entries []summaryPerUser `json:"all"` } -func getStateDir() (string, error) { +func getLibDir(subDir string) (string, error) { var dir string - dir = "/var/lib/jobwatch/states" + dir = "/var/lib/jobwatch" if os.Getuid() > 0 { if od, err := os.UserHomeDir(); err == nil { - dir = od + "/.jobwatch/states" + dir = od + "/.jobwatch" } } + if len(subDir) > 0 { + dir = filepath.FromSlash(dir + "/" + subDir) + } + err := os.MkdirAll(dir, 0770) return dir, err } +func getStateDir() (string, error) { + return getLibDir("states") +} + func findAllStateDirs() []summaryPerUser { var dirs []summaryPerUser var res []summaryPerUser @@ -115,10 +123,6 @@ func LoadState(path string) (*State, error) { return &res, nil } -func WriteLog(job types.Job, line ...string) { - fmt.Println("B") -} - func (state *State) ReEval() error { lr, err := time.Parse(time.RFC3339, state.LastRun) if err != nil { @@ -184,6 +188,10 @@ func ShowAll() error { } for _, e := range fsi { + if strings.LastIndex(e.Name(), ".state.json") < 0 { + continue + } + p := filepath.FromSlash(d.Path + "/" + e.Name()) log.Printf("Processing state file %v", p) diff --git a/app/types/types.go b/app/types/types.go index 95208f3..edf35a9 100644 --- a/app/types/types.go +++ b/app/types/types.go @@ -50,7 +50,7 @@ func LoadJob(jobdir string, jobid string) (*Job, error) { dirs = append(dirs, jobdir) } else { if hd, err := os.UserHomeDir(); err == nil { - dirs = append(dirs, filepath.FromSlash(hd)+"/jobwatch.d") + dirs = append(dirs, filepath.FromSlash(hd)+"/etc/jobwatch.d") } dirs = append(dirs, "/etc/jobwatch.d") } @@ -78,8 +78,18 @@ func LoadJob(jobdir string, jobid string) (*Job, error) { if err == nil { log.Printf("%+v\n", job) } else { - return nil, fmt.Errorf("! Error reading job %v: %+v\n", jobfile, err) + s := fmt.Sprintf("! Error reading job %v: %+v\n", jobfile, err) + return nil, fmt.Errorf(s) } job.Id = jobid return &job, nil } + +func (j Job) GetJobBaseFileName() string { + if j.InstId == "" { + return filepath.FromSlash(fmt.Sprintf("%v", j.Id)) + } else { + return filepath.FromSlash(fmt.Sprintf("%v_%v", j.Id, j.InstId)) + } + +}