From d4c130360bce85d4fbf749cb6f529917f211ff3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20H=C3=B6=C3=9F?= Date: Wed, 23 Feb 2022 00:29:40 +0100 Subject: [PATCH 1/3] Added log/logwatch-features. Locking. Other features --- README.md | 18 +++-- app/go.mod | 5 +- app/go.sum | 2 + app/main.go | 77 +++++++++++-------- app/sample.job.yml | 6 +- app/state/lock.go | 45 +++++++++++ app/state/log.go | 188 +++++++++++++++++++++++++++++++++++++++++++++ app/state/state.go | 22 ++++-- app/types/types.go | 12 ++- 9 files changed, 326 insertions(+), 49 deletions(-) create mode 100644 app/state/lock.go create mode 100644 app/state/log.go diff --git a/README.md b/README.md index 3d5ada4..db12b0d 100644 --- a/README.md +++ b/README.md @@ -11,22 +11,24 @@ But if you want to ensure the script really run, in the required interval, compl without error, you have to add a lot of plumping code. jobwatch helps to migitate this problem. It wraps the execution of your script, and does -all the rest for you, by providing output, which can be fed to CheckMK then -Agent-Plugin-Mechanism. +all the rest for you, by providing output, which can be fed to CheckMK by the +Agent-Plugin-mechanism. jobwatch -- checks for required exit-codes -- searches through the output of your script via regex to classify certain - keywords as WARN or CRIT -- let you define the required run-interval +- checks for required _exit-codes_, and maps them to _OK, WARN, CRIT, UNKN (0-3)_ +- _searches_ through the _console-output_ of your script via _regex_ to classify certain + keywords as OK, WARN or CRIT (0, 1, 2) +- sends previous regex matches to _logwatch_ +- prevents running your job in multiple instances (previous job took longer than expected) +- let you define the required run-intervals -Simply add a .job-file into /etc/jobwatch.d where you defined all of this, +Simply add a .job-file into /etc/jobwatch.d where you defines all of this, call you script via jobwatch -j job in the crontab and you are done. TBD: - document deployment - document job-file (see included sample) -- feed logoutput to CheckMK - redirect script-output to a logfile via Config +- job-timeout - more metrics (last-success-ful run ) diff --git a/app/go.mod b/app/go.mod index 633de7c..c71474c 100644 --- a/app/go.mod +++ b/app/go.mod @@ -2,4 +2,7 @@ module jobwatch go 1.18 -require gopkg.in/yaml.v2 v2.4.0 +require ( + github.com/nightlyone/lockfile v1.0.0 + gopkg.in/yaml.v2 v2.4.0 +) diff --git a/app/go.sum b/app/go.sum index 7534661..9f40921 100644 --- a/app/go.sum +++ b/app/go.sum @@ -1,3 +1,5 @@ +github.com/nightlyone/lockfile v1.0.0 h1:RHep2cFKK4PonZJDdEl4GmkabuhbsRMgk/k3uAmxBiA= +github.com/nightlyone/lockfile v1.0.0/go.mod h1:rywoIealpdNse2r832aiD9jRk8ErCatROs6LzC841CI= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= diff --git a/app/main.go b/app/main.go index a8e358a..a95dbc3 100644 --- a/app/main.go +++ b/app/main.go @@ -21,15 +21,15 @@ import ( // Jobs root-idfile -func evalOutput(job types.Job, output []string) (types.CMKState, error) { +func evalOutput(job types.Job, output []string) (types.CMKState, []state.LogEntry, error) { var res types.CMKState = types.OK - var logLines []string + var logLines []state.LogEntry var rcs []*regexp.Regexp for _, m := range job.LogMatches { r, err := regexp.Compile(m.Regex) if err != nil { - return 0, err + return 0, logLines, err } else { rcs = append(rcs, r) } @@ -47,20 +47,12 @@ func evalOutput(job types.Job, output []string) (types.CMKState, error) { v = fmt.Sprintf(m.AltMsg, v) } - var p = "" - switch s := m.State; s { - case types.OK: - p = "O" - case types.WARN: - p = "W" - case types.CRIT: - p = "C" - case types.UNKNOWN: - p = "U" - } - p += ": " - logLines = append(logLines, time.Now().Format("2006-01-2 15:04:05")+" "+p+v) - + var le state.LogEntry + le.State = state.ToLogState(m.State) + le.Date = time.Now().Format("2006-01-2 15:04:05") + le.Text = v + logLines = append(logLines, le) + break } } } @@ -68,10 +60,10 @@ func evalOutput(job types.Job, output []string) (types.CMKState, error) { log.Printf("- Matched LogLine %v\n", oln) } - return res, nil + return res, logLines, nil } -func runJob(job types.Job) (state.State, error) { +func runJob(job types.Job) (state.State, []state.LogEntry, error) { st := state.StateFromJob(job) st.LastExitCode = -1 @@ -96,7 +88,7 @@ func runJob(job types.Job) (state.State, error) { stcode = exitError.ProcessState.ExitCode() } else { // cmd not found etc - return st, err + return st, []state.LogEntry{}, err } if stcode == 0 { // Exec failed, but exitcode, be sure to get error! stcode = int(types.UNKNOWN) @@ -119,7 +111,7 @@ func runJob(job types.Job) (state.State, error) { log.Printf("- State after ExitCodeMapping: %v", stcode) outText := string(stdcombinedBuf.Bytes()) - log_state, err := evalOutput(job, strings.Split(outText, "\n")) + log_state, logLines, err := evalOutput(job, strings.Split(outText, "\n")) if int(log_state) > stcode { stcode = int(log_state) } @@ -130,7 +122,7 @@ func runJob(job types.Job) (state.State, error) { st.LastState = stcode st.LastRun = time.Now().Format(time.RFC3339) - return st, err + return st, logLines, err } func setup() (*types.Job, error) { @@ -172,6 +164,7 @@ func setup() (*types.Job, error) { job.InstId = job_instance return job, nil } else { + log.Printf("! Error loading job : %v\n", err) return nil, err } } @@ -182,26 +175,50 @@ func main() { if job == nil && err == nil { fmt.Println("<<>>") state.ShowAll() + fmt.Println("<<>>") + state.ShowAllLogs() fmt.Println("<<<>>>") } else { fmt.Fprintln(os.Stderr, "Jobwatch 0.1 (C) 2022 hoess@gmx.net") + var exitCode = 0 + var logLines []state.LogEntry var res state.State - if err == nil { - res, err = runJob(*job) - } - res.ReEval() - res.SaveState() + defer func() { + os.Exit(exitCode) + }() + + lock, err := state.Lock(job.GetJobBaseFileName(), "JOB") + log.Println("Locked on job") + if err == nil { + defer func() { + log.Println("Unlocking job") + state.Unlock(lock) + }() + + if err != nil { + log.Println("Can't acquire lock, skipping") + } + + if err == nil { + res, logLines, err = runJob(*job) + res.ReEval() + res.SaveState() + } + + if err == nil { + err = state.AppendLog(*job, logLines) + } + } if err != nil { fmt.Printf("! Error running job %+v\n", err) - os.Exit(int(types.UNKNOWN)) + exitCode = int(types.UNKNOWN) } else { - os.Exit(int(res.LastState)) + exitCode = int(res.LastState) } - state.WriteLog(*job) } } diff --git a/app/sample.job.yml b/app/sample.job.yml index 3ed19ea..a3ff92e 100644 --- a/app/sample.job.yml +++ b/app/sample.job.yml @@ -13,8 +13,10 @@ log_matches: state: 1 - regex: "-" state: 2 - alt_msg: "User logged in at console (%v)" -hide_output: True + alt_msg: "%v -> a user is logged in at console" + - regex: .+ + state: 0 +hide_output: False last_run_warn: val: 8 unit: "h" diff --git a/app/state/lock.go b/app/state/lock.go new file mode 100644 index 0000000..d900e10 --- /dev/null +++ b/app/state/lock.go @@ -0,0 +1,45 @@ +package state + +import ( + "fmt" + "log" + "os" + "path/filepath" + "time" + + "github.com/nightlyone/lockfile" +) + +const LOCk_WAIT_MILLIS = 30000 + +func Lock(jobFullName string, lockType string) (*lockfile.Lockfile, error) { + + fn := fmt.Sprintf("%v_%v", jobFullName, lockType) + lock, err := lockfile.New(filepath.Join(os.TempDir(), fn)) + if err != nil { + return nil, fmt.Errorf("Cannot init lock. reason: %v", err) + } + + var retries = 300 + for retries > 0 { + err = lock.TryLock() + if err == nil { + log.Printf("Locked via lockfile %v", lock) + return &lock, nil + } + if _, ok := err.(interface{ Temporary() bool }); ok { + retries-- + time.Sleep(time.Millisecond * 100) + } else { + retries = -1 + } + } + return nil, fmt.Errorf("Timeout acquiring lock %v/%v", jobFullName, lockType) +} + +func Unlock(lock *lockfile.Lockfile) { + if lock != nil { + log.Printf("Unlock lockfile %v", *lock) + lock.Unlock() + } +} diff --git a/app/state/log.go b/app/state/log.go new file mode 100644 index 0000000..2d9c1e7 --- /dev/null +++ b/app/state/log.go @@ -0,0 +1,188 @@ +package state + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "jobwatch/types" + "log" + "os" + "path/filepath" + "strings" +) + +type LogEntryState string + +const MAX_LOG_ENTRIES = 2500 +const MAX_LOG_SEND = 250 + +const ( + LOG_OK LogEntryState = "O" + LOG_WARN LogEntryState = "W" + LOG_CRIT LogEntryState = "C" + LOG_UNKWN LogEntryState = "U" +) + +type LogEntry struct { + Id int `json:"id"` + Date string `json:"date"` + State LogEntryState `json:"state"` + Text string `json:"text"` +} + +type LogDb struct { + JobFullName string `json:"job_full_name"` + MaxSentByRemote map[string]int `json:"max_sent_by_remote"` + Entries []LogEntry `json:"entries"` +} + +func LoadLog(logPath string) (*LogDb, error) { + + if _, err := os.Stat(logPath); err != nil { + log.Println("Existing log not readable, creating new") + return &LogDb{}, nil + } + + data, err := os.ReadFile(logPath) + if err != nil { + return nil, err + } + + var logDb LogDb + err = json.Unmarshal(data, &logDb) + if err != nil { + return nil, fmt.Errorf("Error unmarshaling state: %v", err) + } + + log.Println("existing log loaded") + return &logDb, nil +} + +func (logDb LogDb) Save(logPath string) error { + + data, err := json.MarshalIndent(logDb, " ", " ") + if err != nil { + return fmt.Errorf("Error marshaling state: %v", err) + } + + if err := os.WriteFile(logPath, data, 0600); err != nil { + return fmt.Errorf("Error writing log file %v: %v", logPath, err) + } + + log.Println("Log written") + return nil +} + +func AppendLog(job types.Job, entries []LogEntry) error { + + lock, err := Lock(job.GetJobBaseFileName(), "LOG") + if err != nil { + return err + } + defer func() { + Unlock(lock) + }() + + logDir, err := getLibDir("states") + if err != nil { + return err + } + var logPath = filepath.FromSlash(fmt.Sprintf("%v/%v.log.json", logDir, job.GetJobBaseFileName())) + + logDb, err := LoadLog(logPath) + if err != nil { + return err + } + logDb.JobFullName = job.GetJobBaseFileName() + var maxId int = 1 + if len(logDb.Entries) > 0 { + maxId = logDb.Entries[len(logDb.Entries)-1].Id + } + + for _, e := range entries { + maxId++ + e.Id = maxId + logDb.Entries = append(logDb.Entries, e) + } + + for len(logDb.Entries) > MAX_LOG_ENTRIES { + logDb.Entries = logDb.Entries[:1] + } + + return logDb.Save(logPath) +} + +func ShowAllLogs() { + dirs := findAllStateDirs() + + var remote = os.Getenv("REMOTE") + if remote == "" { + remote = "-no-remote-" + } + + for _, d := range dirs { + fsi, err := ioutil.ReadDir(d.Path) + log.Printf("Scanning path %v", d.Path) + if err != nil { + continue + } + + for _, e := range fsi { + if strings.LastIndex(e.Name(), ".log.json") < 0 { + continue + } + p := filepath.FromSlash(d.Path + "/" + e.Name()) + + jfn := strings.Replace(e.Name(), ".log.json", "", -1) + lock, err := Lock(jfn, "LOG") + if err != nil { + log.Printf("Can't lock log %v: %v, skipping!", p, err) + continue + } + defer func() { + Unlock(lock) + }() + + log.Printf("Processing log file %v", p) + logDb, err := LoadLog(p) + if err != nil { + log.Printf("Can't read log %v: %v, skipping!", p, err) + continue + } + var maxSent = logDb.MaxSentByRemote[remote] + + fmt.Printf("[[[JobWatch %v/%v]]]\n", d.UserId, logDb.JobFullName) + var sentRem = MAX_LOG_SEND + for _, l := range logDb.Entries { + if l.Id <= maxSent { + continue + } + + fmt.Printf("%v %v: %v\n", l.State, l.Date, l.Text) + maxSent = l.Id + sentRem-- + if sentRem <= 0 { + break + } + } + if logDb.MaxSentByRemote == nil { + logDb.MaxSentByRemote = map[string]int{} + } + logDb.MaxSentByRemote[remote] = maxSent + logDb.Save(p) + } + } +} + +func ToLogState(s types.CMKState) LogEntryState { + var p = LOG_UNKWN + switch s { + case types.OK: + p = LOG_OK + case types.WARN: + p = LOG_WARN + case types.CRIT: + p = LOG_CRIT + } + return p +} diff --git a/app/state/state.go b/app/state/state.go index 53698ad..a603d3d 100644 --- a/app/state/state.go +++ b/app/state/state.go @@ -45,21 +45,29 @@ type summary struct { Entries []summaryPerUser `json:"all"` } -func getStateDir() (string, error) { +func getLibDir(subDir string) (string, error) { var dir string - dir = "/var/lib/jobwatch/states" + dir = "/var/lib/jobwatch" if os.Getuid() > 0 { if od, err := os.UserHomeDir(); err == nil { - dir = od + "/.jobwatch/states" + dir = od + "/.jobwatch" } } + if len(subDir) > 0 { + dir = filepath.FromSlash(dir + "/" + subDir) + } + err := os.MkdirAll(dir, 0770) return dir, err } +func getStateDir() (string, error) { + return getLibDir("states") +} + func findAllStateDirs() []summaryPerUser { var dirs []summaryPerUser var res []summaryPerUser @@ -115,10 +123,6 @@ func LoadState(path string) (*State, error) { return &res, nil } -func WriteLog(job types.Job, line ...string) { - fmt.Println("B") -} - func (state *State) ReEval() error { lr, err := time.Parse(time.RFC3339, state.LastRun) if err != nil { @@ -184,6 +188,10 @@ func ShowAll() error { } for _, e := range fsi { + if strings.LastIndex(e.Name(), ".state.json") < 0 { + continue + } + p := filepath.FromSlash(d.Path + "/" + e.Name()) log.Printf("Processing state file %v", p) diff --git a/app/types/types.go b/app/types/types.go index 95208f3..3ea47a4 100644 --- a/app/types/types.go +++ b/app/types/types.go @@ -78,8 +78,18 @@ func LoadJob(jobdir string, jobid string) (*Job, error) { if err == nil { log.Printf("%+v\n", job) } else { - return nil, fmt.Errorf("! Error reading job %v: %+v\n", jobfile, err) + s := fmt.Sprintf("! Error reading job %v: %+v\n", jobfile, err) + return nil, fmt.Errorf(s) } job.Id = jobid return &job, nil } + +func (j Job) GetJobBaseFileName() string { + if j.InstId == "" { + return filepath.FromSlash(fmt.Sprintf("%v", j.Id)) + } else { + return filepath.FromSlash(fmt.Sprintf("%v_%v", j.Id, j.InstId)) + } + +} From 825701e285bd7d29433c617f846851184f54c9a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20H=C3=B6=C3=9F?= Date: Wed, 23 Feb 2022 12:30:36 +0100 Subject: [PATCH 2/3] Improve README/Docs, Minor fix --- README.md | 101 +++++++++++++++++++++++++++++++++++++++++---- app/sample.job.yml | 12 +++--- app/types/types.go | 2 +- 3 files changed, 101 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index db12b0d..eb80223 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,20 @@ # Jobwatch -## (C) 2022, Michael Höß, MIT-Licensed +> (C) 2022, Michael Höß, hoess@gmx.net, MIT-Licensed -### What is jobwatch +## What is jobwatch If you just want to execute a simple shell-script via cron every couple of hour, often only a few lines of code are required. -But if you want to ensure the script really run, in the required interval, completed -without error, you have to add a lot of plumping code. +But if you want to ensure +- the script really run +- in the required interval +- completed without error/with the expected exitcode +- has the expected output +- only one instances executes in parallel + +and want all this to be monitored with your CheckMK, you have to add a lot of plumping code. jobwatch helps to migitate this problem. It wraps the execution of your script, and does all the rest for you, by providing output, which can be fed to CheckMK by the @@ -25,9 +31,90 @@ jobwatch Simply add a .job-file into /etc/jobwatch.d where you defines all of this, call you script via jobwatch -j job in the crontab and you are done. -TBD: -- document deployment -- document job-file (see included sample) +## Sample-Job `/etc/jobwatch.d/sample.job.yml`: +``` +cmd: /usr/bin/w +args: + - -i +exitcode_map: + - from: 23 # Map + to: 3 + - from: -1 # Map all nonzero-exit codes to 1=WARN + to: 1 + - from: 0 # Map exitcode 0 also to 1=WARN + to: 1 +log_matches: + - regex: .*192.168.*.* + state: 1 + - regex: ".*tty[0-9].+-.*" # 2=CRIT when sombody is logged on the console + state: 2 + alt_msg: "%v -> a user is logged in at console" + - regex: .+ # Include All other lines as OK + state: 0 +hide_output: False # Dont't hide output, but pass through +last_run_warn: # How often is this job to be expected to be executed + val: 8 + unit: "h" +last_run_crit: + val: 16 + ``` + +> Note: job-names may only consists of chars `a-z`,`0-9` and `- ` + +The config-config dir is When run as +- root: `/etc/jobwatch.d` +- user `$HOME/etc/jobwatch.d` + +## Deploying jobwatch +- Compile the go program, if required, the src-dir: `go get .;go build .` +- `cp jobwatch /usr/loca/bin` +- `ln -s /usr/local/bin /usr/lib/check_agent/plugins` + +On the check-mk server just deploy the provede .mkp, no further +config there is currently needed + +## Invoking Jobwatch + +``` +jobwatch -h +Usage of jobwatch: + -d Debug + -i string + JobId Instance. Default '' + -j string + JobId. reads $jobDir/$job.job.yml Default '' + -jd string + JobDir. Default: $HOME/etc/jobwatch.d /etc/jobwatch.d' + -- After this parameter every further parameter is passed to the + called program + +Without any parameters CheckMK-Agent-Output is generated +``` + +Example1 from an crontab +``` +10 9 * * 0 root /usr/local/bin/jobwatch -j bu-dupl +``` + +Example2 from an crontab, here we use the same job for +different instances. (In this e.g. we have backup-sets) +``` +10 5 * * 0,3 root /usr/local/bin/jobwatch -j bu-borg -i b01-main -- /opt/borg/B01_main.borgjob +10 5 * * 1,4 root /usr/local/bin/jobwatch -j bu-borg -i b10-dvp -- /opt/borg/B10_dvp.borgjob +``` + +### Instances/Multiuser + +See "Invoking jobwatch above" + +> Note instance-names have the same restrictions as job-names + +When using instances, the final job name in the output is `[userid]/[job-name]_[instance-name]`, otherwise only `[userid]/[job-name]` + + + + +## TBD: - redirect script-output to a logfile via Config - job-timeout - more metrics (last-success-ful run ) diff --git a/app/sample.job.yml b/app/sample.job.yml index a3ff92e..9482983 100644 --- a/app/sample.job.yml +++ b/app/sample.job.yml @@ -4,20 +4,20 @@ args: exitcode_map: - from: 23 to: 3 - - from: -1 + - from: -1 # Map all nonzero-exit codes to 1=WARN to: 1 - - from: 0 + - from: 0 # Map exitcode 0 also to 1=WARN to: 1 log_matches: - regex: .*192.168.*.* state: 1 - - regex: "-" + - regex: ".*tty[0-9].+-.*" # 2=CRIT when sombody is logged on the console state: 2 alt_msg: "%v -> a user is logged in at console" - - regex: .+ + - regex: .+ # Include All other lines as OK state: 0 -hide_output: False -last_run_warn: +hide_output: False # Dont't hide output, but pass through +last_run_warn: # How often is this job to be expected to be executed val: 8 unit: "h" last_run_crit: diff --git a/app/types/types.go b/app/types/types.go index 3ea47a4..edf35a9 100644 --- a/app/types/types.go +++ b/app/types/types.go @@ -50,7 +50,7 @@ func LoadJob(jobdir string, jobid string) (*Job, error) { dirs = append(dirs, jobdir) } else { if hd, err := os.UserHomeDir(); err == nil { - dirs = append(dirs, filepath.FromSlash(hd)+"/jobwatch.d") + dirs = append(dirs, filepath.FromSlash(hd)+"/etc/jobwatch.d") } dirs = append(dirs, "/etc/jobwatch.d") } From 778fda7aea63058558664b3d707b66cc141229b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20H=C3=B6=C3=9F?= Date: Wed, 23 Feb 2022 14:58:40 +0100 Subject: [PATCH 3/3] README fixes --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index eb80223..1a46df9 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ But if you want to ensure - has the expected output - only one instances executes in parallel -and want all this to be monitored with your CheckMK, you have to add a lot of plumping code. +and want all this to be monitored with your CheckMK, you have to add a lot of plumbing code. jobwatch helps to migitate this problem. It wraps the execution of your script, and does all the rest for you, by providing output, which can be fed to CheckMK by the @@ -67,8 +67,8 @@ The config-config dir is When run as ## Deploying jobwatch - Compile the go program, if required, the src-dir: `go get .;go build .` -- `cp jobwatch /usr/loca/bin` -- `ln -s /usr/local/bin /usr/lib/check_agent/plugins` +- `cp jobwatch /usr/local/bin` +- `ln -s /usr/local/bin/jobwatch /usr/lib/check_agent/plugins` On the check-mk server just deploy the provede .mkp, no further config there is currently needed