function61
diff --git a/‎alertmanager/index.js‎
Lines changed: 44 additions & 5 deletions b/‎alertmanager/index.js‎
Lines changed: 44 additions & 5 deletions
diff --git a/‎docs/usecase_prometheus-alerting-email.png‎
4.61 KB b/‎docs/usecase_prometheus-alerting-email.png‎
4.61 KB
diff --git a/‎docs/usecase_prometheus-alerting-graph-normal.png‎
17.3 KB b/‎docs/usecase_prometheus-alerting-graph-normal.png‎
17.3 KB
diff --git a/‎docs/usecase_prometheus-alerting-graph-unhealthy.png‎
24.4 KB b/‎docs/usecase_prometheus-alerting-graph-unhealthy.png‎
24.4 KB
diff --git a/‎docs/usecase_prometheus-alerting.md‎
Lines changed: 107 additions & 0 deletions b/‎docs/usecase_prometheus-alerting.md‎
Lines changed: 107 additions & 0 deletions
@@ -60,8 +60,6 @@ function failAndLog(context, failResult) {
 }
 
 var apis = {
-	// TODO: 'POST /prom-alertmanager-webhook': function () { }
-
 	'GET /alerts': function (event, context) {
 		dynamodb.scan({
 			TableName: 'alertmanager_alerts'
@@ -113,19 +111,19 @@ var apis = {
 			dynamodb.scan({
 				TableName: 'alertmanager_alerts',
 				Limit: 1000 // whichever comes first, 1 MB or 1 000 records
-			}, function (err, data){
+			}, function (err, firingAlertsResult){
 				if (err) {
 					context.fail(err);
 					return;
 				}
 
-				if (data.Items.length >= MAX_FIRING_ALERTS) {
+				if (firingAlertsResult.Items.length >= MAX_FIRING_ALERTS) {
 					// should not context.fail(), as otherwise the submitter could re-try again (that would be undesirable)
 					httpSucceedAndLog(context, "Max alerts already firing. Discarding the submitted alert.");
 					return;
 				}
 
-				var items = data.Items.map(unwrapDynamoDBTypedObject);
+				var items = firingAlertsResult.Items.map(unwrapDynamoDBTypedObject);
 
 				var largestNumber = 0;
 
@@ -178,6 +176,47 @@ var apis = {
 		trySaveOnce(1);
 	},
 
+	// Prometheus integration
+	'POST /prometheus-alertmanager/api/v1/alerts': function (event, context) {
+		var eventBody = JSON.parse(event.body);
+		/*	eventBody=
+			[
+			  {
+			    "labels": {
+			      "alertname": "dummy_service_down",
+			      "instance": "10.0.0.17:80",
+			      "job": "prometheus-dummy-service"
+			    },
+			    "annotations": {
+			      
+			    },
+			    "startsAt": "2017-01-17T08:42:07.804Z",
+			    "endsAt": "2017-01-17T08:42:52.806Z",
+			    "generatorURL": "http://f67e003689ac:9090/graph?g0.expr=fictional_healthmeter%7Bjob%3D%22prometheus-dummy-service%22%7D+%3C+50\\u0026g0.tab=0"
+			  }
+			]
+		*/
+
+		// FIXME: this only takes care of the first alert
+		var subject = eventBody.length === 1 ?
+			eventBody[0].labels.alertname :
+			'Alert count not 1, was: ' + eventBody.length; // Fallback for actually letting us now
+
+		// convert to simulated incoming HTTP message
+		var simulatedHttpEvent = {
+			httpMethod: 'POST',
+			path: '/alerts/ingest',
+			body: JSON.stringify({
+				subject: subject,
+				details: "Job: " + eventBody[0].labels.job + "\nInstance: " + eventBody[0].labels.instance,
+				timestamp: eventBody[0].startsAt
+			})
+		};
+		
+		// run the main dispatcher again
+		exports.handler(simulatedHttpEvent, context);
+	},
+
 	'SNS: ingest': function (event, context) {
 		/* {
 				Type: 'Notification',
 
@@ -0,0 +1,107 @@
+Use case: Prometheus alerting
+=============================
+
+- Prometheus is the main component when talking about Prometheus. It monitors your services by
+  scraping them for metrics. It allows to define alerting rules for these metrics: "if this metric
+  looks like something is wrong -> raise an alert".
+
+- However, Prometheus only **raises** alerts. It does not filter or transport them. They wisely made
+  this modular and separated those concerns into another Prometheus project:
+  [prom-alertmanager](https://prometheus.io/docs/alerting/alertmanager/).
+
+- Our lambda-alertmanager is a simple replacement to prom-alertmanager that runs entirely on AWS.
+
+
+Configure Prometheus to send alarms to lambda-alertmanager
+----------------------------------------------------------
+
+Edit `prometheus.conf`:
+
+```
+global:
+  ... snipped
+
+# most verbose way of specifying 'https://REDACTED.execute-api.us-east-1.amazonaws.com/prod/prometheus-alertmanager'
+# Prometheus will do a HTTP POST to /prod/prometheus-alertmanager/api/v1/alerts
+alerting:
+  alertmanagers:
+  - scheme: 'https'
+    path_prefix: '/prod/prometheus-alertmanager'
+    static_configs:
+    - targets:
+        - 'REDACTED.execute-api.us-east-1.amazonaws.com'
+
+scrape_configs:
+  ... snipped
+```
+
+
+Have a Prometheus-enabled service you want to monitor/graph
+-----------------------------------------------------------
+
+In our example we have a service `http://prometheus-dummy-service`.
+Its Prometheus-scrapable metrics live at `http://prometheus-dummy-service/metrics`.
+
+The response looks like this:
+
+```
+# this is fictional value
+fictional_healthmeter 100
+
+```
+
+Prometheus-metrics can have much [richer data structure](https://prometheus.io/docs/concepts/data_model/)
+than this, but this is the simplest example.
+
+Prometheus [autodiscovers](https://prometheus.io/docs/operating/configuration/) our services,
+and will scrape those metrics automatically.
+
+Now we can graph that metric inside Prometheus:
+
+![](usecase_prometheus-alerting-graph-normal.png)
+
+The metric is reporting constant `100`. Which in our fictional case means everything is OK.
+
+
+Configure an alert to Prometheus
+--------------------------------
+
+We'll decide that the metric `fictional_healthmeter` signals error if it dips below `50`.
+Add to Prometheus' alerting rules:
+
+```
+ALERT dummy_service_down
+  IF fictional_healthmeter{job="prometheus-dummy-service"} < 50
+```
+
+Now, when that happens (`fictional_healthmeter` dips to `20`):
+
+![](usecase_prometheus-alerting-graph-unhealthy.png)
+
+Prometheus will submit this alarm to lambda-AlertManager - you'll get a notification via your configured transports:
+
+![](usecase_prometheus-alerting-email.png)
+
+
+Why did we replace Prometheus' AlertManager?
+--------------------------------------------
+
+- Prometheus' AlertManager would have to run on your own infrastructure - more stuff for you to operate and worry about.
+
+- Reliability. If AlertManager goes down, you are not going to be alerted. AlertManager is in a sense
+  your most critical part of your infrastructure, as you have to trust it to work when shit hits the fan.
+  You don't want your customers to call you because you yourself don't know that your servers are down.
+  I.e. if monitoring goes down, who monitors the monitoring? I have great confidence in letting all this
+  run on AWS' well-managed environment.
+
+
+But what if Prometheus goes down?
+---------------------------------
+
+Okay we learned that lambda-alertmanager is in charge of being reliable in delivering alerts. But since
+Prometheus is the one that **raises these alerts**, what if Prometheus itself goes down, so there's nobody
+to alert us that monitoring is down?
+
+For this case I advise you to make AlertManager-Canary monitor your Prometheus. Just configure a http check
+in Canary to alert you if Prometheus goes down. That way if AWS stays up, you'll always be notified even
+if your entire cluster dies at the exact same moment.