Commit 40a001dc by Duchaj János

Added the GPU usage monitoring, and all Disk IO R/W monitoring.

parent 951b1c5c
pika==1.2.0 pika==1.2.0
psutil==2.1.1 psutil==2.1.1
pynvml==11.5.0
...@@ -9,6 +9,8 @@ import pika ...@@ -9,6 +9,8 @@ import pika
import psutil import psutil
import time import time
import re import re
import subprocess
from pynvml import *
logging.basicConfig() logging.basicConfig()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -138,6 +140,18 @@ class Client: ...@@ -138,6 +140,18 @@ class Client:
'bytes_sent', 'bytes_recv'): 'bytes_sent', 'bytes_recv'):
metrics['network.%s-%s' % metrics['network.%s-%s' %
(metric, interface)] = getattr(data, metric) (metric, interface)] = getattr(data, metric)
try:
for deviceCounter in range(nvmlDeviceGetCount()):
handle = nvmlDeviceGetHandleByIndex(deviceCounter)
deviceName = nvmlDeviceGetName(handle).replace(" ", "_")
deviceMemoryInfos = nvmlDeviceGetMemoryInfo(handle)
gpu_percent = deviceMemoryInfos.used / deviceMemoryInfos.total * 100
gpu_used_bytes = deviceMemoryInfos.used
metrics['gpu.percent.%s' % deviceName] = gpu_percent
metrics['gpu.used_bytes.%s' % deviceName] = gpu_used_bytes
except NVMLError as error:
logger.error('Something went wrong with GPU Monitoring:')
logger.error('Error: %s' % error)
return ['%(host)s.%(name)s %(val)f %(time)d' % {'host': self.name, return ['%(host)s.%(name)s %(val)f %(time)d' % {'host': self.name,
'name': name, 'name': name,
...@@ -218,6 +232,36 @@ class Client: ...@@ -218,6 +232,36 @@ class Client:
return metrics return metrics
def startIOmonitor(self):
ioTopCall = subprocess.Popen(["sh","-c","sudo iotop -ao -qqq -b -k --iter=2 -d9.5 | awk '{$1=$1};1' | cut -d'%' -f1 | cut -d' ' -f3,4,6 | sort"],stdout=subprocess.PIPE)
return ioTopCall
def collect_node_IO(self, completedIOShell):
now = time.time()
metrics = []
ProcessOut, ProcessErr = completedIOShell.communicate()
decodedIOTopOut = bytes.decode(ProcessOut)
decodedLines = decodedIOTopOut.splitlines()
IOReadWriteValsDict = {}
for line in decodedLines:
lineVals = line.split()
if (IOReadWriteValsDict.get(lineVals[0]) == None):
IOReadWriteValsDict[lineVals[0]] = [float(lineVals[1]),float(lineVals[2])]
else:
IOReadWriteValsDict[lineVals[0]][0] += float(lineVals[1])
IOReadWriteValsDict[lineVals[0]][1] += float(lineVals[2])
for metric,valueDuo in IOReadWriteValsDict.items():
rw = "read"
for value in valueDuo:
metrics.append('%(host)s.io.%(rw)s.%(name)s %(val)f %(time)d' % {'host': self.name,
'name': metric,
'rw': rw,
'val': value,
'time': now})
rw = "write"
return metrics
@staticmethod @staticmethod
def _chunker(seq, size): def _chunker(seq, size):
"""Yield seq in size-long chunks. """Yield seq in size-long chunks.
...@@ -233,9 +277,14 @@ class Client: ...@@ -233,9 +277,14 @@ class Client:
""" """
self.connect() self.connect()
self.processes = {} self.processes = {}
nvmlInit()
try: try:
runningIOshell = self.startIOmonitor()
while True: while True:
metrics = self.collect_node() + self.collect_vms() metrics = self.collect_node() + self.collect_vms()
if runningIOshell.poll() != None:
metrics += self.collect_node_IO(runningIOshell)
runningIOshell = self.startIOmonitor()
if metrics: if metrics:
for chunk in self._chunker(metrics, 100): for chunk in self._chunker(metrics, 100):
self.send(chunk) self.send(chunk)
...@@ -244,4 +293,5 @@ class Client: ...@@ -244,4 +293,5 @@ class Client:
except KeyboardInterrupt: except KeyboardInterrupt:
logger.info("Reporting has stopped by the user. Exiting...") logger.info("Reporting has stopped by the user. Exiting...")
finally: finally:
nvmlShutdown()
self.disconnect() self.disconnect()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment