monitor igrida
The snippet can be accessed without any authentication.
Authored by
CHOI Kwon-Young
Monitor jobs on igrida
Requirements
- pandas
Description
This scripts does the same jobs as oarstat
but can gather more useful
information such as the gpu model for a job or the name of the node which runs
the job. Try oarstat -f -j JOB_ID
for a complete list of information that
you can gather for a job. You can also use oarstat -p -j JOB_ID
to gather
additional properties for running jobs.
Here is an example of how to use this tool:
$ python ~/prog/monitor_igrida/monitor_igrida.py -a 8248866 -p assigned_network_address gpu_model command
Duration array_index assigned_network_address command gpu_model job_user message state
Job_Id
8248866 27:23:016 1 [igrida-abacus11.irisa.fr] [seg_gan_singleclass, 36, 361] RTX 2080 Ti kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Terminated
8248867 31:03:025 2 [igrida-abacus11.irisa.fr] [seg_gan_singleclass, 36, 362] RTX 2080 Ti kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Terminated
8248868 30:24:027 3 [igrida-abacus4.irisa.fr] [seg_gan_singleclass, 36, 363] Tesla P100 kchoi R=20,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Terminated
8248869 22:26:024 4 [igrida-abacus9.irisa.fr] [seg_gan_singleclass, 36, 364] Tesla V100 kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Terminated
8248870 24:32:014 5 [igrida-abacus9.irisa.fr] [seg_gan_singleclass, 36, 365] Tesla V100 kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Terminated
8248871 23:22:010 6 [igrida-abacus9.irisa.fr] [seg_gan_singleclass, 36, 366] Tesla V100 kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Terminated
8248872 30:11:013 7 [igrida-abacus11.irisa.fr] [seg_gan_flat, 36, 361] RTX 2080 Ti kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Running
8248873 26:24:044 8 [igrida-abacus4.irisa.fr] [seg_gan_flat, 36, 362] Tesla P100 kchoi R=20,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Running
8248874 26:18:022 9 [igrida-abacus9.irisa.fr] [seg_gan_flat, 36, 363] Tesla V100 kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Running
8248875 25:11:042 10 [igrida-abacus11.irisa.fr] [seg_gan_flat, 36, 364] RTX 2080 Ti kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Running
8248876 23:16:010 11 [igrida-abacus9.irisa.fr] [seg_gan_flat, 36, 365] Tesla V100 kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Running
8248877 22:35:044 12 [igrida-abacus11.irisa.fr] [seg_gan_flat, 36, 366] RTX 2080 Ti kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Running
8248878 21:36:042 13 [igrida-abacus4.irisa.fr] [seg_gan_natural, 36, 361] Tesla P100 kchoi R=20,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Running
8248879 18:35:019 14 [igrida-abacus11.irisa.fr] [seg_gan_natural, 36, 362] RTX 2080 Ti kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Error
8248880 16:44:002 15 [igrida-abacus9.irisa.fr] [seg_gan_natural, 36, 363] Tesla V100 kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Error
8248881 00:00:000 16 [] [seg_gan_natural, 36, 364] NaN kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Hold
8248882 00:00:000 17 [] [seg_gan_natural, 36, 365] NaN kchoi R=20,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Hold
8248883 00:00:000 18 [] [seg_gan_natural, 36, 366] NaN kchoi R=10,W=80:0:0,J=B,N=singleclass,Q=gpu (Karma=0... Hold
monitor_igrida.py 4.20 KiB
"""
File: monitor_igrida.py
Author: Kwon-Young Choi
Email: kwon-young.choi@irisa.fr
Date: 2020-04-24
Description: Monitor jobs on igrida.
This scripts does the same jobs as `oarstat` but can gather more useful
information such as the gpu model for a job or the name of the node which runs
the job. Try `oarstat -f -j JOB_ID` for a complete list of information that
you can gather for a job. You can also use `oarstat -p -j JOB_ID` to gather
additional properties for running jobs.
"""
import argparse
from subprocess import run, PIPE
import shlex
import time
import json
import pandas as pd
def run_command(command):
command = shlex.split(command)
output = run(command, stdout=PIPE)
try:
output = json.loads(output.stdout)
except json.decoder.JSONDecodeError:
print(f"Empty results for `{' '.join(command)}`")
output = {}
return output
def get_job_array_info(job_array_id):
command = 'oarstat -J --array {} -f'.format(job_array_id)
return run_command(command)
def get_job_info(job_array_id):
command = 'oarstat -J -j {} -f'.format(job_array_id)
return run_command(command)
def get_user_job_info(user):
command = 'oarstat -J -f -u {}'.format(user)
return run_command(command)
def get_job_property(job_id):
command = f'oarstat -j {job_id} -p'
command = shlex.split(command)
output = run(command, stdout=PIPE)
output = output.stdout.decode()
if output == '':
return {}
output = output.splitlines()[0]
key_values = output.split(',')
data = {}
for key_value in key_values:
key_value = key_value.strip()
key, value = key_value.split('=')
value = value.strip().strip("'")
data[key.strip()] = value
return data
def get_Durations(job_data):
for id, data in job_data.items():
start = data['startTime']
end = data['stopTime']
if end == 0 and start != 0:
end = int(time.time())
duration = end - start
job_data[id]['Duration'] = duration
return job_data
prop_func_map = {
"Duration": get_Durations,
}
def print_data(job_data, properties):
df = pd.DataFrame.from_dict(job_data, orient='index')
df['Duration'] = df['Duration'].map(
lambda x: f'{int(x / 3600):02}:{int(x / 60) % 60:02}:{x % 60:03}')
df['command'] = df['command'].map(lambda x: x.split()[1:])
properties = df.columns.intersection(properties).sort_values()
df = df[properties]
df.index.name = "Job_Id"
print(df)
return
def main(args):
job_data = {}
for user in args.user:
job_data.update(get_user_job_info(user))
for job_id in args.array:
job_data.update(get_job_array_info(job_id))
for job_id in args.job:
job_data.update(get_job_info(job_id))
for job_id in job_data:
job_data[job_id].update(get_job_property(job_id))
job_data = get_Durations(job_data)
default_prop = [
"array_index",
"state",
"job_user",
"Duration",
"message",
]
default_prop.extend(args.properties)
print_data(job_data, default_prop)
return
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Monitor jobs on igrida.
This scripts does the same jobs as `oarstat` but can gather more useful
information such as the gpu model for a job or the name of the node which runs
the job. Try `oarstat -f -j JOB_ID` for a complete list of information that
you can gather for a job. You can also use `oarstat -p -j JOB_ID` to gather
additional properties for running jobs.""",
usage="python monitor_igrida.py -j JOB_ID -a JOB_ARRAY_ID -p gpu_model"
"-p command")
parser.add_argument("-u", "--user", action='store', help="user",
type=str, nargs='*', default=[])
parser.add_argument("-j", "--job", action='store', help="job id",
type=int, nargs='*', default=[])
parser.add_argument("-a", "--array", action='store', help="job array id",
type=int, nargs='*', default=[])
parser.add_argument("-p", "--properties", help="additional job properties",
action='store', type=str, nargs='*', default=[])
args = parser.parse_args()
main(args)
Please register or sign in to comment