wiki:SrmProbes
Last modified 4 years ago Last modified on 10/13/14 11:15:50

====================================================

SRM-probes Readme

====================================================

How to setup these probes

1.) Install nagios on your platform. For SL5 and SL6: yum install nagios.x86_64

2.) When you read this you will have installed: emi.dcache.srm-probes-1.0.0-1.noarch

This package will install the following to your system:

/usr/lib/python2.6/site-packages/gridmetrics/__init__.py
/usr/lib/python2.6/site-packages/gridmetrics/__init__.pyc
/usr/lib/python2.6/site-packages/gridmetrics/__init__.pyo
/usr/lib/python2.6/site-packages/gridmetrics/srmmetrics.py
/usr/lib/python2.6/site-packages/gridmetrics/srmmetrics.pyc
/usr/lib/python2.6/site-packages/gridmetrics/srmmetrics.pyo
/usr/share/nagios/plugins/contrib/srm/CHANGES
/usr/share/nagios/plugins/contrib/srm/README
/usr/share/nagios/plugins/contrib/srm/SRM-probe
/usr/share/nagios/plugins/contrib/srm/gridmetrics
/usr/share/nagios/plugins/contrib/srm/gridmetrics/__init__.py
/usr/share/nagios/plugins/contrib/srm/gridmetrics/__init__.pyc
/usr/share/nagios/plugins/contrib/srm/gridmetrics/__init__.pyo
/usr/share/nagios/plugins/contrib/srm/gridmetrics/srmmetrics.py
/usr/share/nagios/plugins/contrib/srm/gridmetrics/srmmetrics.pyc
/usr/share/nagios/plugins/contrib/srm/gridmetrics/srmmetrics.pyo
/usr/share/nagios/plugins/contrib/srm/setup.py
/usr/share/nagios/plugins/contrib/srm/setup.pyc
/usr/share/nagios/plugins/contrib/srm/setup.pyo

3.) Create a directory structure that suites your needs. We recommend:

   |-- SE
   |   |-- group.cfg
   |   |-- services.cfg
   |   |-- template.cfg
   |   `-- host.cfg
   |-- commands.cfg

The files contain the following:

commands.cfg:

	###############
	#
	#
	# SRM-probes related commands
	#
	#
	################################

	define command{
	       command_name  ncg_check_native
	       command_line     $ARG1$ -H $HOSTNAME$ -t $ARG2$ $ARG3$
	}

	define command{
	       command_name  ncg_check_passive
	       command_line     $USER1$/check_dummy 3 "$ARG1$"
	}

host.cfg:

	###############################################################################
	# HOST DEFINITION
	###############################################################################

	define host{
	    use         storage-element
	    host_name   <hostname>
	    alias       SL 5 dCache Storage Element
	    address     <host ip address>
	}

The place holders - <hostname> and <host ip address> - need to be replaced by the hostname and ip address of the machine that shall be monitored. There can be as many machines there as you like.

group.cfg:

        ###############################################################################
	# HOST GROUP DEFINITION
	###############################################################################

	define hostgroup{
            hostgroup_name      dCacheNodes_dteam
            alias              	dCache Storage Elements
       	    members            	<hostname>
	}

services.cfg:

        ####################
        ## SRM Probes
        ####################
        define service{
            use                   generic-service
            hostgroup_name        dCacheNodes_dteam
            service_description   org.sam.SRM-All-testers.eu-emi.eu
            check_command         ncg_check_native!/usr/share/nagios/plugins/contrib/srm/SRM-probe!600!--vo <VO name> --ldap-uri <BDII server url>:2170 --pass-check-dest nagcmd --nagcmdfile /usr/local/nagios/var/rw/nagios.cmd -x /tmp/x509up_<userid> --work-dir <working directory>
            normal_check_interval 8
            retry_check_interval  4
            max_check_attempts    4
        }

        define service {
        use                    generic-service
        hostgroup_name         dCacheNodes_dteam
        service_description    org.sam.SRM-Put-dteam
        check_command          ncg_check_passive!This metric is part of the org.sam.SRM-All bundle and cannot be executed independently.
        active_checks_enabled  0
        passive_checks_enabled 1
        _vo                    dteam
        _vo_fqan               dteam
        _service_flavour       dCache
        _server                <hostname>
       _metric_name           org.sam.SRM-Put
}


define service {
        use                    generic-service
        hostgroup_name         dCacheNodes_dteam
        service_description    org.sam.SRM-GetSURLs-dteam
        check_command          ncg_check_passive!This metric is part of the org.sam.SRM-All bundle and cannot be executed independently.
        active_checks_enabled  0
        passive_checks_enabled 1
        _vo                    dteam
        _vo_fqan               dteam
        _service_flavour       dCache
        _server                <hostname>
        _metric_name           org.sam.SRM-GetSURLs
}


define service {
        use                    generic-service
        hostgroup_name         dCacheNodes_dteam
        service_description    org.sam.SRM-LsDir-dteam
        check_command          ncg_check_passive!This metric is part of the org.sam.SRM-All bundle and cannot be executed independently.
        active_checks_enabled  0
        passive_checks_enabled 1
        _vo                    dteam
        _vo_fqan               dteam
        _service_flavour       dCache
        _server                <hostname>
        _metric_name           org.sam.SRM-LsDir
}

define service {
        use                    generic-service
        hostgroup_name         dCacheNodes_dteam
        service_description    org.sam.SRM-Ls-dteam
        check_command          ncg_check_passive!This metric is part of the org.sam.SRM-All bundle and cannot be executed independently.
        active_checks_enabled  0
        passive_checks_enabled 1
        _vo                    dteam
        _vo_fqan               dteam
        _service_flavour       dCache
        _server                <hostname>
        _metric_name           org.sam.SRM-Ls
}

define service {
        use                    generic-service
        hostgroup_name         dCacheNodes_dteam
        service_description    org.sam.SRM-GetTURLs-dteam
        check_command          ncg_check_passive!This metric is part of the org.sam.SRM-All bundle and cannot be executed independently.
        active_checks_enabled  0
        passive_checks_enabled 1
        _vo                    dteam
        _vo_fqan               dteam
        _service_flavour       dCache
        _server                <hostname>
        _metric_name           org.sam.SRM-GetTURLs
}

define service {
        use                    generic-service
        hostgroup_name         dCacheNodes_dteam
        service_description    org.sam.SRM-Get-dteam
        check_command          ncg_check_passive!This metric is part of the org.sam.SRM-All bundle and cannot be executed independently.
        active_checks_enabled  0
        passive_checks_enabled 1
        _vo                    dteam
        _vo_fqan               dteam
        _service_flavour       dCache
        _server                <hostname>
        _metric_name           org.sam.SRM-Get
}

define service {
        use                    generic-service
        hostgroup_name         dCacheNodes_dteam
        service_description    org.sam.SRM-Del-dteam
        check_command          ncg_check_passive!This metric is part of the org.sam.SRM-All bundle and cannot be executed independently.
        active_checks_enabled  0
        passive_checks_enabled 1
        _vo                    dteam
        _vo_fqan               dteam
        _service_flavour       dCache
        _server                <hostname>
        _metric_name           org.sam.SRM-Del
        }

The following placeholders have to be replaced with there respective values: <VO name>, <BDII server url>, <userid>, <working directory>

template.cfg:

    #########################################################################################
    # Storage Element Template Definition
    #########################################################################################

    define host{
         name                       storage-element
         use                        generic-host
         check_period               24x7
         check_interval             5
         retry_interval             1
         max_check_attempts         10
         check_command              check-host-alive
         notification_period        24x7
         notification_interval      30
         notification_options       d,r
         contact_groups             admins        
         register                   0
    }

4.) Add the path of the created directory structure in /etc/nagios/nagios.cfg

	    cfg_dir=/etc/nagios/objects/grid

5.) You will also need a voms proxy for the VO you are using. Install the voms-clients on your platform. Then issue a voms-proxy and put it in "-x /tmp/x509up_<userid>" in the services.cfg file.

6.) Now restart nagios. SL5 and SL6: service nagios restart