Source code for alphatwirl.concurrently.HTCondorJobSubmitter

#!/usr/bin/env python
# Tai Sakuma <sakuma@cern.ch>
import os, sys
import argparse
import subprocess
import collections
import time
import textwrap
import getpass
import re
import logging

import alphatwirl

##__________________________________________________________________||
# https://htcondor-wiki.cs.wisc.edu/index.cgi/wiki?p=MagicNumbers
HTCONDOR_JOBSTATUS = {
    0: "Unexpanded",
    1: "Idle",
    2: "Running",
    3: "Removed",
    4: "Completed",
    5: "Held",
    6: "Transferring_Output",
    7: "Suspended"
}

##__________________________________________________________________||
[docs]class HTCondorJobSubmitter(object):
    def __init__(self, job_desc_extra = [ ]):

        self.job_desc_template = """
        Executable = {job_script}
        output = {out}
        error = {error}
        log = {log}
        {args}
        should_transfer_files = YES
        when_to_transfer_output = ON_EXIT
        transfer_input_files = {input_files}
        transfer_output_files = {output_files}
        Universe = vanilla
        notification = Error
        # Initialdir = {initialdir}
        getenv = True
        queue 1
        """
        self.job_desc_template = textwrap.dedent(self.job_desc_template).strip()

        if job_desc_extra:
            job_desc_list = self.job_desc_template.split('\n')
            job_desc_list[-1:-1] = job_desc_extra
            self.job_desc_template = '\n'.join(job_desc_list)

        self.clusterids_outstanding = [ ]
        self.clusterids_finished = [ ]

[docs]    def run(self, workingArea, package_index):

        cwd = os.getcwd()
        os.chdir(workingArea.path)

        package_path = workingArea.package_path(package_index)

        resultdir_basename = os.path.splitext(package_path)[0]
        resultdir_basename = os.path.splitext(resultdir_basename)[0]
        resultdir = os.path.join('results', resultdir_basename)
        alphatwirl.mkdir_p(resultdir)

        input_files = [package_path, 'python_modules.tar.gz']
        input_files = [f for f in input_files if os.path.exists(f)]

        job_desc = self.job_desc_template.format(
            job_script = 'run.py',
            out = os.path.join(resultdir, 'stdout.txt'),
            error = os.path.join(resultdir, 'stderr.txt'),
            log = os.path.join(resultdir, 'log.txt'),
            args = 'Arguments = {}'.format(package_path),
            input_files = ', '.join(input_files),
            output_files = 'results',
            initialdir = 'to be determined',
        )

        procargs = [
            '/usr/bin/condor_submit',
            '-append', 'accounting_group=group_physics.hep',
            '-append', 'accounting_group_user={}'.format(getpass.getuser()),
        ]
        proc = subprocess.Popen(
            procargs,
            stdin = subprocess.PIPE,
            stdout = subprocess.PIPE,
            stderr = subprocess.PIPE
        )

        stdout, stderr = proc.communicate(job_desc)
        regex = re.compile("submitted to cluster (\d*)", re.MULTILINE)
        clusterid = regex.search(stdout).groups()[0]

        self.clusterids_outstanding.append(clusterid)

        change_job_priority([clusterid], 10) ## need to make configurable

        os.chdir(cwd)

        return clusterid

[docs]    def poll(self):
        """check if the jobs are running and return a list of cluster IDs for
        finished jobs

        """

        clusterid_status_list = query_status_for(self.clusterids_outstanding)
        # e.g., [['1730126', 2], ['1730127', 2], ['1730129', 1], ['1730130', 1]]


        if clusterid_status_list:
            clusterids, statuses = zip(*clusterid_status_list)
        else:
            clusterids, statuses = (), ()

        clusterids_finished = [i for i in self.clusterids_outstanding if i not in clusterids]
        self.clusterids_finished.extend(clusterids_finished)
        self.clusterids_outstanding[:] = clusterids

        # logging
        counter = collections.Counter(statuses)
        messages = [ ]
        if counter:
            messages.append(', '.join(['{}: {}'.format(HTCONDOR_JOBSTATUS[k], counter[k]) for k in counter.keys()]))
        if self.clusterids_finished:
            messages.append('Finished {}'.format(len(self.clusterids_finished)))
        logger = logging.getLogger(__name__)
        logger.info(', '.join(messages))

        return clusterids_finished

[docs]    def wait(self):
        """wait until all jobs finish and return a list of cluster IDs
        """
        sleep = 5
        while self.clusterids_outstanding:
            self.poll()
            time.sleep(sleep)
        return self.clusterids_finished

[docs]    def failed_runids(self, runids):
        # remove failed clusterids from self.clusterids_finished
        # so that len(self.clusterids_finished)) becomes the number
        # of the successfully finished jobs
        for i in runids:
            try:
                self.clusterids_finished.remove(i)
            except ValueError:
                pass

[docs]    def terminate(self):
        n_at_a_time = 500
        ids_split = [self.clusterids_outstanding[i:(i + n_at_a_time)] for i in range(0, len(self.clusterids_outstanding), n_at_a_time)]
        statuses = [ ]
        for ids_sub in ids_split:
            procargs = ['condor_rm'] + ids_sub
            stdout = try_executing_until_succeed(procargs)

##__________________________________________________________________||
[docs]def try_executing_until_succeed(procargs):

    sleep = 2
    logger = logging.getLogger(__name__)

    while True:

        # logging
        ellipsis = '...(({} letters))...'
        nfirst = 50
        nlast = 50
        command_display = '{} {}'.format(procargs[0], ' '.join([repr(a) for a in procargs[1:]]))
        if len(command_display) > nfirst + len(ellipsis) + nlast:
            command_display = '{}...(({} letters))...{}'.format(
                command_display[:nfirst],
                len(command_display) - (nfirst + nlast),
                command_display[-nlast:]
            )
        logger.debug('execute: {}'.format(command_display))

        #
        proc = subprocess.Popen(
            procargs,
            stdout = subprocess.PIPE, stderr = subprocess.PIPE
        )
        stdout, stderr =  proc.communicate()
        success = not (proc.returncode or stderr)

        #
        if success: break

        #
        if stderr: logger.warning(stderr.strip())
        logger.warning('the command failed: {}. will try again in {} seconds'.format(command_display, sleep))

        #
        time.sleep(sleep)

    if not stdout: return [ ]
    return stdout.rstrip().split('\n')

##__________________________________________________________________||
[docs]def query_status_for(ids):

    n_at_a_time = 500
    ids_split = [ids[i:(i + n_at_a_time)] for i in range(0, len(ids), n_at_a_time)]
    stdout = [ ]
    for ids_sub in ids_split:
        procargs = ['condor_q'] + ids_sub + ['-format', '%-2s ', 'ClusterId', '-format', '%-2s\n', 'JobStatus']
        stdout.extend(try_executing_until_succeed(procargs))

    # e.g., stdout = ['688244 1 ', '688245 1 ', '688246 2 ']

    ret = [l.strip().split() for l in stdout]
    # e.g., [['688244', '1'], ['688245', '1'], ['688246', '2']]

    ret = [[e[0], int(e[1])] for e in ret]
    # a list of [clusterid, status]
    # e.g., [['688244', 1], ['688245', 1], ['688246', 2]]

    return ret

##__________________________________________________________________||
[docs]def change_job_priority(ids, priority = 10):

    # http://research.cs.wisc.edu/htcondor/manual/v7.8/2_6Managing_Job.html#sec:job-prio

    n_at_a_time = 500
    ids_split = [ids[i:(i + n_at_a_time)] for i in range(0, len(ids), n_at_a_time)]
    for ids_sub in ids_split:
        procargs = ['condor_prio', '-p', str(priority)] + ids_sub
        try_executing_until_succeed(procargs)

##__________________________________________________________________||
[docs]def sample_ids(n = -1):
    # to be deleted

    procargs = ['condor_q', '-format', '%-2s\n', 'ClusterId']
    stdout = try_executing_until_succeed(procargs)
    sample_ids = [l.strip() for l in stdout]

    if n == -1:
        return sample_ids

    sample_ids = sample_ids[0:n] if len(sample_ids) >= n else sample_ids
    return sample_ids

##__________________________________________________________________||