Source code for threeML.parallel.parallel_client

# Custom warning
import math
import signal
import subprocess
import time
from typing import Optional
import warnings
from contextlib import contextmanager
import shutil
from pathlib import Path

from threeML.config.config import threeML_config
from threeML.io.logging import setup_logger
from threeML.utils.progress_bar import tqdm

import sys

log = setup_logger(__name__)

try:
    from subprocess import DEVNULL  # py3k
except ImportError:
    import os

    DEVNULL = open(os.devnull, "wb")

# Check whether we have a parallel system or not

has_parallel = False

try:

    from ipyparallel import Client

except ImportError:

    has_parallel = False

else:

    has_parallel = True


[docs] def get_base_prefix_compat() -> str: """Get base/real prefix, or sys.prefix if there is none.""" return ( getattr(sys, "base_prefix", None) or getattr(sys, "real_prefix", None) or sys.prefix )
[docs] def in_virtualenv() -> bool: return get_base_prefix_compat() != sys.prefix
[docs] class NoParallelEnvironment(UserWarning): pass
# Set up the warnings module to always display our custom warning (otherwise it would only be displayed once) warnings.simplefilter("always", NoParallelEnvironment)
[docs] @contextmanager def parallel_computation( profile: Optional[str] = None, start_cluster: bool = True, n_jobs: Optional[int] = None, ) -> None: """ A context manager which turns on parallel execution temporarily :param profile: the profile to use, if different from the default :param start_cluster: True or False. Whether to start a new cluster. If False, try to use an existing one for the same profile :return: """ # Memorize the state of the use-parallel config old_state = bool(threeML_config.parallel.use_parallel) old_profile = str(threeML_config.parallel.profile_name) # Set the use_parallel feature on, if available if has_parallel: threeML_config.parallel.use_parallel = True else: # No parallel environment available. Issue a warning and continue with serial computation log.warning( "You requested parallel computation, but no parallel environment is available. You need " "to install the ipyparallel package. Continuing with serial computation...", ) threeML_config.parallel.use_parallel = False # Now use the specified profile (if any), otherwise the default one if profile is not None: threeML_config.parallel.profile_name = str(profile) # Here is where the content of the with parallel_computation statement gets # executed # See if we need to start the ipyparallel cluster first if start_cluster: # Get the command line together # First find out path of ipcluster # first let's see if we are in a virtaul env if in_virtualenv(): ipcluster_path = Path(sys.prefix) / "bin" / "ipcluster" if not ipcluster_path.exists(): log.warning(f"you are using the virtualenv {sys.prefix}") log.warning("but no ipcluster executable was found!") ipcluster_path = shutil.which("ipcluster") log.warning(f"using {ipcluster_path} instead") else: ipcluster_path = shutil.which("ipcluster") cmd_line = [str(ipcluster_path), "start"] if profile is not None: cmd_line.append(f"--profile={profile}") if n_jobs is not None: cmd_line.append(f"-n {n_jobs}") # Start process asynchronously with Popen, suppressing all output log.info("Starting ipyparallel cluster with this command line:") log.info(" ".join(cmd_line)) ipycluster_process = subprocess.Popen( cmd_line, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT ) rc = Client(profile=profile) # Wait for the engines to become available while True: try: view = rc[:] except Exception as e: log.info("waiting on cluster to start") time.sleep(0.5) continue else: log.info(f"{len(view)} engines are active") break # Do whatever we need to do try: yield finally: # This gets executed in any case, even if there is an exception log.info("\nShutting down ipcluster...") ipycluster_process.send_signal(signal.SIGINT) ipycluster_process.wait() else: # Using an already started cluster yield # Revert back threeML_config.parallel.use_parallel = old_state threeML_config.parallel.profile_name = old_profile
[docs] def is_parallel_computation_active() -> bool: return bool(threeML_config.parallel.use_parallel)
if has_parallel: class ParallelClient(Client): def __init__(self, *args, **kwargs) -> None: """ Wrapper around the IPython Client class, which forces the use of dill for object serialization :param args: same as IPython Client :param kwargs: same as IPython Client :return: """ # Just a wrapper around the IPython Client class # forcing the use of dill for object serialization # (more robust, and allows for serialization of class # methods) if "profile" not in kwargs.keys(): kwargs["profile"] = threeML_config.parallel.profile_name super(ParallelClient, self).__init__(*args, **kwargs) # This will propagate the use_dill to all running # engines _ = self.direct_view().use_dill()
[docs] def get_number_of_engines(self): return len(self.direct_view())
def _interactive_map( self, worker, items_to_process, ordered=True, chunk_size=None ): """ Subdivide the work among the active engines, taking care of dividing it among them :param worker: the function to be applied :param items_to_process: the items to apply the function to :param ordered: whether to keep the order of output (default: True). Using False can be much faster, but you need to have a way to re-estabilish the order if you care about it, after the fact. :param chunk_size: determine how many items should an engine process before reporting back. Use None for an automatic choice. :return: a AsyncResult object """ # Split the work evenly between the engines n_total_engines = self.get_number_of_engines() n_items = len(items_to_process) # Get a load-balanced view with the appropriate number of engines if n_items < n_total_engines: log.warning("More engines than items to process") # Limit the view to the needed engines lview = self.load_balanced_view(range(n_items)) n_active_engines = n_items chunk_size = 1 else: # Use all engines lview = self.load_balanced_view() n_active_engines = n_total_engines if chunk_size is None: chunk_size = int( math.ceil(n_items / float(n_active_engines) / 20) ) # We need this to keep the instance alive self._current_amr = lview.imap( worker, items_to_process, # chunksize=chunk_size, ordered=ordered, ) return self._current_amr
[docs] def execute_with_progress_bar( self, worker, items, chunk_size=None, name="progress" ): # Let's make a wrapper which will allow us to recover the order def wrapper(x): (id, item) = x return (id, worker(item)) items_wrapped = [(i, item) for i, item in enumerate(items)] amr = self._interactive_map( wrapper, items_wrapped, ordered=False, chunk_size=chunk_size ) results = [] for i, res in enumerate(tqdm(amr, desc=name)): results.append(res) # Reorder the list according to the id return list( map(lambda x: x[1], sorted(results, key=lambda x: x[0])) )
else: # NO parallel environment available. Make a dumb object to avoid import problems, but this object will never # be really used because the context manager will not activate the parallel mode (see above)
[docs] class ParallelClient(object): def __init__(self, *args, **kwargs): raise RuntimeError( "No parallel environment and attempted to use the ParallelClient class, which should " "never happen. Please open an issue at https://github.com/giacomov/3ML/issues" )