microproduct/surfaceRoughness_oh2004/dist/distributed/distributed.yaml

distributed:
  version: 2
  # logging:
  #   distributed: info
  #   distributed.client: warning
  #   bokeh: critical
  #   # http://stackoverflow.com/questions/21234772/python-tornado-disable-logging-to-stderr
  #   tornado: critical
  #   tornado.application: error

  scheduler:
    allowed-failures: 3     # number of retries before a task is considered bad
    bandwidth: 100000000    # 100 MB/s estimated worker-worker bandwidth
    blocked-handlers: []
    default-data-size: 1kiB
    # Number of seconds to wait until workers or clients are removed from the events log
    # after they have been removed from the scheduler
    events-cleanup-delay: 1h
    idle-timeout: null      # Shut down after this duration, like "1h" or "30 minutes"
    transition-log-length: 100000
    events-log-length: 100000
    work-stealing: True     # workers should steal tasks from each other
    work-stealing-interval: 100ms  # Callback time for work stealing
    worker-ttl: null        # like '60s'. Time to live for workers.  They must heartbeat faster than this
    pickle: True            # Is the scheduler allowed to deserialize arbitrary bytestrings
    preload: []             # Run custom modules with Scheduler
    preload-argv: []        # See https://docs.dask.org/en/latest/setup/custom-startup.html
    unknown-task-duration: 500ms  # Default duration for all tasks with unknown durations ("15m", "2h")
    default-task-durations:  # How long we expect function names to run ("1h", "1s") (helps for long tasks)
      rechunk-split: 1us
      split-shuffle: 1us
    validate: False         # Check scheduler state at every step for debugging
    dashboard:
      status:
        task-stream-length: 1000
      tasks:
        task-stream-length: 100000
      tls:
        ca-file: null
        key: null
        cert: null
      bokeh-application:  # keywords to pass to BokehTornado application
        allow_websocket_origin: ["*"]
        keep_alive_milliseconds: 500
        check_unused_sessions_milliseconds: 500
    locks:
      lease-validation-interval: 10s  # The interval in which the scheduler validates staleness of all acquired leases. Must always be smaller than the lease-timeout itself.
      lease-timeout: 30s  # Maximum interval to wait for a Client refresh before a lease is invalidated and released.

    http:
      routes:
        - distributed.http.scheduler.prometheus
        - distributed.http.scheduler.info
        - distributed.http.scheduler.json
        - distributed.http.health
        - distributed.http.proxy
        - distributed.http.statics

    allowed-imports:
      - dask
      - distributed

    active-memory-manager:
      # Set to true to auto-start the Active Memory Manager on Scheduler start; if false
      # you'll have to either manually start it with client.scheduler.amm_start() or run
      # it once with client.scheduler.amm_run().
      start: false
      # Once started, run the AMM cycle every <interval>
      interval: 2s
      policies:
        # Policies that should be executed at every cycle. Any additional keys in each
        # object are passed as keyword arguments to the policy constructor.
        - class: distributed.active_memory_manager.ReduceReplicas

  worker:
    blocked-handlers: []
    multiprocessing-method: spawn
    use-file-locking: True
    connections:            # Maximum concurrent connections for data
      outgoing: 50          # This helps to control network saturation
      incoming: 10
    preload: []             # Run custom modules with Worker
    preload-argv: []        # See https://docs.dask.org/en/latest/setup/custom-startup.html
    daemon: True
    validate: False         # Check worker state at every step for debugging
    resources: {}           # Key: value pairs specifying worker resources.
    lifetime:
      duration: null        # Time after which to gracefully shutdown the worker
      stagger: 0 seconds    # Random amount by which to stagger lifetimes
      restart: False        # Do we ressurrect the worker after the lifetime deadline?

    profile:
      interval: 10ms        # Time between statistical profiling queries
      cycle: 1000ms         # Time between starting new profile
      low-level: False      # Whether or not to include low-level functions
                            # Requires https://github.com/numba/stacktrace

    memory:
      # When there is an increase in process memory (as observed by the operating
      # system) that is not accounted for by the dask keys stored on the worker, ignore
      # it for this long before considering it in non-critical memory measures.
      # This should be set to be longer than the duration of most dask tasks.
      recent-to-old-time: 30s

      rebalance:
        # Memory measure to rebalance upon. Possible choices are:
        # process
        #     Total process memory, as measured by the OS.
        # optimistic
        #     Managed by dask (instantaneous) + unmanaged (without any increases
        #     happened in the last <distributed.worker.memory.recent-to-old-time>).
        #     Recommended for use on CPython with large (2MiB+) numpy-based data chunks.
        # managed_in_memory
        #     Only consider the data allocated by dask in RAM. Recommended if RAM is not
        #     released in a timely fashion back to the OS after the Python objects are
        #     dereferenced, but remains available for reuse by PyMalloc.
        #
        #     If this is your problem on Linux, you should alternatively consider
        #     setting the MALLOC_TRIM_THRESHOLD_ environment variable (note the final
        #     underscore) to a low value; refer to the mallopt man page and to the
        #     comments about M_TRIM_THRESHOLD on
        #     https://sourceware.org/git/?p=glibc.git;a=blob;f=malloc/malloc.c
        # managed
        #     Only consider data allocated by dask, including that spilled to disk.
        #     Recommended if disk occupation of the spill file is an issue.
        measure: optimistic
        # Fraction of worker process memory at which we start potentially sending
        # data to other workers. Ignored when max_memory is not set.
        sender-min: 0.30
        # Fraction of worker process memory at which we stop potentially accepting
        # data from other workers. Ignored when max_memory is not set.
        recipient-max: 0.60
        # Fraction of worker process memory, around the cluster mean, where a worker is
        # neither a sender nor a recipient of data during a rebalance operation. E.g. if
        # the mean cluster occupation is 50%, sender-recipient-gap=0.10 means that only
        # nodes above 55% will donate data and only nodes below 45% will receive them.
        # This helps avoid data from bouncing around the cluster repeatedly.
        # Ignored when max_memory is not set.
        sender-recipient-gap: 0.10

      # Fractions of worker process memory at which we take action to avoid memory
      # blowup. Set any of the values to False to turn off the behavior entirely.
      target: 0.60  # target fraction to stay below
      spill: 0.70  # fraction at which we spill to disk
      pause: 0.80  # fraction at which we pause worker threads
      terminate: 0.95  # fraction at which we terminate the worker

    http:
      routes:
        - distributed.http.worker.prometheus
        - distributed.http.health
        - distributed.http.statics

  nanny:
    preload: []             # Run custom modules with Nanny
    preload-argv: []        # See https://docs.dask.org/en/latest/setup/custom-startup.html
    environ:
      MALLOC_TRIM_THRESHOLD_: 65536
      OMP_NUM_THREADS: 1
      MKL_NUM_THREADS: 1

  client:
    heartbeat: 5s  # Interval between client heartbeats
    scheduler-info-interval: 2s  # Interval between scheduler-info updates

  deploy:
    lost-worker-timeout: 15s  # Interval after which to hard-close a lost worker job
    cluster-repr-interval: 500ms  # Interval between calls to update cluster-repr for the widget

  adaptive:
    interval: 1s         # Interval between scaling evaluations
    target-duration: 5s  # Time an entire graph calculation is desired to take ("1m", "30m")
    minimum: 0           # Minimum number of workers
    maximum: .inf        # Maximum number of workers
    wait-count: 3        # Number of times a worker should be suggested for removal before removing it

  comm:
    retry:  # some operations (such as gathering data) are subject to re-tries with the below parameters
      count: 0  # the maximum retry attempts. 0 disables re-trying.
      delay:
         min: 1s  # the first non-zero delay between re-tries
         max: 20s  # the maximum delay between re-tries
    compression: auto
    shard: 64MiB
    offload: 10MiB # Size after which we choose to offload serialization to another thread
    default-scheme: tcp
    socket-backlog: 2048
    recent-messages-log-length: 0  # number of messages to keep for debugging
    ucx:
      cuda_copy: False  # enable cuda-copy
      tcp: False  # enable tcp
      nvlink: False  # enable cuda_ipc
      infiniband: False # enable Infiniband
      rdmacm: False # enable RDMACM
      net-devices: null  # define what interface to use for UCX comm
      reuse-endpoints: null  # enable endpoint reuse

    zstd:
      level: 3      # Compression level, between 1 and 22.
      threads: 0    # Threads to use. 0 for single-threaded, -1 to infer from cpu count.

    timeouts:
      connect: 30s          # time before connecting fails
      tcp: 30s              # time before calling an unresponsive connection dead

    require-encryption: null # Whether to require encryption on non-local comms

    tls:
      ciphers: null   # Allowed ciphers, specified as an OpenSSL cipher string.
      ca-file: null   # Path to a CA file, in pem format, optional
      scheduler:
        cert: null    # Path to certificate file for scheduler.
        key: null     # Path to key file for scheduler. Alternatively, the key
                      # can be appended to the cert file above, and this field
                      # left blank.
      worker:
        key: null
        cert: null
      client:
        key: null
        cert: null

    websockets:
      shard: 8MiB

  diagnostics:
    nvml: True
    computations:
      max-history: 100
      ignore-modules:
        - distributed
        - dask
        - xarray
        - cudf
        - cuml
        - prefect
        - xgboost

  ###################
  # Bokeh dashboard #
  ###################

  dashboard:
    link: "{scheme}://{host}:{port}/status"
    export-tool: False
    graph-max-items: 5000  # maximum number of tasks to try to plot in graph view
    prometheus:
      namespace: "dask"

  ##################
  # Administrative #
  ##################

  admin:
    tick:
      interval: 20ms  # time between event loop health checks
      limit: 3s       # time allowed before triggering a warning

    max-error-length: 10000 # Maximum size traceback after error to return
    log-length: 10000  # default length of logs to keep in memory
    log-format: '%(name)s - %(levelname)s - %(message)s'
    pdb-on-err: False       # enter debug mode on scheduling error
    system-monitor:
      interval: 500ms
    event-loop: tornado
  rmm:
    pool-size: null
入所测试定稿上传 2023-08-28 10:17:29 +00:00			`distributed:`
			`version: 2`
			`# logging:`
			`# distributed: info`
			`# distributed.client: warning`
			`# bokeh: critical`
			`# # http://stackoverflow.com/questions/21234772/python-tornado-disable-logging-to-stderr`
			`# tornado: critical`
			`# tornado.application: error`

			`scheduler:`
			`allowed-failures: 3 # number of retries before a task is considered bad`
			`bandwidth: 100000000 # 100 MB/s estimated worker-worker bandwidth`
			`blocked-handlers: []`
			`default-data-size: 1kiB`
			`# Number of seconds to wait until workers or clients are removed from the events log`
			`# after they have been removed from the scheduler`
			`events-cleanup-delay: 1h`
			`idle-timeout: null # Shut down after this duration, like "1h" or "30 minutes"`
			`transition-log-length: 100000`
			`events-log-length: 100000`
			`work-stealing: True # workers should steal tasks from each other`
			`work-stealing-interval: 100ms # Callback time for work stealing`
			`worker-ttl: null # like '60s'. Time to live for workers. They must heartbeat faster than this`
			`pickle: True # Is the scheduler allowed to deserialize arbitrary bytestrings`
			`preload: [] # Run custom modules with Scheduler`
			`preload-argv: [] # See https://docs.dask.org/en/latest/setup/custom-startup.html`
			`unknown-task-duration: 500ms # Default duration for all tasks with unknown durations ("15m", "2h")`
			`default-task-durations: # How long we expect function names to run ("1h", "1s") (helps for long tasks)`
			`rechunk-split: 1us`
			`split-shuffle: 1us`
			`validate: False # Check scheduler state at every step for debugging`
			`dashboard:`
			`status:`
			`task-stream-length: 1000`
			`tasks:`
			`task-stream-length: 100000`
			`tls:`
			`ca-file: null`
			`key: null`
			`cert: null`
			`bokeh-application: # keywords to pass to BokehTornado application`
			`allow_websocket_origin: ["*"]`
			`keep_alive_milliseconds: 500`
			`check_unused_sessions_milliseconds: 500`
			`locks:`
			`lease-validation-interval: 10s # The interval in which the scheduler validates staleness of all acquired leases. Must always be smaller than the lease-timeout itself.`
			`lease-timeout: 30s # Maximum interval to wait for a Client refresh before a lease is invalidated and released.`

			`http:`
			`routes:`
			`- distributed.http.scheduler.prometheus`
			`- distributed.http.scheduler.info`
			`- distributed.http.scheduler.json`
			`- distributed.http.health`
			`- distributed.http.proxy`
			`- distributed.http.statics`

			`allowed-imports:`
			`- dask`
			`- distributed`

			`active-memory-manager:`
			`# Set to true to auto-start the Active Memory Manager on Scheduler start; if false`
			`# you'll have to either manually start it with client.scheduler.amm_start() or run`
			`# it once with client.scheduler.amm_run().`
			`start: false`
			`# Once started, run the AMM cycle every <interval>`
			`interval: 2s`
			`policies:`
			`# Policies that should be executed at every cycle. Any additional keys in each`
			`# object are passed as keyword arguments to the policy constructor.`
			`- class: distributed.active_memory_manager.ReduceReplicas`

			`worker:`
			`blocked-handlers: []`
			`multiprocessing-method: spawn`
			`use-file-locking: True`
			`connections: # Maximum concurrent connections for data`
			`outgoing: 50 # This helps to control network saturation`
			`incoming: 10`
			`preload: [] # Run custom modules with Worker`
			`preload-argv: [] # See https://docs.dask.org/en/latest/setup/custom-startup.html`
			`daemon: True`
			`validate: False # Check worker state at every step for debugging`
			`resources: {} # Key: value pairs specifying worker resources.`
			`lifetime:`
			`duration: null # Time after which to gracefully shutdown the worker`
			`stagger: 0 seconds # Random amount by which to stagger lifetimes`
			`restart: False # Do we ressurrect the worker after the lifetime deadline?`

			`profile:`
			`interval: 10ms # Time between statistical profiling queries`
			`cycle: 1000ms # Time between starting new profile`
			`low-level: False # Whether or not to include low-level functions`
			`# Requires https://github.com/numba/stacktrace`

			`memory:`
			`# When there is an increase in process memory (as observed by the operating`
			`# system) that is not accounted for by the dask keys stored on the worker, ignore`
			`# it for this long before considering it in non-critical memory measures.`
			`# This should be set to be longer than the duration of most dask tasks.`
			`recent-to-old-time: 30s`

			`rebalance:`
			`# Memory measure to rebalance upon. Possible choices are:`
			`# process`
			`# Total process memory, as measured by the OS.`
			`# optimistic`
			`# Managed by dask (instantaneous) + unmanaged (without any increases`
			`# happened in the last <distributed.worker.memory.recent-to-old-time>).`
			`# Recommended for use on CPython with large (2MiB+) numpy-based data chunks.`
			`# managed_in_memory`
			`# Only consider the data allocated by dask in RAM. Recommended if RAM is not`
			`# released in a timely fashion back to the OS after the Python objects are`
			`# dereferenced, but remains available for reuse by PyMalloc.`
			`#`
			`# If this is your problem on Linux, you should alternatively consider`
			`# setting the MALLOC_TRIM_THRESHOLD_ environment variable (note the final`
			`# underscore) to a low value; refer to the mallopt man page and to the`
			`# comments about M_TRIM_THRESHOLD on`
			`# https://sourceware.org/git/?p=glibc.git;a=blob;f=malloc/malloc.c`
			`# managed`
			`# Only consider data allocated by dask, including that spilled to disk.`
			`# Recommended if disk occupation of the spill file is an issue.`
			`measure: optimistic`
			`# Fraction of worker process memory at which we start potentially sending`
			`# data to other workers. Ignored when max_memory is not set.`
			`sender-min: 0.30`
			`# Fraction of worker process memory at which we stop potentially accepting`
			`# data from other workers. Ignored when max_memory is not set.`
			`recipient-max: 0.60`
			`# Fraction of worker process memory, around the cluster mean, where a worker is`
			`# neither a sender nor a recipient of data during a rebalance operation. E.g. if`
			`# the mean cluster occupation is 50%, sender-recipient-gap=0.10 means that only`
			`# nodes above 55% will donate data and only nodes below 45% will receive them.`
			`# This helps avoid data from bouncing around the cluster repeatedly.`
			`# Ignored when max_memory is not set.`
			`sender-recipient-gap: 0.10`

			`# Fractions of worker process memory at which we take action to avoid memory`
			`# blowup. Set any of the values to False to turn off the behavior entirely.`
			`target: 0.60 # target fraction to stay below`
			`spill: 0.70 # fraction at which we spill to disk`
			`pause: 0.80 # fraction at which we pause worker threads`
			`terminate: 0.95 # fraction at which we terminate the worker`

			`http:`
			`routes:`
			`- distributed.http.worker.prometheus`
			`- distributed.http.health`
			`- distributed.http.statics`

			`nanny:`
			`preload: [] # Run custom modules with Nanny`
			`preload-argv: [] # See https://docs.dask.org/en/latest/setup/custom-startup.html`
			`environ:`
			`MALLOC_TRIM_THRESHOLD_: 65536`
			`OMP_NUM_THREADS: 1`
			`MKL_NUM_THREADS: 1`

			`client:`
			`heartbeat: 5s # Interval between client heartbeats`
			`scheduler-info-interval: 2s # Interval between scheduler-info updates`

			`deploy:`
			`lost-worker-timeout: 15s # Interval after which to hard-close a lost worker job`
			`cluster-repr-interval: 500ms # Interval between calls to update cluster-repr for the widget`

			`adaptive:`
			`interval: 1s # Interval between scaling evaluations`
			`target-duration: 5s # Time an entire graph calculation is desired to take ("1m", "30m")`
			`minimum: 0 # Minimum number of workers`
			`maximum: .inf # Maximum number of workers`
			`wait-count: 3 # Number of times a worker should be suggested for removal before removing it`

			`comm:`
			`retry: # some operations (such as gathering data) are subject to re-tries with the below parameters`
			`count: 0 # the maximum retry attempts. 0 disables re-trying.`
			`delay:`
			`min: 1s # the first non-zero delay between re-tries`
			`max: 20s # the maximum delay between re-tries`
			`compression: auto`
			`shard: 64MiB`
			`offload: 10MiB # Size after which we choose to offload serialization to another thread`
			`default-scheme: tcp`
			`socket-backlog: 2048`
			`recent-messages-log-length: 0 # number of messages to keep for debugging`
			`ucx:`
			`cuda_copy: False # enable cuda-copy`
			`tcp: False # enable tcp`
			`nvlink: False # enable cuda_ipc`
			`infiniband: False # enable Infiniband`
			`rdmacm: False # enable RDMACM`
			`net-devices: null # define what interface to use for UCX comm`
			`reuse-endpoints: null # enable endpoint reuse`

			`zstd:`
			`level: 3 # Compression level, between 1 and 22.`
			`threads: 0 # Threads to use. 0 for single-threaded, -1 to infer from cpu count.`

			`timeouts:`
			`connect: 30s # time before connecting fails`
			`tcp: 30s # time before calling an unresponsive connection dead`

			`require-encryption: null # Whether to require encryption on non-local comms`

			`tls:`
			`ciphers: null # Allowed ciphers, specified as an OpenSSL cipher string.`
			`ca-file: null # Path to a CA file, in pem format, optional`
			`scheduler:`
			`cert: null # Path to certificate file for scheduler.`
			`key: null # Path to key file for scheduler. Alternatively, the key`
			`# can be appended to the cert file above, and this field`
			`# left blank.`
			`worker:`
			`key: null`
			`cert: null`
			`client:`
			`key: null`
			`cert: null`

			`websockets:`
			`shard: 8MiB`

			`diagnostics:`
			`nvml: True`
			`computations:`
			`max-history: 100`
			`ignore-modules:`
			`- distributed`
			`- dask`
			`- xarray`
			`- cudf`
			`- cuml`
			`- prefect`
			`- xgboost`

			`###################`
			`# Bokeh dashboard #`
			`###################`

			`dashboard:`
			`link: "{scheme}://{host}:{port}/status"`
			`export-tool: False`
			`graph-max-items: 5000 # maximum number of tasks to try to plot in graph view`
			`prometheus:`
			`namespace: "dask"`

			`##################`
			`# Administrative #`
			`##################`

			`admin:`
			`tick:`
			`interval: 20ms # time between event loop health checks`
			`limit: 3s # time allowed before triggering a warning`

			`max-error-length: 10000 # Maximum size traceback after error to return`
			`log-length: 10000 # default length of logs to keep in memory`
			`log-format: '%(name)s - %(levelname)s - %(message)s'`
			`pdb-on-err: False # enter debug mode on scheduling error`
			`system-monitor:`
			`interval: 500ms`
			`event-loop: tornado`
			`rmm:`
			`pool-size: null`